module Encoding (
utf8DecodeChar#,
utf8PrevChar,
utf8CharStart,
utf8DecodeChar,
utf8DecodeString,
utf8EncodeChar,
utf8EncodeString,
utf8EncodedLength,
countUTF8Chars,
zEncodeString,
zDecodeString
) where
#include "HsVersions.h"
import Foreign
import Data.Char
import Numeric
import GHC.Ptr ( Ptr(..) )
import GHC.Base
utf8DecodeChar# :: Addr# -> (# Char#, Addr# #)
utf8DecodeChar# a# =
let !ch0 = word2Int# (indexWord8OffAddr# a# 0#) in
case () of
_ | ch0 <=# 0x7F# -> (# chr# ch0, a# `plusAddr#` 1# #)
| ch0 >=# 0xC0# && ch0 <=# 0xDF# ->
let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
if ch1 <# 0x80# || ch1 >=# 0xC0# then fail 1# else
(# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
(ch1 -# 0x80#)),
a# `plusAddr#` 2# #)
| ch0 >=# 0xE0# && ch0 <=# 0xEF# ->
let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
if ch1 <# 0x80# || ch1 >=# 0xC0# then fail 1# else
let !ch2 = word2Int# (indexWord8OffAddr# a# 2#) in
if ch2 <# 0x80# || ch2 >=# 0xC0# then fail 2# else
(# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +#
(ch2 -# 0x80#)),
a# `plusAddr#` 3# #)
| ch0 >=# 0xF0# && ch0 <=# 0xF8# ->
let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
if ch1 <# 0x80# || ch1 >=# 0xC0# then fail 1# else
let !ch2 = word2Int# (indexWord8OffAddr# a# 2#) in
if ch2 <# 0x80# || ch2 >=# 0xC0# then fail 2# else
let !ch3 = word2Int# (indexWord8OffAddr# a# 3#) in
if ch3 <# 0x80# || ch3 >=# 0xC0# then fail 3# else
(# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +#
(ch3 -# 0x80#)),
a# `plusAddr#` 4# #)
| otherwise -> fail 1#
where
fail n = (# '\0'#, a# `plusAddr#` n #)
utf8DecodeChar :: Ptr Word8 -> (Char, Ptr Word8)
utf8DecodeChar (Ptr a#) =
case utf8DecodeChar# a# of (# c#, b# #) -> ( C# c#, Ptr b# )
utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
utf8PrevChar p = utf8CharStart (p `plusPtr` (1))
utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
utf8CharStart p = go p
where go p = do w <- peek p
if w >= 0x80 && w < 0xC0
then go (p `plusPtr` (1))
else return p
utf8DecodeString :: Ptr Word8 -> Int -> IO [Char]
STRICT2(utf8DecodeString)
utf8DecodeString (Ptr a#) (I# len#)
= unpack a#
where
!end# = addr2Int# (a# `plusAddr#` len#)
unpack p#
| addr2Int# p# >=# end# = return []
| otherwise =
case utf8DecodeChar# p# of
(# c#, q# #) -> do
chs <- unpack q#
return (C# c# : chs)
countUTF8Chars :: Ptr Word8 -> Int -> IO Int
countUTF8Chars ptr bytes = go ptr 0
where
end = ptr `plusPtr` bytes
STRICT2(go)
go ptr n
| ptr >= end = return n
| otherwise = do
case utf8DecodeChar# (unPtr ptr) of
(# _, a #) -> go (Ptr a) (n+1)
unPtr :: Ptr a -> Addr#
unPtr (Ptr a) = a
utf8EncodeChar :: Char -> Ptr Word8 -> IO (Ptr Word8)
utf8EncodeChar c ptr =
let x = ord c in
case () of
_ | x > 0 && x <= 0x007f -> do
poke ptr (fromIntegral x)
return (ptr `plusPtr` 1)
| x <= 0x07ff -> do
poke ptr (fromIntegral (0xC0 .|. ((x `shiftR` 6) .&. 0x1F)))
pokeElemOff ptr 1 (fromIntegral (0x80 .|. (x .&. 0x3F)))
return (ptr `plusPtr` 2)
| x <= 0xffff -> do
poke ptr (fromIntegral (0xE0 .|. (x `shiftR` 12) .&. 0x0F))
pokeElemOff ptr 1 (fromIntegral (0x80 .|. (x `shiftR` 6) .&. 0x3F))
pokeElemOff ptr 2 (fromIntegral (0x80 .|. (x .&. 0x3F)))
return (ptr `plusPtr` 3)
| otherwise -> do
poke ptr (fromIntegral (0xF0 .|. (x `shiftR` 18)))
pokeElemOff ptr 1 (fromIntegral (0x80 .|. ((x `shiftR` 12) .&. 0x3F)))
pokeElemOff ptr 2 (fromIntegral (0x80 .|. ((x `shiftR` 6) .&. 0x3F)))
pokeElemOff ptr 3 (fromIntegral (0x80 .|. (x .&. 0x3F)))
return (ptr `plusPtr` 4)
utf8EncodeString :: Ptr Word8 -> String -> IO ()
utf8EncodeString ptr str = go ptr str
where STRICT2(go)
go _ [] = return ()
go ptr (c:cs) = do
ptr' <- utf8EncodeChar c ptr
go ptr' cs
utf8EncodedLength :: String -> Int
utf8EncodedLength str = go 0 str
where STRICT2(go)
go n [] = n
go n (c:cs)
| ord c > 0 && ord c <= 0x007f = go (n+1) cs
| ord c <= 0x07ff = go (n+2) cs
| ord c <= 0xffff = go (n+3) cs
| otherwise = go (n+4) cs
type UserString = String
type EncodedString = String
zEncodeString :: UserString -> EncodedString
zEncodeString cs = case maybe_tuple cs of
Just n -> n
Nothing -> go cs
where
go [] = []
go (c:cs) = encode_digit_ch c ++ go' cs
go' [] = []
go' (c:cs) = encode_ch c ++ go' cs
unencodedChar :: Char -> Bool
unencodedChar 'Z' = False
unencodedChar 'z' = False
unencodedChar c = c >= 'a' && c <= 'z'
|| c >= 'A' && c <= 'Z'
|| c >= '0' && c <= '9'
encode_digit_ch :: Char -> EncodedString
encode_digit_ch c | c >= '0' && c <= '9' = encode_as_unicode_char c
encode_digit_ch c | otherwise = encode_ch c
encode_ch :: Char -> EncodedString
encode_ch c | unencodedChar c = [c]
encode_ch '(' = "ZL"
encode_ch ')' = "ZR"
encode_ch '[' = "ZM"
encode_ch ']' = "ZN"
encode_ch ':' = "ZC"
encode_ch 'Z' = "ZZ"
encode_ch 'z' = "zz"
encode_ch '&' = "za"
encode_ch '|' = "zb"
encode_ch '^' = "zc"
encode_ch '$' = "zd"
encode_ch '=' = "ze"
encode_ch '>' = "zg"
encode_ch '#' = "zh"
encode_ch '.' = "zi"
encode_ch '<' = "zl"
encode_ch '-' = "zm"
encode_ch '!' = "zn"
encode_ch '+' = "zp"
encode_ch '\'' = "zq"
encode_ch '\\' = "zr"
encode_ch '/' = "zs"
encode_ch '*' = "zt"
encode_ch '_' = "zu"
encode_ch '%' = "zv"
encode_ch c = encode_as_unicode_char c
encode_as_unicode_char :: Char -> EncodedString
encode_as_unicode_char c = 'z' : if isDigit (head hex_str) then hex_str
else '0':hex_str
where hex_str = showHex (ord c) "U"
zDecodeString :: EncodedString -> UserString
zDecodeString [] = []
zDecodeString ('Z' : d : rest)
| isDigit d = decode_tuple d rest
| otherwise = decode_upper d : zDecodeString rest
zDecodeString ('z' : d : rest)
| isDigit d = decode_num_esc d rest
| otherwise = decode_lower d : zDecodeString rest
zDecodeString (c : rest) = c : zDecodeString rest
decode_upper, decode_lower :: Char -> Char
decode_upper 'L' = '('
decode_upper 'R' = ')'
decode_upper 'M' = '['
decode_upper 'N' = ']'
decode_upper 'C' = ':'
decode_upper 'Z' = 'Z'
decode_upper ch = ch
decode_lower 'z' = 'z'
decode_lower 'a' = '&'
decode_lower 'b' = '|'
decode_lower 'c' = '^'
decode_lower 'd' = '$'
decode_lower 'e' = '='
decode_lower 'g' = '>'
decode_lower 'h' = '#'
decode_lower 'i' = '.'
decode_lower 'l' = '<'
decode_lower 'm' = '-'
decode_lower 'n' = '!'
decode_lower 'p' = '+'
decode_lower 'q' = '\''
decode_lower 'r' = '\\'
decode_lower 's' = '/'
decode_lower 't' = '*'
decode_lower 'u' = '_'
decode_lower 'v' = '%'
decode_lower ch = ch
decode_num_esc :: Char -> EncodedString -> UserString
decode_num_esc d rest
= go (digitToInt d) rest
where
go n (c : rest) | isHexDigit c = go (16*n + digitToInt c) rest
go n ('U' : rest) = chr n : zDecodeString rest
go n other = error ("decode_num_esc: " ++ show n ++ ' ':other)
decode_tuple :: Char -> EncodedString -> UserString
decode_tuple d rest
= go (digitToInt d) rest
where
go n (c : rest) | isDigit c = go (10*n + digitToInt c) rest
go 0 ('T':rest) = "()" ++ zDecodeString rest
go n ('T':rest) = '(' : replicate (n1) ',' ++ ")" ++ zDecodeString rest
go 1 ('H':rest) = "(# #)" ++ zDecodeString rest
go n ('H':rest) = '(' : '#' : replicate (n1) ',' ++ "#)" ++ zDecodeString rest
go n other = error ("decode_tuple: " ++ show n ++ ' ':other)
maybe_tuple :: UserString -> Maybe EncodedString
maybe_tuple "(# #)" = Just("Z1H")
maybe_tuple ('(' : '#' : cs) = case count_commas (0::Int) cs of
(n, '#' : ')' : _) -> Just ('Z' : shows (n+1) "H")
_ -> Nothing
maybe_tuple "()" = Just("Z0T")
maybe_tuple ('(' : cs) = case count_commas (0::Int) cs of
(n, ')' : _) -> Just ('Z' : shows (n+1) "T")
_ -> Nothing
maybe_tuple _ = Nothing
count_commas :: Int -> String -> (Int, String)
count_commas n (',' : cs) = count_commas (n+1) cs
count_commas n cs = (n,cs)