module GHC.Utils.Encoding (
utf8DecodeCharAddr#,
utf8PrevChar,
utf8CharStart,
utf8DecodeChar,
utf8DecodeByteString,
utf8DecodeShortByteString,
utf8DecodeStringLazy,
utf8EncodeChar,
utf8EncodeString,
utf8EncodeShortByteString,
utf8EncodedLength,
countUTF8Chars,
zEncodeString,
zDecodeString,
toBase62,
toBase62Padded
) where
import GHC.Prelude
import Foreign
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
import Data.Char
import qualified Data.Char as Char
import Numeric
import GHC.IO
import GHC.ST
import Data.ByteString (ByteString)
import qualified Data.ByteString.Internal as BS
import Data.ByteString.Short.Internal (ShortByteString(..))
import GHC.Exts
utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
utf8DecodeChar# indexWord8# =
let !ch0 = word2Int# (indexWord8# 0#) in
case () of
_ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #)
| isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) ->
let !ch1 = word2Int# (indexWord8# 1#) in
if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
(# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
(ch1 -# 0x80#)),
2# #)
| isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) ->
let !ch1 = word2Int# (indexWord8# 1#) in
if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
let !ch2 = word2Int# (indexWord8# 2#) in
if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
(# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +#
(ch2 -# 0x80#)),
3# #)
| isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) ->
let !ch1 = word2Int# (indexWord8# 1#) in
if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
let !ch2 = word2Int# (indexWord8# 2#) in
if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
let !ch3 = word2Int# (indexWord8# 3#) in
if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else
(# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +#
(ch3 -# 0x80#)),
4# #)
| otherwise -> fail 1#
where
fail :: Int# -> (# Char#, Int# #)
fail nBytes# = (# '\0'#, nBytes# #)
utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
utf8DecodeCharAddr# a# off# =
utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#))
utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
utf8DecodeCharByteArray# ba# off# =
utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#))
utf8DecodeChar :: Ptr Word8 -> (Char, Int)
utf8DecodeChar !(Ptr a#) =
case utf8DecodeCharAddr# a# 0# of
(# c#, nBytes# #) -> ( C# c#, I# nBytes# )
utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
utf8PrevChar p = utf8CharStart (p `plusPtr` (1))
utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
utf8CharStart p = go p
where go p = do w <- peek p
if w >= 0x80 && w < 0xC0
then go (p `plusPtr` (1))
else return p
utf8DecodeLazy# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
utf8DecodeLazy# retain decodeChar# len#
= unpack 0#
where
unpack i#
| isTrue# (i# >=# len#) = retain >> return []
| otherwise =
case decodeChar# i# of
(# c#, nBytes# #) -> do
rest <- unsafeDupableInterleaveIO $ unpack (i# +# nBytes#)
return (C# c# : rest)
utf8DecodeByteString :: ByteString -> [Char]
utf8DecodeByteString (BS.PS fptr offset len)
= utf8DecodeStringLazy fptr offset len
utf8DecodeStringLazy :: ForeignPtr Word8 -> Int -> Int -> [Char]
utf8DecodeStringLazy fp offset (I# len#)
= unsafeDupablePerformIO $ do
let !(Ptr a#) = unsafeForeignPtrToPtr fp `plusPtr` offset
utf8DecodeLazy# (touchForeignPtr fp) (utf8DecodeCharAddr# a#) len#
utf8DecodeShortByteString :: ShortByteString -> [Char]
utf8DecodeShortByteString (SBS ba#)
= unsafeDupablePerformIO $
let len# = sizeofByteArray# ba# in
utf8DecodeLazy# (return ()) (utf8DecodeCharByteArray# ba#) len#
countUTF8Chars :: ShortByteString -> IO Int
countUTF8Chars (SBS ba) = go 0# 0#
where
len# = sizeofByteArray# ba
go i# n#
| isTrue# (i# >=# len#) =
return (I# n#)
| otherwise = do
case utf8DecodeCharByteArray# ba i# of
(# _, nBytes# #) -> go (i# +# nBytes#) (n# +# 1#)
utf8EncodeChar :: (Int# -> Word# -> State# s -> State# s)
-> Char -> ST s Int
utf8EncodeChar write# c =
let x = ord c in
case () of
_ | x > 0 && x <= 0x007f -> do
write 0 x
return 1
| x <= 0x07ff -> do
write 0 (0xC0 .|. ((x `shiftR` 6) .&. 0x1F))
write 1 (0x80 .|. (x .&. 0x3F))
return 2
| x <= 0xffff -> do
write 0 (0xE0 .|. (x `shiftR` 12) .&. 0x0F)
write 1 (0x80 .|. (x `shiftR` 6) .&. 0x3F)
write 2 (0x80 .|. (x .&. 0x3F))
return 3
| otherwise -> do
write 0 (0xF0 .|. (x `shiftR` 18))
write 1 (0x80 .|. ((x `shiftR` 12) .&. 0x3F))
write 2 (0x80 .|. ((x `shiftR` 6) .&. 0x3F))
write 3 (0x80 .|. (x .&. 0x3F))
return 4
where
write (I# off#) (I# c#) = ST $ \s ->
case write# off# (int2Word# c#) s of
s -> (# s, () #)
utf8EncodeString :: Ptr Word8 -> String -> IO ()
utf8EncodeString (Ptr a#) str = go a# str
where go !_ [] = return ()
go a# (c:cs) = do
I# off# <- stToIO $ utf8EncodeChar (writeWord8OffAddr# a#) c
go (a# `plusAddr#` off#) cs
utf8EncodeShortByteString :: String -> IO ShortByteString
utf8EncodeShortByteString str = IO $ \s ->
case utf8EncodedLength str of { I# len# ->
case newByteArray# len# s of { (# s, mba# #) ->
case go mba# 0# str of { ST f_go ->
case f_go s of { (# s, () #) ->
case unsafeFreezeByteArray# mba# s of { (# s, ba# #) ->
(# s, SBS ba# #) }}}}}
where
go _ _ [] = return ()
go mba# i# (c:cs) = do
I# off# <- utf8EncodeChar (\j# -> writeWord8Array# mba# (i# +# j#)) c
go mba# (i# +# off#) cs
utf8EncodedLength :: String -> Int
utf8EncodedLength str = go 0 str
where go !n [] = n
go n (c:cs)
| ord c > 0 && ord c <= 0x007f = go (n+1) cs
| ord c <= 0x07ff = go (n+2) cs
| ord c <= 0xffff = go (n+3) cs
| otherwise = go (n+4) cs
type UserString = String
type EncodedString = String
zEncodeString :: UserString -> EncodedString
zEncodeString cs = case maybe_tuple cs of
Just n -> n
Nothing -> go cs
where
go [] = []
go (c:cs) = encode_digit_ch c ++ go' cs
go' [] = []
go' (c:cs) = encode_ch c ++ go' cs
unencodedChar :: Char -> Bool
unencodedChar 'Z' = False
unencodedChar 'z' = False
unencodedChar c = c >= 'a' && c <= 'z'
|| c >= 'A' && c <= 'Z'
|| c >= '0' && c <= '9'
encode_digit_ch :: Char -> EncodedString
encode_digit_ch c | c >= '0' && c <= '9' = encode_as_unicode_char c
encode_digit_ch c | otherwise = encode_ch c
encode_ch :: Char -> EncodedString
encode_ch c | unencodedChar c = [c]
encode_ch '(' = "ZL"
encode_ch ')' = "ZR"
encode_ch '[' = "ZM"
encode_ch ']' = "ZN"
encode_ch ':' = "ZC"
encode_ch 'Z' = "ZZ"
encode_ch 'z' = "zz"
encode_ch '&' = "za"
encode_ch '|' = "zb"
encode_ch '^' = "zc"
encode_ch '$' = "zd"
encode_ch '=' = "ze"
encode_ch '>' = "zg"
encode_ch '#' = "zh"
encode_ch '.' = "zi"
encode_ch '<' = "zl"
encode_ch '-' = "zm"
encode_ch '!' = "zn"
encode_ch '+' = "zp"
encode_ch '\'' = "zq"
encode_ch '\\' = "zr"
encode_ch '/' = "zs"
encode_ch '*' = "zt"
encode_ch '_' = "zu"
encode_ch '%' = "zv"
encode_ch c = encode_as_unicode_char c
encode_as_unicode_char :: Char -> EncodedString
encode_as_unicode_char c = 'z' : if isDigit (head hex_str) then hex_str
else '0':hex_str
where hex_str = showHex (ord c) "U"
zDecodeString :: EncodedString -> UserString
zDecodeString [] = []
zDecodeString ('Z' : d : rest)
| isDigit d = decode_tuple d rest
| otherwise = decode_upper d : zDecodeString rest
zDecodeString ('z' : d : rest)
| isDigit d = decode_num_esc d rest
| otherwise = decode_lower d : zDecodeString rest
zDecodeString (c : rest) = c : zDecodeString rest
decode_upper, decode_lower :: Char -> Char
decode_upper 'L' = '('
decode_upper 'R' = ')'
decode_upper 'M' = '['
decode_upper 'N' = ']'
decode_upper 'C' = ':'
decode_upper 'Z' = 'Z'
decode_upper ch = ch
decode_lower 'z' = 'z'
decode_lower 'a' = '&'
decode_lower 'b' = '|'
decode_lower 'c' = '^'
decode_lower 'd' = '$'
decode_lower 'e' = '='
decode_lower 'g' = '>'
decode_lower 'h' = '#'
decode_lower 'i' = '.'
decode_lower 'l' = '<'
decode_lower 'm' = '-'
decode_lower 'n' = '!'
decode_lower 'p' = '+'
decode_lower 'q' = '\''
decode_lower 'r' = '\\'
decode_lower 's' = '/'
decode_lower 't' = '*'
decode_lower 'u' = '_'
decode_lower 'v' = '%'
decode_lower ch = ch
decode_num_esc :: Char -> EncodedString -> UserString
decode_num_esc d rest
= go (digitToInt d) rest
where
go n (c : rest) | isHexDigit c = go (16*n + digitToInt c) rest
go n ('U' : rest) = chr n : zDecodeString rest
go n other = error ("decode_num_esc: " ++ show n ++ ' ':other)
decode_tuple :: Char -> EncodedString -> UserString
decode_tuple d rest
= go (digitToInt d) rest
where
go n (c : rest) | isDigit c = go (10*n + digitToInt c) rest
go 0 ('T':rest) = "()" ++ zDecodeString rest
go n ('T':rest) = '(' : replicate (n1) ',' ++ ")" ++ zDecodeString rest
go 1 ('H':rest) = "(# #)" ++ zDecodeString rest
go n ('H':rest) = '(' : '#' : replicate (n1) ',' ++ "#)" ++ zDecodeString rest
go n other = error ("decode_tuple: " ++ show n ++ ' ':other)
maybe_tuple :: UserString -> Maybe EncodedString
maybe_tuple "(# #)" = Just("Z1H")
maybe_tuple ('(' : '#' : cs) = case count_commas (0::Int) cs of
(n, '#' : ')' : _) -> Just ('Z' : shows (n+1) "H")
_ -> Nothing
maybe_tuple "()" = Just("Z0T")
maybe_tuple ('(' : cs) = case count_commas (0::Int) cs of
(n, ')' : _) -> Just ('Z' : shows (n+1) "T")
_ -> Nothing
maybe_tuple _ = Nothing
count_commas :: Int -> String -> (Int, String)
count_commas n (',' : cs) = count_commas (n+1) cs
count_commas n cs = (n,cs)
word64Base62Len :: Int
word64Base62Len = 11
toBase62Padded :: Word64 -> String
toBase62Padded w = pad ++ str
where
pad = replicate len '0'
len = word64Base62Len length str
str = toBase62 w
toBase62 :: Word64 -> String
toBase62 w = showIntAtBase 62 represent w ""
where
represent :: Int -> Char
represent x
| x < 10 = Char.chr (48 + x)
| x < 36 = Char.chr (65 + x 10)
| x < 62 = Char.chr (97 + x 36)
| otherwise = error "represent (base 62): impossible!"