module GHC.Utils.Encoding (
utf8DecodeCharAddr#,
utf8PrevChar,
utf8CharStart,
utf8DecodeChar,
utf8DecodeByteString,
utf8UnconsByteString,
utf8DecodeShortByteString,
utf8CompareShortByteString,
utf8DecodeStringLazy,
utf8EncodeChar,
utf8EncodeString,
utf8EncodeStringPtr,
utf8EncodeShortByteString,
utf8EncodedLength,
countUTF8Chars,
zEncodeString,
zDecodeString,
toBase62,
toBase62Padded
) where
import Prelude
import Foreign
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
import Data.Char
import qualified Data.Char as Char
import Numeric
import GHC.IO
import GHC.ST
import Data.ByteString (ByteString)
import qualified Data.ByteString.Internal as BS
import Data.ByteString.Short.Internal (ShortByteString(..))
import GHC.Exts
utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
utf8DecodeChar# indexWord8# =
let !ch0 = word2Int# (indexWord8# 0#) in
case () of
_ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #)
| isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) ->
let !ch1 = word2Int# (indexWord8# 1#) in
if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
(# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
(ch1 -# 0x80#)),
2# #)
| isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) ->
let !ch1 = word2Int# (indexWord8# 1#) in
if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
let !ch2 = word2Int# (indexWord8# 2#) in
if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
(# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +#
(ch2 -# 0x80#)),
3# #)
| isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) ->
let !ch1 = word2Int# (indexWord8# 1#) in
if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
let !ch2 = word2Int# (indexWord8# 2#) in
if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
let !ch3 = word2Int# (indexWord8# 3#) in
if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else
(# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +#
(ch3 -# 0x80#)),
4# #)
| otherwise -> fail 1#
where
fail :: Int# -> (# Char#, Int# #)
fail nBytes# = (# '\0'#, nBytes# #)
utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
utf8DecodeCharAddr# a# off# =
#if !MIN_VERSION_base(4,16,0)
utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#))
#else
utf8DecodeChar# (\i# -> word8ToWord# (indexWord8OffAddr# a# (i# +# off#)))
#endif
utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
utf8DecodeCharByteArray# ba# off# =
#if !MIN_VERSION_base(4,16,0)
utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#))
#else
utf8DecodeChar# (\i# -> word8ToWord# (indexWord8Array# ba# (i# +# off#)))
#endif
utf8DecodeChar :: Ptr Word8 -> (Char, Int)
utf8DecodeChar !(Ptr a#) =
case utf8DecodeCharAddr# a# 0# of
(# c#, nBytes# #) -> ( C# c#, I# nBytes# )
utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
utf8PrevChar p = utf8CharStart (p `plusPtr` (1))
utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
utf8CharStart p = go p
where go p = do w <- peek p
if w >= 0x80 && w < 0xC0
then go (p `plusPtr` (1))
else return p
utf8DecodeLazy# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
utf8DecodeLazy# retain decodeChar# len#
= unpack 0#
where
unpack i#
| isTrue# (i# >=# len#) = retain >> return []
| otherwise =
case decodeChar# i# of
(# c#, nBytes# #) -> do
rest <- unsafeDupableInterleaveIO $ unpack (i# +# nBytes#)
return (C# c# : rest)
utf8DecodeByteString :: ByteString -> [Char]
utf8DecodeByteString (BS.PS fptr offset len)
= utf8DecodeStringLazy fptr offset len
utf8UnconsByteString :: ByteString -> Maybe (Char, ByteString)
utf8UnconsByteString (BS.PS _ _ 0) = Nothing
utf8UnconsByteString (BS.PS fptr offset len)
= unsafeDupablePerformIO $
withForeignPtr fptr $ \ptr -> do
let (c,n) = utf8DecodeChar (ptr `plusPtr` offset)
return $ Just (c, BS.PS fptr (offset + n) (len n))
utf8DecodeStringLazy :: ForeignPtr Word8 -> Int -> Int -> [Char]
utf8DecodeStringLazy fp offset (I# len#)
= unsafeDupablePerformIO $ do
let !(Ptr a#) = unsafeForeignPtrToPtr fp `plusPtr` offset
utf8DecodeLazy# (touchForeignPtr fp) (utf8DecodeCharAddr# a#) len#
utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
utf8CompareShortByteString (SBS a1) (SBS a2) = go 0# 0#
where
!sz1 = sizeofByteArray# a1
!sz2 = sizeofByteArray# a2
go off1 off2
| isTrue# ((off1 >=# sz1) `andI#` (off2 >=# sz2)) = EQ
| isTrue# (off1 >=# sz1) = LT
| isTrue# (off2 >=# sz2) = GT
| otherwise =
#if !MIN_VERSION_base(4,16,0)
let !b1_1 = indexWord8Array# a1 off1
!b2_1 = indexWord8Array# a2 off2
#else
let !b1_1 = word8ToWord# (indexWord8Array# a1 off1)
!b2_1 = word8ToWord# (indexWord8Array# a2 off2)
#endif
in case b1_1 of
0xC0## -> case b2_1 of
0xC0## -> go (off1 +# 1#) (off2 +# 1#)
#if !MIN_VERSION_base(4,16,0)
_ -> case indexWord8Array# a1 (off1 +# 1#) of
#else
_ -> case word8ToWord# (indexWord8Array# a1 (off1 +# 1#)) of
#endif
0x80## -> LT
_ -> go (off1 +# 1#) (off2 +# 1#)
_ -> case b2_1 of
#if !MIN_VERSION_base(4,16,0)
0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of
#else
0xC0## -> case word8ToWord# (indexWord8Array# a2 (off2 +# 1#)) of
#endif
0x80## -> GT
_ -> go (off1 +# 1#) (off2 +# 1#)
_ | isTrue# (b1_1 `gtWord#` b2_1) -> GT
| isTrue# (b1_1 `ltWord#` b2_1) -> LT
| otherwise -> go (off1 +# 1#) (off2 +# 1#)
utf8DecodeShortByteString :: ShortByteString -> [Char]
utf8DecodeShortByteString (SBS ba#)
= unsafeDupablePerformIO $
let len# = sizeofByteArray# ba# in
utf8DecodeLazy# (return ()) (utf8DecodeCharByteArray# ba#) len#
countUTF8Chars :: ShortByteString -> IO Int
countUTF8Chars (SBS ba) = go 0# 0#
where
len# = sizeofByteArray# ba
go i# n#
| isTrue# (i# >=# len#) =
return (I# n#)
| otherwise = do
case utf8DecodeCharByteArray# ba i# of
(# _, nBytes# #) -> go (i# +# nBytes#) (n# +# 1#)
utf8EncodeChar :: (Int# -> Word8# -> State# s -> State# s)
-> Char -> ST s Int
utf8EncodeChar write# c =
let x = fromIntegral (ord c) in
case () of
_ | x > 0 && x <= 0x007f -> do
write 0 x
return 1
| x <= 0x07ff -> do
write 0 (0xC0 .|. ((x `shiftR` 6) .&. 0x1F))
write 1 (0x80 .|. (x .&. 0x3F))
return 2
| x <= 0xffff -> do
write 0 (0xE0 .|. (x `shiftR` 12) .&. 0x0F)
write 1 (0x80 .|. (x `shiftR` 6) .&. 0x3F)
write 2 (0x80 .|. (x .&. 0x3F))
return 3
| otherwise -> do
write 0 (0xF0 .|. (x `shiftR` 18))
write 1 (0x80 .|. ((x `shiftR` 12) .&. 0x3F))
write 2 (0x80 .|. ((x `shiftR` 6) .&. 0x3F))
write 3 (0x80 .|. (x .&. 0x3F))
return 4
where
write (I# off#) (W# c#) = ST $ \s ->
#if !MIN_VERSION_base(4,16,0)
case write# off# (narrowWord8# c#) s of
#else
case write# off# (wordToWord8# c#) s of
#endif
s -> (# s, () #)
utf8EncodeString :: String -> ByteString
utf8EncodeString s =
unsafePerformIO $ do
let len = utf8EncodedLength s
buf <- mallocForeignPtrBytes len
withForeignPtr buf $ \ptr -> do
utf8EncodeStringPtr ptr s
pure (BS.fromForeignPtr buf 0 len)
utf8EncodeStringPtr :: Ptr Word8 -> String -> IO ()
utf8EncodeStringPtr (Ptr a#) str = go a# str
where go !_ [] = return ()
go a# (c:cs) = do
#if !MIN_VERSION_base(4,16,0)
I# off# <- stToIO $ utf8EncodeChar (\i w -> writeWord8OffAddr# a# i (extendWord8# w)) c
#else
I# off# <- stToIO $ utf8EncodeChar (writeWord8OffAddr# a#) c
#endif
go (a# `plusAddr#` off#) cs
utf8EncodeShortByteString :: String -> IO ShortByteString
utf8EncodeShortByteString str = IO $ \s ->
case utf8EncodedLength str of { I# len# ->
case newByteArray# len# s of { (# s, mba# #) ->
case go mba# 0# str of { ST f_go ->
case f_go s of { (# s, () #) ->
case unsafeFreezeByteArray# mba# s of { (# s, ba# #) ->
(# s, SBS ba# #) }}}}}
where
go _ _ [] = return ()
go mba# i# (c:cs) = do
#if !MIN_VERSION_base(4,16,0)
I# off# <- utf8EncodeChar (\j# w -> writeWord8Array# mba# (i# +# j#) (extendWord8# w)) c
#else
I# off# <- utf8EncodeChar (\j# -> writeWord8Array# mba# (i# +# j#)) c
#endif
go mba# (i# +# off#) cs
utf8EncodedLength :: String -> Int
utf8EncodedLength str = go 0 str
where go !n [] = n
go n (c:cs)
| ord c > 0 && ord c <= 0x007f = go (n+1) cs
| ord c <= 0x07ff = go (n+2) cs
| ord c <= 0xffff = go (n+3) cs
| otherwise = go (n+4) cs
type UserString = String
type EncodedString = String
zEncodeString :: UserString -> EncodedString
zEncodeString cs = case maybe_tuple cs of
Just n -> n
Nothing -> go cs
where
go [] = []
go (c:cs) = encode_digit_ch c ++ go' cs
go' [] = []
go' (c:cs) = encode_ch c ++ go' cs
unencodedChar :: Char -> Bool
unencodedChar 'Z' = False
unencodedChar 'z' = False
unencodedChar c = c >= 'a' && c <= 'z'
|| c >= 'A' && c <= 'Z'
|| c >= '0' && c <= '9'
encode_digit_ch :: Char -> EncodedString
encode_digit_ch c | c >= '0' && c <= '9' = encode_as_unicode_char c
encode_digit_ch c | otherwise = encode_ch c
encode_ch :: Char -> EncodedString
encode_ch c | unencodedChar c = [c]
encode_ch '(' = "ZL"
encode_ch ')' = "ZR"
encode_ch '[' = "ZM"
encode_ch ']' = "ZN"
encode_ch ':' = "ZC"
encode_ch 'Z' = "ZZ"
encode_ch 'z' = "zz"
encode_ch '&' = "za"
encode_ch '|' = "zb"
encode_ch '^' = "zc"
encode_ch '$' = "zd"
encode_ch '=' = "ze"
encode_ch '>' = "zg"
encode_ch '#' = "zh"
encode_ch '.' = "zi"
encode_ch '<' = "zl"
encode_ch '-' = "zm"
encode_ch '!' = "zn"
encode_ch '+' = "zp"
encode_ch '\'' = "zq"
encode_ch '\\' = "zr"
encode_ch '/' = "zs"
encode_ch '*' = "zt"
encode_ch '_' = "zu"
encode_ch '%' = "zv"
encode_ch c = encode_as_unicode_char c
encode_as_unicode_char :: Char -> EncodedString
encode_as_unicode_char c = 'z' : if isDigit (head hex_str) then hex_str
else '0':hex_str
where hex_str = showHex (ord c) "U"
zDecodeString :: EncodedString -> UserString
zDecodeString [] = []
zDecodeString ('Z' : d : rest)
| isDigit d = decode_tuple d rest
| otherwise = decode_upper d : zDecodeString rest
zDecodeString ('z' : d : rest)
| isDigit d = decode_num_esc d rest
| otherwise = decode_lower d : zDecodeString rest
zDecodeString (c : rest) = c : zDecodeString rest
decode_upper, decode_lower :: Char -> Char
decode_upper 'L' = '('
decode_upper 'R' = ')'
decode_upper 'M' = '['
decode_upper 'N' = ']'
decode_upper 'C' = ':'
decode_upper 'Z' = 'Z'
decode_upper ch = ch
decode_lower 'z' = 'z'
decode_lower 'a' = '&'
decode_lower 'b' = '|'
decode_lower 'c' = '^'
decode_lower 'd' = '$'
decode_lower 'e' = '='
decode_lower 'g' = '>'
decode_lower 'h' = '#'
decode_lower 'i' = '.'
decode_lower 'l' = '<'
decode_lower 'm' = '-'
decode_lower 'n' = '!'
decode_lower 'p' = '+'
decode_lower 'q' = '\''
decode_lower 'r' = '\\'
decode_lower 's' = '/'
decode_lower 't' = '*'
decode_lower 'u' = '_'
decode_lower 'v' = '%'
decode_lower ch = ch
decode_num_esc :: Char -> EncodedString -> UserString
decode_num_esc d rest
= go (digitToInt d) rest
where
go n (c : rest) | isHexDigit c = go (16*n + digitToInt c) rest
go n ('U' : rest) = chr n : zDecodeString rest
go n other = error ("decode_num_esc: " ++ show n ++ ' ':other)
decode_tuple :: Char -> EncodedString -> UserString
decode_tuple d rest
= go (digitToInt d) rest
where
go n (c : rest) | isDigit c = go (10*n + digitToInt c) rest
go 0 ('T':rest) = "()" ++ zDecodeString rest
go n ('T':rest) = '(' : replicate (n1) ',' ++ ")" ++ zDecodeString rest
go 1 ('H':rest) = "(# #)" ++ zDecodeString rest
go n ('H':rest) = '(' : '#' : replicate (n1) ',' ++ "#)" ++ zDecodeString rest
go n other = error ("decode_tuple: " ++ show n ++ ' ':other)
maybe_tuple :: UserString -> Maybe EncodedString
maybe_tuple "(# #)" = Just("Z1H")
maybe_tuple ('(' : '#' : cs) = case count_commas (0::Int) cs of
(n, '#' : ')' : _) -> Just ('Z' : shows (n+1) "H")
_ -> Nothing
maybe_tuple "()" = Just("Z0T")
maybe_tuple ('(' : cs) = case count_commas (0::Int) cs of
(n, ')' : _) -> Just ('Z' : shows (n+1) "T")
_ -> Nothing
maybe_tuple _ = Nothing
count_commas :: Int -> String -> (Int, String)
count_commas n (',' : cs) = count_commas (n+1) cs
count_commas n cs = (n,cs)
word64Base62Len :: Int
word64Base62Len = 11
toBase62Padded :: Word64 -> String
toBase62Padded w = pad ++ str
where
pad = replicate len '0'
len = word64Base62Len length str
str = toBase62 w
toBase62 :: Word64 -> String
toBase62 w = showIntAtBase 62 represent w ""
where
represent :: Int -> Char
represent x
| x < 10 = Char.chr (48 + x)
| x < 36 = Char.chr (65 + x 10)
| x < 62 = Char.chr (97 + x 36)
| otherwise = error "represent (base 62): impossible!"