{-# LANGUAGE CPP #-}
{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples, MultiWayIf #-}
{-# OPTIONS_GHC -O2 -fno-warn-name-shadowing #-}
module GHC.Utils.Encoding.UTF8
(
utf8DecodeCharAddr#
, utf8DecodeCharPtr
, utf8DecodeCharByteArray#
, utf8PrevChar
, utf8CharStart
, utf8UnconsByteString
, utf8DecodeByteString
, utf8DecodeShortByteString
, utf8DecodeForeignPtr
, utf8DecodeByteArray#
, utf8CountCharsShortByteString
, utf8CountCharsByteArray#
, utf8CompareByteArray#
, utf8CompareShortByteString
, utf8EncodeByteArray#
, utf8EncodePtr
, utf8EncodeByteString
, utf8EncodeShortByteString
, utf8EncodedLength
) where
import Prelude
import Foreign
import GHC.IO
#if MIN_VERSION_base(4,18,0)
import GHC.Encoding.UTF8
#else
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
import Data.Char
import GHC.Exts
import GHC.ST
#endif
import Data.ByteString (ByteString)
import qualified Data.ByteString.Internal as BS
import Data.ByteString.Short.Internal (ShortByteString(..))
utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
utf8PrevChar Ptr Word8
p = Ptr Word8 -> IO (Ptr Word8)
utf8CharStart (Ptr Word8
p Ptr Word8 -> Int -> Ptr Word8
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` (-Int
1))
utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
utf8CharStart Ptr Word8
p = Ptr Word8 -> IO (Ptr Word8)
forall {b}. (Storable b, Ord b, Num b) => Ptr b -> IO (Ptr b)
go Ptr Word8
p
where go :: Ptr b -> IO (Ptr b)
go Ptr b
p = do b
w <- Ptr b -> IO b
forall a. Storable a => Ptr a -> IO a
peek Ptr b
p
if b
w b -> b -> Bool
forall a. Ord a => a -> a -> Bool
>= b
0x80 Bool -> Bool -> Bool
&& b
w b -> b -> Bool
forall a. Ord a => a -> a -> Bool
< b
0xC0
then Ptr b -> IO (Ptr b)
go (Ptr b
p Ptr b -> Int -> Ptr b
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` (-Int
1))
else Ptr b -> IO (Ptr b)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return Ptr b
p
utf8CountCharsShortByteString :: ShortByteString -> Int
(SBS ByteArray#
ba) = ByteArray# -> Int
utf8CountCharsByteArray# ByteArray#
ba
utf8DecodeShortByteString :: ShortByteString -> [Char]
utf8DecodeShortByteString :: ShortByteString -> [Char]
utf8DecodeShortByteString (SBS ByteArray#
ba#) = ByteArray# -> [Char]
utf8DecodeByteArray# ByteArray#
ba#
utf8DecodeByteString :: ByteString -> [Char]
utf8DecodeByteString :: ByteString -> [Char]
utf8DecodeByteString (BS.PS ForeignPtr Word8
fptr Int
offset Int
len)
= ForeignPtr Word8 -> Int -> Int -> [Char]
utf8DecodeForeignPtr ForeignPtr Word8
fptr Int
offset Int
len
utf8EncodeShortByteString :: String -> ShortByteString
utf8EncodeShortByteString :: [Char] -> ShortByteString
utf8EncodeShortByteString [Char]
str = ByteArray# -> ShortByteString
SBS ([Char] -> ByteArray#
utf8EncodeByteArray# [Char]
str)
utf8EncodeByteString :: String -> ByteString
utf8EncodeByteString :: [Char] -> ByteString
utf8EncodeByteString [Char]
s =
IO ByteString -> ByteString
forall a. IO a -> a
unsafePerformIO (IO ByteString -> ByteString) -> IO ByteString -> ByteString
forall a b. (a -> b) -> a -> b
$ do
let len :: Int
len = [Char] -> Int
utf8EncodedLength [Char]
s
ForeignPtr Word8
buf <- Int -> IO (ForeignPtr Word8)
forall a. Int -> IO (ForeignPtr a)
mallocForeignPtrBytes Int
len
ForeignPtr Word8 -> (Ptr Word8 -> IO ByteString) -> IO ByteString
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Word8
buf ((Ptr Word8 -> IO ByteString) -> IO ByteString)
-> (Ptr Word8 -> IO ByteString) -> IO ByteString
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
ptr -> do
Ptr Word8 -> [Char] -> IO ()
utf8EncodePtr Ptr Word8
ptr [Char]
s
ByteString -> IO ByteString
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ForeignPtr Word8 -> Int -> Int -> ByteString
BS.fromForeignPtr ForeignPtr Word8
buf Int
0 Int
len)
utf8UnconsByteString :: ByteString -> Maybe (Char, ByteString)
utf8UnconsByteString :: ByteString -> Maybe (Char, ByteString)
utf8UnconsByteString (BS.PS ForeignPtr Word8
_ Int
_ Int
0) = Maybe (Char, ByteString)
forall a. Maybe a
Nothing
utf8UnconsByteString (BS.PS ForeignPtr Word8
fptr Int
offset Int
len)
= IO (Maybe (Char, ByteString)) -> Maybe (Char, ByteString)
forall a. IO a -> a
unsafeDupablePerformIO (IO (Maybe (Char, ByteString)) -> Maybe (Char, ByteString))
-> IO (Maybe (Char, ByteString)) -> Maybe (Char, ByteString)
forall a b. (a -> b) -> a -> b
$
ForeignPtr Word8
-> (Ptr Word8 -> IO (Maybe (Char, ByteString)))
-> IO (Maybe (Char, ByteString))
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Word8
fptr ((Ptr Word8 -> IO (Maybe (Char, ByteString)))
-> IO (Maybe (Char, ByteString)))
-> (Ptr Word8 -> IO (Maybe (Char, ByteString)))
-> IO (Maybe (Char, ByteString))
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
ptr -> do
let (Char
c,Int
n) = Ptr Word8 -> (Char, Int)
utf8DecodeCharPtr (Ptr Word8
ptr Ptr Word8 -> Int -> Ptr Word8
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
offset)
Maybe (Char, ByteString) -> IO (Maybe (Char, ByteString))
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe (Char, ByteString) -> IO (Maybe (Char, ByteString)))
-> Maybe (Char, ByteString) -> IO (Maybe (Char, ByteString))
forall a b. (a -> b) -> a -> b
$ (Char, ByteString) -> Maybe (Char, ByteString)
forall a. a -> Maybe a
Just (Char
c, ForeignPtr Word8 -> Int -> Int -> ByteString
BS.PS ForeignPtr Word8
fptr (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
n) (Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
n))
utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
utf8CompareShortByteString (SBS ByteArray#
a1) (SBS ByteArray#
a2) = ByteArray# -> ByteArray# -> Ordering
utf8CompareByteArray# ByteArray#
a1 ByteArray#
a2
#if !MIN_VERSION_base(4,18,0)
{-# INLINE utf8DecodeChar# #-}
utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
utf8DecodeChar# indexWord8# =
let !ch0 = word2Int# (indexWord8# 0#) in
case () of
_ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #)
| isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) ->
let !ch1 = word2Int# (indexWord8# 1#) in
if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
(# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
(ch1 -# 0x80#)),
2# #)
| isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) ->
let !ch1 = word2Int# (indexWord8# 1#) in
if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
let !ch2 = word2Int# (indexWord8# 2#) in
if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
(# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +#
(ch2 -# 0x80#)),
3# #)
| isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) ->
let !ch1 = word2Int# (indexWord8# 1#) in
if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
let !ch2 = word2Int# (indexWord8# 2#) in
if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
let !ch3 = word2Int# (indexWord8# 3#) in
if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else
(# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +#
(ch3 -# 0x80#)),
4# #)
| otherwise -> fail 1#
where
fail :: Int# -> (# Char#, Int# #)
fail nBytes# = (# '\0'#, nBytes# #)
utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
utf8DecodeCharAddr# a# off# =
#if !MIN_VERSION_base(4,16,0)
utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#))
#else
utf8DecodeChar# (\i# -> word8ToWord# (indexWord8OffAddr# a# (i# +# off#)))
#endif
utf8DecodeCharPtr :: Ptr Word8 -> (Char, Int)
utf8DecodeCharPtr !(Ptr a#) =
case utf8DecodeCharAddr# a# 0# of
(# c#, nBytes# #) -> ( C# c#, I# nBytes# )
utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
utf8DecodeCharByteArray# ba# off# =
#if !MIN_VERSION_base(4,16,0)
utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#))
#else
utf8DecodeChar# (\i# -> word8ToWord# (indexWord8Array# ba# (i# +# off#)))
#endif
{-# INLINE utf8Decode# #-}
utf8Decode# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
utf8Decode# retain decodeChar# len#
= unpack 0#
where
unpack i#
| isTrue# (i# >=# len#) = retain >> return []
| otherwise =
case decodeChar# i# of
(# c#, nBytes# #) -> do
rest <- unsafeDupableInterleaveIO $ unpack (i# +# nBytes#)
return (C# c# : rest)
utf8DecodeForeignPtr :: ForeignPtr Word8 -> Int -> Int -> [Char]
utf8DecodeForeignPtr fp offset (I# len#)
= unsafeDupablePerformIO $ do
let !(Ptr a#) = unsafeForeignPtrToPtr fp `plusPtr` offset
utf8Decode# (touchForeignPtr fp) (utf8DecodeCharAddr# a#) len#
utf8DecodeByteArray# :: ByteArray# -> [Char]
utf8DecodeByteArray# ba#
= unsafeDupablePerformIO $
let len# = sizeofByteArray# ba# in
utf8Decode# (return ()) (utf8DecodeCharByteArray# ba#) len#
utf8CompareByteArray# :: ByteArray# -> ByteArray# -> Ordering
utf8CompareByteArray# a1 a2 = go 0# 0#
where
!sz1 = sizeofByteArray# a1
!sz2 = sizeofByteArray# a2
go off1 off2
| isTrue# ((off1 >=# sz1) `andI#` (off2 >=# sz2)) = EQ
| isTrue# (off1 >=# sz1) = LT
| isTrue# (off2 >=# sz2) = GT
| otherwise =
#if !MIN_VERSION_base(4,16,0)
let !b1_1 = indexWord8Array# a1 off1
!b2_1 = indexWord8Array# a2 off2
#else
let !b1_1 = word8ToWord# (indexWord8Array# a1 off1)
!b2_1 = word8ToWord# (indexWord8Array# a2 off2)
#endif
in case b1_1 of
0xC0## -> case b2_1 of
0xC0## -> go (off1 +# 1#) (off2 +# 1#)
#if !MIN_VERSION_base(4,16,0)
_ -> case indexWord8Array# a1 (off1 +# 1#) of
#else
_ -> case word8ToWord# (indexWord8Array# a1 (off1 +# 1#)) of
#endif
0x80## -> LT
_ -> go (off1 +# 1#) (off2 +# 1#)
_ -> case b2_1 of
#if !MIN_VERSION_base(4,16,0)
0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of
#else
0xC0## -> case word8ToWord# (indexWord8Array# a2 (off2 +# 1#)) of
#endif
0x80## -> GT
_ -> go (off1 +# 1#) (off2 +# 1#)
_ | isTrue# (b1_1 `gtWord#` b2_1) -> GT
| isTrue# (b1_1 `ltWord#` b2_1) -> LT
| otherwise -> go (off1 +# 1#) (off2 +# 1#)
utf8CountCharsByteArray# :: ByteArray# -> Int
utf8CountCharsByteArray# ba = go 0# 0#
where
len# = sizeofByteArray# ba
go i# n#
| isTrue# (i# >=# len#) = I# n#
| otherwise =
case utf8DecodeCharByteArray# ba i# of
(# _, nBytes# #) -> go (i# +# nBytes#) (n# +# 1#)
{-# INLINE utf8EncodeChar #-}
utf8EncodeChar :: (Int# -> Word8# -> State# s -> State# s)
-> Char -> ST s Int
utf8EncodeChar write# c =
let x = fromIntegral (ord c) in
case () of
_ | x > 0 && x <= 0x007f -> do
write 0 x
return 1
| x <= 0x07ff -> do
write 0 (0xC0 .|. ((x `shiftR` 6) .&. 0x1F))
write 1 (0x80 .|. (x .&. 0x3F))
return 2
| x <= 0xffff -> do
write 0 (0xE0 .|. (x `shiftR` 12) .&. 0x0F)
write 1 (0x80 .|. (x `shiftR` 6) .&. 0x3F)
write 2 (0x80 .|. (x .&. 0x3F))
return 3
| otherwise -> do
write 0 (0xF0 .|. (x `shiftR` 18))
write 1 (0x80 .|. ((x `shiftR` 12) .&. 0x3F))
write 2 (0x80 .|. ((x `shiftR` 6) .&. 0x3F))
write 3 (0x80 .|. (x .&. 0x3F))
return 4
where
{-# INLINE write #-}
write (I# off#) (W# c#) = ST $ \s ->
#if !MIN_VERSION_base(4,16,0)
case write# off# (narrowWord8# c#) s of
#else
case write# off# (wordToWord8# c#) s of
#endif
s -> (# s, () #)
utf8EncodePtr :: Ptr Word8 -> String -> IO ()
utf8EncodePtr (Ptr a#) str = go a# str
where go !_ [] = return ()
go a# (c:cs) = do
#if !MIN_VERSION_base(4,16,0)
I# off# <- stToIO $ utf8EncodeChar (\i w -> writeWord8OffAddr# a# i (extendWord8# w)) c
#else
I# off# <- stToIO $ utf8EncodeChar (writeWord8OffAddr# a#) c
#endif
go (a# `plusAddr#` off#) cs
utf8EncodeByteArray# :: String -> ByteArray#
utf8EncodeByteArray# str = runRW# $ \s ->
case utf8EncodedLength str of { I# len# ->
case newByteArray# len# s of { (# s, mba# #) ->
case go mba# 0# str of { ST f_go ->
case f_go s of { (# s, () #) ->
case unsafeFreezeByteArray# mba# s of { (# _, ba# #) ->
ba# }}}}}
where
go _ _ [] = return ()
go mba# i# (c:cs) = do
#if !MIN_VERSION_base(4,16,0)
I# off# <- utf8EncodeChar (\j# w -> writeWord8Array# mba# (i# +# j#) (extendWord8# w)) c
#else
I# off# <- utf8EncodeChar (\j# -> writeWord8Array# mba# (i# +# j#)) c
#endif
go mba# (i# +# off#) cs
utf8EncodedLength :: String -> Int
utf8EncodedLength str = go 0 str
where go !n [] = n
go n (c:cs)
| ord c > 0 && ord c <= 0x007f = go (n+1) cs
| ord c <= 0x07ff = go (n+2) cs
| ord c <= 0xffff = go (n+3) cs
| otherwise = go (n+4) cs
#endif /* MIN_VERSION_base(4,18,0) */