module GHC.IO.Encoding.UTF8 (
utf8, mkUTF8,
utf8_bom, mkUTF8_bom
) where
import GHC.Base
import GHC.Real
import GHC.Num
import GHC.IORef
import GHC.IO.Buffer
import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.Word
import Data.Bits
utf8 :: TextEncoding
utf8 = mkUTF8 ErrorOnCodingFailure
mkUTF8 :: CodingFailureMode -> TextEncoding
mkUTF8 cfm = TextEncoding { textEncodingName = "UTF-8",
mkTextDecoder = utf8_DF cfm,
mkTextEncoder = utf8_EF cfm }
utf8_DF :: CodingFailureMode -> IO (TextDecoder ())
utf8_DF cfm =
return (BufferCodec {
encode = utf8_decode,
recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
utf8_EF :: CodingFailureMode -> IO (TextEncoder ())
utf8_EF cfm =
return (BufferCodec {
encode = utf8_encode,
recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
utf8_bom :: TextEncoding
utf8_bom = mkUTF8_bom ErrorOnCodingFailure
mkUTF8_bom :: CodingFailureMode -> TextEncoding
mkUTF8_bom cfm = TextEncoding { textEncodingName = "UTF-8BOM",
mkTextDecoder = utf8_bom_DF cfm,
mkTextEncoder = utf8_bom_EF cfm }
utf8_bom_DF :: CodingFailureMode -> IO (TextDecoder Bool)
utf8_bom_DF cfm = do
ref <- newIORef True
return (BufferCodec {
encode = utf8_bom_decode ref,
recover = recoverDecode cfm,
close = return (),
getState = readIORef ref,
setState = writeIORef ref
})
utf8_bom_EF :: CodingFailureMode -> IO (TextEncoder Bool)
utf8_bom_EF cfm = do
ref <- newIORef True
return (BufferCodec {
encode = utf8_bom_encode ref,
recover = recoverEncode cfm,
close = return (),
getState = readIORef ref,
setState = writeIORef ref
})
utf8_bom_decode :: IORef Bool -> DecodeBuffer
utf8_bom_decode ref
input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
output
= do
first <- readIORef ref
if not first
then utf8_decode input output
else do
let no_bom = do writeIORef ref False; utf8_decode input output
if iw ir < 1 then return (InputUnderflow,input,output) else do
c0 <- readWord8Buf iraw ir
if (c0 /= bom0) then no_bom else do
if iw ir < 2 then return (InputUnderflow,input,output) else do
c1 <- readWord8Buf iraw (ir+1)
if (c1 /= bom1) then no_bom else do
if iw ir < 3 then return (InputUnderflow,input,output) else do
c2 <- readWord8Buf iraw (ir+2)
if (c2 /= bom2) then no_bom else do
writeIORef ref False
utf8_decode input{ bufL = ir + 3 } output
utf8_bom_encode :: IORef Bool -> EncodeBuffer
utf8_bom_encode ref input
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
= do
b <- readIORef ref
if not b then utf8_encode input output
else if os ow < 3
then return (OutputUnderflow,input,output)
else do
writeIORef ref False
writeWord8Buf oraw ow bom0
writeWord8Buf oraw (ow+1) bom1
writeWord8Buf oraw (ow+2) bom2
utf8_encode input output{ bufR = ow+3 }
bom0, bom1, bom2 :: Word8
bom0 = 0xef
bom1 = 0xbb
bom2 = 0xbf
utf8_decode :: DecodeBuffer
utf8_decode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
| ow >= os = done OutputUnderflow ir ow
| ir >= iw = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
case c0 of
_ | c0 <= 0x7f -> do
ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
loop (ir+1) ow'
| c0 >= 0xc0 && c0 <= 0xc1 -> invalid
| c0 >= 0xc2 && c0 <= 0xdf ->
if iw ir < 2 then done InputUnderflow ir ow else do
c1 <- readWord8Buf iraw (ir+1)
if (c1 < 0x80 || c1 >= 0xc0) then invalid else do
ow' <- writeCharBuf oraw ow (chr2 c0 c1)
loop (ir+2) ow'
| c0 >= 0xe0 && c0 <= 0xef ->
case iw ir of
1 -> done InputUnderflow ir ow
2 -> do
c1 <- readWord8Buf iraw (ir+1)
if not (validate3 c0 c1 0x80)
then invalid else done InputUnderflow ir ow
_ -> do
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
if not (validate3 c0 c1 c2) then invalid else do
ow' <- writeCharBuf oraw ow (chr3 c0 c1 c2)
loop (ir+3) ow'
| c0 >= 0xf0 ->
case iw ir of
1 -> done InputUnderflow ir ow
2 -> do
c1 <- readWord8Buf iraw (ir+1)
if not (validate4 c0 c1 0x80 0x80)
then invalid else done InputUnderflow ir ow
3 -> do
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
if not (validate4 c0 c1 c2 0x80)
then invalid else done InputUnderflow ir ow
_ -> do
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
c3 <- readWord8Buf iraw (ir+3)
if not (validate4 c0 c1 c2 c3) then invalid else do
ow' <- writeCharBuf oraw ow (chr4 c0 c1 c2 c3)
loop (ir+4) ow'
| otherwise ->
invalid
where
invalid = done InvalidSequence ir ow
done why !ir !ow = return (why,
if ir == iw then input{ bufL=0, bufR=0 }
else input{ bufL=ir },
output{ bufR=ow })
in
loop ir0 ow0
utf8_encode :: EncodeBuffer
utf8_encode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
done why !ir !ow = return (why,
if ir == iw then input{ bufL=0, bufR=0 }
else input{ bufL=ir },
output{ bufR=ow })
loop !ir !ow
| ow >= os = done OutputUnderflow ir ow
| ir >= iw = done InputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
case ord c of
x | x <= 0x7F -> do
writeWord8Buf oraw ow (fromIntegral x)
loop ir' (ow+1)
| x <= 0x07FF ->
if os ow < 2 then done OutputUnderflow ir ow else do
let (c1,c2) = ord2 c
writeWord8Buf oraw ow c1
writeWord8Buf oraw (ow+1) c2
loop ir' (ow+2)
| x <= 0xFFFF -> if isSurrogate c then done InvalidSequence ir ow else do
if os ow < 3 then done OutputUnderflow ir ow else do
let (c1,c2,c3) = ord3 c
writeWord8Buf oraw ow c1
writeWord8Buf oraw (ow+1) c2
writeWord8Buf oraw (ow+2) c3
loop ir' (ow+3)
| otherwise -> do
if os ow < 4 then done OutputUnderflow ir ow else do
let (c1,c2,c3,c4) = ord4 c
writeWord8Buf oraw ow c1
writeWord8Buf oraw (ow+1) c2
writeWord8Buf oraw (ow+2) c3
writeWord8Buf oraw (ow+3) c4
loop ir' (ow+4)
in
loop ir0 ow0
ord2 :: Char -> (Word8,Word8)
ord2 c = assert (n >= 0x80 && n <= 0x07ff) (x1,x2)
where
n = ord c
x1 = fromIntegral $ (n `shiftR` 6) + 0xC0
x2 = fromIntegral $ (n .&. 0x3F) + 0x80
ord3 :: Char -> (Word8,Word8,Word8)
ord3 c = assert (n >= 0x0800 && n <= 0xffff) (x1,x2,x3)
where
n = ord c
x1 = fromIntegral $ (n `shiftR` 12) + 0xE0
x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
x3 = fromIntegral $ (n .&. 0x3F) + 0x80
ord4 :: Char -> (Word8,Word8,Word8,Word8)
ord4 c = assert (n >= 0x10000) (x1,x2,x3,x4)
where
n = ord c
x1 = fromIntegral $ (n `shiftR` 18) + 0xF0
x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80
x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
x4 = fromIntegral $ (n .&. 0x3F) + 0x80
chr2 :: Word8 -> Word8 -> Char
chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#))
where
!y1# = word2Int# x1#
!y2# = word2Int# x2#
!z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6#
!z2# = y2# -# 0x80#
chr3 :: Word8 -> Word8 -> Word8 -> Char
chr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#))
where
!y1# = word2Int# x1#
!y2# = word2Int# x2#
!y3# = word2Int# x3#
!z1# = uncheckedIShiftL# (y1# -# 0xE0#) 12#
!z2# = uncheckedIShiftL# (y2# -# 0x80#) 6#
!z3# = y3# -# 0x80#
chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
C# (chr# (z1# +# z2# +# z3# +# z4#))
where
!y1# = word2Int# x1#
!y2# = word2Int# x2#
!y3# = word2Int# x3#
!y4# = word2Int# x4#
!z1# = uncheckedIShiftL# (y1# -# 0xF0#) 18#
!z2# = uncheckedIShiftL# (y2# -# 0x80#) 12#
!z3# = uncheckedIShiftL# (y3# -# 0x80#) 6#
!z4# = y4# -# 0x80#
between :: Word8
-> Word8
-> Word8
-> Bool
between x y z = x >= y && x <= z
validate3 :: Word8 -> Word8 -> Word8 -> Bool
validate3 x1 x2 x3 = validate3_1 ||
validate3_2 ||
validate3_3 ||
validate3_4
where
validate3_1 = (x1 == 0xE0) &&
between x2 0xA0 0xBF &&
between x3 0x80 0xBF
validate3_2 = between x1 0xE1 0xEC &&
between x2 0x80 0xBF &&
between x3 0x80 0xBF
validate3_3 = x1 == 0xED &&
between x2 0x80 0x9F &&
between x3 0x80 0xBF
validate3_4 = between x1 0xEE 0xEF &&
between x2 0x80 0xBF &&
between x3 0x80 0xBF
validate4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
validate4 x1 x2 x3 x4 = validate4_1 ||
validate4_2 ||
validate4_3
where
validate4_1 = x1 == 0xF0 &&
between x2 0x90 0xBF &&
between x3 0x80 0xBF &&
between x4 0x80 0xBF
validate4_2 = between x1 0xF1 0xF3 &&
between x2 0x80 0xBF &&
between x3 0x80 0xBF &&
between x4 0x80 0xBF
validate4_3 = x1 == 0xF4 &&
between x2 0x80 0x8F &&
between x3 0x80 0xBF &&
between x4 0x80 0xBF