{-# LANGUAGE Trustworthy #-} {-# LANGUAGE NoImplicitPrelude, PatternGuards #-} ----------------------------------------------------------------------------- -- | -- Module : GHC.IO.Encoding.Failure -- Copyright : (c) The University of Glasgow, 2008-2011 -- License : see libraries/base/LICENSE -- -- Maintainer : libraries@haskell.org -- Stability : internal -- Portability : non-portable -- -- Types for specifying how text encoding/decoding fails -- ----------------------------------------------------------------------------- module GHC.IO.Encoding.Failure ( CodingFailureMode(..), codingFailureModeSuffix, isSurrogate, surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter, recoverDecode, recoverEncode ) where import GHC.IO import GHC.IO.Buffer import GHC.IO.Exception import GHC.Base import GHC.Word import GHC.Show import GHC.Num import GHC.Real ( fromIntegral ) --import System.Posix.Internals import Data.Maybe -- | The 'CodingFailureMode' is used to construct 'TextEncoding's, and specifies -- how they handle illegal sequences. data CodingFailureMode = ErrorOnCodingFailure -- ^ Throw an error when an illegal sequence is encountered | IgnoreCodingFailure -- ^ Attempt to ignore and recover if an illegal sequence is encountered | TransliterateCodingFailure -- ^ Replace with the closest visual match upon an illegal sequence | RoundtripFailure -- ^ Use the private-use escape mechanism to attempt to allow illegal sequences to be roundtripped. deriving (Show) -- This will only work properly for those encodings which are strict supersets of ASCII in the sense -- that valid ASCII data is also valid in that encoding. This is not true for e.g. UTF-16, because -- ASCII characters must be padded to two bytes to retain their meaning. -- Note [Roundtripping] -- ~~~~~~~~~~~~~~~~~~~~ -- -- Roundtripping is based on the ideas of PEP383. However, unlike PEP383 we do not wish to use lone surrogate codepoints -- to escape undecodable bytes, because that may confuse Unicode processing software written in Haskell. Instead, we use -- the range of private-use characters from 0xEF80 to 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registery. -- -- This introduces a technical problem when it comes to encoding back to bytes using iconv. The iconv code will not fail when -- it tries to encode a private-use character (as it would if trying to encode a surrogate), which means that we won't get a -- chance to replace it with the byte we originally escaped. -- -- To work around this, when filling the buffer to be encoded (in writeBlocks/withEncodedCString/newEncodedCString), we replace -- the private-use characters with lone surrogates again! Likewise, when reading from a buffer (unpack/unpack_nl/peekEncodedCString) -- we have to do the inverse process. -- -- The user of String should never see these lone surrogates, but it ensures that iconv will throw an error when encountering them. -- We use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose. codingFailureModeSuffix :: CodingFailureMode -> String codingFailureModeSuffix ErrorOnCodingFailure = "" codingFailureModeSuffix IgnoreCodingFailure = "//IGNORE" codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT" codingFailureModeSuffix RoundtripFailure = "//ROUNDTRIP" -- | In transliterate mode, we use this character when decoding unknown bytes. -- -- This is the defined Unicode replacement character: <http://www.fileformat.info/info/unicode/char/0fffd/index.htm> unrepresentableChar :: Char unrepresentableChar = '\xFFFD' -- | Some characters are actually "surrogate" codepoints defined for use in UTF-16. We need to signal an -- invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's because they won't -- give valid Unicode. -- -- We may also need to signal an invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's -- because the 'RoundtripFailure' mode creates these to round-trip bytes through our internal UTF-16 encoding. isSurrogate :: Char -> Bool isSurrogate c = (0xD800 <= x && x <= 0xDBFF) || (0xDC00 <= x && x <= 0xDFFF) where x = ord c -- | We use some private-use characters for roundtripping unknown bytes through a String isRoundtripEscapeChar :: Char -> Bool isRoundtripEscapeChar c = 0xEF00 <= x && x < 0xF000 where x = ord c -- | We use some surrogate characters for roundtripping unknown bytes through a String isRoundtripEscapeSurrogateChar :: Char -> Bool isRoundtripEscapeSurrogateChar c = 0xDC00 <= x && x < 0xDD00 where x = ord c -- Private use characters (in Strings) --> lone surrogates (in Buffer CharBufElem) surrogatifyRoundtripCharacter :: Char -> Char surrogatifyRoundtripCharacter c | isRoundtripEscapeChar c = chr (ord c - 0xEF00 + 0xDC00) | otherwise = c -- Lone surrogates (in Buffer CharBufElem) --> private use characters (in Strings) desurrogatifyRoundtripCharacter :: Char -> Char desurrogatifyRoundtripCharacter c | isRoundtripEscapeSurrogateChar c = chr (ord c - 0xDC00 + 0xEF00) | otherwise = c -- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem) escapeToRoundtripCharacterSurrogate :: Word8 -> Char escapeToRoundtripCharacterSurrogate b | b < 128 = chr (fromIntegral b) -- Disallow 'smuggling' of ASCII bytes. For roundtripping to work, this assumes encoding is ASCII-superset. | otherwise = chr (0xDC00 + fromIntegral b) -- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8) unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8 unescapeRoundtripCharacterSurrogate c | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte | otherwise = Nothing where x = ord c recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char -> IO (Buffer Word8, Buffer Char) recoverDecode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do --puts $ "recoverDecode " ++ show ir case cfm of ErrorOnCodingFailure -> ioe_decodingError IgnoreCodingFailure -> return (input { bufL=ir+1 }, output) TransliterateCodingFailure -> do ow' <- writeCharBuf oraw ow unrepresentableChar return (input { bufL=ir+1 }, output { bufR=ow' }) RoundtripFailure -> do b <- readWord8Buf iraw ir ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b) return (input { bufL=ir+1 }, output { bufR=ow' }) recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8) recoverEncode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do (c,ir') <- readCharBuf iraw ir --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir' case cfm of IgnoreCodingFailure -> return (input { bufL=ir' }, output) TransliterateCodingFailure -> do if c == '?' then return (input { bufL=ir' }, output) else do -- XXX: evil hack! To implement transliteration, we just poke an -- ASCII ? into the input buffer and tell the caller to try and decode -- again. This is *probably* safe given current uses of TextEncoding. -- -- The "if" test above ensures we skip if the encoding fails to deal with -- the ?, though this should never happen in practice as all encodings are -- in fact capable of reperesenting all ASCII characters. _ir' <- writeCharBuf iraw ir '?' return (input, output) -- This implementation does not work because e.g. UTF-16 requires 2 bytes to -- encode a simple ASCII value --writeWord8Buf oraw ow unrepresentableByte --return (input { bufL=ir' }, output { bufR=ow+1 }) RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do writeWord8Buf oraw ow x return (input { bufL=ir' }, output { bufR=ow+1 }) _ -> ioe_encodingError ioe_decodingError :: IO a ioe_decodingError = ioException (IOError Nothing InvalidArgument "recoverDecode" "invalid byte sequence" Nothing Nothing) ioe_encodingError :: IO a ioe_encodingError = ioException (IOError Nothing InvalidArgument "recoverEncode" "invalid character" Nothing Nothing)