{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE NoImplicitPrelude #-}

-----------------------------------------------------------------------------
-- |
-- Module      :  GHC.IO.Encoding.Failure
-- Copyright   :  (c) The University of Glasgow, 2008-2011
-- License     :  see libraries/base/LICENSE
--
-- Maintainer  :  libraries@haskell.org
-- Stability   :  internal
-- Portability :  non-portable
--
-- Types for specifying how text encoding/decoding fails
--
-----------------------------------------------------------------------------

module GHC.IO.Encoding.Failure (
    CodingFailureMode(..), codingFailureModeSuffix,
    isSurrogate,
    recoverDecode, recoverEncode
  ) where

import GHC.IO
import GHC.IO.Buffer
import GHC.IO.Exception

import GHC.Base
import GHC.Char
import GHC.Word
import GHC.Show
import GHC.Num
import GHC.Real ( fromIntegral )

--import System.Posix.Internals

-- | The 'CodingFailureMode' is used to construct 'System.IO.TextEncoding's,
-- and specifies how they handle illegal sequences.
data CodingFailureMode
  = ErrorOnCodingFailure
       -- ^ Throw an error when an illegal sequence is encountered
  | IgnoreCodingFailure
       -- ^ Attempt to ignore and recover if an illegal sequence is
       -- encountered
  | TransliterateCodingFailure
       -- ^ Replace with the closest visual match upon an illegal
       -- sequence
  | RoundtripFailure
       -- ^ Use the private-use escape mechanism to attempt to allow
       -- illegal sequences to be roundtripped.
  deriving ( Int -> CodingFailureMode -> ShowS
[CodingFailureMode] -> ShowS
CodingFailureMode -> String
(Int -> CodingFailureMode -> ShowS)
-> (CodingFailureMode -> String)
-> ([CodingFailureMode] -> ShowS)
-> Show CodingFailureMode
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> CodingFailureMode -> ShowS
showsPrec :: Int -> CodingFailureMode -> ShowS
$cshow :: CodingFailureMode -> String
show :: CodingFailureMode -> String
$cshowList :: [CodingFailureMode] -> ShowS
showList :: [CodingFailureMode] -> ShowS
Show -- ^ @since 4.4.0.0
           )
       -- This will only work properly for those encodings which are
       -- strict supersets of ASCII in the sense that valid ASCII data
       -- is also valid in that encoding. This is not true for
       -- e.g. UTF-16, because ASCII characters must be padded to two
       -- bytes to retain their meaning.

-- Note [Roundtripping]
-- ~~~~~~~~~~~~~~~~~~~~
-- Roundtripping is based on the ideas of PEP383.
--
-- We used to use the range of private-use characters from 0xEF80 to
-- 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registery
-- to encode these characters.
--
-- However, people didn't like this because it means we don't get
-- guaranteed roundtripping for byte sequences that look like a UTF-8
-- encoded codepoint 0xEFxx.
--
-- So now like PEP383 we use lone surrogate codepoints 0xDCxx to escape
-- undecodable bytes, even though that may confuse Unicode processing
-- software written in Haskell. This guarantees roundtripping because
-- unicode input that includes lone surrogate codepoints is invalid by
-- definition.
--
--
-- When we used private-use characters there was a technical problem when it
-- came to encoding back to bytes using iconv. The iconv code will not fail when
-- it tries to encode a private-use character (as it would if trying to encode
-- a surrogate), which means that we wouldn't get a chance to replace it
-- with the byte we originally escaped.
--
-- To work around this, when filling the buffer to be encoded (in
-- writeBlocks/withEncodedCString/newEncodedCString), we replaced the
-- private-use characters with lone surrogates again! Likewise, when
-- reading from a buffer (unpack/unpack_nl/peekEncodedCString) we had
-- to do the inverse process.
--
-- The user of String would never see these lone surrogates, but it
-- ensured that iconv will throw an error when encountering them.  We
-- used lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.

codingFailureModeSuffix :: CodingFailureMode -> String
codingFailureModeSuffix :: CodingFailureMode -> String
codingFailureModeSuffix CodingFailureMode
ErrorOnCodingFailure       = String
""
codingFailureModeSuffix CodingFailureMode
IgnoreCodingFailure        = String
"//IGNORE"
codingFailureModeSuffix CodingFailureMode
TransliterateCodingFailure = String
"//TRANSLIT"
codingFailureModeSuffix CodingFailureMode
RoundtripFailure           = String
"//ROUNDTRIP"

-- | In transliterate mode, we use this character when decoding
-- unknown bytes.
--
-- This is the defined Unicode replacement character:
-- <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>
unrepresentableChar :: Char
unrepresentableChar :: Char
unrepresentableChar = Char
'\xFFFD'

-- It is extraordinarily important that this series of
-- predicates/transformers gets inlined, because they tend to be used
-- in inner loops related to text encoding. In particular,
-- surrogatifyRoundtripCharacter must be inlined (see #5536)

-- | Some characters are actually "surrogate" codepoints defined for
-- use in UTF-16. We need to signal an invalid character if we detect
-- them when encoding a sequence of 'Char's into 'Word8's because they
-- won't give valid Unicode.
--
-- We may also need to signal an invalid character if we detect them
-- when encoding a sequence of 'Char's into 'Word8's because the
-- 'RoundtripFailure' mode creates these to round-trip bytes through
-- our internal UTF-16 encoding.
{-# INLINE isSurrogate #-}
isSurrogate :: Char -> Bool
isSurrogate :: Char -> Bool
isSurrogate Char
c = (Int
0xD800 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
x Bool -> Bool -> Bool
&& Int
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0xDBFF)
             Bool -> Bool -> Bool
|| (Int
0xDC00 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
x Bool -> Bool -> Bool
&& Int
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0xDFFF)
  where x :: Int
x = Char -> Int
ord Char
c

-- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
{-# INLINE escapeToRoundtripCharacterSurrogate #-}
escapeToRoundtripCharacterSurrogate :: Word8 -> Char
escapeToRoundtripCharacterSurrogate :: Word8 -> Char
escapeToRoundtripCharacterSurrogate Word8
b
  | Word8
b Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
128   = Int -> Char
chr (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
b)
      -- Disallow 'smuggling' of ASCII bytes. For roundtripping to
      -- work, this assumes encoding is ASCII-superset.
  | Bool
otherwise = Int -> Char
chr (Int
0xDC00 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
b)

-- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8)
{-# INLINE unescapeRoundtripCharacterSurrogate #-}
unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
unescapeRoundtripCharacterSurrogate Char
c
    | Int
0xDC80 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
x Bool -> Bool -> Bool
&& Int
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0xDD00 = Word8 -> Maybe Word8
forall a. a -> Maybe a
Just (Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
x) -- Discard high byte
    | Bool
otherwise                 = Maybe Word8
forall a. Maybe a
Nothing
  where x :: Int
x = Char -> Int
ord Char
c

recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char
              -> IO (Buffer Word8, Buffer Char)
recoverDecode :: CodingFailureMode
-> Buffer Word8 -> Buffer Char -> IO (Buffer Word8, Buffer Char)
recoverDecode CodingFailureMode
cfm input :: Buffer Word8
input@Buffer{  bufRaw :: forall e. Buffer e -> RawBuffer e
bufRaw=RawBuffer Word8
iraw, bufL :: forall e. Buffer e -> Int
bufL=Int
ir, bufR :: forall e. Buffer e -> Int
bufR=Int
_  }
                  output :: Buffer Char
output@Buffer{ bufRaw :: forall e. Buffer e -> RawBuffer e
bufRaw=RawBuffer Char
oraw, bufL :: forall e. Buffer e -> Int
bufL=Int
_,  bufR :: forall e. Buffer e -> Int
bufR=Int
ow } =
 --puts $ "recoverDecode " ++ show ir
 case CodingFailureMode
cfm of
  CodingFailureMode
ErrorOnCodingFailure       -> IO (Buffer Word8, Buffer Char)
forall a. IO a
ioe_decodingError
  CodingFailureMode
IgnoreCodingFailure        -> (Buffer Word8, Buffer Char) -> IO (Buffer Word8, Buffer Char)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Buffer Word8
input { bufL :: Int
bufL=Int
irInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1 }, Buffer Char
output)
  CodingFailureMode
TransliterateCodingFailure -> do
      Int
ow' <- RawBuffer Char -> Int -> Char -> IO Int
writeCharBuf RawBuffer Char
oraw Int
ow Char
unrepresentableChar
      (Buffer Word8, Buffer Char) -> IO (Buffer Word8, Buffer Char)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Buffer Word8
input { bufL :: Int
bufL=Int
irInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1 }, Buffer Char
output { bufR :: Int
bufR=Int
ow' })
  CodingFailureMode
RoundtripFailure           -> do
      Word8
b <- RawBuffer Word8 -> Int -> IO Word8
readWord8Buf RawBuffer Word8
iraw Int
ir
      Int
ow' <- RawBuffer Char -> Int -> Char -> IO Int
writeCharBuf RawBuffer Char
oraw Int
ow (Word8 -> Char
escapeToRoundtripCharacterSurrogate Word8
b)
      (Buffer Word8, Buffer Char) -> IO (Buffer Word8, Buffer Char)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Buffer Word8
input { bufL :: Int
bufL=Int
irInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1 }, Buffer Char
output { bufR :: Int
bufR=Int
ow' })

recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8
              -> IO (Buffer Char, Buffer Word8)
recoverEncode :: CodingFailureMode
-> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8)
recoverEncode CodingFailureMode
cfm input :: Buffer Char
input@Buffer{  bufRaw :: forall e. Buffer e -> RawBuffer e
bufRaw=RawBuffer Char
iraw, bufL :: forall e. Buffer e -> Int
bufL=Int
ir, bufR :: forall e. Buffer e -> Int
bufR=Int
_  }
                  output :: Buffer Word8
output@Buffer{ bufRaw :: forall e. Buffer e -> RawBuffer e
bufRaw=RawBuffer Word8
oraw, bufL :: forall e. Buffer e -> Int
bufL=Int
_,  bufR :: forall e. Buffer e -> Int
bufR=Int
ow } = do
  (Char
c,Int
ir') <- RawBuffer Char -> Int -> IO (Char, Int)
readCharBuf RawBuffer Char
iraw Int
ir
  --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'
  case CodingFailureMode
cfm of
    CodingFailureMode
IgnoreCodingFailure        -> (Buffer Char, Buffer Word8) -> IO (Buffer Char, Buffer Word8)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Buffer Char
input { bufL :: Int
bufL=Int
ir' }, Buffer Word8
output)
    CodingFailureMode
TransliterateCodingFailure -> do
        if Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'?'
         then (Buffer Char, Buffer Word8) -> IO (Buffer Char, Buffer Word8)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Buffer Char
input { bufL :: Int
bufL=Int
ir' }, Buffer Word8
output)
         else do
          -- XXX: evil hack! To implement transliteration, we just
          -- poke an ASCII ? into the input buffer and tell the caller
          -- to try and decode again. This is *probably* safe given
          -- current uses of TextEncoding.
          --
          -- The "if" test above ensures we skip if the encoding fails
          -- to deal with the ?, though this should never happen in
          -- practice as all encodings are in fact capable of
          -- reperesenting all ASCII characters.
          Int
_ir' <- RawBuffer Char -> Int -> Char -> IO Int
writeCharBuf RawBuffer Char
iraw Int
ir Char
'?'
          (Buffer Char, Buffer Word8) -> IO (Buffer Char, Buffer Word8)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Buffer Char
input, Buffer Word8
output)

        -- This implementation does not work because e.g. UTF-16
        -- requires 2 bytes to encode a simple ASCII value
        --writeWord8Buf oraw ow unrepresentableByte
        --return (input { bufL=ir' }, output { bufR=ow+1 })
    CodingFailureMode
RoundtripFailure | Just Word8
x <- Char -> Maybe Word8
unescapeRoundtripCharacterSurrogate Char
c -> do
        RawBuffer Word8 -> Int -> Word8 -> IO ()
writeWord8Buf RawBuffer Word8
oraw Int
ow Word8
x
        (Buffer Char, Buffer Word8) -> IO (Buffer Char, Buffer Word8)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Buffer Char
input { bufL :: Int
bufL=Int
ir' }, Buffer Word8
output { bufR :: Int
bufR=Int
owInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1 })
    CodingFailureMode
_                          -> IO (Buffer Char, Buffer Word8)
forall a. IO a
ioe_encodingError

ioe_decodingError :: IO a
ioe_decodingError :: forall a. IO a
ioe_decodingError = IOException -> IO a
forall a. IOException -> IO a
ioException
    (Maybe Handle
-> IOErrorType
-> String
-> String
-> Maybe CInt
-> Maybe String
-> IOException
IOError Maybe Handle
forall a. Maybe a
Nothing IOErrorType
InvalidArgument String
"recoverDecode"
        String
"invalid byte sequence" Maybe CInt
forall a. Maybe a
Nothing Maybe String
forall a. Maybe a
Nothing)

ioe_encodingError :: IO a
ioe_encodingError :: forall a. IO a
ioe_encodingError = IOException -> IO a
forall a. IOException -> IO a
ioException
    (Maybe Handle
-> IOErrorType
-> String
-> String
-> Maybe CInt
-> Maybe String
-> IOException
IOError Maybe Handle
forall a. Maybe a
Nothing IOErrorType
InvalidArgument String
"recoverEncode"
        String
"invalid character" Maybe CInt
forall a. Maybe a
Nothing Maybe String
forall a. Maybe a
Nothing)