{-# LANGUAGE BangPatterns,CPP #-}
{-# LANGUAGE Trustworthy #-}
-- |
-- Module      : Data.Text.Lazy.Encoding
-- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com
-- Portability : portable
--
-- Functions for converting lazy 'Text' values to and from lazy
-- 'ByteString', using several standard encodings.
--
-- To gain access to a much larger family of encodings, use the
-- <http://hackage.haskell.org/package/text-icu text-icu package>.

module Data.Text.Lazy.Encoding
    (
    -- * Decoding ByteStrings to Text
    -- $strict
      decodeASCII
    , decodeLatin1
    , decodeUtf8
    , decodeUtf16LE
    , decodeUtf16BE
    , decodeUtf32LE
    , decodeUtf32BE

    -- ** Catchable failure
    , decodeUtf8'

    -- ** Controllable error handling
    , decodeUtf8With
    , decodeUtf16LEWith
    , decodeUtf16BEWith
    , decodeUtf32LEWith
    , decodeUtf32BEWith

    -- * Encoding Text to ByteStrings
    , encodeUtf8
    , encodeUtf16LE
    , encodeUtf16BE
    , encodeUtf32LE
    , encodeUtf32BE

    -- * Encoding Text using ByteString Builders
    , encodeUtf8Builder
    , encodeUtf8BuilderEscaped
    ) where

import Control.Exception (evaluate, try)
import Data.Monoid (Monoid(..))
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
import Data.Text.Internal.Lazy (Text(..), chunk, empty, foldrChunks)
import Data.Word (Word8)
import qualified Data.ByteString as S
import qualified Data.ByteString.Builder as B
import qualified Data.ByteString.Builder.Extra as B (safeStrategy, toLazyByteStringWith)
import qualified Data.ByteString.Builder.Prim as BP
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy.Internal as B
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Internal.Lazy.Encoding.Fusion as E
import qualified Data.Text.Internal.Lazy.Fusion as F
import Data.Text.Unsafe (unsafeDupablePerformIO)

-- $strict
--
-- All of the single-parameter functions for decoding bytestrings
-- encoded in one of the Unicode Transformation Formats (UTF) operate
-- in a /strict/ mode: each will throw an exception if given invalid
-- input.
--
-- Each function has a variant, whose name is suffixed with -'With',
-- that gives greater control over the handling of decoding errors.
-- For instance, 'decodeUtf8' will throw an exception, but
-- 'decodeUtf8With' allows the programmer to determine what to do on a
-- decoding error.

-- | /Deprecated/.  Decode a 'ByteString' containing 7-bit ASCII
-- encoded text.
decodeASCII :: B.ByteString -> Text
decodeASCII :: ByteString -> Text
decodeASCII = ByteString -> Text
decodeUtf8
{-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}

-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
decodeLatin1 :: B.ByteString -> Text
decodeLatin1 :: ByteString -> Text
decodeLatin1 = (ByteString -> Text -> Text) -> Text -> [ByteString] -> Text
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (Text -> Text -> Text
chunk (Text -> Text -> Text)
-> (ByteString -> Text) -> ByteString -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Text
TE.decodeLatin1) Text
empty ([ByteString] -> Text)
-> (ByteString -> [ByteString]) -> ByteString -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [ByteString]
B.toChunks

-- | Decode a 'ByteString' containing UTF-8 encoded text.
decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
decodeUtf8With :: OnDecodeError -> ByteString -> Text
decodeUtf8With OnDecodeError
onErr (B.Chunk ByteString
b0 ByteString
bs0) =
    case OnDecodeError -> ByteString -> Decoding
TE.streamDecodeUtf8With OnDecodeError
onErr ByteString
b0 of
      TE.Some Text
t ByteString
l ByteString -> Decoding
f -> Text -> Text -> Text
chunk Text
t ((ByteString -> Decoding) -> ByteString -> ByteString -> Text
go ByteString -> Decoding
f ByteString
l ByteString
bs0)
  where
    go :: (ByteString -> Decoding) -> ByteString -> ByteString -> Text
go ByteString -> Decoding
f0 ByteString
_ (B.Chunk ByteString
b ByteString
bs) =
      case ByteString -> Decoding
f0 ByteString
b of
        TE.Some Text
t ByteString
l ByteString -> Decoding
f -> Text -> Text -> Text
chunk Text
t ((ByteString -> Decoding) -> ByteString -> ByteString -> Text
go ByteString -> Decoding
f ByteString
l ByteString
bs)
    go ByteString -> Decoding
_ ByteString
l ByteString
_
      | ByteString -> Bool
S.null ByteString
l  = Text
empty
      | Bool
otherwise =
        let !t :: Text
t = String -> Text
T.pack (ByteString -> String
skipBytes ByteString
l)
            skipBytes :: ByteString -> String
skipBytes = (Word8 -> String -> String) -> String -> ByteString -> String
forall a. (Word8 -> a -> a) -> a -> ByteString -> a
S.foldr (\Word8
x String
s' ->
                  case OnDecodeError
onErr String
desc (Word8 -> Maybe Word8
forall a. a -> Maybe a
Just Word8
x) of
                    Just Char
c -> Char
c Char -> String -> String
forall a. a -> [a] -> [a]
: String
s'
                    Maybe Char
Nothing -> String
s') [] in
        Text -> Text -> Text
Chunk Text
t Text
Empty
    desc :: String
desc = String
"Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
decodeUtf8With OnDecodeError
_ ByteString
_ = Text
empty

-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
-- to be valid.
--
-- If the input contains any invalid UTF-8 data, an exception will be
-- thrown that cannot be caught in pure code.  For more control over
-- the handling of invalid data, use 'decodeUtf8'' or
-- 'decodeUtf8With'.
decodeUtf8 :: B.ByteString -> Text
decodeUtf8 :: ByteString -> Text
decodeUtf8 = OnDecodeError -> ByteString -> Text
decodeUtf8With OnDecodeError
strictDecode
{-# INLINE[0] decodeUtf8 #-}

-- This rule seems to cause performance loss.
{- RULES "LAZY STREAM stream/decodeUtf8' fusion" [1]
   forall bs. F.stream (decodeUtf8' bs) = E.streamUtf8 strictDecode bs #-}

-- | Decode a 'ByteString' containing UTF-8 encoded text..
--
-- If the input contains any invalid UTF-8 data, the relevant
-- exception will be returned, otherwise the decoded text.
--
-- /Note/: this function is /not/ lazy, as it must decode its entire
-- input before it can return a result.  If you need lazy (streaming)
-- decoding, use 'decodeUtf8With' in lenient mode.
decodeUtf8' :: B.ByteString -> Either UnicodeException Text
decodeUtf8' :: ByteString -> Either UnicodeException Text
decodeUtf8' ByteString
bs = IO (Either UnicodeException Text) -> Either UnicodeException Text
forall a. IO a -> a
unsafeDupablePerformIO (IO (Either UnicodeException Text) -> Either UnicodeException Text)
-> IO (Either UnicodeException Text)
-> Either UnicodeException Text
forall a b. (a -> b) -> a -> b
$ do
                   let t :: Text
t = ByteString -> Text
decodeUtf8 ByteString
bs
                   IO Text -> IO (Either UnicodeException Text)
forall e a. Exception e => IO a -> IO (Either e a)
try (Text -> IO Text
forall a. a -> IO a
evaluate (Text -> ()
rnf Text
t () -> Text -> Text
`seq` Text
t))
  where
    rnf :: Text -> ()
rnf Text
Empty        = ()
    rnf (Chunk Text
_ Text
ts) = Text -> ()
rnf Text
ts
{-# INLINE decodeUtf8' #-}

-- | Encode text using UTF-8 encoding.
encodeUtf8 :: Text -> B.ByteString
encodeUtf8 :: Text -> ByteString
encodeUtf8    Text
Empty       = ByteString
B.empty
encodeUtf8 lt :: Text
lt@(Chunk Text
t Text
_) =
    AllocationStrategy -> ByteString -> Builder -> ByteString
B.toLazyByteStringWith AllocationStrategy
strategy ByteString
B.empty (Builder -> ByteString) -> Builder -> ByteString
forall a b. (a -> b) -> a -> b
$ Text -> Builder
encodeUtf8Builder Text
lt
  where
    -- To improve our small string performance, we use a strategy that
    -- allocates a buffer that is guaranteed to be large enough for the
    -- encoding of the first chunk, but not larger than the default
    -- B.smallChunkSize. We clamp the firstChunkSize to ensure that we don't
    -- generate too large buffers which hamper streaming.
    firstChunkSize :: Int
firstChunkSize  = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
B.smallChunkSize (Int
4 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Text -> Int
T.length Text
t Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1))
    strategy :: AllocationStrategy
strategy        = Int -> Int -> AllocationStrategy
B.safeStrategy Int
firstChunkSize Int
B.defaultChunkSize

-- | Encode text to a ByteString 'B.Builder' using UTF-8 encoding.
--
-- @since 1.1.0.0
encodeUtf8Builder :: Text -> B.Builder
encodeUtf8Builder :: Text -> Builder
encodeUtf8Builder =
    (Text -> Builder -> Builder) -> Builder -> Text -> Builder
forall a. (Text -> a -> a) -> a -> Text -> a
foldrChunks (\Text
c Builder
b -> Text -> Builder
TE.encodeUtf8Builder Text
c Builder -> Builder -> Builder
forall a. Monoid a => a -> a -> a
`mappend` Builder
b) Builder
forall a. Monoid a => a
Data.Monoid.mempty

-- | Encode text using UTF-8 encoding and escape the ASCII characters using
-- a 'BP.BoundedPrim'.
--
-- Use this function is to implement efficient encoders for text-based formats
-- like JSON or HTML.
--
-- @since 1.1.0.0
{-# INLINE encodeUtf8BuilderEscaped #-}
encodeUtf8BuilderEscaped :: BP.BoundedPrim Word8 -> Text -> B.Builder
encodeUtf8BuilderEscaped :: BoundedPrim Word8 -> Text -> Builder
encodeUtf8BuilderEscaped BoundedPrim Word8
prim =
    (Text -> Builder -> Builder) -> Builder -> Text -> Builder
forall a. (Text -> a -> a) -> a -> Text -> a
foldrChunks (\Text
c Builder
b -> BoundedPrim Word8 -> Text -> Builder
TE.encodeUtf8BuilderEscaped BoundedPrim Word8
prim Text
c Builder -> Builder -> Builder
forall a. Monoid a => a -> a -> a
`mappend` Builder
b) Builder
forall a. Monoid a => a
mempty

-- | Decode text from little endian UTF-16 encoding.
decodeUtf16LEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
decodeUtf16LEWith OnDecodeError
onErr ByteString
bs = Stream Char -> Text
F.unstream (OnDecodeError -> ByteString -> Stream Char
E.streamUtf16LE OnDecodeError
onErr ByteString
bs)
{-# INLINE decodeUtf16LEWith #-}

-- | Decode text from little endian UTF-16 encoding.
--
-- If the input contains any invalid little endian UTF-16 data, an
-- exception will be thrown.  For more control over the handling of
-- invalid data, use 'decodeUtf16LEWith'.
decodeUtf16LE :: B.ByteString -> Text
decodeUtf16LE :: ByteString -> Text
decodeUtf16LE = OnDecodeError -> ByteString -> Text
decodeUtf16LEWith OnDecodeError
strictDecode
{-# INLINE decodeUtf16LE #-}

-- | Decode text from big endian UTF-16 encoding.
decodeUtf16BEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf16BEWith :: OnDecodeError -> ByteString -> Text
decodeUtf16BEWith OnDecodeError
onErr ByteString
bs = Stream Char -> Text
F.unstream (OnDecodeError -> ByteString -> Stream Char
E.streamUtf16BE OnDecodeError
onErr ByteString
bs)
{-# INLINE decodeUtf16BEWith #-}

-- | Decode text from big endian UTF-16 encoding.
--
-- If the input contains any invalid big endian UTF-16 data, an
-- exception will be thrown.  For more control over the handling of
-- invalid data, use 'decodeUtf16BEWith'.
decodeUtf16BE :: B.ByteString -> Text
decodeUtf16BE :: ByteString -> Text
decodeUtf16BE = OnDecodeError -> ByteString -> Text
decodeUtf16BEWith OnDecodeError
strictDecode
{-# INLINE decodeUtf16BE #-}

-- | Encode text using little endian UTF-16 encoding.
encodeUtf16LE :: Text -> B.ByteString
encodeUtf16LE :: Text -> ByteString
encodeUtf16LE Text
txt = [ByteString] -> ByteString
B.fromChunks ((Text -> [ByteString] -> [ByteString])
-> [ByteString] -> Text -> [ByteString]
forall a. (Text -> a -> a) -> a -> Text -> a
foldrChunks ((:) (ByteString -> [ByteString] -> [ByteString])
-> (Text -> ByteString) -> Text -> [ByteString] -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> ByteString
TE.encodeUtf16LE) [] Text
txt)
{-# INLINE encodeUtf16LE #-}

-- | Encode text using big endian UTF-16 encoding.
encodeUtf16BE :: Text -> B.ByteString
encodeUtf16BE :: Text -> ByteString
encodeUtf16BE Text
txt = [ByteString] -> ByteString
B.fromChunks ((Text -> [ByteString] -> [ByteString])
-> [ByteString] -> Text -> [ByteString]
forall a. (Text -> a -> a) -> a -> Text -> a
foldrChunks ((:) (ByteString -> [ByteString] -> [ByteString])
-> (Text -> ByteString) -> Text -> [ByteString] -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> ByteString
TE.encodeUtf16BE) [] Text
txt)
{-# INLINE encodeUtf16BE #-}

-- | Decode text from little endian UTF-32 encoding.
decodeUtf32LEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf32LEWith :: OnDecodeError -> ByteString -> Text
decodeUtf32LEWith OnDecodeError
onErr ByteString
bs = Stream Char -> Text
F.unstream (OnDecodeError -> ByteString -> Stream Char
E.streamUtf32LE OnDecodeError
onErr ByteString
bs)
{-# INLINE decodeUtf32LEWith #-}

-- | Decode text from little endian UTF-32 encoding.
--
-- If the input contains any invalid little endian UTF-32 data, an
-- exception will be thrown.  For more control over the handling of
-- invalid data, use 'decodeUtf32LEWith'.
decodeUtf32LE :: B.ByteString -> Text
decodeUtf32LE :: ByteString -> Text
decodeUtf32LE = OnDecodeError -> ByteString -> Text
decodeUtf32LEWith OnDecodeError
strictDecode
{-# INLINE decodeUtf32LE #-}

-- | Decode text from big endian UTF-32 encoding.
decodeUtf32BEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf32BEWith :: OnDecodeError -> ByteString -> Text
decodeUtf32BEWith OnDecodeError
onErr ByteString
bs = Stream Char -> Text
F.unstream (OnDecodeError -> ByteString -> Stream Char
E.streamUtf32BE OnDecodeError
onErr ByteString
bs)
{-# INLINE decodeUtf32BEWith #-}

-- | Decode text from big endian UTF-32 encoding.
--
-- If the input contains any invalid big endian UTF-32 data, an
-- exception will be thrown.  For more control over the handling of
-- invalid data, use 'decodeUtf32BEWith'.
decodeUtf32BE :: B.ByteString -> Text
decodeUtf32BE :: ByteString -> Text
decodeUtf32BE = OnDecodeError -> ByteString -> Text
decodeUtf32BEWith OnDecodeError
strictDecode
{-# INLINE decodeUtf32BE #-}

-- | Encode text using little endian UTF-32 encoding.
encodeUtf32LE :: Text -> B.ByteString
encodeUtf32LE :: Text -> ByteString
encodeUtf32LE Text
txt = [ByteString] -> ByteString
B.fromChunks ((Text -> [ByteString] -> [ByteString])
-> [ByteString] -> Text -> [ByteString]
forall a. (Text -> a -> a) -> a -> Text -> a
foldrChunks ((:) (ByteString -> [ByteString] -> [ByteString])
-> (Text -> ByteString) -> Text -> [ByteString] -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> ByteString
TE.encodeUtf32LE) [] Text
txt)
{-# INLINE encodeUtf32LE #-}

-- | Encode text using big endian UTF-32 encoding.
encodeUtf32BE :: Text -> B.ByteString
encodeUtf32BE :: Text -> ByteString
encodeUtf32BE Text
txt = [ByteString] -> ByteString
B.fromChunks ((Text -> [ByteString] -> [ByteString])
-> [ByteString] -> Text -> [ByteString]
forall a. (Text -> a -> a) -> a -> Text -> a
foldrChunks ((:) (ByteString -> [ByteString] -> [ByteString])
-> (Text -> ByteString) -> Text -> [ByteString] -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> ByteString
TE.encodeUtf32BE) [] Text
txt)
{-# INLINE encodeUtf32BE #-}