{-# LANGUAGE CPP, DeriveDataTypeable, UnboxedTuples #-}
{-# OPTIONS_HADDOCK not-home #-}

-- |
-- Module      : Data.Text.Internal
-- Copyright   : (c) 2008, 2009 Tom Harper,
--               (c) 2009, 2010 Bryan O'Sullivan,
--               (c) 2009 Duncan Coutts
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : GHC
--
-- A module containing private 'Text' internals. This exposes the
-- 'Text' representation and low level construction functions.
-- Modules which extend the 'Text' system may need to use this module.
--
-- You should not use this module unless you are determined to monkey
-- with the internals, as the functions here do just about nothing to
-- preserve data invariants.  You have been warned!

module Data.Text.Internal
    (
    -- * Types
    -- $internals
      Text(..)
    -- * Construction
    , text
    , textP
    -- * Safety
    , safe
    -- * Code that must be here for accessibility
    , empty
    , empty_
    -- * Utilities
    , firstf
    -- * Checked multiplication
    , mul
    , mul32
    , mul64
    -- * Debugging
    , showText
    ) where

#if defined(ASSERTS)
import Control.Exception (assert)
import GHC.Stack (HasCallStack)
#endif
import Data.Bits
import Data.Int (Int32, Int64)
import Data.Text.Internal.Unsafe.Char (ord)
import Data.Typeable (Typeable)
import qualified Data.Text.Array as A

-- | A space efficient, packed, unboxed Unicode text type.
data Text = Text
    {-# UNPACK #-} !A.Array          -- payload (Word16 elements)
    {-# UNPACK #-} !Int              -- offset (units of Word16, not Char)
    {-# UNPACK #-} !Int              -- length (units of Word16, not Char)
    deriving (Typeable)

-- | Smart constructor.
text_ ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  A.Array -> Int -> Int -> Text
text_ :: Array -> Int -> Int -> Text
text_ Array
arr Int
off Int
len =
#if defined(ASSERTS)
  let c    = A.unsafeIndex arr off
  in assert (len >= 0) .
     assert (off >= 0) .
     assert (len == 0 || c < 0xDC00 || c > 0xDFFF) $
#endif
     Array -> Int -> Int -> Text
Text Array
arr Int
off Int
len
{-# INLINE text_ #-}

-- | /O(1)/ The empty 'Text'.
empty :: Text
empty :: Text
empty = Array -> Int -> Int -> Text
Text Array
A.empty Int
0 Int
0
{-# INLINE [1] empty #-}

-- | A non-inlined version of 'empty'.
empty_ :: Text
empty_ :: Text
empty_ = Array -> Int -> Int -> Text
Text Array
A.empty Int
0 Int
0
{-# NOINLINE empty_ #-}

-- | Construct a 'Text' without invisibly pinning its byte array in
-- memory if its length has dwindled to zero.
text ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  A.Array -> Int -> Int -> Text
text :: Array -> Int -> Int -> Text
text Array
arr Int
off Int
len | Int
len Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0  = Text
empty
                 | Bool
otherwise = Array -> Int -> Int -> Text
text_ Array
arr Int
off Int
len
{-# INLINE text #-}

textP :: A.Array -> Int -> Int -> Text
{-# DEPRECATED textP "Use text instead" #-}
textP :: Array -> Int -> Int -> Text
textP = Array -> Int -> Int -> Text
text

-- | A useful 'show'-like function for debugging purposes.
showText :: Text -> String
showText :: Text -> String
showText (Text Array
arr Int
off Int
len) =
    String
"Text " String -> String -> String
forall a. [a] -> [a] -> [a]
++ [Word16] -> String
forall a. Show a => a -> String
show (Array -> Int -> Int -> [Word16]
A.toList Array
arr Int
off Int
len) String -> String -> String
forall a. [a] -> [a] -> [a]
++ Char
' ' Char -> String -> String
forall a. a -> [a] -> [a]
:
            Int -> String
forall a. Show a => a -> String
show Int
off String -> String -> String
forall a. [a] -> [a] -> [a]
++ Char
' ' Char -> String -> String
forall a. a -> [a] -> [a]
: Int -> String
forall a. Show a => a -> String
show Int
len

-- | Map a 'Char' to a 'Text'-safe value.
--
-- UTF-16 surrogate code points are not included in the set of Unicode
-- scalar values, but are unfortunately admitted as valid 'Char'
-- values by Haskell.  They cannot be represented in a 'Text'.  This
-- function remaps those code points to the Unicode replacement
-- character (U+FFFD, \'&#xfffd;\'), and leaves other code points
-- unchanged.
safe :: Char -> Char
safe :: Char -> Char
safe Char
c
    | Char -> Int
ord Char
c Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x1ff800 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
/= Int
0xd800 = Char
c
    | Bool
otherwise                    = Char
'\xfffd'
{-# INLINE [0] safe #-}

-- | Apply a function to the first element of an optional pair.
firstf :: (a -> c) -> Maybe (a,b) -> Maybe (c,b)
firstf :: forall a c b. (a -> c) -> Maybe (a, b) -> Maybe (c, b)
firstf a -> c
f (Just (a
a, b
b)) = (c, b) -> Maybe (c, b)
forall a. a -> Maybe a
Just (a -> c
f a
a, b
b)
firstf a -> c
_  Maybe (a, b)
Nothing      = Maybe (c, b)
forall a. Maybe a
Nothing

-- | Checked multiplication.  Calls 'error' if the result would
-- overflow.
mul :: Int -> Int -> Int
mul :: Int -> Int -> Int
mul Int
a Int
b
  | Word -> Int
forall b. FiniteBits b => b -> Int
finiteBitSize (Word
0 :: Word) Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
64
  = Int64 -> Int
int64ToInt (Int64 -> Int) -> Int64 -> Int
forall a b. (a -> b) -> a -> b
$ Int -> Int64
intToInt64 Int
a Int64 -> Int64 -> Int64
`mul64` Int -> Int64
intToInt64 Int
b
  | Bool
otherwise
  = Int32 -> Int
int32ToInt (Int32 -> Int) -> Int32 -> Int
forall a b. (a -> b) -> a -> b
$ Int -> Int32
intToInt32 Int
a Int32 -> Int32 -> Int32
`mul32` Int -> Int32
intToInt32 Int
b
{-# INLINE mul #-}
infixl 7 `mul`

-- | Checked multiplication.  Calls 'error' if the result would
-- overflow.
mul64 :: Int64 -> Int64 -> Int64
mul64 :: Int64 -> Int64 -> Int64
mul64 Int64
a Int64
b
  | Int64
a Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
>= Int64
0 Bool -> Bool -> Bool
&& Int64
b Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
>= Int64
0 =  Int64 -> Int64 -> Int64
mul64_ Int64
a Int64
b
  | Int64
a Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
>= Int64
0           = -Int64 -> Int64 -> Int64
mul64_ Int64
a (-Int64
b)
  | Int64
b Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
>= Int64
0           = -Int64 -> Int64 -> Int64
mul64_ (-Int64
a) Int64
b
  | Bool
otherwise        =  Int64 -> Int64 -> Int64
mul64_ (-Int64
a) (-Int64
b)
{-# INLINE mul64 #-}
infixl 7 `mul64`

mul64_ :: Int64 -> Int64 -> Int64
mul64_ :: Int64 -> Int64 -> Int64
mul64_ Int64
a Int64
b
  | Int64
ahi Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
> Int64
0 Bool -> Bool -> Bool
&& Int64
bhi Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
> Int64
0 = String -> Int64
forall a. HasCallStack => String -> a
error String
"overflow"
  | Int64
top Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
> Int64
0x7fffffff   = String -> Int64
forall a. HasCallStack => String -> a
error String
"overflow"
  | Int64
total Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
< Int64
0          = String -> Int64
forall a. HasCallStack => String -> a
error String
"overflow"
  | Bool
otherwise          = Int64
total
  where (# Int64
ahi, Int64
alo #) = (# Int64
a Int64 -> Int -> Int64
forall a. Bits a => a -> Int -> a
`shiftR` Int
32, Int64
a Int64 -> Int64 -> Int64
forall a. Bits a => a -> a -> a
.&. Int64
0xffffffff #)
        (# Int64
bhi, Int64
blo #) = (# Int64
b Int64 -> Int -> Int64
forall a. Bits a => a -> Int -> a
`shiftR` Int
32, Int64
b Int64 -> Int64 -> Int64
forall a. Bits a => a -> a -> a
.&. Int64
0xffffffff #)
        top :: Int64
top            = Int64
ahi Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
* Int64
blo Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int64
alo Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
* Int64
bhi
        total :: Int64
total          = (Int64
top Int64 -> Int -> Int64
forall a. Bits a => a -> Int -> a
`shiftL` Int
32) Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int64
alo Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
* Int64
blo
{-# INLINE mul64_ #-}

-- | Checked multiplication.  Calls 'error' if the result would
-- overflow.
mul32 :: Int32 -> Int32 -> Int32
mul32 :: Int32 -> Int32 -> Int32
mul32 Int32
a Int32
b = case Int32 -> Int64
int32ToInt64 Int32
a Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
* Int32 -> Int64
int32ToInt64 Int32
b of
              Int64
ab | Int64
ab Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
< Int64
min32 Bool -> Bool -> Bool
|| Int64
ab Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
> Int64
max32 -> String -> Int32
forall a. HasCallStack => String -> a
error String
"overflow"
                 | Bool
otherwise                -> Int64 -> Int32
int64ToInt32 Int64
ab
  where min32 :: Int64
min32 = -Int64
0x80000000 :: Int64
        max32 :: Int64
max32 =  Int64
0x7fffffff
{-# INLINE mul32 #-}
infixl 7 `mul32`

intToInt64 :: Int -> Int64
intToInt64 :: Int -> Int64
intToInt64 = Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral

int64ToInt :: Int64 -> Int
int64ToInt :: Int64 -> Int
int64ToInt = Int64 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral

intToInt32 :: Int -> Int32
intToInt32 :: Int -> Int32
intToInt32 = Int -> Int32
forall a b. (Integral a, Num b) => a -> b
fromIntegral

int32ToInt :: Int32 -> Int
int32ToInt :: Int32 -> Int
int32ToInt = Int32 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral

int32ToInt64 :: Int32 -> Int64
int32ToInt64 :: Int32 -> Int64
int32ToInt64 = Int32 -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral

int64ToInt32 :: Int64 -> Int32
int64ToInt32 :: Int64 -> Int32
int64ToInt32 = Int64 -> Int32
forall a b. (Integral a, Num b) => a -> b
fromIntegral

-- $internals
--
-- Internally, the 'Text' type is represented as an array of 'Word16'
-- UTF-16 code units. The offset and length fields in the constructor
-- are in these units, /not/ units of 'Char'.
--
-- Invariants that all functions must maintain:
--
-- * Since the 'Text' type uses UTF-16 internally, it cannot represent
--   characters in the reserved surrogate code point range U+D800 to
--   U+DFFF. To maintain this invariant, the 'safe' function maps
--   'Char' values in this range to the replacement character (U+FFFD,
--   \'&#xfffd;\').
--
-- * A leading (or \"high\") surrogate code unit (0xD800–0xDBFF) must
--   always be followed by a trailing (or \"low\") surrogate code unit
--   (0xDC00-0xDFFF). A trailing surrogate code unit must always be
--   preceded by a leading surrogate code unit.