utils/FastString.lhs

% % (c) The University of Glasgow, 1997-2006 % \begin{code}
{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_GHC -O -funbox-strict-fields #-}
-- We always optimise this, otherwise performance of a non-optimised
-- compiler is severely affected

-- |
-- There are two principal string types used internally by GHC:
--
-- 'FastString':
--               * A compact, hash-consed, representation of character strings.
--               * Comparison is O(1), and you can get a 'Unique.Unique' from them.
--               * Generated by 'fsLit'.
--               * Turn into 'Outputable.SDoc' with 'Outputable.ftext'.
--
-- 'LitString':
--               * Just a wrapper for the @Addr#@ of a C string (@Ptr CChar@).
--               * Practically no operations.
--               * Outputing them is fast.
--               * Generated by 'sLit'.
--               * Turn into 'Outputable.SDoc' with 'Outputable.ptext'
--
-- Use 'LitString' unless you want the facilities of 'FastString'.
module FastString
       (
        -- * FastStrings
        FastString(..),     -- not abstract, for now.

        -- ** Construction
        fsLit,
        mkFastString,
        mkFastStringBytes,
        mkFastStringByteList,
        mkFastStringForeignPtr,
#if defined(__GLASGOW_HASKELL__)
        mkFastString#,
#endif
        mkZFastString,
        mkZFastStringBytes,

        -- ** Deconstruction
        unpackFS,           -- :: FastString -> String
        bytesFS,            -- :: FastString -> [Word8]

        -- ** Encoding
        isZEncoded,
        zEncodeFS,

        -- ** Operations
        uniqueOfFS,
        lengthFS,
        nullFS,
        appendFS,
        headFS,
        tailFS,
        concatFS,
        consFS,
        nilFS,

        -- ** Outputing
        hPutFS,

        -- ** Internal
        getFastStringTable,
        hasZEncoding,

        -- * LitStrings
        LitString,
        
        -- ** Construction
        sLit,
#if defined(__GLASGOW_HASKELL__)
        mkLitString#,
#endif
        mkLitString,
        
        -- ** Deconstruction
        unpackLitString,
        
        -- ** Operations
        lengthLS
       ) where

#include "HsVersions.h"

import Encoding
import FastTypes
import FastFunctions
import Panic
import Util

import Foreign.C
import GHC.Exts
import System.IO
import System.IO.Unsafe ( unsafePerformIO )
import Data.Data
import Data.IORef       ( IORef, newIORef, readIORef, writeIORef )
import Data.Maybe       ( isJust )
import Data.Char        ( ord )

import GHC.IO           ( IO(..) )

#if __GLASGOW_HASKELL__ >= 701
import Foreign.Safe
#else
import Foreign hiding ( unsafePerformIO )
#endif

#if defined(__GLASGOW_HASKELL__)
import GHC.Base         ( unpackCString# )
#endif

#define hASH_TBL_SIZE          4091
#define hASH_TBL_SIZE_UNBOXED  4091#


{-|
A 'FastString' is an array of bytes, hashed to support fast O(1)
comparison.  It is also associated with a character encoding, so that
we know how to convert a 'FastString' to the local encoding, or to the
Z-encoding used by the compiler internally.

'FastString's support a memoized conversion to the Z-encoding via zEncodeFS.
-}

data FastString = FastString {
      uniq    :: {-# UNPACK #-} !Int, -- unique id
      n_bytes :: {-# UNPACK #-} !Int, -- number of bytes
      n_chars :: {-# UNPACK #-} !Int, -- number of chars
      buf     :: {-# UNPACK #-} !(ForeignPtr Word8),
      enc     :: FSEncoding
  } deriving Typeable

data FSEncoding
    -- including strings that don't need any encoding
  = ZEncoded
    -- A UTF-8 string with a memoized Z-encoding
  | UTF8Encoded {-# UNPACK #-} !(IORef (Maybe FastString))

instance Eq FastString where
  f1 == f2  =  uniq f1 == uniq f2

instance Ord FastString where
    -- Compares lexicographically, not by unique
    a <= b = case cmpFS a b of { LT -> True;  EQ -> True;  GT -> False }
    a <  b = case cmpFS a b of { LT -> True;  EQ -> False; GT -> False }
    a >= b = case cmpFS a b of { LT -> False; EQ -> True;  GT -> True  }
    a >  b = case cmpFS a b of { LT -> False; EQ -> False; GT -> True  }
    max x y | x >= y    =  x
            | otherwise =  y
    min x y | x <= y    =  x
            | otherwise =  y
    compare a b = cmpFS a b

instance Show FastString where
   show fs = show (unpackFS fs)

instance Data FastString where
  -- don't traverse?
  toConstr _   = abstractConstr "FastString"
  gunfold _ _  = error "gunfold"
  dataTypeOf _ = mkNoRepType "FastString"

cmpFS :: FastString -> FastString -> Ordering
cmpFS (FastString u1 l1 _ buf1 _) (FastString u2 l2 _ buf2 _) =
  if u1 == u2 then EQ else
  case unsafeMemcmp buf1 buf2 (min l1 l2) `compare` 0 of
     LT -> LT
     EQ -> compare l1 l2
     GT -> GT

unsafeMemcmp :: ForeignPtr a -> ForeignPtr b -> Int -> Int
unsafeMemcmp buf1 buf2 l =
      inlinePerformIO $
        withForeignPtr buf1 $ \p1 ->
        withForeignPtr buf2 $ \p2 ->
          memcmp p1 p2 l

#ifndef __HADDOCK__
foreign import ccall unsafe "ghc_memcmp"
  memcmp :: Ptr a -> Ptr b -> Int -> IO Int
#endif

-- -----------------------------------------------------------------------------
-- Construction

{-
Internally, the compiler will maintain a fast string symbol
table, providing sharing and fast comparison. Creation of
new @FastString@s then covertly does a lookup, re-using the
@FastString@ if there was a hit.
-}

data FastStringTable =
 FastStringTable
    {-# UNPACK #-} !Int
    (MutableArray# RealWorld [FastString])

{-# NOINLINE string_table #-}
string_table :: IORef FastStringTable
string_table =
 unsafePerformIO $ do
   tab <- IO $ \s1# -> case newArray# hASH_TBL_SIZE_UNBOXED [] s1# of
                           (# s2#, arr# #) ->
                               (# s2#, FastStringTable 0 arr# #)
   newIORef tab

lookupTbl :: FastStringTable -> Int -> IO [FastString]
lookupTbl (FastStringTable _ arr#) (I# i#) =
  IO $ \ s# -> readArray# arr# i# s#

updTbl :: IORef FastStringTable -> FastStringTable -> Int -> [FastString] -> IO ()
updTbl fs_table_var (FastStringTable uid arr#) (I# i#) ls = do
  (IO $ \ s# -> case writeArray# arr# i# ls s# of { s2# -> (# s2#, () #) })
  writeIORef fs_table_var (FastStringTable (uid+1) arr#)

mkFastString# :: Addr# -> FastString
mkFastString# a# = mkFastStringBytes ptr (ptrStrLength ptr)
  where ptr = Ptr a#

mkFastStringBytes :: Ptr Word8 -> Int -> FastString
mkFastStringBytes ptr len = unsafePerformIO $ do
  ft@(FastStringTable uid _) <- readIORef string_table
  let
   h = hashStr ptr len
   add_it ls = do
        fs <- copyNewFastString uid ptr len
        updTbl string_table ft h (fs:ls)
        {- _trace ("new: " ++ show f_str)   $ -}
        return fs
  --
  lookup_result <- lookupTbl ft h
  case lookup_result of
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
         Nothing -> add_it ls
         Just v  -> {- _trace ("re-use: "++show v) $ -} return v

mkZFastStringBytes :: Ptr Word8 -> Int -> FastString
mkZFastStringBytes ptr len = unsafePerformIO $ do
  ft@(FastStringTable uid _) <- readIORef string_table
  let
   h = hashStr ptr len
   add_it ls = do
        fs <- copyNewZFastString uid ptr len
        updTbl string_table ft h (fs:ls)
        {- _trace ("new: " ++ show f_str)   $ -}
        return fs
  --
  lookup_result <- lookupTbl ft h
  case lookup_result of
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
         Nothing -> add_it ls
         Just v  -> {- _trace ("re-use: "++show v) $ -} return v

-- | Create a 'FastString' from an existing 'ForeignPtr'; the difference
-- between this and 'mkFastStringBytes' is that we don't have to copy
-- the bytes if the string is new to the table.
mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
mkFastStringForeignPtr ptr fp len = do
  ft@(FastStringTable uid _) <- readIORef string_table
--  _trace ("hashed: "++show (I# h)) $
  let
    h = hashStr ptr len
    add_it ls = do
        fs <- mkNewFastString uid ptr fp len
        updTbl string_table ft h (fs:ls)
        {- _trace ("new: " ++ show f_str)   $ -}
        return fs
  --
  lookup_result <- lookupTbl ft h
  case lookup_result of
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
         Nothing -> add_it ls
         Just v  -> {- _trace ("re-use: "++show v) $ -} return v

mkZFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
mkZFastStringForeignPtr ptr fp len = do
  ft@(FastStringTable uid _) <- readIORef string_table
--  _trace ("hashed: "++show (I# h)) $
  let
    h = hashStr ptr len
    add_it ls = do
        fs <- mkNewZFastString uid ptr fp len
        updTbl string_table ft h (fs:ls)
        {- _trace ("new: " ++ show f_str)   $ -}
        return fs
  --
  lookup_result <- lookupTbl ft h
  case lookup_result of
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
         Nothing -> add_it ls
         Just v  -> {- _trace ("re-use: "++show v) $ -} return v


-- | Creates a UTF-8 encoded 'FastString' from a 'String'
mkFastString :: String -> FastString
mkFastString str =
  inlinePerformIO $ do
    let l = utf8EncodedLength str
    buf <- mallocForeignPtrBytes l
    withForeignPtr buf $ \ptr -> do
      utf8EncodeString ptr str
      mkFastStringForeignPtr ptr buf l

-- | Creates a 'FastString' from a UTF-8 encoded @[Word8]@
mkFastStringByteList :: [Word8] -> FastString
mkFastStringByteList str =
  inlinePerformIO $ do
    let l = Prelude.length str
    buf <- mallocForeignPtrBytes l
    withForeignPtr buf $ \ptr -> do
      pokeArray (castPtr ptr) str
      mkFastStringForeignPtr ptr buf l

-- | Creates a Z-encoded 'FastString' from a 'String'
mkZFastString :: String -> FastString
mkZFastString str =
  inlinePerformIO $ do
    let l = Prelude.length str
    buf <- mallocForeignPtrBytes l
    withForeignPtr buf $ \ptr -> do
      pokeCAString (castPtr ptr) str
      mkZFastStringForeignPtr ptr buf l

bucket_match :: [FastString] -> Int -> Ptr Word8 -> IO (Maybe FastString)
bucket_match [] _ _ = return Nothing
bucket_match (v@(FastString _ l _ buf _):ls) len ptr
      | len == l  =  do
         b <- cmpStringPrefix ptr buf len
         if b then return (Just v)
              else bucket_match ls len ptr
      | otherwise =
         bucket_match ls len ptr

mkNewFastString :: Int -> Ptr Word8 -> ForeignPtr Word8 -> Int
                -> IO FastString
mkNewFastString uid ptr fp len = do
  ref <- newIORef Nothing
  n_chars <- countUTF8Chars ptr len
  return (FastString uid len n_chars fp (UTF8Encoded ref))

mkNewZFastString :: Int -> Ptr Word8 -> ForeignPtr Word8 -> Int
                 -> IO FastString
mkNewZFastString uid _ fp len = do
  return (FastString uid len len fp ZEncoded)

copyNewFastString :: Int -> Ptr Word8 -> Int -> IO FastString
copyNewFastString uid ptr len = do
  fp <- copyBytesToForeignPtr ptr len
  ref <- newIORef Nothing
  n_chars <- countUTF8Chars ptr len
  return (FastString uid len n_chars fp (UTF8Encoded ref))

copyNewZFastString :: Int -> Ptr Word8 -> Int -> IO FastString
copyNewZFastString uid ptr len = do
  fp <- copyBytesToForeignPtr ptr len
  return (FastString uid len len fp ZEncoded)

copyBytesToForeignPtr :: Ptr Word8 -> Int -> IO (ForeignPtr Word8)
copyBytesToForeignPtr ptr len = do
  fp <- mallocForeignPtrBytes len
  withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len
  return fp

cmpStringPrefix :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO Bool
cmpStringPrefix ptr fp len =
  withForeignPtr fp $ \ptr' -> do
    r <- memcmp ptr ptr' len
    return (r == 0)


hashStr  :: Ptr Word8 -> Int -> Int
 -- use the Addr to produce a hash value between 0 & m (inclusive)
hashStr (Ptr a#) (I# len#) = loop 0# 0#
   where
    loop h n | n GHC.Exts.==# len# = I# h
             | otherwise  = loop h2 (n GHC.Exts.+# 1#)
          where !c = ord# (indexCharOffAddr# a# n)
                !h2 = (c GHC.Exts.+# (h GHC.Exts.*# 128#)) `remInt#`
                      hASH_TBL_SIZE#

-- -----------------------------------------------------------------------------
-- Operations

-- | Returns the length of the 'FastString' in characters
lengthFS :: FastString -> Int
lengthFS f = n_chars f

-- | Returns @True@ if the 'FastString' is Z-encoded
isZEncoded :: FastString -> Bool
isZEncoded fs | ZEncoded <- enc fs = True
              | otherwise          = False

-- | Returns @True@ if this 'FastString' is not Z-encoded but already has
-- a Z-encoding cached (used in producing stats).
hasZEncoding :: FastString -> Bool
hasZEncoding (FastString _ _ _ _ enc) =
  case enc of
    ZEncoded -> False
    UTF8Encoded ref ->
      inlinePerformIO $ do
        m <- readIORef ref
        return (isJust m)

-- | Returns @True@ if the 'FastString' is empty
nullFS :: FastString -> Bool
nullFS f  =  n_bytes f == 0

-- | Unpacks and decodes the FastString
unpackFS :: FastString -> String
unpackFS (FastString _ n_bytes _ buf enc) =
  inlinePerformIO $ withForeignPtr buf $ \ptr ->
    case enc of
        ZEncoded      -> peekCAStringLen (castPtr ptr,n_bytes)
        UTF8Encoded _ -> utf8DecodeString ptr n_bytes

-- | Gives the UTF-8 encoded bytes corresponding to a 'FastString'
bytesFS :: FastString -> [Word8]
bytesFS (FastString _ n_bytes _ buf _) =
  inlinePerformIO $ withForeignPtr buf $ \ptr ->
    peekArray n_bytes ptr

-- | Returns a Z-encoded version of a 'FastString'.  This might be the
-- original, if it was already Z-encoded.  The first time this
-- function is applied to a particular 'FastString', the results are
-- memoized.
--
zEncodeFS :: FastString -> FastString
zEncodeFS fs@(FastString _ _ _ _ enc) =
  case enc of
    ZEncoded -> fs
    UTF8Encoded ref ->
      inlinePerformIO $ do
        m <- readIORef ref
        case m of
          Just fs -> return fs
          Nothing -> do
            let efs = mkZFastString (zEncodeString (unpackFS fs))
            writeIORef ref (Just efs)
            return efs

appendFS :: FastString -> FastString -> FastString
appendFS fs1 fs2 =
  inlinePerformIO $ do
    r <- mallocForeignPtrBytes len
    withForeignPtr r $ \ r' -> do
    withForeignPtr (buf fs1) $ \ fs1Ptr -> do
    withForeignPtr (buf fs2) $ \ fs2Ptr -> do
        copyBytes r' fs1Ptr len1
        copyBytes (advancePtr r' len1) fs2Ptr len2
        mkFastStringForeignPtr r' r len
  where len  = len1 + len2
        len1 = lengthFS fs1
        len2 = lengthFS fs2

concatFS :: [FastString] -> FastString
concatFS ls = mkFastString (Prelude.concat (map unpackFS ls)) -- ToDo: do better

headFS :: FastString -> Char
headFS (FastString _ 0 _ _ _) = panic "headFS: Empty FastString"
headFS (FastString _ _ _ buf enc) =
  inlinePerformIO $ withForeignPtr buf $ \ptr -> do
    case enc of
      ZEncoded -> do
         w <- peek (castPtr ptr)
         return (castCCharToChar w)
      UTF8Encoded _ ->
         return (fst (utf8DecodeChar ptr))

tailFS :: FastString -> FastString
tailFS (FastString _ 0 _ _ _) = panic "tailFS: Empty FastString"
tailFS (FastString _ n_bytes _ buf enc) =
  inlinePerformIO $ withForeignPtr buf $ \ptr -> do
    case enc of
      ZEncoded -> do
        return $! mkZFastStringBytes (ptr `plusPtr` 1) (n_bytes - 1)
      UTF8Encoded _ -> do
         let (_,ptr') = utf8DecodeChar ptr
         let off = ptr' `minusPtr` ptr
         return $! mkFastStringBytes (ptr `plusPtr` off) (n_bytes - off)

consFS :: Char -> FastString -> FastString
consFS c fs = mkFastString (c : unpackFS fs)

uniqueOfFS :: FastString -> FastInt
uniqueOfFS (FastString u _ _ _ _) = iUnbox u

nilFS :: FastString
nilFS = mkFastString ""

-- -----------------------------------------------------------------------------
-- Stats

getFastStringTable :: IO [[FastString]]
getFastStringTable = do
  tbl <- readIORef string_table
  buckets <- mapM (lookupTbl tbl) [0 .. hASH_TBL_SIZE]
  return buckets

-- -----------------------------------------------------------------------------
-- Outputting 'FastString's

-- |Outputs a 'FastString' with /no decoding at all/, that is, you
-- get the actual bytes in the 'FastString' written to the 'Handle'.
hPutFS :: Handle -> FastString -> IO ()
hPutFS handle (FastString _ len _ fp _)
  | len == 0  = return ()
  | otherwise = do withForeignPtr fp $ \ptr -> hPutBuf handle ptr len

-- ToDo: we'll probably want an hPutFSLocal, or something, to output
-- in the current locale's encoding (for error messages and suchlike).

-- -----------------------------------------------------------------------------
-- LitStrings, here for convenience only.

-- hmm, not unboxed (or rather FastPtr), interesting
--a.k.a. Ptr CChar, Ptr Word8, Ptr (), hmph.  We don't
--really care about C types in naming, where we can help it.
type LitString = Ptr Word8
--Why do we recalculate length every time it's requested?
--If it's commonly needed, we should perhaps have
--data LitString = LitString {-#UNPACK#-}!(FastPtr Word8) {-#UNPACK#-}!FastInt

#if defined(__GLASGOW_HASKELL__)
mkLitString# :: Addr# -> LitString
mkLitString# a# = Ptr a#
#endif
--can/should we use FastTypes here?
--Is this likely to be memory-preserving if only used on constant strings?
--should we inline it? If lucky, that would make a CAF that wouldn't
--be computationally repeated... although admittedly we're not
--really intending to use mkLitString when __GLASGOW_HASKELL__...
--(I wonder, is unicode / multi-byte characters allowed in LitStrings
-- at all?)
{-# INLINE mkLitString #-}
mkLitString :: String -> LitString
mkLitString s =
 unsafePerformIO (do
   p <- mallocBytes (length s + 1)
   let
     loop :: Int -> String -> IO ()
     loop n cs | n `seq` null cs = pokeByteOff p n (0 :: Word8)
     loop n (c:cs) = do
        pokeByteOff p n (fromIntegral (ord c) :: Word8)
        loop (1+n) cs
     -- XXX GHC isn't smart enough to know that we have already covered
     -- this case.
     loop _ [] = panic "mkLitString"
   loop 0 s
   return p
 )

unpackLitString :: LitString -> String
unpackLitString p_ = case pUnbox p_ of
 p -> unpack (_ILIT(0))
  where
    unpack n = case indexWord8OffFastPtrAsFastChar p n of
      ch -> if ch `eqFastChar` _CLIT('\0')
            then [] else cBox ch : unpack (n +# _ILIT(1))

lengthLS :: LitString -> Int
lengthLS = ptrStrLength

-- for now, use a simple String representation
--no, let's not do that right now - it's work in other places
#if 0
type LitString = String

mkLitString :: String -> LitString
mkLitString = id

unpackLitString :: LitString -> String
unpackLitString = id

lengthLS :: LitString -> Int
lengthLS = length

#endif

-- -----------------------------------------------------------------------------
-- under the carpet

foreign import ccall unsafe "ghc_strlen"
  ptrStrLength :: Ptr Word8 -> Int

-- NB. does *not* add a '\0'-terminator.
-- We only use CChar here to be parallel to the imported
-- peekC(A)StringLen.
pokeCAString :: Ptr CChar -> String -> IO ()
pokeCAString ptr str =
  let
        go [] _     = return ()
        go (c:cs) n = do pokeElemOff ptr n (castCharToCChar c); go cs (n+1)
  in
  go str 0

{-# NOINLINE sLit #-}
sLit :: String -> LitString
sLit x  = mkLitString x

{-# NOINLINE fsLit #-}
fsLit :: String -> FastString
fsLit x = mkFastString x

{-# RULES "slit"
    forall x . sLit  (unpackCString# x) = mkLitString#  x #-}
{-# RULES "fslit"
    forall x . fsLit (unpackCString# x) = mkFastString# x #-}
\end{code}