{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE UnliftedFFITypes #-}

-- |
-- Module      : Data.Text.Lazy.Search
-- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Fast substring search for lazy 'Text', based on work by Boyer,
-- Moore, Horspool, Sunday, and Lundh.  Adapted from the strict
-- implementation.

module Data.Text.Internal.Lazy.Search
    (
      indices
    ) where

import Data.Bits (unsafeShiftL)
import qualified Data.Text.Array as A
import Data.Int (Int64)
import Data.Word (Word8, Word64)
import qualified Data.Text.Internal as T
import qualified Data.Text as T (concat, isPrefixOf)
import Data.Text.Internal.Fusion.Types (PairS(..))
import Data.Text.Internal.Lazy (Text(..), foldrChunks)
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Data.Bits ((.|.), (.&.))
import Foreign.C.Types
import GHC.Exts (ByteArray#)
import System.Posix.Types (CSsize(..))

-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
-- @needle@ within @haystack@.
--
-- This function is strict in @needle@, and lazy (as far as possible)
-- in the chunks of @haystack@.
--
-- In (unlikely) bad cases, this algorithm's complexity degrades
-- towards /O(n*m)/.
indices :: Text              -- ^ Substring to search for (@needle@)
        -> Text              -- ^ Text to search in (@haystack@)
        -> [Int64]
indices :: Text -> Text -> [Int64]
indices Text
needle
    | Int
nlen Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0  = [Int64] -> Text -> [Int64]
forall a b. a -> b -> a
const []
    | Int
nlen Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
1  = Word8 -> Int64 -> Text -> [Int64]
indicesOne (Array -> Int -> Word8
A.unsafeIndex Array
narr Int
noff) Int64
0
    | Bool
otherwise  = Int64 -> Int -> Text -> [Int64]
advance Int64
0 Int
0
  where
    T.Text Array
narr Int
noff Int
nlen = [Text] -> Text
T.concat ((Text -> [Text] -> [Text]) -> [Text] -> Text -> [Text]
forall a. (Text -> a -> a) -> a -> Text -> a
foldrChunks (:) [] Text
needle)

    advance :: Int64 -> Int -> Text -> [Int64]
advance !Int64
_ !Int
_ Text
Empty = []
    advance !(Int64
g :: Int64) !(Int
i :: Int) xxs :: Text
xxs@(Chunk x :: Text
x@(T.Text xarr :: Array
xarr@(A.ByteArray ByteArray#
xarr#) Int
xoff Int
l) Text
xs)
         | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
l = Int64 -> Int -> Text -> [Int64]
advance Int64
g (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
l) Text
xs
         | Int -> Text -> Text -> Bool
lackingHay (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
nlen) Text
x Text
xs  = []
         | Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
z Bool -> Bool -> Bool
&& Bool
candidateMatch    = Int64
g Int64 -> [Int64] -> [Int64]
forall a. a -> [a] -> [a]
: Int64 -> Int -> Text -> [Int64]
advance (Int64
g Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int -> Int64
intToInt64 Int
nlen) (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
nlen) Text
xxs
         | Bool
otherwise                   = Int64 -> Int -> Text -> [Int64]
advance (Int64
g Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int -> Int64
intToInt64 Int
delta) (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
delta) Text
xxs
       where
         c :: Word8
c = Text -> Int -> Word8
index Text
xxs (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
nlast)
         delta :: Int
delta | Bool
nextInPattern = Int
nlen Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
               | Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
z        = Int
skip Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
               | Int
l Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
nlen = case IO CSsize -> CSsize
forall a. IO a -> a
unsafeDupablePerformIO (IO CSsize -> CSsize) -> IO CSsize -> CSsize
forall a b. (a -> b) -> a -> b
$
                  ByteArray# -> CSize -> CSize -> Word8 -> IO CSsize
memchr ByteArray#
xarr# (Int -> CSize
intToCSize (Int
xoff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
nlen)) (Int -> CSize
intToCSize (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
nlen)) Word8
z of
                    -1 -> Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
1 (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
nlen)
                    CSsize
s  -> CSsize -> Int
cSsizeToInt CSsize
s Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
                | Bool
otherwise = Int
1
         nextInPattern :: Bool
nextInPattern         = Word64
mask Word64 -> Word64 -> Word64
forall a. Bits a => a -> a -> a
.&. Word8 -> Word64
swizzle (Text -> Int -> Word8
index Text
xxs (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
nlen)) Word64 -> Word64 -> Bool
forall a. Eq a => a -> a -> Bool
== Word64
0

         candidateMatch :: Bool
candidateMatch
          | Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
nlen Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
l = Array -> Int -> Array -> Int -> Int -> Bool
A.equal Array
narr Int
noff Array
xarr (Int
xoff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i) Int
nlen
          | Bool
otherwise     = Array -> Int -> Array -> Int -> Int -> Bool
A.equal Array
narr Int
noff Array
xarr (Int
xoff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i) (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
i) Bool -> Bool -> Bool
&&
            Array -> Int -> Int -> Text
T.Text Array
narr (Int
noff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
i) (Int
nlen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i) Text -> Text -> Bool
`isPrefixOf` Text
xs

    nlast :: Int
nlast     = Int
nlen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1
    z :: Word8
z         = Array -> Int -> Word8
A.unsafeIndex Array
narr (Int
noff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
nlen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)
    (Word64
mask :: Word64) :*: Int
skip = Int -> Int -> Word64 -> Int -> PairS Word64 Int
buildTable Int
0 Int
0 Word64
0 (Int
nlenInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
2)

    swizzle :: Word8 -> Word64
    swizzle :: Word8 -> Word64
swizzle Word8
w = Word64
1 Word64 -> Int -> Word64
forall a. Bits a => a -> Int -> a
`unsafeShiftL` (Word8 -> Int
word8ToInt Word8
w Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3f)

    buildTable :: Int -> Int -> Word64 -> Int -> PairS Word64 Int
buildTable !Int
g !Int
i !Word64
msk !Int
skp
            | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
nlast = (Word64
msk Word64 -> Word64 -> Word64
forall a. Bits a => a -> a -> a
.|. Word8 -> Word64
swizzle Word8
z) Word64 -> Int -> PairS Word64 Int
forall a b. a -> b -> PairS a b
:*: Int
skp
            | Bool
otherwise = Int -> Int -> Word64 -> Int -> PairS Word64 Int
buildTable (Int
gInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1) (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1) Word64
msk' Int
skp'
            where c :: Word8
c                = Array -> Int -> Word8
A.unsafeIndex Array
narr (Int
noffInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
i)
                  msk' :: Word64
msk'             = Word64
msk Word64 -> Word64 -> Word64
forall a. Bits a => a -> a -> a
.|. Word8 -> Word64
swizzle Word8
c
                  skp' :: Int
skp' | Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
z    = Int
nlen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
g Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
2
                       | Bool
otherwise = Int
skp

    -- | Check whether an attempt to index into the haystack at the
    -- given offset would fail.
    lackingHay :: Int -> T.Text -> Text -> Bool
    lackingHay :: Int -> Text -> Text -> Bool
lackingHay Int
q (T.Text Array
_ Int
_ Int
l) Text
ps = Int
l Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
q Bool -> Bool -> Bool
&& case Text
ps of
      Text
Empty -> Bool
True
      Chunk Text
r Text
rs -> Int -> Text -> Text -> Bool
lackingHay (Int
q Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
l) Text
r Text
rs

-- | Fast index into a partly unpacked 'Text'.  We take into account
-- the possibility that the caller might try to access one element
-- past the end.
index :: Text -> Int -> Word8
index :: Text -> Int -> Word8
index Text
Empty !Int
_ = Word8
0
index (Chunk (T.Text Array
arr Int
off Int
len) Text
xs) !Int
i
    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
len   = Array -> Int -> Word8
A.unsafeIndex Array
arr (Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i)
    | Bool
otherwise = Text -> Int -> Word8
index Text
xs (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len)

-- | A variant of 'indices' that scans linearly for a single 'Word8'.
indicesOne :: Word8 -> Int64 -> Text -> [Int64]
indicesOne :: Word8 -> Int64 -> Text -> [Int64]
indicesOne Word8
c = Int64 -> Text -> [Int64]
chunk
  where
    chunk :: Int64 -> Text -> [Int64]
    chunk :: Int64 -> Text -> [Int64]
chunk !Int64
_ Text
Empty = []
    chunk !Int64
i (Chunk (T.Text Array
oarr Int
ooff Int
olen) Text
os) = Int -> [Int64]
go Int
0
      where
        go :: Int -> [Int64]
go Int
h | Int
h Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
olen = Int64 -> Text -> [Int64]
chunk (Int64
iInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int -> Int64
intToInt64 Int
olen) Text
os
             | Word8
on Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
c = Int64
i Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int -> Int64
intToInt64 Int
h Int64 -> [Int64] -> [Int64]
forall a. a -> [a] -> [a]
: Int -> [Int64]
go (Int
hInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
             | Bool
otherwise = Int -> [Int64]
go (Int
hInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
             where on :: Word8
on = Array -> Int -> Word8
A.unsafeIndex Array
oarr (Int
ooffInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
h)

-- | First argument is a strict Text, and second is a lazy one.
isPrefixOf :: T.Text -> Text -> Bool
isPrefixOf :: Text -> Text -> Bool
isPrefixOf (T.Text Array
_ Int
_ Int
xlen) Text
Empty = Int
xlen Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0
isPrefixOf x :: Text
x@(T.Text Array
xarr Int
xoff Int
xlen) (Chunk y :: Text
y@(T.Text Array
_ Int
_ Int
ylen) Text
ys)
  | Int
xlen Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
ylen = Text
x Text -> Text -> Bool
`T.isPrefixOf` Text
y
  | Bool
otherwise = Text
y Text -> Text -> Bool
`T.isPrefixOf` Text
x Bool -> Bool -> Bool
&& Array -> Int -> Int -> Text
T.Text Array
xarr (Int
xoff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
ylen) (Int
xlen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
ylen) Text -> Text -> Bool
`isPrefixOf` Text
ys

intToInt64 :: Int -> Int64
intToInt64 :: Int -> Int64
intToInt64 = Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral

word8ToInt :: Word8 -> Int
word8ToInt :: Word8 -> Int
word8ToInt = Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral

intToCSize :: Int -> CSize
intToCSize :: Int -> CSize
intToCSize = Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral

cSsizeToInt :: CSsize -> Int
cSsizeToInt :: CSsize -> Int
cSsizeToInt = CSsize -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral

foreign import ccall unsafe "_hs_text_memchr" memchr
    :: ByteArray# -> CSize -> CSize -> Word8 -> IO CSsize