{-# INCLUDE "HsRegexPosixConfig.h" #-} {-# INCLUDE <sys/types.h> #-} {-# INCLUDE "regex.h" #-} {-# LINE 1 "Text/Regex/Posix/Wrap.hsc" #-} {-# OPTIONS_GHC -fglasgow-exts -fffi #-} {-# LINE 2 "Text/Regex/Posix/Wrap.hsc" #-} ----------------------------------------------------------------------------- -- | -- Module : Text.Regex.Posix.Wrap -- Copyright : (c) Chris Kuklewicz 2006 derived from (c) The University of Glasgow 2002 -- License : BSD-style (see the file LICENSE) -- -- Maintainer : libraries@haskell.org, textregexlazy@personal.mightyreason.com -- Stability : experimental -- Portability : non-portable (regex-base needs MPTC+FD) -- -- WrapPosix.hsc exports a wrapped version of the ffi imports. To -- increase type safety, the flags are newtype'd. The other important -- export is a 'Regex' type that is specific to the Posix library -- backend. The flags are documented in "Text.Regex.Posix". The -- 'defaultCompOpt' is @(compExtended .|. compNewline)@. -- -- The 'Regex', 'CompOption', and 'ExecOption' types and their 'RegexOptions' -- instance is declared. The '=~' and '=~~' convenience functions are -- defined. -- -- The exported symbols are the same whether HAVE_REGEX_H is defined, but -- when it is not defined then @getVersion == Nothing@ and all other -- exported values will call error or fail. -- -- This module will fail or error only if allocation fails or a nullPtr -- is passed in. -- ----------------------------------------------------------------------------- module Text.Regex.Posix.Wrap( -- ** High-level API Regex, RegOffset, (=~), (=~~), -- ** Low-level API WrapError, wrapCompile, wrapTest, wrapMatch, wrapMatchAll, wrapCount, -- ** Miscellaneous unusedRegOffset, -- ** Compilation options CompOption(CompOption), compBlank, compExtended, -- use extended regex syntax compIgnoreCase, -- ignore case when matching compNoSub, -- no substring matching needed compNewline, -- '.' doesn't match newline -- ** Execution options ExecOption(ExecOption), execBlank, execNotBOL, -- not at begining of line execNotEOL, -- not at end of line -- ** Return codes ReturnCode(ReturnCode), retBadbr, retBadpat, retBadrpt, retEcollate, retEctype, retEescape, retEsubreg, retEbrack, retEparen, retEbrace, retErange, retEspace ) where {-# LINE 80 "Text/Regex/Posix/Wrap.hsc" #-} {-# LINE 81 "Text/Regex/Posix/Wrap.hsc" #-} {-# LINE 85 "Text/Regex/Posix/Wrap.hsc" #-} {-# LINE 87 "Text/Regex/Posix/Wrap.hsc" #-} {-# LINE 89 "Text/Regex/Posix/Wrap.hsc" #-} {-# LINE 90 "Text/Regex/Posix/Wrap.hsc" #-} {-# LINE 100 "Text/Regex/Posix/Wrap.hsc" #-} import Control.Monad(mapM) import Data.Array(Array,listArray) import Data.Bits(Bits(..)) import Data.Int -- need whatever #regoff_t type will be import Foreign(Ptr, FunPtr, nullPtr, mallocForeignPtrBytes, addForeignPtrFinalizer, Storable(peekByteOff), allocaArray, allocaBytes, withForeignPtr,ForeignPtr,plusPtr) import Foreign.C(CSize,CInt) import Foreign.C.String(peekCAString, CString) import Text.Regex.Base.RegexLike(RegexOptions(..),RegexMaker(..),RegexContext(..),MatchArray) type CRegex = () -- dummy regex_t used below to read out nsub value -- | RegOffset is "typedef int regoff_t" on Linux and ultimately "typedef -- long long __int64_t" on Max OS X. So rather than saying -- 2,147,483,647 is all the length you need, I'll take the larger: -- 9,223,372,036,854,775,807 should be enough bytes for anyone, no -- need for Integer. The alternative is to compile to different sizes -- in a platform dependent manner with "type RegOffset = (#type -- regoff_t)", which I do not want to do. -- -- There is also a special value 'unusedRegOffset' :: 'RegOffset' which is -- (-1) and as a starting index means that the subgroup capture was -- unused. Otherwise the RegOffset indicates a character boundary that -- is before the character at that index offset, with the first -- character at index offset 0. So starting at 1 and ending at 2 means -- to take only the second character. type RegOffset = Int64 -- | A bitmapped 'CInt' containing options for compilation of regular -- expressions. Option values (and their man 3 regcomp names) are -- -- * 'compBlank' which is a completely zero value for all the flags. -- This is also the 'blankCompOpt' value. -- -- * 'compExtended' (REG_EXTENDED) which can be set to use extended instead -- of basic regular expressions. -- This is set in the 'defaultCompOpt' value. -- -- * 'compNewline' (REG_NEWLINE) turns on newline sensitivity: The dot (.) -- and inverted set @[^ ]@ never match newline, and ^ and $ anchors do -- match after and before newlines. -- This is set in the 'defaultCompOpt' value. -- -- * 'compIgnoreCase' (REG_ICASE) which can be set to match ignoring upper -- and lower distinctions. -- -- * 'compNoSub' (REG_NOSUB) which turns off all information from matching -- except whether a match exists. {-# LINE 151 "Text/Regex/Posix/Wrap.hsc" #-} newtype CompOption = CompOption CInt deriving (Eq,Show,Num,Bits) {-# LINE 173 "Text/Regex/Posix/Wrap.hsc" #-} -- | A bitmapped 'CInt' containing options for execution of compiled -- regular expressions. Option values (and their man 3 regexec names) are -- -- * 'execBlank' which is a complete zero value for all the flags. This is -- the blankExecOpt value. -- -- * 'execNotBOL' (REG_NOTBOL) can be set to prevent ^ from matching at the -- start of the input. -- -- * 'execNotEOL' (REG_NOTEOL) can be set to prevent $ from matching at the -- end of the input (before the terminating NUL). {-# LINE 186 "Text/Regex/Posix/Wrap.hsc" #-} newtype ExecOption = ExecOption CInt deriving (Eq,Show,Num,Bits) {-# LINE 208 "Text/Regex/Posix/Wrap.hsc" #-} -- | ReturnCode is an enumerated 'CInt', corresponding to the error codes -- from @man 3 regex@: -- -- * 'retBadbr' (@REG_BADBR@) invalid repetition count(s) in @{ }@ -- -- * 'retBadpat' (@REG_BADPAT@) invalid regular expression -- -- * 'retBadrpt' (@REG_BADRPT@) @?@, @*@, or @+@ operand invalid -- -- * 'retEcollate' (@REG_ECOLLATE@) invalid collating element -- -- * 'retEctype' (@REG_ECTYPE@) invalid character class -- -- * 'retEescape' (@REG_EESCAPE@) @\\@ applied to unescapable character -- -- * 'retEsubreg' (@REG_ESUBREG@) invalid backreference number -- -- * 'retEbrack' (@REG_EBRACK@) brackets @[ ]@ not balanced -- -- * 'retEparen' (@REG_EPAREN@) parentheses @( )@ not balanced -- -- * 'retEbrace' (@REG_EBRACE@) braces @{ }@ not balanced -- -- * 'retErange' (@REG_ERANGE@) invalid character range in @[ ]@ -- -- * 'retEspace' (@REG_ESPACE@) ran out of memory -- -- * 'retNoMatch' (@REG_NOMATCH@) The regexec() function failed to match -- newtype ReturnCode = ReturnCode CInt deriving (Eq,Show) -- | A compiled regular expression. data Regex = Regex (ForeignPtr CRegex) CompOption ExecOption -- | A completely zero value for all the flags. -- This is also the 'blankCompOpt' value. compBlank :: CompOption compBlank = CompOption 0 -- | A completely zero value for all the flags. -- This is also the 'blankExecOpt' value. execBlank :: ExecOption execBlank = ExecOption 0 unusedRegOffset :: RegOffset unusedRegOffset = (-1) -- | The return code will be retOk when it is the Haskell wrapper and -- not the underlying library generating the error message. type WrapError = (ReturnCode,String) wrapCompile :: CompOption -- ^ Flags (bitmapped) -> ExecOption -- ^ Flags (bitmapped) -> CString -- ^ The regular expression to compile (ASCII only, no null bytes) -> IO (Either WrapError Regex) -- ^ Returns: the compiled regular expression wrapTest :: Regex -> CString -> IO (Either WrapError Bool) -- | wrapMatch returns offsets for the begin and end of each capture. -- Unused captures have offsets of unusedRegOffset which is (-1) wrapMatch :: Regex -> CString -> IO (Either WrapError (Maybe [(RegOffset,RegOffset)])) -- | wrapMatchAll returns the offset and length of each capture. -- Unused captures have an offset of unusedRegOffset which is (-1) and -- length of 0. wrapMatchAll :: Regex -> CString -> IO (Either WrapError [MatchArray]) wrapCount :: Regex -> CString -> IO (Either WrapError Int) (=~) :: (RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target) => source1 -> source -> target (=~~) :: (RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target,Monad m) => source1 -> source -> m target instance RegexOptions Regex CompOption ExecOption where blankCompOpt = compBlank blankExecOpt = execBlank defaultCompOpt = compExtended .|. compNewline defaultExecOpt = execBlank setExecOpts e' (Regex r c _) = Regex r c e' getExecOpts (Regex _ _ e) = e -- (=~) ::(RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target) => source1 -> source -> target (=~) x r = let make :: RegexMaker Regex CompOption ExecOption a => a -> Regex make = makeRegex in match (make r) x -- (=~~) ::(RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target,Monad m) => source1 -> source -> m target (=~~) x r = let make :: RegexMaker Regex CompOption ExecOption a => a -> Regex make = makeRegex in matchM (make r) x type CRegMatch = () -- dummy regmatch_t used below to read out so and eo values -- ----------------------------------------------------------------------------- -- The POSIX regex C interface {-# LINE 311 "Text/Regex/Posix/Wrap.hsc" #-} foreign import ccall unsafe "regcomp" c_regcomp :: Ptr CRegex -> CString -> CompOption -> IO ReturnCode foreign import ccall unsafe "®free" c_regfree :: FunPtr (Ptr CRegex -> IO ()) foreign import ccall unsafe "regexec" c_regexec :: Ptr CRegex -> CString -> CSize -> Ptr CRegMatch -> ExecOption -> IO ReturnCode foreign import ccall unsafe "regerror" c_regerror :: ReturnCode -> Ptr CRegex -> CString -> CSize -> IO CSize {-# LINE 358 "Text/Regex/Posix/Wrap.hsc" #-} retOk :: ReturnCode retOk = ReturnCode 0 -- Flags for regexec execNotBOL :: ExecOption execNotBOL = ExecOption 1 execNotEOL :: ExecOption execNotEOL = ExecOption 2 {-# LINE 366 "Text/Regex/Posix/Wrap.hsc" #-} -- Flags for regcomp compExtended :: CompOption compExtended = CompOption 1 compIgnoreCase :: CompOption compIgnoreCase = CompOption 2 compNoSub :: CompOption compNoSub = CompOption 8 compNewline :: CompOption compNewline = CompOption 4 {-# LINE 373 "Text/Regex/Posix/Wrap.hsc" #-} -- Return values from regexec (REG_NOMATCH, REG_ESPACE,...) -- Error codes from regcomp (not REG_NOMATCH) -- Though calling retNoMatch an error is rather missing the point... retNoMatch :: ReturnCode retNoMatch = ReturnCode 1 retBadbr :: ReturnCode retBadbr = ReturnCode 10 retBadpat :: ReturnCode retBadpat = ReturnCode 2 retBadrpt :: ReturnCode retBadrpt = ReturnCode 13 retEcollate :: ReturnCode retEcollate = ReturnCode 3 retEctype :: ReturnCode retEctype = ReturnCode 4 retEescape :: ReturnCode retEescape = ReturnCode 5 retEsubreg :: ReturnCode retEsubreg = ReturnCode 6 retEbrack :: ReturnCode retEbrack = ReturnCode 7 retEparen :: ReturnCode retEparen = ReturnCode 8 retEbrace :: ReturnCode retEbrace = ReturnCode 9 retErange :: ReturnCode retErange = ReturnCode 11 retEspace :: ReturnCode retEspace = ReturnCode 12 {-# LINE 391 "Text/Regex/Posix/Wrap.hsc" #-} ---- -- error helpers nullTest :: Ptr a -> String -> IO (Either WrapError b) -> IO (Either WrapError b) {-# INLINE nullTest #-} nullTest ptr msg io = do if nullPtr == ptr then return (Left (retOk,"Ptr parameter was nullPtr in Text.Regex.TRE.Wrap."++msg)) else io {- wrapRC :: ReturnCode -> IO (Either WrapError b) {-# INLINE wrapRC #-} wrapRC r = return (Left (r,"Error in Text.Regex.Posix.Wrap: "++show r)) -} wrapError :: ReturnCode -> Ptr CRegex -> IO (Either WrapError b) wrapError errCode regex_ptr = do -- Call once to compute the error message buffer size errBufSize <- c_regerror errCode regex_ptr nullPtr 0 -- Allocate a temporary buffer to hold the error message allocaArray (fromIntegral errBufSize) $ \errBuf -> do nullTest errBuf "wrapError errBuf" $ do c_regerror errCode regex_ptr errBuf errBufSize msg <- peekCAString errBuf :: IO String return (Left (errCode, msg)) ---------- wrapCompile flags e pattern = do nullTest pattern "wrapCompile pattern" $ do regex_fptr <- mallocForeignPtrBytes (32) {-# LINE 421 "Text/Regex/Posix/Wrap.hsc" #-} withForeignPtr regex_fptr $ \regex_ptr -> do if nullPtr == regex_ptr then return (Left (retOk,"Text.Regex.Posix.Wrap.wrapCompile could not malloc")) else do errCode <- c_regcomp regex_ptr pattern flags if (errCode == retOk) then do -- The SunOS 5.10 man page for regcomp says that if the compilation -- fails then the content of the regex buffer is undefined. Trying to -- free an undefined buffer causes a segv with (at least) GHC 6.10.1 -- on SPARC and x86. Test for this is ghc-regress/lib/Regex/regex001.hs -- addForeignPtrFinalizer c_regfree regex_fptr return . Right $ Regex regex_fptr flags e else wrapError errCode regex_ptr --------- wrapTest (Regex regex_fptr _ flags) cstr = do nullTest cstr "wrapTest" $ do withForeignPtr regex_fptr $ \regex_ptr -> do r <- c_regexec regex_ptr cstr 0 nullPtr flags if r == retOk then return (Right True) else if r == retNoMatch then return (Right False) else wrapError r regex_ptr --------- wrapMatch regex@(Regex regex_fptr compileOptions flags) cstr = do nullTest cstr "wrapMatch cstr" $ do if (0 /= compNoSub .&. compileOptions) then do r <- wrapTest regex cstr case r of Right True -> return (Right (Just [])) -- Source of much "wtf?" crap Right False -> return (Right Nothing) Left err -> return (Left err) else do withForeignPtr regex_fptr $ \regex_ptr -> do nsub <- ((\hsc_ptr -> peekByteOff hsc_ptr 24)) regex_ptr :: IO CSize {-# LINE 463 "Text/Regex/Posix/Wrap.hsc" #-} let nsub_int,nsub_bytes :: Int nsub_int = fromIntegral nsub nsub_bytes = ((1 + nsub_int) * (8)) {-# LINE 466 "Text/Regex/Posix/Wrap.hsc" #-} -- add one because index zero covers the whole match allocaBytes nsub_bytes $ \p_match -> do nullTest p_match "wrapMatch allocaBytes" $ do doMatch regex_ptr cstr nsub p_match flags -- Very very thin wrapper -- Requires, but does not check, that nsub>=0 -- Cannot return (Right (Just [])) doMatch :: Ptr CRegex -> CString -> CSize -> Ptr CRegMatch -> ExecOption -> IO (Either WrapError (Maybe [(RegOffset,RegOffset)])) {-# INLINE doMatch #-} doMatch regex_ptr cstr nsub p_match flags = do r <- c_regexec regex_ptr cstr (1 + nsub) p_match flags if r == retOk then do regions <- mapM getOffsets . take (1+fromIntegral nsub) . iterate (`plusPtr` (8)) $ p_match {-# LINE 483 "Text/Regex/Posix/Wrap.hsc" #-} return (Right (Just regions)) -- regions will not be [] else if r == retNoMatch then return (Right Nothing) else wrapError r regex_ptr where getOffsets :: Ptr CRegMatch -> IO (RegOffset,RegOffset) {-# INLINE getOffsets #-} getOffsets pmatch' = do start <- ((\hsc_ptr -> peekByteOff hsc_ptr 0)) pmatch' :: IO (Int32) {-# LINE 492 "Text/Regex/Posix/Wrap.hsc" #-} end <- ((\hsc_ptr -> peekByteOff hsc_ptr 4)) pmatch' :: IO (Int32) {-# LINE 493 "Text/Regex/Posix/Wrap.hsc" #-} return (fromIntegral start,fromIntegral end) wrapMatchAll regex@(Regex regex_fptr compileOptions flags) cstr = do nullTest cstr "wrapMatchAll cstr" $ do if (0 /= compNoSub .&. compileOptions) then do r <- wrapTest regex cstr case r of Right True -> return (Right [(toMA 0 [])]) -- Source of much "wtf?" crap Right False -> return (Right []) Left err -> return (Left err) else do withForeignPtr regex_fptr $ \regex_ptr -> do nsub <- ((\hsc_ptr -> peekByteOff hsc_ptr 24)) regex_ptr :: IO CSize {-# LINE 507 "Text/Regex/Posix/Wrap.hsc" #-} let nsub_int,nsub_bytes :: Int nsub_int = fromIntegral nsub nsub_bytes = ((1 + nsub_int) * (8)) {-# LINE 510 "Text/Regex/Posix/Wrap.hsc" #-} -- add one because index zero covers the whole match allocaBytes nsub_bytes $ \p_match -> do nullTest cstr "wrapMatchAll p_match" $ do let flags' = (execNotBOL .|. flags) at pos = doMatch regex_ptr (plusPtr cstr pos) nsub p_match flags' loop acc old (s,e) | old `seq` s == e = return (Right (acc [])) | otherwise = do let pos = old + fromIntegral e result <- at pos case result of Right Nothing -> return (Right (acc [])) Right (Just parts@(whole:_)) -> let ma = toMA pos parts in loop (acc.(ma:)) pos whole Left err -> return (Left err) Right (Just []) -> return (Right (acc [(toMA pos [])])) -- should never happen result <- doMatch regex_ptr cstr nsub p_match flags case result of Right Nothing -> return (Right []) Right (Just parts@(whole:_)) -> let ma = toMA 0 parts in loop (ma:) 0 whole Left err -> return (Left err) Right (Just []) -> return (Right [(toMA 0 [])]) -- should never happen where toMA :: Int -> [(RegOffset,RegOffset)] -> Array Int (Int,Int) toMA pos [] = listArray (0,0) [(pos,0)] -- wtf? toMA pos parts = listArray (0,pred (length parts)) . map (\(s,e)-> if s>=0 then (pos+fromIntegral s, fromIntegral (e-s)) else (-1,0)) $ parts --------- wrapCount regex@(Regex regex_fptr compileOptions flags) cstr = do nullTest cstr "wrapCount cstr" $ do if (0 /= compNoSub .&. compileOptions) then do r <- wrapTest regex cstr case r of Right True -> return (Right 1) Right False -> return (Right 0) Left err -> return (Left err) else do withForeignPtr regex_fptr $ \regex_ptr -> do let nsub_bytes = ((8)) {-# LINE 552 "Text/Regex/Posix/Wrap.hsc" #-} allocaBytes nsub_bytes $ \p_match -> do nullTest cstr "wrapCount p_match" $ do let flags' = (execNotBOL .|. flags) at pos = doMatch regex_ptr (plusPtr cstr pos) 0 p_match flags' loop acc old (s,e) | acc `seq` old `seq` False = undefined | s == e = return (Right acc) | otherwise = do let pos = old + fromIntegral e result <- at pos case result of Right Nothing -> return (Right acc) Right (Just (whole:_)) -> loop (succ acc) pos whole Left err -> return (Left err) Right (Just []) -> return (Right acc) -- should never happen result <- doMatch regex_ptr cstr 0 p_match flags case result of Right Nothing -> return (Right 0) Right (Just (whole:_)) -> loop 1 0 whole Left err -> return (Left err) Right (Just []) -> return (Right 0) -- should never happen {- -- This is the slower 0.66 version of the code (91s instead of 79s on 10^6 bytes) wrapMatchAll regex cstr = do let regex' = setExecOpts (execNotBOL .|. (getExecOpts regex)) regex at pos = wrapMatch regex' (plusPtr cstr pos) loop old (s,e) | s == e = return [] | otherwise = do let pos = old + fromIntegral e result <- at pos case unwrap result of Nothing -> return [] Just [] -> return ((toMA pos []):[]) -- wtf? Just parts@(whole:_) -> do rest <- loop pos whole return ((toMA pos parts) : rest) result <- wrapMatch regex cstr case unwrap result of Nothing -> return [] Just [] -> return ((toMA 0 []):[]) -- wtf? Just parts@(whole:_) -> do rest <- loop 0 whole return ((toMA 0 parts) : rest) --------- -- This was also changed to match wrapMatchAll after 0.66 wrapCount regex cstr = do let regex' = setExecOpts (execNotBOL .|. (getExecOpts regex)) regex at pos = wrapMatch regex' (plusPtr cstr pos) loop acc old (s,e) | acc `seq` old `seq` False = undefined | s == e = return acc | otherwise = do let pos = old + fromIntegral e result <- at pos case unwrap result of Nothing -> return acc Just [] -> return (succ acc) -- wtf? Just (whole:_) -> loop (succ acc) pos whole result <- wrapMatch regex cstr case unwrap result of Nothing -> return 0 Just [] -> return 1 -- wtf? Just (whole:_) -> loop 1 0 whole -}