Character classification
\begin{code}
module Ctype
( is_ident
, is_symbol
, is_any
, is_space
, is_lower
, is_upper
, is_digit
, is_alphanum
, is_decdigit, is_hexdigit, is_octdigit
, hexDigit, octDecDigit
) where
#include "HsVersions.h"
import Data.Int ( Int32 )
import Data.Bits ( Bits((.&.)) )
import Data.Char ( ord, chr )
import Panic
\end{code}
Bit masks
\begin{code}
cIdent, cSymbol, cAny, cSpace, cLower, cUpper, cDigit :: Int
cIdent = 1
cSymbol = 2
cAny = 4
cSpace = 8
cLower = 16
cUpper = 32
cDigit = 64
\end{code}
The predicates below look costly, but aren't, GHC+GCC do a great job
at the big case below.
\begin{code}
is_ctype :: Int -> Char -> Bool
is_ctype mask c = (fromIntegral (charType c) .&. fromIntegral mask) /= (0::Int32)
is_ident, is_symbol, is_any, is_space, is_lower, is_upper, is_digit,
is_alphanum :: Char -> Bool
is_ident = is_ctype cIdent
is_symbol = is_ctype cSymbol
is_any = is_ctype cAny
is_space = is_ctype cSpace
is_lower = is_ctype cLower
is_upper = is_ctype cUpper
is_digit = is_ctype cDigit
is_alphanum = is_ctype (cLower+cUpper+cDigit)
\end{code}
Utils
\begin{code}
hexDigit :: Char -> Int
hexDigit c | is_decdigit c = ord c ord '0'
| otherwise = ord (to_lower c) ord 'a' + 10
octDecDigit :: Char -> Int
octDecDigit c = ord c ord '0'
is_decdigit :: Char -> Bool
is_decdigit c
= c >= '0' && c <= '9'
is_hexdigit :: Char -> Bool
is_hexdigit c
= is_decdigit c
|| (c >= 'a' && c <= 'f')
|| (c >= 'A' && c <= 'F')
is_octdigit :: Char -> Bool
is_octdigit c = c >= '0' && c <= '7'
to_lower :: Char -> Char
to_lower c
| c >= 'A' && c <= 'Z' = chr (ord c (ord 'A' ord 'a'))
| otherwise = c
\end{code}
We really mean .|. instead of + below, but GHC currently doesn't do
any constant folding with bitops. *sigh*
\begin{code}
charType :: Char -> Int
charType c = case c of
'\0' -> 0
'\1' -> 0
'\2' -> 0
'\3' -> 0
'\4' -> 0
'\5' -> 0
'\6' -> 0
'\7' -> 0
'\8' -> 0
'\9' -> cSpace
'\10' -> cSpace
'\11' -> cSpace
'\12' -> cSpace
'\13' -> cSpace
'\14' -> 0
'\15' -> 0
'\16' -> 0
'\17' -> 0
'\18' -> 0
'\19' -> 0
'\20' -> 0
'\21' -> 0
'\22' -> 0
'\23' -> 0
'\24' -> 0
'\25' -> 0
'\26' -> 0
'\27' -> 0
'\28' -> 0
'\29' -> 0
'\30' -> 0
'\31' -> 0
'\32' -> cAny + cSpace
'\33' -> cAny + cSymbol
'\34' -> cAny
'\35' -> cAny + cSymbol
'\36' -> cAny + cSymbol
'\37' -> cAny + cSymbol
'\38' -> cAny + cSymbol
'\39' -> cAny + cIdent
'\40' -> cAny
'\41' -> cAny
'\42' -> cAny + cSymbol
'\43' -> cAny + cSymbol
'\44' -> cAny
'\45' -> cAny + cSymbol
'\46' -> cAny + cSymbol
'\47' -> cAny + cSymbol
'\48' -> cAny + cIdent + cDigit
'\49' -> cAny + cIdent + cDigit
'\50' -> cAny + cIdent + cDigit
'\51' -> cAny + cIdent + cDigit
'\52' -> cAny + cIdent + cDigit
'\53' -> cAny + cIdent + cDigit
'\54' -> cAny + cIdent + cDigit
'\55' -> cAny + cIdent + cDigit
'\56' -> cAny + cIdent + cDigit
'\57' -> cAny + cIdent + cDigit
'\58' -> cAny + cSymbol
'\59' -> cAny
'\60' -> cAny + cSymbol
'\61' -> cAny + cSymbol
'\62' -> cAny + cSymbol
'\63' -> cAny + cSymbol
'\64' -> cAny + cSymbol
'\65' -> cAny + cIdent + cUpper
'\66' -> cAny + cIdent + cUpper
'\67' -> cAny + cIdent + cUpper
'\68' -> cAny + cIdent + cUpper
'\69' -> cAny + cIdent + cUpper
'\70' -> cAny + cIdent + cUpper
'\71' -> cAny + cIdent + cUpper
'\72' -> cAny + cIdent + cUpper
'\73' -> cAny + cIdent + cUpper
'\74' -> cAny + cIdent + cUpper
'\75' -> cAny + cIdent + cUpper
'\76' -> cAny + cIdent + cUpper
'\77' -> cAny + cIdent + cUpper
'\78' -> cAny + cIdent + cUpper
'\79' -> cAny + cIdent + cUpper
'\80' -> cAny + cIdent + cUpper
'\81' -> cAny + cIdent + cUpper
'\82' -> cAny + cIdent + cUpper
'\83' -> cAny + cIdent + cUpper
'\84' -> cAny + cIdent + cUpper
'\85' -> cAny + cIdent + cUpper
'\86' -> cAny + cIdent + cUpper
'\87' -> cAny + cIdent + cUpper
'\88' -> cAny + cIdent + cUpper
'\89' -> cAny + cIdent + cUpper
'\90' -> cAny + cIdent + cUpper
'\91' -> cAny
'\92' -> cAny + cSymbol
'\93' -> cAny
'\94' -> cAny + cSymbol
'\95' -> cAny + cIdent + cLower
'\96' -> cAny
'\97' -> cAny + cIdent + cLower
'\98' -> cAny + cIdent + cLower
'\99' -> cAny + cIdent + cLower
'\100' -> cAny + cIdent + cLower
'\101' -> cAny + cIdent + cLower
'\102' -> cAny + cIdent + cLower
'\103' -> cAny + cIdent + cLower
'\104' -> cAny + cIdent + cLower
'\105' -> cAny + cIdent + cLower
'\106' -> cAny + cIdent + cLower
'\107' -> cAny + cIdent + cLower
'\108' -> cAny + cIdent + cLower
'\109' -> cAny + cIdent + cLower
'\110' -> cAny + cIdent + cLower
'\111' -> cAny + cIdent + cLower
'\112' -> cAny + cIdent + cLower
'\113' -> cAny + cIdent + cLower
'\114' -> cAny + cIdent + cLower
'\115' -> cAny + cIdent + cLower
'\116' -> cAny + cIdent + cLower
'\117' -> cAny + cIdent + cLower
'\118' -> cAny + cIdent + cLower
'\119' -> cAny + cIdent + cLower
'\120' -> cAny + cIdent + cLower
'\121' -> cAny + cIdent + cLower
'\122' -> cAny + cIdent + cLower
'\123' -> cAny
'\124' -> cAny + cSymbol
'\125' -> cAny
'\126' -> cAny + cSymbol
'\127' -> 0
_ -> panic ("charType: " ++ show c)
\end{code}