{-# LANGUAGE Trustworthy #-} {-# LANGUAGE NoImplicitPrelude #-} ----------------------------------------------------------------------------- -- | -- Module : Data.Char -- Copyright : (c) The University of Glasgow 2001 -- License : BSD-style (see the file libraries/base/LICENSE) -- -- Maintainer : libraries@haskell.org -- Stability : stable -- Portability : portable -- -- The Char type and associated operations. -- ----------------------------------------------------------------------------- module Data.Char ( Char -- * Character classification -- | Unicode characters are divided into letters, numbers, marks, -- punctuation, symbols, separators (including spaces) and others -- (including control characters). , isControl, isSpace , isLower, isUpper, isAlpha, isAlphaNum, isPrint , isDigit, isOctDigit, isHexDigit , isLetter, isMark, isNumber, isPunctuation, isSymbol, isSeparator -- ** Subranges , isAscii, isLatin1 , isAsciiUpper, isAsciiLower -- ** Unicode general categories , GeneralCategory(..), generalCategory -- * Case conversion , toUpper, toLower, toTitle -- * Single digit characters , digitToInt , intToDigit -- * Numeric representations , ord , chr -- * String representations , showLitChar , lexLitChar , readLitChar ) where import GHC.Base import GHC.Arr (Ix) import GHC.Char import GHC.Real (fromIntegral) import GHC.Show import GHC.Read (Read, readLitChar, lexLitChar) import GHC.Unicode import GHC.Num import GHC.Enum -- $setup -- Allow the use of Prelude in doctests. -- >>> import Prelude -- | Convert a single digit 'Char' to the corresponding 'Int'. This -- function fails unless its argument satisfies 'isHexDigit', but -- recognises both upper- and lower-case hexadecimal digits (that -- is, @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@). -- -- ==== __Examples__ -- -- Characters @\'0\'@ through @\'9\'@ are converted properly to -- @0..9@: -- -- >>> map digitToInt ['0'..'9'] -- [0,1,2,3,4,5,6,7,8,9] -- -- Both upper- and lower-case @\'A\'@ through @\'F\'@ are converted -- as well, to @10..15@. -- -- >>> map digitToInt ['a'..'f'] -- [10,11,12,13,14,15] -- >>> map digitToInt ['A'..'F'] -- [10,11,12,13,14,15] -- -- Anything else throws an exception: -- -- >>> digitToInt 'G' -- *** Exception: Char.digitToInt: not a digit 'G' -- >>> digitToInt '♥' -- *** Exception: Char.digitToInt: not a digit '\9829' -- digitToInt :: Char -> Int digitToInt c | (fromIntegral dec::Word) <= 9 = dec | (fromIntegral hexl::Word) <= 5 = hexl + 10 | (fromIntegral hexu::Word) <= 5 = hexu + 10 | otherwise = error ("Char.digitToInt: not a digit " ++ show c) -- sigh where dec = ord c - ord '0' hexl = ord c - ord 'a' hexu = ord c - ord 'A' -- | Unicode General Categories (column 2 of the UnicodeData table) in -- the order they are listed in the Unicode standard (the Unicode -- Character Database, in particular). -- -- ==== __Examples__ -- -- Basic usage: -- -- >>> :t OtherLetter -- OtherLetter :: GeneralCategory -- -- 'Eq' instance: -- -- >>> UppercaseLetter == UppercaseLetter -- True -- >>> UppercaseLetter == LowercaseLetter -- False -- -- 'Ord' instance: -- -- >>> NonSpacingMark <= MathSymbol -- True -- -- 'Enum' instance: -- -- >>> enumFromTo ModifierLetter SpacingCombiningMark -- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark] -- -- 'Read' instance: -- -- >>> read "DashPunctuation" :: GeneralCategory -- DashPunctuation -- >>> read "17" :: GeneralCategory -- *** Exception: Prelude.read: no parse -- -- 'Show' instance: -- -- >>> show EnclosingMark -- "EnclosingMark" -- -- 'Bounded' instance: -- -- >>> minBound :: GeneralCategory -- UppercaseLetter -- >>> maxBound :: GeneralCategory -- NotAssigned -- -- 'Ix' instance: -- -- >>> import Data.Ix ( index ) -- >>> index (OtherLetter,Control) FinalQuote -- 12 -- >>> index (OtherLetter,Control) Format -- *** Exception: Error in array index -- data GeneralCategory = UppercaseLetter -- ^ Lu: Letter, Uppercase | LowercaseLetter -- ^ Ll: Letter, Lowercase | TitlecaseLetter -- ^ Lt: Letter, Titlecase | ModifierLetter -- ^ Lm: Letter, Modifier | OtherLetter -- ^ Lo: Letter, Other | NonSpacingMark -- ^ Mn: Mark, Non-Spacing | SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining | EnclosingMark -- ^ Me: Mark, Enclosing | DecimalNumber -- ^ Nd: Number, Decimal | LetterNumber -- ^ Nl: Number, Letter | OtherNumber -- ^ No: Number, Other | ConnectorPunctuation -- ^ Pc: Punctuation, Connector | DashPunctuation -- ^ Pd: Punctuation, Dash | OpenPunctuation -- ^ Ps: Punctuation, Open | ClosePunctuation -- ^ Pe: Punctuation, Close | InitialQuote -- ^ Pi: Punctuation, Initial quote | FinalQuote -- ^ Pf: Punctuation, Final quote | OtherPunctuation -- ^ Po: Punctuation, Other | MathSymbol -- ^ Sm: Symbol, Math | CurrencySymbol -- ^ Sc: Symbol, Currency | ModifierSymbol -- ^ Sk: Symbol, Modifier | OtherSymbol -- ^ So: Symbol, Other | Space -- ^ Zs: Separator, Space | LineSeparator -- ^ Zl: Separator, Line | ParagraphSeparator -- ^ Zp: Separator, Paragraph | Control -- ^ Cc: Other, Control | Format -- ^ Cf: Other, Format | Surrogate -- ^ Cs: Other, Surrogate | PrivateUse -- ^ Co: Other, Private Use | NotAssigned -- ^ Cn: Other, Not Assigned deriving (Eq, Ord, Enum, Read, Show, Bounded, Ix) -- | The Unicode general category of the character. This relies on the -- 'Enum' instance of 'GeneralCategory', which must remain in the -- same order as the categories are presented in the Unicode -- standard. -- -- ==== __Examples__ -- -- Basic usage: -- -- >>> generalCategory 'a' -- LowercaseLetter -- >>> generalCategory 'A' -- UppercaseLetter -- >>> generalCategory '0' -- DecimalNumber -- >>> generalCategory '%' -- OtherPunctuation -- >>> generalCategory '♥' -- OtherSymbol -- >>> generalCategory '\31' -- Control -- >>> generalCategory ' ' -- Space -- generalCategory :: Char -> GeneralCategory generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c -- derived character classifiers -- | Selects alphabetic Unicode characters (lower-case, upper-case and -- title-case letters, plus letters of caseless scripts and -- modifiers letters). This function is equivalent to -- 'Data.Char.isAlpha'. -- -- This function returns 'True' if its argument has one of the -- following 'GeneralCategory's, or 'False' otherwise: -- -- * 'UppercaseLetter' -- * 'LowercaseLetter' -- * 'TitlecaseLetter' -- * 'ModifierLetter' -- * 'OtherLetter' -- -- These classes are defined in the -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, -- part of the Unicode standard. The same document defines what is -- and is not a \"Letter\". -- -- ==== __Examples__ -- -- Basic usage: -- -- >>> isLetter 'a' -- True -- >>> isLetter 'A' -- True -- >>> isLetter '0' -- False -- >>> isLetter '%' -- False -- >>> isLetter '♥' -- False -- >>> isLetter '\31' -- False -- -- Ensure that 'isLetter' and 'isAlpha' are equivalent. -- -- >>> let chars = [(chr 0)..] -- >>> let letters = map isLetter chars -- >>> let alphas = map isAlpha chars -- >>> letters == alphas -- True -- isLetter :: Char -> Bool isLetter c = case generalCategory c of UppercaseLetter -> True LowercaseLetter -> True TitlecaseLetter -> True ModifierLetter -> True OtherLetter -> True _ -> False -- | Selects Unicode mark characters, for example accents and the -- like, which combine with preceding characters. -- -- This function returns 'True' if its argument has one of the -- following 'GeneralCategory's, or 'False' otherwise: -- -- * 'NonSpacingMark' -- * 'SpacingCombiningMark' -- * 'EnclosingMark' -- -- These classes are defined in the -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, -- part of the Unicode standard. The same document defines what is -- and is not a \"Mark\". -- -- ==== __Examples__ -- -- Basic usage: -- -- >>> isMark 'a' -- False -- >>> isMark '0' -- False -- -- Combining marks such as accent characters usually need to follow -- another character before they become printable: -- -- >>> map isMark "ò" -- [False,True] -- -- Puns are not necessarily supported: -- -- >>> isMark '✓' -- False -- isMark :: Char -> Bool isMark c = case generalCategory c of NonSpacingMark -> True SpacingCombiningMark -> True EnclosingMark -> True _ -> False -- | Selects Unicode numeric characters, including digits from various -- scripts, Roman numerals, et cetera. -- -- This function returns 'True' if its argument has one of the -- following 'GeneralCategory's, or 'False' otherwise: -- -- * 'DecimalNumber' -- * 'LetterNumber' -- * 'OtherNumber' -- -- These classes are defined in the -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, -- part of the Unicode standard. The same document defines what is -- and is not a \"Number\". -- -- ==== __Examples__ -- -- Basic usage: -- -- >>> isNumber 'a' -- False -- >>> isNumber '%' -- False -- >>> isNumber '3' -- True -- -- ASCII @\'0\'@ through @\'9\'@ are all numbers: -- -- >>> and $ map isNumber ['0'..'9'] -- True -- -- Unicode Roman numerals are \"numbers\" as well: -- -- >>> isNumber 'Ⅸ' -- True -- isNumber :: Char -> Bool isNumber c = case generalCategory c of DecimalNumber -> True LetterNumber -> True OtherNumber -> True _ -> False -- | Selects Unicode punctuation characters, including various kinds -- of connectors, brackets and quotes. -- -- This function returns 'True' if its argument has one of the -- following 'GeneralCategory's, or 'False' otherwise: -- -- * 'ConnectorPunctuation' -- * 'DashPunctuation' -- * 'OpenPunctuation' -- * 'ClosePunctuation' -- * 'InitialQuote' -- * 'FinalQuote' -- * 'OtherPunctuation' -- -- These classes are defined in the -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, -- part of the Unicode standard. The same document defines what is -- and is not a \"Punctuation\". -- -- ==== __Examples__ -- -- Basic usage: -- -- >>> isPunctuation 'a' -- False -- >>> isPunctuation '7' -- False -- >>> isPunctuation '♥' -- False -- >>> isPunctuation '"' -- True -- >>> isPunctuation '?' -- True -- >>> isPunctuation '—' -- True -- isPunctuation :: Char -> Bool isPunctuation c = case generalCategory c of ConnectorPunctuation -> True DashPunctuation -> True OpenPunctuation -> True ClosePunctuation -> True InitialQuote -> True FinalQuote -> True OtherPunctuation -> True _ -> False -- | Selects Unicode symbol characters, including mathematical and -- currency symbols. -- -- This function returns 'True' if its argument has one of the -- following 'GeneralCategory's, or 'False' otherwise: -- -- * 'MathSymbol' -- * 'CurrencySymbol' -- * 'ModifierSymbol' -- * 'OtherSymbol' -- -- These classes are defined in the -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, -- part of the Unicode standard. The same document defines what is -- and is not a \"Symbol\". -- -- ==== __Examples__ -- -- Basic usage: -- -- >>> isSymbol 'a' -- False -- >>> isSymbol '6' -- False -- >>> isSymbol '=' -- True -- -- The definition of \"math symbol\" may be a little -- counter-intuitive depending on one's background: -- -- >>> isSymbol '+' -- True -- >>> isSymbol '-' -- False -- isSymbol :: Char -> Bool isSymbol c = case generalCategory c of MathSymbol -> True CurrencySymbol -> True ModifierSymbol -> True OtherSymbol -> True _ -> False -- | Selects Unicode space and separator characters. -- -- This function returns 'True' if its argument has one of the -- following 'GeneralCategory's, or 'False' otherwise: -- -- * 'Space' -- * 'LineSeparator' -- * 'ParagraphSeparator' -- -- These classes are defined in the -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, -- part of the Unicode standard. The same document defines what is -- and is not a \"Separator\". -- -- ==== __Examples__ -- -- Basic usage: -- -- >>> isSeparator 'a' -- False -- >>> isSeparator '6' -- False -- >>> isSeparator ' ' -- True -- -- Warning: newlines and tab characters are not considered -- separators. -- -- >>> isSeparator '\n' -- False -- >>> isSeparator '\t' -- False -- -- But some more exotic characters are (like HTML's @ @): -- -- >>> isSeparator '\160' -- True -- isSeparator :: Char -> Bool isSeparator c = case generalCategory c of Space -> True LineSeparator -> True ParagraphSeparator -> True _ -> False