-----------------------------------------------------------------------------
-- |
-- Module      :  Distribution.Simple.PreProcess.Unlit
-- Copyright   :  ...
--
-- Maintainer  :  cabal-devel@haskell.org
-- Portability :  portable
--
-- Remove the \"literal\" markups from a Haskell source file, including
-- \"@>@\", \"@\\begin{code}@\", \"@\\end{code}@\", and \"@#@\"

-- This version is interesting because instead of striping comment lines, it
-- turns them into "-- " style comments. This allows using haddock markup
-- in literate scripts without having to use "> --" prefix.

module Distribution.Simple.PreProcess.Unlit (unlit,plain) where

import Data.Char
import Data.List

data Classified = BirdTrack String | Blank String | Ordinary String
                | Line !Int String | CPP String
                | BeginCode | EndCode
                -- output only:
                | Error String | Comment String

-- | No unliteration.
plain :: String -> String -> String
plain _ hs = hs

classify :: String -> Classified
classify ('>':s) = BirdTrack s
classify ('#':s) = case tokens s of
                     (line:file:_) | all isDigit line
                                  && length file >= 2
                                  && head file == '"'
                                  && last file == '"'
                                -> Line (read line) (tail (init file))
                     _          -> CPP s
  where tokens = unfoldr $ \str -> case lex str of
                                   (t@(_:_), str'):_ -> Just (t, str')
                                   _                 -> Nothing
classify ('\\':s)
  | s `isPrefixOf` "begin{code}" = BeginCode
  | s `isPrefixOf` "end{code}"   = EndCode
classify s | all isSpace s       = Blank s
classify s                       = Ordinary s

-- So the weird exception for comment indenting is to make things work with
-- haddock, see classifyAndCheckForBirdTracks below.
unclassify :: Bool -> Classified -> String
unclassify _     (BirdTrack s) = ' ':s
unclassify _     (Blank s)     = s
unclassify _     (Ordinary s)  = s
unclassify _     (Line n file) = "# " ++ show n ++ " " ++ show file
unclassify _     (CPP s)       = '#':s
unclassify True  (Comment "")  = "  --"
unclassify True  (Comment s)   = "  -- " ++ s
unclassify False (Comment "")  = "--"
unclassify False (Comment s)   = "-- " ++ s
unclassify _     _             = internalError

-- | 'unlit' takes a filename (for error reports), and transforms the
--   given string, to eliminate the literate comments from the program text.
unlit :: FilePath -> String -> Either String String
unlit file input =
  let (usesBirdTracks, classified) = classifyAndCheckForBirdTracks
                                   . inlines
                                   $ input
   in either (Left . unlines . map (unclassify usesBirdTracks))
              Right
    . checkErrors
    . reclassify
    $ classified

  where
    -- So haddock requires comments and code to align, since it treats comments
    -- as following the layout rule. This is a pain for us since bird track
    -- style literate code typically gets indented by two since ">" is replaced
    -- by " " and people usually use one additional space of indent ie
    -- "> then the code". On the other hand we cannot just go and indent all
    -- the comments by two since that does not work for latex style literate
    -- code. So the hacky solution we use here is that if we see any bird track
    -- style code then we'll indent all comments by two, otherwise by none.
    -- Of course this will not work for mixed latex/bird track .lhs files but
    -- nobody does that, it's silly and specifically recommended against in the
    -- H98 unlit spec.
    --
    classifyAndCheckForBirdTracks =
      flip mapAccumL False $ \seenBirdTrack line ->
        let classification = classify line
         in (seenBirdTrack || isBirdTrack classification, classification)

    isBirdTrack (BirdTrack _) = True
    isBirdTrack _             = False

    checkErrors ls = case [ e | Error e <- ls ] of
      []          -> Left  ls
      (message:_) -> Right (f ++ ":" ++ show n ++ ": " ++ message)
        where (f, n) = errorPos file 1 ls
    errorPos f n []              = (f, n)
    errorPos f n (Error _:_)     = (f, n)
    errorPos _ _ (Line n' f':ls) = errorPos f' n' ls
    errorPos f n (_         :ls) = errorPos f  (n+1) ls

-- Here we model a state machine, with each state represented by
-- a local function. We only have four states (well, five,
-- if you count the error state), but the rules
-- to transition between then are not so simple.
-- Would it be simpler to have more states?
--
-- Each state represents the type of line that was last read
-- i.e. are we in a comment section, or a latex-code section,
-- or a bird-code section, etc?
reclassify :: [Classified] -> [Classified]
reclassify = blank -- begin in blank state
  where
    latex []               = []
    latex (EndCode    :ls) = Blank "" : comment ls
    latex (BeginCode  :_ ) = [Error "\\begin{code} in code section"]
    latex (BirdTrack l:ls) = Ordinary ('>':l) : latex ls
    latex (          l:ls) = l : latex ls

    blank []               = []
    blank (EndCode    :_ ) = [Error "\\end{code} without \\begin{code}"]
    blank (BeginCode  :ls) = Blank ""    : latex ls
    blank (BirdTrack l:ls) = BirdTrack l : bird ls
    blank (Ordinary  l:ls) = Comment   l : comment ls
    blank (          l:ls) =           l : blank ls

    bird []              = []
    bird (EndCode   :_ ) = [Error "\\end{code} without \\begin{code}"]
    bird (BeginCode :ls) = Blank "" : latex ls
    bird (Blank l   :ls) = Blank l  : blank ls
    bird (Ordinary _:_ ) = [Error "program line before comment line"]
    bird (         l:ls) = l : bird ls

    comment []               = []
    comment (EndCode    :_ ) = [Error "\\end{code} without \\begin{code}"]
    comment (BeginCode  :ls) = Blank "" : latex ls
    comment (CPP l      :ls) = CPP l : comment ls
    comment (BirdTrack _:_ ) = [Error "comment line before program line"]
    -- a blank line and another ordinary line following a comment
    -- will be treated as continuing the comment. Otherwise it's
    -- then end of the comment, with a blank line.
    comment (Blank     l:ls@(Ordinary  _:_)) = Comment l : comment ls
    comment (Blank     l:ls) = Blank l   : blank ls
    comment (Line n f   :ls) = Line n f  : comment ls
    comment (Ordinary  l:ls) = Comment l : comment ls
    comment (Comment   _: _) = internalError
    comment (Error     _: _) = internalError

-- Re-implementation of 'lines', for better efficiency (but decreased laziness).
-- Also, importantly, accepts non-standard DOS and Mac line ending characters.
inlines :: String -> [String]
inlines xs = lines' xs id
  where
  lines' []             acc = [acc []]
  lines' ('\^M':'\n':s) acc = acc [] : lines' s id    -- DOS
  lines' ('\^M':s)      acc = acc [] : lines' s id    -- MacOS
  lines' ('\n':s)       acc = acc [] : lines' s id    -- Unix
  lines' (c:s)          acc = lines' s (acc . (c:))

internalError :: a
internalError = error "unlit: internal error"