{-# LANGUAGE RecursiveDo, StandaloneDeriving #-}
module Text.Format.Cleartext
    ( Doc, Txt, Exp, BTok(..), LTok(..), MTok(..)
    , Table(..), Colspec(..), Row(..), Cell(..)
    , FMode(..)
    , Metadata, Bibliography, Footnotes, TxtContext(..)
    , metadata, bibliography, footnotes, metatags
    , doctitle, docsubtitle
    , folddoctree, folddoctree_wc
    , p_txt, p_doc, p_exp, parsetxt, parsedoc, parseexp
    , readctxfile
    , doctohtml, txttohtml, exptohtml
    , doctoascii, txttoascii, exptoascii
    ) where

import Text.Parsers.Frisby hiding (text)
import qualified Text.Parsers.Frisby as Frisby
import Text.Parsers.Frisby.Char hiding (space)
import Data.Char (isSpace, ord)
import Control.Monad (liftM)
import Control.Monad.List (mplus)
import Data.Maybe (listToMaybe)
import XmlOut
import UTF8


---- Document representation

type Doc = [BTok]
type Txt = [LTok]
type Exp = [MTok]

-- block tokens
data BTok = TEXT Txt        -- text block
          | PARA Txt        -- new paragraph (w. opt. title)
          | SECT Int Txt    -- new section (of given level, w. title)
          | META String String  -- metadata (key, value) pair
          | FLOA FMode Doc Txt  -- floating body w. caption
          | QUOT Doc        -- blockquote
          | CODE String     -- code block
          | BULL [Doc]      -- bulleted list
          | NUMB [Doc]      -- numbered list
          | TABL (Table Txt) -- table
          | FOOT Footnotes  -- footnote definitions
    deriving (Show, Read)

-- table data
data Table txt = Table [Colspec] [Row txt]
data Colspec   = Vrule | Colleft | Colcenter | Colright
data Row txt   = Hrule | Row [Cell txt]
data Cell txt  = Joinleft | Joinright | Cell txt

deriving instance Show t => Show (Table t)
deriving instance Show Colspec
deriving instance Show t => Show (Row t)
deriving instance Show t => Show (Cell t)

deriving instance Read t => Read (Table t)
deriving instance Read Colspec
deriving instance Read t => Read (Row t)
deriving instance Read t => Read (Cell t)

instance Functor Cell where
    fmap f c = case c of Cell x -> Cell (f x)
                         Joinleft -> Joinleft; Joinright -> Joinright
instance Functor Row where
    fmap f r = case r of Row cs -> Row (map (fmap f) cs); Hrule -> Hrule
instance Functor Table where
    fmap f (Table cs rs) = Table cs (map (fmap f) rs)

-- for floating bodies, should the rest of the document flow around it?
data FMode = FlowRight  -- document will flow around the float to the right
           | FlowLeft   -- doc will flow to the left
           | NoFlow     -- doc will continue below the float
    deriving (Show, Read)

-- line tokens
data LTok = St String       -- character string
          | Hy Txt String   -- hyperlink (linked text, target url)
          | Im String       -- embedded image
          | Em Txt          -- emphasized text
          | Bd Txt          -- boldface
          | Co String       -- inline 'code'
          | Qu Txt          -- quoted text
          | An String       -- reference anchor
          | Xr String       -- cross reference
          | Fn String       -- footnote reference
          | Ma Exp          -- math mode
          | Nl              -- explicit linebreak
    deriving (Show, Read)

-- math tokens
data MTok = Mvar String             -- things like x, i, n, ...
          | Mopr String             -- things like exp, log, sin, ...
          | Mtxt Txt                -- embedded normal text
          | Msym String             -- =, +, =>, ...
          | Msub Exp Exp            -- subscript
          | Msup Exp Exp            -- superscript
          | Mfen String Exp String  -- parens, braces, brackets, ...
    deriving (Show, Read)

type Metadata      = [(String, String)]
type Bibliography  = [(String, Txt)]
type Footnotes     = [(String, Doc)]

metadata, metatags :: Doc -> Metadata
metadata = fst . splitmetas
metatags = metadata

splitmetas :: Doc -> (Metadata, Doc)
splitmetas [] = ([], [])
splitmetas (x:xs) = case x of
    SECT _ _ {-_-} -> ([], x:xs)
    META k v   -> ((k,v):ms, xs')
    _          -> (ms, x:xs')
    where
    (ms, xs') = splitmetas xs

bibliography :: Doc -> Bibliography
--bibliography doc = concat [xs | BIBL xs <- doc]
bibliography = const []

footnotes :: Doc -> Footnotes
footnotes doc = concat [xs | FOOT xs <- doc]

doctitle :: Doc -> Txt
doctitle doc = case lookup "title" (metatags doc) of
    Nothing -> []
    Just x  -> [St x]

docsubtitle :: Doc -> Txt
docsubtitle doc = case lookup "subtitle" (metatags doc) of
    Nothing -> []
    Just x  -> [St x]

data TxtContext doc txt = TxtContext { txtfootnotes    :: [(String, doc)]
                                     , txtbibliography :: [(String, txt)]
                                     }

-- fold over a Doc's block-level (tree) structure
-- TODO: bibliography (in Doc type, then reenable above)
-- TODO: section subtitles (in Doc type, then reenable here)
folddoctree
  ::  (TxtContext doc txt -> Txt -> txt)   -- text segment

      -> (txt -> tok)                -- text block
      -> (String -> tok)             -- code block
      -> (doc -> tok)                -- blockquote
      -> ([doc] -> tok)              -- bulleted list
      -> ([doc] -> tok)              -- numbered list
      -> (Table txt -> tok)          -- table
      -> (FMode-> doc -> txt -> tok) -- floating body w. caption
      -> ([(String,txt)] -> tok)     -- bibliography

      -> (  txt                      -- para. title
            -> [tok]                 -- para. content
            -> para  )               -- paragraph

      -> (  Int                      -- sect. level
            -> txt                   -- sect. title
            -> txt                   -- sect. subtitle
            -> doc                   -- sect. content
            -> sect  )               -- section

      -> (  Metadata                 -- (sub)document metadata
            -> [para]                -- leading paragraphs
            -> [sect]                -- following sections
            -> doc  )                -- document

      -> Doc -> doc

folddoctree fl ft fc fq fb fn ftb ff fr fp fs fd doc =
    fst (folddoctree_wc fl ft fc fq fb fn ftb ff fr fp fs fd doc)
folddoctree_wc fl ft fc fq fb fn ftb ff fr fp fs fd doc
    = let ctx = (txtcontext doc) in (fdt ctx doc, ctx)
  where
  txtcontext d = ctx
    where
    ctx = TxtContext
      { txtfootnotes    = [(x, fdt (excf x ctx) y) | (x,y) <- footnotes d]
      , txtbibliography = [(x, fl  (excb x ctx) y) | (x,y) <- bibliography d]
      }
  exc x = filter (\(k,v) -> k/=x)
  excf x ctx = ctx { txtfootnotes = exc x (txtfootnotes ctx) }
  excb x ctx = ctx { txtbibliography = exc x (txtbibliography ctx) }

  fdt ctx d = fst (snip_doc ctx 0 d)

  snip_doc ctx l doc = (fd metas paras subsects, rest)
    where
    (metas, d) = splitmetas doc

    (paras,xs) = case d of
      []           -> ([], [])
      PARA t : d'  -> snip_paras ctx t d'
      _            -> snip_paras ctx [] d

    (subsects,rest) = case xs of
      []              -> ([], [])
      SECT m t {-st-} : xs'
        | m>l         -> snip_sects ctx l m t []{-st-} xs'
        | otherwise   -> ([], xs)
      _               -> error "shouldn't happen"

  snip_paras ctx t d = (fp (fl ctx t) p : ps, rest)
    where
    (p,xs) = snip_para ctx d

    (ps,rest) = case xs of
       []              -> ([], [])
       PARA s : xs'    -> snip_paras ctx s xs'
       SECT _ _ {-_-} : _  -> ([], xs)
       _               -> error "shouldn't happen"

  snip_para ctx d = (map tok (filter notfoot content), rest)
    where
    (content,rest) = break endpara d

    notfoot x = case x of FOOT _ -> False; _ -> True

    tok (TEXT t)      = ft (fl ctx t)
    tok (CODE s)      = fc s
    tok (QUOT x)      = fq (fdt (txtcontext x) x)
    tok (BULL xs)     = fb (map (fdt ctx) xs)
    tok (NUMB xs)     = fn (map (fdt ctx) xs)
    tok (TABL t)      = ftb (fmap (fl ctx) t)
    tok (FLOA m d c)  = ff m (fdt ctx d) (fl ctx c)
    --tok (BIBL xs)     = fr (map (\(s,t) -> (s, fl ctx t)) xs)
    tok (PARA _)      = error "shouldn't happen: PARA"
    tok (SECT _ _ {-_-})  = error "shouldn't happen: SECT"
    tok (META _ _)    = error "shouldn't happen: META"
    tok (FOOT _)      = error "shouldn't happen: FOOT"

    endpara (PARA _)      = True
    endpara (SECT _ _ {-_-})  = True
    endpara _             = False

  snip_sects ctx l m t st d = (fs m (fl ctx t) (fl ctx st) s : ss, rest)
    where
    (s,xs) = snip_doc ctx m d

    (ss,rest) = case xs of
      []              ->  ([], [])
      SECT n t {-st-} : xs'
        | n>l         -> snip_sects ctx l n t []{-st-} xs'
        | otherwise   -> ([], xs)
      _ -> error "shouldn't happen"


---- Document parser

-- some notation
str      = Frisby.text  -- avoids name clash
p ++> q  = p <> q ## uncurry (:)          -- parser "cons"
opt p    = p //> []
dnm      = doesNotMatch                   -- just a shortcut
block l  = many1 l ## concat

infixr 3 ++>

-- some common parsing expressions
space      = noneOf "\n\r" `onlyIf` isSpace
newline    = str "\r\n" // str "\n\r" // str "\n" // str "\r"
eol        = newline // (eof ##> "")
eow        = eol // (space ## return)
identifier = alpha <> many1 (alphaNum // oneOf "_") ## uncurry (:)


-- frisby helpers
many1Until end p = do loop <- manyUntil end p
                      return (dnm end ->> loop)

sepBy p q = p <> many (q ->> p) ## uncurry (:)

strbetw s t = do loop <- many1Until (str t) anyChar
                 return (str s ->> loop <<- str t)

txtbetw s t = liftM (## parsetxt) (strbetw s t)
strin   s   = strbetw s s
txtin   s   = txtbetw s s

p_txt :: PM s (P s Txt)
p_txt = mdo
    let sps = "/_\"'|[]{}<>^"   -- special characters
        exs = "'/_"             -- exceptions that may appear within words

    -- punctuation strings that are converted to unicode equivalents
    dots        <- newRule $ str "..." ##> '\x2026'
    emdash      <- newRule $ str "--"  ##> '\x2014'
    unipunc     <- newRule $ dots // emdash

    -- plain string parsing
    -- convoluted because of the 'exs' (see above)
    -- also needs to stop after whitespace, to reenter ltok loop below
    sschar      <- newRule $ space // newline ##> '\n'
    sachar      <- newRule $ alphaNum
    sochar      <- newRule $ unipunc // noneOf sps `onlyIf` (not . isSpace)
    sechar      <- newRule $ oneOf exs <<- matches (sachar // oneOf exs)
    stspace     <- newRule $ sschar ++> (stspace //> "")
    stalpha     <- newRule $ sachar ++> (st' // stexcept //> "")
    stother     <- newRule $ sochar ++> (st' //> "")
    stexcept    <- newRule $ sechar ++> (st' // stexcept //> "")

    st'         <- newRule $ stspace // stalpha // stother
    st          <- newRule $ st' ## St

    em1         <- txtin "/"
    em2         <- txtin "_"
    bd1         <- txtin "*"
    qu1         <- txtin "\""
    co1         <- strin "'"
    co2         <- strin "|"
    em          <- newRule $ (em1 // em2) ## Em
    bd          <- newRule $ bd1 ## Bd
    co          <- newRule $ (co1 // co2) ## Co
    qu          <- newRule $ qu1 ## Qu
    an          <- newRule $ char '>' ->> identifier <<- char '<' ## An
    xr          <- newRule $ char '>' ->> identifier ## Xr 
    fn          <- newRule $ char '^' ->> many1 digit ## Fn
    im          <- p_im
    nl          <- newRule $ str "//" ->> many space ->> eol ##> Nl

    group       <- txtbetw "{" "}"
    hyurl       <- strbetw "[" "]"
    hytxt       <- newRule $ group // (linkable ## return)
    hy          <- newRule $ opt hytxt <> hyurl ## uncurry Hy
    lword       <- newRule $ many1 (sachar // sochar // sechar) ## St
    linkable    <- newRule $ lmarkup // lword

    lmarkup     <- newRule $ im // co // qu // em // bd
    ltok        <- newRule $ hy // nl // an // xr // fn // lmarkup // st

    -- basic Txt parser - stops on unrecognized formatting
    txt'        <- newRule $ many1 ltok ## foldr stcomb []

    -- top level Txt parser - always eats entire input
    -- if parsing fails: consume one character as plain and go on
    fallback    <- newRule $ anyChar ## (St . return)
    txt         <- newRule $ many1 (ltok // fallback) ## foldr stcomb []
    return txt
    where
    stcomb (St a) (St b : xs) = St (a++b) : xs
    stcomb x xs = x:xs

-- reused in p_doc below
p_im :: PM s (P s LTok)
p_im = liftM (## Im) (strbetw "<<" ">>")


p_doc :: PM s (P s Doc)
p_doc = mdo
    emptyline   <- newRule $ many space <++> newline
    linerest    <- newRule $ many (noneOf "\n\r") <++> eol

    parathead   <- newRule $ str "**"
    parattail   <- newRule $ eol // str "**" <<- opt emptyline
    parattext   <- manyUntil parattail anyChar
    paratitle   <- newRule $ parathead ->> parattext <<- parattail ## parsetxt
    para        <- newRule $ many1 emptyline ->> opt paratitle ## PARA

    let hd c l = many space ->> many1 (char c) <<- many space <<- eol ##> l
    stxtitle    <- newRule $ textline ## parsetxt
    stxline     <- newRule $ hd '=' 1 // hd '-' 2
    stxhding    <- newRule $ stxtitle <> stxline
    sect        <- newRule $ stxhding ## \(t,l) -> SECT l t

    mmark       <- newRule $ str "--" <<- many space
    mkey        <- newRule $ many1 (alphaNum // oneOf "-_") <<- char ':'
    mval        <- newRule $ many space ->> many (noneOf "\n\r")
    meta        <- newRule $ mmark ->> mkey <> mval <<- eol ## uncurry META

    snip        <- newRule $ str "8<" // str ">8" // str "<" // str ">"
    fmark       <- newRule $ many1 (char '-') ->> snip <<- many (char '-') <<- many space <<- eol
    fdoc        <- many1Until fmark anyChar
    float       <- newRule $ (fmark <> (fdoc <> (fmark <> opt txt))) ## mkfloat
    im          <- p_im
    imline      <- newRule $ many space ->> im <<- many space <<- eol
    floatim     <- newRule $ many1 imline <> opt txt ## mkfloatim
    floattbl    <- newRule $ tabl <> txt ## \(t,c) -> FLOA NoFlow [t] c
    floa        <- newRule $ float // floatim // floattbl

    quotline    <- newRule $ char '>' ->> (eol // char ' ' ->> linerest)
    quot        <- newRule $ block quotline ## QUOT . parsedoc

    codeinden   <- newRule $ str "    " ->> dnm emptyline ->> linerest
    codeblank   <- newRule $ emptyline <<- matches codeinden ##> "\n"
    codeline    <- newRule $ codeblank // codeinden
    code        <- newRule $ block codeline ## CODE

    hanginden   <- newRule $ char '\t' ->> linerest
    hangblank   <- newRule $ emptyline <<- matches hangline ##> "\n"
    hangline    <- newRule $ hangblank // hanginden
    hangblock   <- newRule $ linerest ++> many hangline ## concat

    bullet      <- newRule $ oneOf "*+-"
    bullhead    <- newRule $ many (char ' ') ->> bullet <<- many1 space
    bullitem    <- newRule $ bullhead ->> hangblock ## parsedoc
    bull        <- newRule $ many1 bullitem ## BULL

    number      <- newRule $ many1 digit <<- oneOf ".)"
    numbhead    <- newRule $ many (char ' ') ->> number <<- many1 space
    numbitem    <- newRule $ numbhead ->> hangblock ## parsedoc
    numb        <- newRule $ many1 numbitem ## NUMB

    tmark       <- newRule $ many1 (char '=') ->> char '#' ->> many (oneOf "=#")
    colspecs    <- newRule $ many (oneOf "|lcr") ## map mkcolspec
    tstart      <- newRule $ tmark ->> colspecs <<- many space <<- eol
    tend        <- newRule $ tmark <<- many space <<- eol
    row         <- newRule $ dnm tend ->> linerest ## mkrow
    tabl        <- newRule $ tstart <> many row <<- tend ## mktabl

    foothead    <- newRule $ many space ->> char '^' ->> many1 digit <<- char ':'
    footitem    <- newRule $ foothead <> hangblock ## \(n,d) -> (n, parsedoc d)
    foot        <- newRule $ many1 footitem ## FOOT

    breaker     <- newRule $ para // quot // code // bull // numb // foot // tabl
    textline    <- newRule $ dnm breaker ->> dnm eof ->> linerest
    txt         <- newRule $ block textline ## parsetxt
    text        <- newRule $ txt ## TEXT

    btok        <- newRule $ meta // para // sect // floa // quot // code //
                             bull // numb // tabl // foot // text
    doc         <- newRule $ many btok
    return doc
    where
    mkfloat (x,(d,(y,c))) = FLOA (fmode x y) (parsedoc d) c
    fmode "<" "<" = FlowRight
    fmode ">" ">" = FlowLeft
    fmode _ _     = NoFlow
    mkfloatim (ims,c) = FLOA NoFlow [TEXT ims] c
    mktabl (cs,rs) = TABL (Table cs rs)
    mkrow = Row . map (Cell . parsetxt) . splitOn '\t'
    mkcolspec '|' = Vrule
    mkcolspec 'l' = Colleft
    mkcolspec 'c' = Colcenter
    mkcolspec 'r' = Colright

-- somehow you just always end up needing this thing
splitOn z [] = [[]]
splitOn z (x:xs)
    | x==z      = [] : splitOn z xs
    | otherwise = let (a:as) = splitOn z xs in (x:a) : as

p_exp :: PM s (P s Exp)
p_exp = mdo
    -- TODO: parser for Mtxt, Msub, Msup, Mfen
    ws      <- newRule $ many (anyChar `onlyIf` isSpace)
    opr     <- newRule $ ws ->> choice (map str (words opers)) ## Mopr
    var     <- newRule $ ws ->> alpha ## (\x -> Mvar [x])
    sym     <- newRule $ ws ->> many1 (anyChar `onlyIf` (not . isSpace)) ## Msym
    mtok    <- newRule $ opr // var // sym
    exp     <- newRule $ many mtok
    return exp
    where
    opers = "exp log ln ld sin cos tan arcsin arccos arctan sinh cosh tanh lim\
            \inf sup liminf limsup div grad D d"


parsetxt :: String -> Txt
parsedoc :: String -> Doc
parseexp :: String -> Exp
parsetxt = runPeg p_txt
parsedoc = runPeg p_doc
parseexp = runPeg p_exp


---- reading from files (including UTF-8 handling)

readctxfile :: FilePath -> IO Doc
readctxfile f =
    readFile f >>= (return . parsedoc . decode . map (fromIntegral.ord))


---- XHTML output

txttohtml :: Txt -> String
doctohtml :: Doc -> String
exptohtml :: Exp -> String

txttohtml = xmlout . txttoelems
txttoelems = concatMap ltoktohtml

ltoktohtml :: LTok -> [Element]
ltoktohtml tok = case tok of
    St s    -> [cdata s]
    Hy [] u -> [tag "a" ("href" := u) (txttoelems [St u])]
    Hy t u  -> [tag "a" ("href" := u) (txttoelems t)]
    Im u    -> [tag "img" ("src" := u) ("alt" := basename u)]
    Nl      -> [tag "br"]
    Co s    -> [tag "span" ("style" := "font-family:monospace") s]
    Qu t    -> [tag "q" (txttoelems t)]
    Em t    -> [tag "em" (txttoelems t)]
    Bd t    -> [tag "strong" (txttoelems t)]
    Ma e    -> [tag "math" ("xmlns" := mathml) (exptoelems e)]
    -- fallback
    x       -> [tag "span" ("style" := "font-family:monospace; color:red") (show x)]
    where
    mathml = "http://www.w3.org/1998/Math/MathML"

basename = reverse . takeWhile (/='/') . reverse

doctohtml = xmlout . doctoelems
doctoelems = concatMap btoktohtml

btoktohtml :: BTok -> [Element]
btoktohtml tok = case tok of
    TEXT t      -> txttoelems t
    PARA []     -> [tag "p"]
    PARA t      -> [tag "p", tag "b" (txttoelems t), cdata "\n"]
    SECT l t    -> [tag ('h':show l) (txttoelems t)]
    META _ _    -> [cdata ""]
    FLOA m d c  ->
        let content = tag "div" ("class" := "floatcontent")
                                (doctoelems d)
            caption = tag "div" ("class" := "floatcaption")
                                (txttoelems c) :: Element
        in
        [tag "div" ("class" := "float")
                   ("style" := ("float:" ++ floatmode m))
                   (content : if null c then [] else [caption])
        ]
    QUOT d      -> [tag "blockquote" (doctoelems d)]
    CODE s      -> [tag "pre" [tag "code" s :: Element]]
    BULL ds     -> [tag "ul" (map (tag "li" . doctoelems) ds :: [Element])]
    -- fallback
    x       -> [tag "pre" ("style" := "color:red") (show x)]
    where
    floatmode FlowRight = "left"    -- yes, the meaning is reversed in CSS
    floatmode FlowLeft  = "right"
    floatmode NoFlow    = "none"

exptohtml = xmlout . exptoelems
exptoelems = concatMap mtoktohtml

mtoktohtml :: MTok -> [Element]
mtoktohtml = error "TODO: implement mtoktohtml"


---- ASCII output

txttoascii :: Txt -> String
doctoascii :: Doc -> String
exptoascii :: Exp -> String

txttoascii = concatMap ltoktoascii
doctoascii = concatMap btoktoascii
exptoascii = concatMap mtoktoascii

ltoktoascii :: LTok -> String
ltoktoascii tok = case tok of
    St s    -> s
    Hy t u  -> let t' = txttoascii t
                 in if any isSpace t' then "\""++t'++"\"" else t'
                    ++ "[" ++ u ++ "]"
    Im u    -> "<<" ++ u ++ ">>"
    Nl      -> "//\n"
    Co s    -> "'" ++ s ++ "'"
    Qu t    -> "\"" ++ txttoascii t ++ "\""
    Em t    -> "/" ++ txttoascii t ++ "/"
    Bd t    -> "*" ++ txttoascii t ++ "*"
    Ma e    -> "`" ++ exptoascii e ++ "`"
    -- fallback
    x       -> "{{" ++ show x ++ "}}"

btoktoascii :: BTok -> String
btoktoascii tok = case tok of
    TEXT t      -> txttoascii t
    PARA []     -> "\n"
    PARA t      -> "\n**" ++ txttoascii t ++ "**"
    SECT 1 t    -> let t' = txttoascii t in t' ++ map (const '=') t'
    SECT 2 t    -> let t' = txttoascii t in t' ++ map (const '-') t'
    META k v    -> "-- " ++ k ++ ":\t" ++ v ++ "\n"
    FLOA NoFlow [TEXT [Im u]] c
                -> ltoktoascii (Im u) ++ "\n" ++ txttoascii c ++ "\n"
    FLOA m d c  -> "---" ++ floatmode m ++ "-----\n"
                   ++ doctoascii d ++
                   "---" ++ floatmode m ++ "-----\n"
                   ++ txttoascii c ++ "\n"
    QUOT d      -> unlines $ map ("> "++) $ lines $ doctoascii d
    CODE s      -> unlines $ map ("    "++) $ lines s
    BULL ds     -> unlines $ map (hang " -" . doctoascii) ds
    -- fallback
    x       -> "{{" ++ show x ++ "}}\n"
    where
    floatmode FlowRight = ">>"
    floatmode FlowLeft  = "<<"
    floatmode NoFlow    = "8<"
    hang h s = case lines s of
        []     -> ""
        (x:xs) -> unlines $ (h++"\t"++x) : map ('\t':) xs

mtoktoascii :: MTok -> String
mtoktoascii = error "TODO: implement mtoktoascii"
