{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections     #-}

module Text.XmlHtml.HTML.Parse where

import           Control.Applicative
import           Control.Monad
import           Data.Char
import           Data.List
import           Data.Maybe
import           Text.XmlHtml.Common
import           Text.XmlHtml.HTML.Meta
import           Text.XmlHtml.TextParser
import qualified Text.XmlHtml.XML.Parse as XML

import qualified Text.Parsec as P

import qualified Data.HashSet as S
import qualified Data.HashMap.Strict as M
import qualified Data.Map as Map

import           Data.Text (Text)
import qualified Data.Text as T


------------------------------------------------------------------------------
-- | HTML version of document fragment parsing rule  It differs only in that
-- it parses the HTML version of 'content' and returns an 'HtmlDocument'.
docFragment :: Encoding -> Parser Document
docFragment :: Encoding -> Parser Document
docFragment Encoding
e = do
    (dt, nodes1)      <- Parser (Maybe DocType, [Node])
prolog
    (nodes2, Matched) <- content Nothing
    return $ HtmlDocument e dt (nodes1 ++ nodes2)


------------------------------------------------------------------------------
-- Parsing code                                                             --
------------------------------------------------------------------------------

{-
    The following are the differences between this code and the straight XML
    parsing code.

    1. HTML void tags (area, base, etc.) are always treated as empty tags,
       regardless of whether they have the empty-tag slash.

    2. HTML raw text tags (script and style) are parsed as straight text
       with neither markup nor references, except that they end at the first
       syntactically valid matching end tag.

    3. End tags need only match their corresponding start tags in a case
       insensitive comparison.  In case they are different, the start tag is
       used for the element tag name.

    4. Hexadecimal char references may use &#X...; (capital X)  -- DONE

    5. Attribute names are allowed to consist of any text except for control
       characters, space, '\"', '\'', '>', '/', or '='.

    6. Empty attribute syntax is allowed (an attribute not followed by an eq).
       In this case, the attribute value is considered to be the empty string.

    7. Quoted attribute syntax is relaxed to allow any character except for
       the matching quote.  References are allowed.

    8. Attribute values may be unquoted.  In this case, the attribute value
       may not contain space, single or double quotes, '=', '<', '>', or '`',
       and may not be the empty string.  It can still contain references.

    9. There are many more character references available.

    10. Only "ambiguous" ampersands are prohibited in character data.  This
        means ampersands that parse like character or entity references.

    11. Omittable end tags are inserted automatically.

    12. DOCTYPE tags matched with case insensitive keywords.
-}


------------------------------------------------------------------------------
prolog :: Parser (Maybe DocType, [Node])
prolog :: Parser (Maybe DocType, [Node])
prolog = do
    _      <- ParsecT Text () Identity (Maybe Text)
-> ParsecT Text () Identity (Maybe (Maybe Text))
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional ParsecT Text () Identity (Maybe Text)
XML.xmlDecl
    nodes1 <- many XML.misc
    rest   <- optional $ do
        dt     <- docTypeDecl
        nodes2 <- many XML.misc
        return (dt, nodes2)
    case rest of
        Maybe (DocType, [Maybe Node])
Nothing           -> (Maybe DocType, [Node]) -> Parser (Maybe DocType, [Node])
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe DocType
forall a. Maybe a
Nothing, [Maybe Node] -> [Node]
forall a. [Maybe a] -> [a]
catMaybes [Maybe Node]
nodes1)
        Just (DocType
dt, [Maybe Node]
nodes2) -> (Maybe DocType, [Node]) -> Parser (Maybe DocType, [Node])
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return (DocType -> Maybe DocType
forall a. a -> Maybe a
Just DocType
dt, [Maybe Node] -> [Node]
forall a. [Maybe a] -> [a]
catMaybes ([Maybe Node]
nodes1 [Maybe Node] -> [Maybe Node] -> [Maybe Node]
forall a. [a] -> [a] -> [a]
++ [Maybe Node]
nodes2))


------------------------------------------------------------------------------
-- | Internal subset is parsed, but ignored since we don't have data types to
-- store it.
docTypeDecl :: Parser DocType
docTypeDecl :: Parser DocType
docTypeDecl = do
    ParsecT Text () Identity () -> ParsecT Text () Identity ()
forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try (ParsecT Text () Identity () -> ParsecT Text () Identity ())
-> ParsecT Text () Identity () -> ParsecT Text () Identity ()
forall a b. (a -> b) -> a -> b
$ do
        _      <- Text -> Parser Text
text Text
"<!"
        decl   <- XML.name
        when (T.toLower decl /= "doctype") $ fail "Expected DOCTYPE"
    ParsecT Text () Identity ()
XML.whiteSpace
    tag    <- Parser Text
XML.name
    _      <- optional XML.whiteSpace
    extid  <- externalID
    _      <- optional XML.whiteSpace
    intsub <- XML.internalDoctype
    _      <- P.char '>'
    return (DocType tag extid intsub)


------------------------------------------------------------------------------
externalID :: Parser ExternalID
externalID :: Parser ExternalID
externalID = do
    tok  <- Parser Text -> ParsecT Text () Identity (Maybe Text)
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional (Parser Text -> ParsecT Text () Identity (Maybe Text))
-> Parser Text -> ParsecT Text () Identity (Maybe Text)
forall a b. (a -> b) -> a -> b
$ Text -> Text
T.toLower (Text -> Text) -> Parser Text -> Parser Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text
XML.name
    case tok of
        Just Text
"system" -> Parser ExternalID
systemID
        Just Text
"public" -> Parser ExternalID
publicID
        Just Text
_        -> String -> Parser ExternalID
forall a. String -> ParsecT Text () Identity a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"Expected SYSTEM or PUBLIC"
        Maybe Text
Nothing       -> ExternalID -> Parser ExternalID
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return ExternalID
NoExternalID
  where
    systemID :: Parser ExternalID
systemID = do
        ParsecT Text () Identity ()
XML.whiteSpace
        Text -> ExternalID
System (Text -> ExternalID) -> Parser Text -> Parser ExternalID
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text
XML.systemLiteral
    publicID :: Parser ExternalID
publicID = do
        ParsecT Text () Identity ()
XML.whiteSpace
        pid <- Parser Text
XML.pubIdLiteral
        XML.whiteSpace
        sid <- XML.systemLiteral
        return (Public pid sid)


------------------------------------------------------------------------------
-- | When parsing an element, three things can happen (besides failure):
--
-- (1) The end tag matches the start tag.  This is a Matched.
--
-- (2) The end tag does not match, but the element has an end tag that can be
-- omitted when there is no more content in its parent.  This is an
-- ImplicitLast.  In this case, we need to remember the tag name of the
-- end tag that we did find, so as to match it later.
--
-- (3) A start tag is found such that it implicitly ends the current element.
-- This is an ImplicitNext.  In this case, we parse and remember the
-- entire element that comes next, so that it can be inserted after the
-- element being parsed.
data ElemResult = Matched
                | ImplicitLast Text
                | ImplicitNext Text Text [(Text, Text)] Bool


------------------------------------------------------------------------------
finishElement :: Text -> Text -> [(Text, Text)] -> Bool
              -> Parser (Node, ElemResult)
finishElement :: Text -> Text -> [(Text, Text)] -> Bool -> Parser (Node, ElemResult)
finishElement Text
t Text
tbase [(Text, Text)]
a Bool
b = do
    if Bool
b then (Node, ElemResult) -> Parser (Node, ElemResult)
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [(Text, Text)] -> [Node] -> Node
Element Text
t [(Text, Text)]
a [], ElemResult
Matched)
         else Parser (Node, ElemResult)
nonEmptyElem
  where
    nonEmptyElem :: Parser (Node, ElemResult)
nonEmptyElem
        | Text -> [(Text, Text)] -> Bool
isRawText Text
tbase [(Text, Text)]
a = do
            c <- String -> Parser ElemResult -> Parser Node
forall a. String -> Parser a -> Parser Node
XML.cdata  String
"<"  (Parser ElemResult -> Parser Node)
-> Parser ElemResult -> Parser Node
forall a b. (a -> b) -> a -> b
$ Parser ElemResult -> Parser ElemResult
forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try (Text -> Parser ElemResult
endTag Text
t)
            return (Element t a [c], Matched)
        | Text
tbase Text -> HashSet Text -> Bool
forall a. (Eq a, Hashable a) => a -> HashSet a -> Bool
`S.member` HashSet Text
endOmittableLast = (Parser ElemResult -> ParsecT Text () Identity (Maybe ElemResult))
-> Parser (Node, ElemResult)
tagContents Parser ElemResult -> ParsecT Text () Identity (Maybe ElemResult)
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional
        | Bool
otherwise = (Parser ElemResult -> ParsecT Text () Identity (Maybe ElemResult))
-> Parser (Node, ElemResult)
tagContents ((ElemResult -> Maybe ElemResult)
-> Parser ElemResult -> ParsecT Text () Identity (Maybe ElemResult)
forall a b.
(a -> b)
-> ParsecT Text () Identity a -> ParsecT Text () Identity b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ElemResult -> Maybe ElemResult
forall a. a -> Maybe a
Just)
    tagContents :: (Parser ElemResult -> ParsecT Text () Identity (Maybe ElemResult))
-> Parser (Node, ElemResult)
tagContents Parser ElemResult -> ParsecT Text () Identity (Maybe ElemResult)
modifier = do
        (c,r1) <- Maybe Text -> Parser ([Node], ElemResult)
content (Text -> Maybe Text
forall a. a -> Maybe a
Just Text
tbase)
        case r1 of
            ElemResult
Matched -> do
                r2 <- Parser ElemResult -> ParsecT Text () Identity (Maybe ElemResult)
modifier (Text -> Parser ElemResult
endTag Text
t)
                case r2 of
                    Maybe ElemResult
Nothing -> (Node, ElemResult) -> Parser (Node, ElemResult)
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [(Text, Text)] -> [Node] -> Node
Element Text
t [(Text, Text)]
a [Node]
c, ElemResult
Matched)
                    Just ElemResult
rr -> (Node, ElemResult) -> Parser (Node, ElemResult)
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [(Text, Text)] -> [Node] -> Node
Element Text
t [(Text, Text)]
a [Node]
c, ElemResult
rr)
            ImplicitLast Text
tag | Text -> Text
T.toCaseFold Text
tag Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text -> Text
T.toCaseFold Text
t -> do
                (Node, ElemResult) -> Parser (Node, ElemResult)
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [(Text, Text)] -> [Node] -> Node
Element Text
t [(Text, Text)]
a [Node]
c, ElemResult
Matched)
            ElemResult
end -> do
                (Node, ElemResult) -> Parser (Node, ElemResult)
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [(Text, Text)] -> [Node] -> Node
Element Text
t [(Text, Text)]
a [Node]
c, ElemResult
end)


------------------------------------------------------------------------------
emptyOrStartTag :: Parser (Text, Text, [(Text, Text)], Bool)
emptyOrStartTag :: Parser (Text, Text, [(Text, Text)], Bool)
emptyOrStartTag = do
    t <- Parser Text -> Parser Text
forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try (Parser Text -> Parser Text) -> Parser Text -> Parser Text
forall a b. (a -> b) -> a -> b
$ Char -> ParsecT Text () Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'<' ParsecT Text () Identity Char -> Parser Text -> Parser Text
forall a b.
ParsecT Text () Identity a
-> ParsecT Text () Identity b -> ParsecT Text () Identity b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser Text
XML.name
    let tbase = Text -> Text
T.toLower (Text -> Text) -> Text -> Text
forall a b. (a -> b) -> a -> b
$ (Text, Text) -> Text
forall a b. (a, b) -> b
snd ((Text, Text) -> Text) -> (Text, Text) -> Text
forall a b. (a -> b) -> a -> b
$ HasCallStack => Text -> Text -> (Text, Text)
Text -> Text -> (Text, Text)
T.breakOnEnd Text
":" Text
t
    a <- many $ P.try $ do
        XML.whiteSpace
        attribute
    when (hasDups a) $ fail "Duplicate attribute names in element"
    _ <- optional XML.whiteSpace
    e <- fmap isJust $ optional (P.char '/')
    let e' = Bool
e Bool -> Bool -> Bool
|| (Text
tbase Text -> HashSet Text -> Bool
forall a. (Eq a, Hashable a) => a -> HashSet a -> Bool
`S.member` HashSet Text
voidTags)
    _ <- P.char '>'
    return (t, tbase, a, e')
  where
    hasDups :: [(a, b)] -> Bool
hasDups [(a, b)]
a = [a] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length ([a] -> [a]
forall a. Eq a => [a] -> [a]
nub (((a, b) -> a) -> [(a, b)] -> [a]
forall a b. (a -> b) -> [a] -> [b]
map (a, b) -> a
forall a b. (a, b) -> a
fst [(a, b)]
a)) Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< [(a, b)] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [(a, b)]
a


------------------------------------------------------------------------------
attrName :: Parser Text
attrName :: Parser Text
attrName = (Char -> Bool) -> Parser Text
takeWhile1 Char -> Bool
isAttrName
  where isAttrName :: Char -> Bool
isAttrName Char
c | Char
c Char -> String -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Char
'\0',Char
' ',Char
'"',Char
'\'',Char
'>',Char
'/',Char
'='] = Bool
False
                     | Char -> Bool
isControlChar Char
c       = Bool
False
                     | Bool
otherwise             = Bool
True


------------------------------------------------------------------------------
-- | From 8.2.2.3 of the HTML 5 spec, omitting the very high control
-- characters because they are unlikely to occur and I got tired of typing.
isControlChar :: Char -> Bool
isControlChar :: Char -> Bool
isControlChar Char
c | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x007F' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x009F' = Bool
True
                | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\xFDD0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xFDEF' = Bool
True
                | Bool
otherwise                      = Bool
False


------------------------------------------------------------------------------
quotedAttrValue :: Parser Text
quotedAttrValue :: Parser Text
quotedAttrValue = Parser Text
singleQuoted Parser Text -> Parser Text -> Parser Text
forall a.
ParsecT Text () Identity a
-> ParsecT Text () Identity a -> ParsecT Text () Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text
doubleQuoted
  where
    singleQuoted :: Parser Text
singleQuoted = Char -> ParsecT Text () Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'\'' ParsecT Text () Identity Char -> Parser Text -> Parser Text
forall a b.
ParsecT Text () Identity a
-> ParsecT Text () Identity b -> ParsecT Text () Identity b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> String -> Parser Text
forall {t :: * -> *}. Foldable t => t Char -> Parser Text
refTill [Char
'&',Char
'\''] Parser Text -> ParsecT Text () Identity Char -> Parser Text
forall a b.
ParsecT Text () Identity a
-> ParsecT Text () Identity b -> ParsecT Text () Identity a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Char -> ParsecT Text () Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'\''
    doubleQuoted :: Parser Text
doubleQuoted = Char -> ParsecT Text () Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'"'  ParsecT Text () Identity Char -> Parser Text -> Parser Text
forall a b.
ParsecT Text () Identity a
-> ParsecT Text () Identity b -> ParsecT Text () Identity b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> String -> Parser Text
forall {t :: * -> *}. Foldable t => t Char -> Parser Text
refTill [Char
'&',Char
'"']  Parser Text -> ParsecT Text () Identity Char -> Parser Text
forall a b.
ParsecT Text () Identity a
-> ParsecT Text () Identity b -> ParsecT Text () Identity a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Char -> ParsecT Text () Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'"'
    refTill :: t Char -> Parser Text
refTill t Char
end = [Text] -> Text
T.concat ([Text] -> Text) -> ParsecT Text () Identity [Text] -> Parser Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text -> ParsecT Text () Identity [Text]
forall a.
ParsecT Text () Identity a -> ParsecT Text () Identity [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many
        ((Char -> Bool) -> Parser Text
takeWhile1 (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> t Char -> Bool
forall a. Eq a => a -> t a -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` t Char
end)) Parser Text -> Parser Text -> Parser Text
forall a.
ParsecT Text () Identity a
-> ParsecT Text () Identity a -> ParsecT Text () Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text
reference)


------------------------------------------------------------------------------
unquotedAttrValue :: Parser Text
unquotedAttrValue :: Parser Text
unquotedAttrValue = String -> Parser Text
forall {t :: * -> *}. Foldable t => t Char -> Parser Text
refTill [Char
' ',Char
'"',Char
'\'',Char
'=',Char
'<',Char
'>',Char
'&',Char
'`']
  where
    refTill :: t Char -> Parser Text
refTill t Char
end = [Text] -> Text
T.concat ([Text] -> Text) -> ParsecT Text () Identity [Text] -> Parser Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text -> ParsecT Text () Identity [Text]
forall a.
ParsecT Text () Identity a -> ParsecT Text () Identity [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
some
        ((Char -> Bool) -> Parser Text
takeWhile1 (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> t Char -> Bool
forall a. Eq a => a -> t a -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` t Char
end)) Parser Text -> Parser Text -> Parser Text
forall a.
ParsecT Text () Identity a
-> ParsecT Text () Identity a -> ParsecT Text () Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text
reference)


------------------------------------------------------------------------------
attrValue :: Parser Text
attrValue :: Parser Text
attrValue = Parser Text
quotedAttrValue Parser Text -> Parser Text -> Parser Text
forall a.
ParsecT Text () Identity a
-> ParsecT Text () Identity a -> ParsecT Text () Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text
unquotedAttrValue


------------------------------------------------------------------------------
attribute :: Parser (Text, Text)
attribute :: ParsecT Text () Identity (Text, Text)
attribute = do
    n <- Parser Text
attrName
    v <- optional $ do
        _ <- P.try $ do
            _ <- optional XML.whiteSpace
            P.char '='
        _ <- optional XML.whiteSpace
        attrValue
    return $ maybe (n,"") (n,) v


------------------------------------------------------------------------------
endTag :: Text -> Parser ElemResult
endTag :: Text -> Parser ElemResult
endTag Text
s = do
    _ <- Text -> Parser Text
text Text
"</"
    t <- XML.name
    let sbase = Text -> Text
T.toLower (Text -> Text) -> Text -> Text
forall a b. (a -> b) -> a -> b
$ (Text, Text) -> Text
forall a b. (a, b) -> b
snd ((Text, Text) -> Text) -> (Text, Text) -> Text
forall a b. (a -> b) -> a -> b
$ HasCallStack => Text -> Text -> (Text, Text)
Text -> Text -> (Text, Text)
T.breakOnEnd Text
":" Text
s
    r <- if (T.toCaseFold s == T.toCaseFold t)
            then return Matched
            else if sbase `S.member` endOmittableLast
                then return (ImplicitLast t)
                else fail $ "mismatched tags: </" ++ T.unpack t ++
                            "> found inside <" ++ T.unpack s ++ "> tag"
    _ <- optional XML.whiteSpace
    _ <- text ">"
    return r


------------------------------------------------------------------------------
content :: Maybe Text -> Parser ([Node], ElemResult)
content :: Maybe Text -> Parser ([Node], ElemResult)
content Maybe Text
parent = do
    (ns, end) <- ParsecT Text () Identity ([Maybe Node], ElemResult)
readText
    return (coalesceText (catMaybes ns), end)
  where
    readText :: ParsecT Text () Identity ([Maybe Node], ElemResult)
readText     = do
        s <- Parser Node -> ParsecT Text () Identity (Maybe Node)
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional Parser Node
XML.charData
        t <- optional whileMatched
        case t of
            Maybe ([Maybe Node], ElemResult)
Nothing      -> ([Maybe Node], ElemResult)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return ([Maybe Node
s], ElemResult
Matched)
            Just ([Maybe Node]
tt, ElemResult
m) -> ([Maybe Node], ElemResult)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe Node
sMaybe Node -> [Maybe Node] -> [Maybe Node]
forall a. a -> [a] -> [a]
:[Maybe Node]
tt, ElemResult
m)

    whileMatched :: ParsecT Text () Identity ([Maybe Node], ElemResult)
whileMatched = do
        (n,end) <- (,ElemResult
Matched) ([Maybe Node] -> ([Maybe Node], ElemResult))
-> (Maybe Node -> [Maybe Node])
-> Maybe Node
-> ([Maybe Node], ElemResult)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Maybe Node -> [Maybe Node] -> [Maybe Node]
forall a. a -> [a] -> [a]
:[]) (Maybe Node -> ([Maybe Node], ElemResult))
-> (Node -> Maybe Node) -> Node -> ([Maybe Node], ElemResult)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Node -> Maybe Node
forall a. a -> Maybe a
Just (Node -> ([Maybe Node], ElemResult))
-> (Text -> Node) -> Text -> ([Maybe Node], ElemResult)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> Node
TextNode (Text -> ([Maybe Node], ElemResult))
-> Parser Text
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text
reference
               ParsecT Text () Identity ([Maybe Node], ElemResult)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
forall a.
ParsecT Text () Identity a
-> ParsecT Text () Identity a -> ParsecT Text () Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (,ElemResult
Matched) ([Maybe Node] -> ([Maybe Node], ElemResult))
-> (Maybe Node -> [Maybe Node])
-> Maybe Node
-> ([Maybe Node], ElemResult)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Maybe Node -> [Maybe Node] -> [Maybe Node]
forall a. a -> [a] -> [a]
:[]) (Maybe Node -> ([Maybe Node], ElemResult))
-> ParsecT Text () Identity (Maybe Node)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ParsecT Text () Identity (Maybe Node)
XML.cdSect
               ParsecT Text () Identity ([Maybe Node], ElemResult)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
forall a.
ParsecT Text () Identity a
-> ParsecT Text () Identity a -> ParsecT Text () Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (,ElemResult
Matched) ([Maybe Node] -> ([Maybe Node], ElemResult))
-> (Maybe Node -> [Maybe Node])
-> Maybe Node
-> ([Maybe Node], ElemResult)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Maybe Node -> [Maybe Node] -> [Maybe Node]
forall a. a -> [a] -> [a]
:[]) (Maybe Node -> ([Maybe Node], ElemResult))
-> ParsecT Text () Identity (Maybe Node)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ParsecT Text () Identity (Maybe Node)
XML.processingInstruction
               ParsecT Text () Identity ([Maybe Node], ElemResult)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
forall a.
ParsecT Text () Identity a
-> ParsecT Text () Identity a -> ParsecT Text () Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (,ElemResult
Matched) ([Maybe Node] -> ([Maybe Node], ElemResult))
-> (Maybe Node -> [Maybe Node])
-> Maybe Node
-> ([Maybe Node], ElemResult)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Maybe Node -> [Maybe Node] -> [Maybe Node]
forall a. a -> [a] -> [a]
:[]) (Maybe Node -> ([Maybe Node], ElemResult))
-> ParsecT Text () Identity (Maybe Node)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ParsecT Text () Identity (Maybe Node)
XML.comment
               ParsecT Text () Identity ([Maybe Node], ElemResult)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
forall a.
ParsecT Text () Identity a
-> ParsecT Text () Identity a -> ParsecT Text () Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> ParsecT Text () Identity ([Maybe Node], ElemResult)
doElement
        case end of
            ElemResult
Matched -> do
                (ns, end') <- ParsecT Text () Identity ([Maybe Node], ElemResult)
readText
                return (n ++ ns, end')
            ElemResult
_ -> do
                ([Maybe Node], ElemResult)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return ([Maybe Node]
n, ElemResult
end)

    doElement :: ParsecT Text () Identity ([Maybe Node], ElemResult)
doElement = do
        (t,tb, a,b) <- Parser (Text, Text, [(Text, Text)], Bool)
emptyOrStartTag
        handle t tb a b

    handle :: Text
-> Text
-> [(Text, Text)]
-> Bool
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
handle Text
t Text
tb [(Text, Text)]
a Bool
b = do
        if Text -> Maybe Text -> Bool
breaksTag Text
tb Maybe Text
parent
            then ([Maybe Node], ElemResult)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return ([Maybe Node
forall a. Maybe a
Nothing], Text -> Text -> [(Text, Text)] -> Bool -> ElemResult
ImplicitNext Text
t Text
tb [(Text, Text)]
a Bool
b)
            else do
                (n,end) <- Text -> Text -> [(Text, Text)] -> Bool -> Parser (Node, ElemResult)
finishElement Text
t Text
tb [(Text, Text)]
a Bool
b
                case end of
                    ImplicitNext Text
t' Text
tb' [(Text, Text)]
a' Bool
b' -> do
                        (ns,end') <- Text
-> Text
-> [(Text, Text)]
-> Bool
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
handle Text
t' Text
tb' [(Text, Text)]
a' Bool
b'
                        return (Just n : ns, end')
                    ElemResult
_ -> ([Maybe Node], ElemResult)
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return ([Node -> Maybe Node
forall a. a -> Maybe a
Just Node
n], ElemResult
end)

    breaksTag :: Text -> Maybe Text -> Bool
breaksTag Text
_     Maybe Text
Nothing       = Bool
False
    breaksTag Text
child (Just Text
tag) = case Text -> HashMap Text (HashSet Text) -> Maybe (HashSet Text)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Text
tag HashMap Text (HashSet Text)
endOmittableNext of
        Maybe (HashSet Text)
Nothing -> Bool
False
        Just HashSet Text
s  -> Text -> HashSet Text -> Bool
forall a. (Eq a, Hashable a) => a -> HashSet a -> Bool
S.member Text
child HashSet Text
s

    coalesceText :: [Node] -> [Node]
coalesceText (TextNode Text
s : TextNode Text
t : [Node]
ns)
        = [Node] -> [Node]
coalesceText (Text -> Node
TextNode (Text -> Text -> Text
T.append Text
s Text
t) Node -> [Node] -> [Node]
forall a. a -> [a] -> [a]
: [Node]
ns)
    coalesceText (Node
n:[Node]
ns)
        = Node
n Node -> [Node] -> [Node]
forall a. a -> [a] -> [a]
: [Node] -> [Node]
coalesceText [Node]
ns
    coalesceText []
        = []


------------------------------------------------------------------------------
reference :: Parser Text
reference :: Parser Text
reference = do
    _    <- Char -> ParsecT Text () Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'&'
    r    <- (Left  <$> P.try finishCharRef)
        <|> (Right <$> P.try finishEntityRef)
        <|> (Left  <$> return '&')
    case r of
        Left Char
c   -> do
            Bool -> ParsecT Text () Identity () -> ParsecT Text () Identity ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Bool -> Bool
not (Char -> Bool
isValidChar Char
c)) (ParsecT Text () Identity () -> ParsecT Text () Identity ())
-> ParsecT Text () Identity () -> ParsecT Text () Identity ()
forall a b. (a -> b) -> a -> b
$ String -> ParsecT Text () Identity ()
forall a. String -> ParsecT Text () Identity a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail (String -> ParsecT Text () Identity ())
-> String -> ParsecT Text () Identity ()
forall a b. (a -> b) -> a -> b
$
                String
"Reference is not a valid character"
            Text -> Parser Text
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> Text
T.singleton Char
c)
        Right Text
nm -> case Text -> Map Text Text -> Maybe Text
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Text
nm Map Text Text
predefinedRefs of
            Maybe Text
Nothing -> String -> Parser Text
forall a. String -> ParsecT Text () Identity a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail (String -> Parser Text) -> String -> Parser Text
forall a b. (a -> b) -> a -> b
$ String
"Unknown entity reference: " String -> String -> String
forall a. [a] -> [a] -> [a]
++ Text -> String
T.unpack Text
nm
            Just Text
t  -> Text -> Parser Text
forall a. a -> ParsecT Text () Identity a
forall (m :: * -> *) a. Monad m => a -> m a
return Text
t


------------------------------------------------------------------------------
finishCharRef :: Parser Char
finishCharRef :: ParsecT Text () Identity Char
finishCharRef = Char -> ParsecT Text () Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'#' ParsecT Text () Identity Char
-> ParsecT Text () Identity Char -> ParsecT Text () Identity Char
forall a b.
ParsecT Text () Identity a
-> ParsecT Text () Identity b -> ParsecT Text () Identity b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> (ParsecT Text () Identity Char
hexCharRef ParsecT Text () Identity Char
-> ParsecT Text () Identity Char -> ParsecT Text () Identity Char
forall a.
ParsecT Text () Identity a
-> ParsecT Text () Identity a -> ParsecT Text () Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> ParsecT Text () Identity Char
decCharRef)
  where
    decCharRef :: ParsecT Text () Identity Char
decCharRef = do
        ds <- ParsecT Text () Identity Int -> ParsecT Text () Identity [Int]
forall a.
ParsecT Text () Identity a -> ParsecT Text () Identity [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
some ParsecT Text () Identity Int
forall {u}. ParsecT Text u Identity Int
digit
        _ <- P.char ';'
        safeChr $ foldl' (\Int
a Int
b -> Int
10 Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
a Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
b) 0 ds
      where
        digit :: ParsecT Text u Identity Int
digit = do
            d <- (Char -> Bool) -> ParsecT Text u Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
(Char -> Bool) -> ParsecT s u m Char
P.satisfy (\Char
c -> Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'9')
            return (ord d - ord '0')
    hexCharRef :: ParsecT Text () Identity Char
hexCharRef = do
        _ <- Char -> ParsecT Text () Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'x' ParsecT Text () Identity Char
-> ParsecT Text () Identity Char -> ParsecT Text () Identity Char
forall a.
ParsecT Text () Identity a
-> ParsecT Text () Identity a -> ParsecT Text () Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Char -> ParsecT Text () Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'X'
        ds <- some digit
        _ <- P.char ';'
        safeChr $ foldl' (\Int
a Int
b -> Int
16 Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
a Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
b) 0 ds
      where
        digit :: ParsecT Text u Identity Int
digit = ParsecT Text u Identity Int
forall {u}. ParsecT Text u Identity Int
num ParsecT Text u Identity Int
-> ParsecT Text u Identity Int -> ParsecT Text u Identity Int
forall a.
ParsecT Text u Identity a
-> ParsecT Text u Identity a -> ParsecT Text u Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> ParsecT Text u Identity Int
forall {u}. ParsecT Text u Identity Int
upper ParsecT Text u Identity Int
-> ParsecT Text u Identity Int -> ParsecT Text u Identity Int
forall a.
ParsecT Text u Identity a
-> ParsecT Text u Identity a -> ParsecT Text u Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> ParsecT Text u Identity Int
forall {u}. ParsecT Text u Identity Int
lower
        num :: ParsecT Text u Identity Int
num = do
            d <- (Char -> Bool) -> ParsecT Text u Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
(Char -> Bool) -> ParsecT s u m Char
P.satisfy (\Char
c -> Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'9')
            return (ord d - ord '0')
        upper :: ParsecT Text u Identity Int
upper = do
            d <- (Char -> Bool) -> ParsecT Text u Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
(Char -> Bool) -> ParsecT s u m Char
P.satisfy (\Char
c -> Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'A' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'F')
            return (10 + ord d - ord 'A')
        lower :: ParsecT Text u Identity Int
lower = do
            d <- (Char -> Bool) -> ParsecT Text u Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
(Char -> Bool) -> ParsecT s u m Char
P.satisfy (\Char
c -> Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'a' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'f')
            return (10 + ord d - ord 'a')


------------------------------------------------------------------------------
finishEntityRef :: Parser Text
finishEntityRef :: Parser Text
finishEntityRef = Parser Text
XML.name Parser Text -> ParsecT Text () Identity Char -> Parser Text
forall a b.
ParsecT Text () Identity a
-> ParsecT Text () Identity b -> ParsecT Text () Identity a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Char -> ParsecT Text () Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
';'