diff options
author | rumpf_a@web.de <> | 2010-02-14 09:53:56 +0100 |
---|---|---|
committer | rumpf_a@web.de <> | 2010-02-14 09:53:56 +0100 |
commit | 597d98e7ee9cf60ec1ed601a311975384d65cba8 (patch) | |
tree | d92d710b4b9d841d40a480fb4be9a10085550ee2 /lib/pure | |
parent | 40a5d6c3b9a0fa3b5a7444a00073729fec17dfd8 (diff) | |
download | Nim-597d98e7ee9cf60ec1ed601a311975384d65cba8.tar.gz |
further improvements for the HTML parser
Diffstat (limited to 'lib/pure')
-rw-r--r-- | lib/pure/htmlparser.nim | 37 |
1 files changed, 28 insertions, 9 deletions
diff --git a/lib/pure/htmlparser.nim b/lib/pure/htmlparser.nim index 5c88f211d..4e4af0e4b 100644 --- a/lib/pure/htmlparser.nim +++ b/lib/pure/htmlparser.nim @@ -138,8 +138,7 @@ const "s", "samp", "script", "select", "small", "span", "strike", "strong", "style", "sub", "sup", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", - "title", "tr", "tt", "u", "ul", "var" - ] + "title", "tr", "tt", "u", "ul", "var"] InlineTags* = {tagA, tagAbbr, tagAcronym, tagApplet, tagB, tagBasefont, tagBdo, tagBig, tagBr, tagButton, tagCite, tagCode, tagDel, tagDfn, tagEm, tagFont, tagI, tagImg, tagIns, tagInput, tagIframe, tagKbd, @@ -154,7 +153,7 @@ const tagMenu, tagNoframes} SingleTags* = {tagArea, tagBase, tagBasefont, tagBr, tagCol, tagFrame, tagHr, tagImg, tagInput, tagIsindex, - tagLink, tagMeta, tagParam} # `tagP` can be both! + tagLink, tagMeta, tagParam} Entities = [ ("nbsp", 0x00A0), ("iexcl", 0x00A1), ("cent", 0x00A2), ("pound", 0x00A3), @@ -247,13 +246,17 @@ proc htmlTag*(n: PXmlNode): THtmlTag = n.clientData = binaryStrSearch(tagStrs, n.tag)+1 result = THtmlTag(n.clientData) +proc htmlTag*(s: string): THtmlTag = + ## converts `s` to a ``THtmlTag``. If `s` is no HTML tag, ``tagUnknown`` is + ## returned. + result = THtmlTag(binaryStrSearch(tagStrs, s.toLower)+1) + proc entityToUtf8*(entity: string): string = ## converts an HTML entity name like ``Ü`` to its UTF-8 equivalent. ## "" is returned if the entity name is unknown. The HTML parser ## already converts entities to UTF-8. for name, val in items(entities): - if name == entity: - return toUTF8(TRune(val)) + if name == entity: return toUTF8(TRune(val)) result = "" proc addNode(father, son: PXmlNode) = @@ -261,6 +264,9 @@ proc addNode(father, son: PXmlNode) = proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode +proc expected(x: var TXmlParser, n: PXmlNode): string = + result = errorMsg(x, "</" & n.tag & "$1> expected") + proc untilElementEnd(x: var TXmlParser, result: PXmlNode, errors: var seq[string]) = if result.htmlTag in singleTags: @@ -268,15 +274,28 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode, return while true: case x.kind + of xmlElementStart, xmlElementOpen: + case result.htmlTag + of tagLi, tagP, tagDt, tagDd, tagOption: + if htmlTag(x.elementName) notin InlineTags: + # some tags are common to have no ``</end>``, like ``<li>``: + errors.add(expected(x, result)) + break + of tagTr, tagTd, tagTh: + if htmlTag(x.elementName) in {tagTr, tagTd, tagTh}: + errors.add(expected(x, result)) + break + else: nil + result.addNode(parse(x, errors)) of xmlElementEnd: if cmpIgnoreCase(x.elementName, result.tag) == 0: next(x) else: - errors.add(errorMsg(x, "</" & result.tag & "$1> expected")) + errors.add(expected(x, result)) # do not skip it here! break of xmlEof: - errors.add(errorMsg(x, "</" & result.tag & "$1> expected")) + errors.add(expected(x, result)) break else: result.addNode(parse(x, errors)) @@ -296,13 +315,13 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode = errors.add(errorMsg(x)) next(x) of xmlElementStart: - result = newElement(x.elementName) + result = newElement(x.elementName.toLower) next(x) untilElementEnd(x, result, errors) of xmlElementEnd: errors.add(errorMsg(x, "unexpected ending tag: " & x.elementName)) of xmlElementOpen: - result = newElement(x.elementName) + result = newElement(x.elementName.toLower) next(x) result.attr = newStringTable() while true: |