diff options
author | Araq <rumpf_a@web.de> | 2013-06-27 01:04:56 +0200 |
---|---|---|
committer | Araq <rumpf_a@web.de> | 2013-06-27 01:04:56 +0200 |
commit | a674c039fb959166681f13f809670807b89e0b41 (patch) | |
tree | ec4676bf220ed2e49e8fff9b1071eaf2bfcae3d4 /lib/pure | |
parent | ff1d68c50b5df56645eb09a523c08726772bec4d (diff) | |
download | Nim-a674c039fb959166681f13f809670807b89e0b41.tar.gz |
fixed a long standing xml parser bug; html parser bugfixes + speed improvements
Diffstat (limited to 'lib/pure')
-rw-r--r-- | lib/pure/htmlparser.nim | 286 | ||||
-rw-r--r-- | lib/pure/parsexml.nim | 19 | ||||
-rw-r--r-- | lib/pure/xmltree.nim | 10 |
3 files changed, 242 insertions, 73 deletions
diff --git a/lib/pure/htmlparser.nim b/lib/pure/htmlparser.nim index ad952fc41..c94170803 100644 --- a/lib/pure/htmlparser.nim +++ b/lib/pure/htmlparser.nim @@ -1,7 +1,7 @@ # # # Nimrod's Runtime Library -# (c) Copyright 2010 Andreas Rumpf +# (c) Copyright 2013 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. @@ -35,8 +35,12 @@ type tagAddress, ## the HTML ``address`` element tagApplet, ## the deprecated HTML ``applet`` element tagArea, ## the HTML ``area`` element + tagArticle, ## the HTML ``article`` element + tagAside, ## the HTML ``aside`` element + tagAudio, ## the HTML ``audio`` element tagB, ## the HTML ``b`` element tagBase, ## the HTML ``base`` element + tagBdi, ## the HTML ``bdi`` element tagBdo, ## the deprecated HTML ``dbo`` element tagBasefont, ## the deprecated HTML ``basefont`` element tagBig, ## the HTML ``big`` element @@ -44,22 +48,31 @@ type tagBody, ## the HTML ``body`` element tagBr, ## the HTML ``br`` element tagButton, ## the HTML ``button`` element + tagCanvas, ## the HTML ``canvas`` element tagCaption, ## the HTML ``caption`` element tagCenter, ## the deprecated HTML ``center`` element tagCite, ## the HTML ``cite`` element tagCode, ## the HTML ``code`` element tagCol, ## the HTML ``col`` element tagColgroup, ## the HTML ``colgroup`` element + tagCommand, ## the HTML ``command`` element + tagDatalist, ## the HTML ``datalist`` element tagDd, ## the HTML ``dd`` element tagDel, ## the HTML ``del`` element + tagDetails, ## the HTML ``details`` element tagDfn, ## the HTML ``dfn`` element + tagDialog, ## the HTML ``dialog`` element tagDiv, ## the HTML ``div`` element tagDir, ## the deprecated HTLM ``dir`` element tagDl, ## the HTML ``dl`` element tagDt, ## the HTML ``dt`` element tagEm, ## the HTML ``em`` element + tagEmbed, ## the HTML ``embed`` element tagFieldset, ## the HTML ``fieldset`` element + tagFigcaption, ## the HTML ``figcaption`` element + tagFigure, ## the HTML ``figure`` element tagFont, ## the deprecated HTML ``font`` element + tagFooter, ## the HTML ``footer`` element tagForm, ## the HTML ``form`` element tagFrame, ## the HTML ``frame`` element tagFrameset, ## the deprecated HTML ``frameset`` element @@ -70,6 +83,8 @@ type tagH5, ## the HTML ``h5`` element tagH6, ## the HTML ``h6`` element tagHead, ## the HTML ``head`` element + tagHeader, ## the HTML ``header`` element + tagHgroup, ## the HTML ``hgroup`` element tagHtml, ## the HTML ``html`` element tagHr, ## the HTML ``hr`` element tagI, ## the HTML ``i`` element @@ -79,13 +94,17 @@ type tagIns, ## the HTML ``ins`` element tagIsindex, ## the deprecated HTML ``isindex`` element tagKbd, ## the HTML ``kbd`` element + tagKeygen, ## the HTML ``keygen`` element tagLabel, ## the HTML ``label`` element tagLegend, ## the HTML ``legend`` element tagLi, ## the HTML ``li`` element tagLink, ## the HTML ``link`` element tagMap, ## the HTML ``map`` element + tagMark, ## the HTML ``mark`` element tagMenu, ## the deprecated HTML ``menu`` element tagMeta, ## the HTML ``meta`` element + tagMeter, ## the HTML ``meter`` element + tagNav, ## the HTML ``nav`` element tagNobr, ## the deprecated HTML ``nobr`` element tagNoframes, ## the deprecated HTML ``noframes`` element tagNoscript, ## the HTML ``noscript`` element @@ -93,20 +112,28 @@ type tagOl, ## the HTML ``ol`` element tagOptgroup, ## the HTML ``optgroup`` element tagOption, ## the HTML ``option`` element + tagOutput, ## the HTML ``output`` element tagP, ## the HTML ``p`` element tagParam, ## the HTML ``param`` element tagPre, ## the HTML ``pre`` element + tagProgress, ## the HTML ``progress`` element tagQ, ## the HTML ``q`` element + tagRp, ## the HTML ``rp`` element + tagRt, ## the HTML ``rt`` element + tagRuby, ## the HTML ``ruby`` element tagS, ## the deprecated HTML ``s`` element tagSamp, ## the HTML ``samp`` element tagScript, ## the HTML ``script`` element + tagSection, ## the HTML ``section`` element tagSelect, ## the HTML ``select`` element tagSmall, ## the HTML ``small`` element + tagSource, ## the HTML ``source`` element tagSpan, ## the HTML ``span`` element tagStrike, ## the deprecated HTML ``strike`` element tagStrong, ## the HTML ``strong`` element tagStyle, ## the HTML ``style`` element tagSub, ## the HTML ``sub`` element + tagSummary, ## the HTML ``summary`` element tagSup, ## the HTML ``sup`` element tagTable, ## the HTML ``table`` element tagTbody, ## the HTML ``tbody`` element @@ -115,37 +142,46 @@ type tagTfoot, ## the HTML ``tfoot`` element tagTh, ## the HTML ``th`` element tagThead, ## the HTML ``thead`` element + tagTime, ## the HTML ``time`` element tagTitle, ## the HTML ``title`` element tagTr, ## the HTML ``tr`` element + tagTrack, ## the HTML ``track`` element tagTt, ## the HTML ``tt`` element tagU, ## the deprecated HTML ``u`` element tagUl, ## the HTML ``ul`` element - tagVar ## the HTML ``var`` element + tagVar, ## the HTML ``var`` element + tagVideo, ## the HTML ``video`` element + tagWbr ## the HTML ``wbr`` element const tagToStr* = [ - "a", "abbr", "acronym", "address", "applet", "area", - "b", "base", "basefont", "bdo", "big", "blockquote", "body", - "br", "button", "caption", "center", "cite", "code", - "col", "colgroup", "dd", "del", "dfn", "div", - "dir", "dl", "dt", "em", "fieldset", "font", + "a", "abbr", "acronym", "address", "applet", "area", "article", + "aside", "audio", + "b", "base", "basefont", "bdi", "bdo", "big", "blockquote", "body", + "br", "button", "canvas", "caption", "center", "cite", "code", + "col", "colgroup", "command", + "datalist", "dd", "del", "details", "dfn", "dialog", "div", + "dir", "dl", "dt", "em", "embed", "fieldset", + "figcaption", "figure", "font", "footer", "form", "frame", "frameset", "h1", "h2", "h3", - "h4", "h5", "h6", "head", "html", "hr", + "h4", "h5", "h6", "head", "header", "hgroup", "html", "hr", "i", "iframe", "img", "input", "ins", "isindex", - "kbd", "label", "legend", "li", "link", "map", - "menu", "meta", "nobr", "noframes", "noscript", "object", "ol", - "optgroup", "option", "p", "param", "pre", "q", - "s", "samp", "script", "select", "small", "span", - "strike", "strong", "style", "sub", "sup", "table", - "tbody", "td", "textarea", "tfoot", "th", "thead", - "title", "tr", "tt", "u", "ul", "var"] + "kbd", "keygen", "label", "legend", "li", "link", "map", "mark", + "menu", "meta", "meter", "nav", "nobr", "noframes", "noscript", + "object", "ol", + "optgroup", "option", "output", "p", "param", "pre", "progress", "q", + "rp", "rt", "ruby", "s", "samp", "script", "section", "select", "small", + "source", "span", "strike", "strong", "style", + "sub", "summary", "sup", "table", + "tbody", "td", "textarea", "tfoot", "th", "thead", "time", + "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr"] InlineTags* = {tagA, tagAbbr, tagAcronym, tagApplet, tagB, tagBasefont, tagBdo, tagBig, tagBr, tagButton, tagCite, tagCode, tagDel, tagDfn, tagEm, tagFont, tagI, tagImg, tagIns, tagInput, tagIframe, tagKbd, tagLabel, tagMap, tagObject, tagQ, tagSamp, tagScript, tagSelect, tagSmall, tagSpan, tagStrong, tagSub, tagSup, tagTextarea, tagTt, tagVar, tagApplet, tagBasefont, tagFont, tagIframe, tagU, tagS, - tagStrike} + tagStrike, tagWbr} BlockTags* = {tagAddress, tagBlockquote, tagCenter, tagDel, tagDir, tagDiv, tagDl, tagFieldset, tagForm, tagH1, tagH2, tagH3, tagH4, tagH5, tagH6, tagHr, tagIns, tagIsindex, tagMenu, tagNoframes, tagNoscript, @@ -153,7 +189,7 @@ const tagMenu, tagNoframes} SingleTags* = {tagArea, tagBase, tagBasefont, tagBr, tagCol, tagFrame, tagHr, tagImg, tagIsindex, - tagLink, tagMeta, tagParam} + tagLink, tagMeta, tagParam, tagWbr} Entities = [ ("nbsp", 0x00A0), ("iexcl", 0x00A1), ("cent", 0x00A2), ("pound", 0x00A3), @@ -228,28 +264,149 @@ const ("loz", 0x25CA), ("spades", 0x2660), ("clubs", 0x2663), ("hearts", 0x2665), ("diams", 0x2666)] -proc binaryStrSearch(x: openarray[string], y: string): int = - ## XXX put this into the library somewhere! - var a = 0 - var b = len(x) - 1 - while a <= b: - var mid = (a + b) div 2 - var c = cmp(x[mid], y) - if c < 0: a = mid + 1 - elif c > 0: b = mid - 1 - else: return mid - result = - 1 +proc allLower(s: string): bool = + for c in s: + if c < 'a' or c > 'z': return false + return true + +proc toHtmlTag(s: string): THtmlTag = + case s + of "a": tagA + of "abbr": tagAbbr + of "acronym": tagAcronym + of "address": tagAddress + of "applet": tagApplet + of "area": tagArea + of "article": tagArticle + of "aside": tagAside + of "audio": tagAudio + of "b": tagB + of "base": tagBase + of "basefont": tagBasefont + of "bdi": tagBdi + of "bdo": tagBdo + of "big": tagBig + of "blockquote": tagBlockquote + of "body": tagBody + of "br": tagBr + of "button": tagButton + of "canvas": tagCanvas + of "caption": tagCaption + of "center": tagCenter + of "cite": tagCite + of "code": tagCode + of "col": tagCol + of "colgroup": tagColgroup + of "command": tagCommand + of "datalist": tagDatalist + of "dd": tagDd + of "del": tagDel + of "details": tagDetails + of "dfn": tagDfn + of "dialog": tagDialog + of "div": tagDiv + of "dir": tagDir + of "dl": tagDl + of "dt": tagDt + of "em": tagEm + of "embed": tagEmbed + of "fieldset": tagFieldset + of "figcaption": tagFigcaption + of "figure": tagFigure + of "font": tagFont + of "footer": tagFooter + of "form": tagForm + of "frame": tagFrame + of "frameset": tagFrameset + of "h1": tagH1 + of "h2": tagH2 + of "h3": tagH3 + of "h4": tagH4 + of "h5": tagH5 + of "h6": tagH6 + of "head": tagHead + of "header": tagHeader + of "hgroup": tagHgroup + of "html": tagHtml + of "hr": tagHr + of "i": tagI + of "iframe": tagIframe + of "img": tagImg + of "input": tagInput + of "ins": tagIns + of "isindex": tagIsindex + of "kbd": tagKbd + of "keygen": tagKeygen + of "label": tagLabel + of "legend": tagLegend + of "li": tagLi + of "link": tagLink + of "map": tagMap + of "mark": tagMark + of "menu": tagMenu + of "meta": tagMeta + of "meter": tagMeter + of "nav": tagNav + of "nobr": tagNobr + of "noframes": tagNoframes + of "noscript": tagNoscript + of "object": tagObject + of "ol": tagOl + of "optgroup": tagOptgroup + of "option": tagOption + of "output": tagOutput + of "p": tagP + of "param": tagParam + of "pre": tagPre + of "progress": tagProgress + of "q": tagQ + of "rp": tagRp + of "rt": tagRt + of "ruby": tagRuby + of "s": tagS + of "samp": tagSamp + of "script": tagScript + of "section": tagSection + of "select": tagSelect + of "small": tagSmall + of "source": tagSource + of "span": tagSpan + of "strike": tagStrike + of "strong": tagStrong + of "style": tagStyle + of "sub": tagSub + of "summary": tagSummary + of "sup": tagSup + of "table": tagTable + of "tbody": tagTbody + of "td": tagTd + of "textarea": tagTextarea + of "tfoot": tagTfoot + of "th": tagTh + of "thead": tagThead + of "time": tagTime + of "title": tagTitle + of "tr": tagTr + of "track": tagTrack + of "tt": tagTt + of "u": tagU + of "ul": tagUl + of "var": tagVar + of "video": tagVideo + of "wbr": tagWbr + else: tagUnknown proc htmlTag*(n: PXmlNode): THtmlTag = ## gets `n`'s tag as a ``THtmlTag``. if n.clientData == 0: - n.clientData = binaryStrSearch(tagToStr, n.tag)+1 + n.clientData = toHtmlTag(n.tag).ord result = THtmlTag(n.clientData) proc htmlTag*(s: string): THtmlTag = ## converts `s` to a ``THtmlTag``. If `s` is no HTML tag, ``tagUnknown`` is ## returned. - result = THtmlTag(binaryStrSearch(tagToStr, s.toLower)+1) + let s = if allLower(s): s else: s.toLower + result = toHtmlTag(s) proc entityToUtf8*(entity: string): string = ## converts an HTML entity name like ``Ü`` to its UTF-8 equivalent. @@ -267,11 +424,13 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode proc expected(x: var TXmlParser, n: PXmlNode): string = result = errorMsg(x, "</" & n.tag & "> expected") +template elemName(x: expr): expr = rawData(x) + proc untilElementEnd(x: var TXmlParser, result: PXmlNode, errors: var seq[string]) = # we parsed e.g. ``<br>`` and don't really expect a ``</br>``: if result.htmlTag in singleTags: - if x.kind != xmlElementEnd or cmpIgnoreCase(x.elementName, result.tag) != 0: + if x.kind != xmlElementEnd or cmpIgnoreCase(x.elemName, result.tag) != 0: return while true: case x.kind @@ -279,28 +438,29 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode, case result.htmlTag of tagLi, tagP, tagDt, tagDd, tagInput, tagOption: # some tags are common to have no ``</end>``, like ``<li>``: - if htmlTag(x.elementName) in {tagLi, tagP, tagDt, tagDd, tagInput, - tagOption}: + if htmlTag(x.elemName) in {tagLi, tagP, tagDt, tagDd, tagInput, + tagOption}: errors.add(expected(x, result)) break - when false: - if htmlTag(x.elementName) notin InlineTags: - errors.add(expected(x, result)) - break - of tagTr, tagTd, tagTh, tagTfoot, tagThead: - if htmlTag(x.elementName) in {tagTr, tagTd, tagTh, tagTfoot, tagThead}: + of tagTd, tagTh, tagTfoot, tagThead: + if htmlTag(x.elemName) in {tagTr, tagTd, tagTh, tagTfoot, tagThead}: + errors.add(expected(x, result)) + break + of tagTr: + if htmlTag(x.elemName) == tagTr: errors.add(expected(x, result)) break of tagOptgroup: - if htmlTag(x.elementName) in {tagOption, tagOptgroup}: + if htmlTag(x.elemName) in {tagOption, tagOptgroup}: errors.add(expected(x, result)) break else: nil result.addNode(parse(x, errors)) of xmlElementEnd: - if cmpIgnoreCase(x.elementName, result.tag) == 0: + if cmpIgnoreCase(x.elemName, result.tag) == 0: next(x) else: + echo "5; expected: ", result.htmltag, " ", x.elemName errors.add(expected(x, result)) # do not skip it here! break @@ -313,10 +473,10 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode, proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode = case x.kind of xmlComment: - result = newComment(x.charData) + result = newComment(x.rawData) next(x) of xmlCharData, xmlWhitespace: - result = newText(x.charData) + result = newText(x.rawData) next(x) of xmlPI, xmlSpecial: # we just ignore processing instructions for now @@ -325,19 +485,19 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode = errors.add(errorMsg(x)) next(x) of xmlElementStart: - result = newElement(x.elementName.toLower) + result = newElement(x.elemName.toLower) next(x) untilElementEnd(x, result, errors) of xmlElementEnd: - errors.add(errorMsg(x, "unexpected ending tag: " & x.elementName)) + errors.add(errorMsg(x, "unexpected ending tag: " & x.elemName)) of xmlElementOpen: - result = newElement(x.elementName.toLower) + result = newElement(x.elemName.toLower) next(x) result.attrs = newStringTable() while true: case x.kind of xmlAttribute: - result.attrs[x.attrKey] = x.attrValue + result.attrs[x.rawData] = x.rawData2 next(x) of xmlElementClose: next(x) @@ -355,10 +515,10 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode = errors.add(errorMsg(x, "<some_tag> expected")) next(x) of xmlCData: - result = newCData(x.charData) + result = newCData(x.rawData) next(x) of xmlEntity: - var u = entityToUtf8(x.entityName) + var u = entityToUtf8(x.rawData) if u.len != 0: result = newText(u) next(x) of xmlEof: nil @@ -372,15 +532,17 @@ proc parseHtml*(s: PStream, filename: string, next(x) # skip the DOCTYPE: if x.kind == xmlSpecial: next(x) - result = parse(x, errors) - if x.kind != xmlEof: - errors.add(errorMsg(x, "EOF expected")) + + result = newElement("document") + result.addNode(parse(x, errors)) + #if x.kind != xmlEof: + # errors.add(errorMsg(x, "EOF expected")) while x.kind != xmlEof: var oldPos = x.bufpos # little hack to see if we made any progess result.addNode(parse(x, errors)) - if x.bufpos == oldPos: + if x.bufpos == oldPos: # force progress! - next(x) + next(x) close(x) proc parseHtml*(s: PStream): PXmlNode = @@ -400,22 +562,9 @@ proc loadHtml*(path: string, errors: var seq[string]): PXmlNode = proc loadHtml*(path: string): PXmlNode = ## Loads and parses HTML from file specified by ``path``, and returns ## a ``PXmlNode``. All parsing errors are ignored. - var errors: seq[string] = @[] + var errors: seq[string] = @[] result = loadHtml(path, errors) -when true: - nil -else: - proc checkHtmlAux(n: PXmlNode, errors: var seq[string]) = - nil - - proc checkHtmlStructure*(n: PXmlNode, errors: var seq[string]) = - ## checks the HTML structure after parsing for other errors like - ## a ``<h1>`` element within a ``<p>`` element. - if n == nil or n.htmlTag != tagHtml: - errors.add("<html> tag expected") - checkHtmlAux(n, errors) - when isMainModule: import os @@ -429,4 +578,3 @@ when isMainModule: f.close() else: quit("cannot write test.txt") - diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim index c49986087..667b8aed6 100644 --- a/lib/pure/parsexml.nim +++ b/lib/pure/parsexml.nim @@ -131,6 +131,10 @@ proc close*(my: var TXmlParser) {.inline.} = ## closes the parser `my` and its associated input stream. lexbase.close(my) +proc kind*(my: TXmlParser): TXmlEventKind {.inline.} = + ## returns the current event type for the XML parser + return my.kind + proc charData*(my: TXmlParser): string {.inline.} = ## returns the character data for the events: ``xmlCharData``, ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial`` @@ -138,10 +142,6 @@ proc charData*(my: TXmlParser): string {.inline.} = xmlSpecial}) return my.a -proc kind*(my: TXmlParser): TXmlEventKind {.inline.} = - ## returns the current event type for the XML parser - return my.kind - proc elementName*(my: TXmlParser): string {.inline.} = ## returns the element name for the events: ``xmlElementStart``, ## ``xmlElementEnd``, ``xmlElementOpen`` @@ -173,6 +173,16 @@ proc PIRest*(my: TXmlParser): string {.inline.} = assert(my.kind == xmlPI) return my.b +proc rawData*(my: TXmlParser): string {.inline.} = + ## returns the underlying 'data' string by reference. + ## This is only used for speed hacks. + shallowCopy(result, my.a) + +proc rawData2*(my: TXmlParser): string {.inline.} = + ## returns the underlying second 'data' string by reference. + ## This is only used for speed hacks. + shallowCopy(result, my.b) + proc getColumn*(my: TXmlParser): int {.inline.} = ## get the current column the parser has arrived at. result = getColNumber(my, my.bufPos) @@ -439,6 +449,7 @@ proc parseTag(my: var TXmlParser) = if my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>': inc(my.bufpos, 2) my.state = stateEmptyElementTag + my.c = nil elif my.buf[my.bufpos] == '>': inc(my.bufpos) else: diff --git a/lib/pure/xmltree.nim b/lib/pure/xmltree.nim index e3fc27237..d5821d8f3 100644 --- a/lib/pure/xmltree.nim +++ b/lib/pure/xmltree.nim @@ -71,6 +71,16 @@ proc text*(n: PXmlNode): string {.inline.} = assert n.k in {xnText, xnComment, xnCData, xnEntity} result = n.fText +proc rawText*(n: PXmlNode): string {.inline.} = + ## returns the underlying 'text' string by reference. + ## This is only used for speed hacks. + shallowCopy(result, n.fText) + +proc rawTag*(n: PXmlNode): string {.inline.} = + ## returns the underlying 'tag' string by reference. + ## This is only used for speed hacks. + shallowCopy(result, n.fTag) + proc innerText*(n: PXmlNode): string = ## gets the inner text of `n`. `n` has to be an ``xnElement`` node. Only ## ``xnText`` and ``xnEntity`` nodes are considered part of `n`'s inner text, |