diff options
Diffstat (limited to 'lib/pure')
-rw-r--r--[-rwxr-xr-x] | lib/pure/browsers.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/cgi.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/complex.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/dynlib.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/hashes.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/hashtabs.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/htmlparser.nim | 327 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/httpclient.nim | 8 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/httpserver.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/lexbase.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/logging.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/macros.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/math.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/md5.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/os.nim | 9 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/osproc.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/parsecfg.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/parsecsv.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/parseopt.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/parsesql.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/parseurl.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/parseutils.nim | 9 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/parsexml.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/pegs.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/re.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/regexprs.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/ropes.nim | 9 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/sockets.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/streams.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/strtabs.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/strutils.nim | 10 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/terminal.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/times.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/unicode.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/unidecode/gen.py | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/unidecode/unidecode.dat | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/unidecode/unidecode.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/variants.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/xmldom.nim | 8 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/xmldomparser.nim | 8 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/xmlgen.nim | 0 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/xmltree.nim | 2 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/pure/xmltreeparser.nim | 52 |
43 files changed, 260 insertions, 182 deletions
diff --git a/lib/pure/browsers.nim b/lib/pure/browsers.nim index 243c07dad..243c07dad 100755..100644 --- a/lib/pure/browsers.nim +++ b/lib/pure/browsers.nim diff --git a/lib/pure/cgi.nim b/lib/pure/cgi.nim index 490ae926d..490ae926d 100755..100644 --- a/lib/pure/cgi.nim +++ b/lib/pure/cgi.nim diff --git a/lib/pure/complex.nim b/lib/pure/complex.nim index f50ff4bd0..f50ff4bd0 100755..100644 --- a/lib/pure/complex.nim +++ b/lib/pure/complex.nim diff --git a/lib/pure/dynlib.nim b/lib/pure/dynlib.nim index 592073e3d..592073e3d 100755..100644 --- a/lib/pure/dynlib.nim +++ b/lib/pure/dynlib.nim diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim index 1593119bd..1593119bd 100755..100644 --- a/lib/pure/hashes.nim +++ b/lib/pure/hashes.nim diff --git a/lib/pure/hashtabs.nim b/lib/pure/hashtabs.nim index 68d19d63b..68d19d63b 100755..100644 --- a/lib/pure/hashtabs.nim +++ b/lib/pure/hashtabs.nim diff --git a/lib/pure/htmlparser.nim b/lib/pure/htmlparser.nim index df840e15c..5c88f211d 100755..100644 --- a/lib/pure/htmlparser.nim +++ b/lib/pure/htmlparser.nim @@ -11,7 +11,7 @@ ## It is supposed to handle the *wild* HTML the real world uses. ## ## It can be used to parse a wild HTML document and output it as valid XHTML -## document (if you are lucky): +## document (well, if you are lucky): ## ## .. code-block:: nimrod ## @@ -23,24 +23,29 @@ ## **Note:** The resulting ``PXmlNode``s already use the ``clientData`` field, ## so it cannot be used by clients of this library. -import streams, parsexml, xmltree +import strutils, streams, parsexml, xmltree, unicode, strtabs type THtmlTag* = enum ## list of all supported HTML tags; order will always be ## alphabetically tagUnknown, ## unknown HTML element tagA, ## the HTML ``a`` element + tagAbbr, ## the deprecated HTML ``abbr`` element tagAcronym, ## the HTML ``acronym`` element tagAddress, ## the HTML ``address`` element + tagApplet, ## the deprecated HTML ``applet`` element tagArea, ## the HTML ``area`` element tagB, ## the HTML ``b`` element tagBase, ## the HTML ``base`` element + tagBdo, ## the deprecated HTML ``dbo`` element + tagBasefont, ## the deprecated HTML ``basefont`` element tagBig, ## the HTML ``big`` element tagBlockquote, ## the HTML ``blockquote`` element tagBody, ## the HTML ``body`` element tagBr, ## the HTML ``br`` element tagButton, ## the HTML ``button`` element tagCaption, ## the HTML ``caption`` element + tagCenter, ## the deprecated HTML ``center`` element tagCite, ## the HTML ``cite`` element tagCode, ## the HTML ``code`` element tagCol, ## the HTML ``col`` element @@ -49,11 +54,15 @@ type tagDel, ## the HTML ``del`` element tagDfn, ## the HTML ``dfn`` element tagDiv, ## the HTML ``div`` element + tagDir, ## the deprecated HTLM ``dir`` element tagDl, ## the HTML ``dl`` element tagDt, ## the HTML ``dt`` element tagEm, ## the HTML ``em`` element tagFieldset, ## the HTML ``fieldset`` element + tagFont, ## the deprecated HTML ``font`` element tagForm, ## the HTML ``form`` element + tagFrame, ## the HTML ``frame`` element + tagFrameset, ## the deprecated HTML ``frameset`` element tagH1, ## the HTML ``h1`` element tagH2, ## the HTML ``h2`` element tagH3, ## the HTML ``h3`` element @@ -64,16 +73,21 @@ type tagHtml, ## the HTML ``html`` element tagHr, ## the HTML ``hr`` element tagI, ## the HTML ``i`` element + tagIframe, ## the deprecated HTML ``iframe`` element tagImg, ## the HTML ``img`` element tagInput, ## the HTML ``input`` element tagIns, ## the HTML ``ins`` element + tagIsindex, ## the deprecated HTML ``isindex`` element tagKbd, ## the HTML ``kbd`` element tagLabel, ## the HTML ``label`` element tagLegend, ## the HTML ``legend`` element tagLi, ## the HTML ``li`` element tagLink, ## the HTML ``link`` element tagMap, ## the HTML ``map`` element + tagMenu, ## the deprecated HTML ``menu`` element tagMeta, ## the HTML ``meta`` element + tagNobr, ## the deprecated HTML ``nobr`` element + tagNoframes, ## the deprecated HTML ``noframes`` element tagNoscript, ## the HTML ``noscript`` element tagObject, ## the HTML ``object`` element tagOl, ## the HTML ``ol`` element @@ -83,11 +97,13 @@ type tagParam, ## the HTML ``param`` element tagPre, ## the HTML ``pre`` element tagQ, ## the HTML ``q`` element + tagS, ## the deprecated HTML ``s`` element tagSamp, ## the HTML ``samp`` element tagScript, ## the HTML ``script`` element tagSelect, ## the HTML ``select`` element tagSmall, ## the HTML ``small`` element tagSpan, ## the HTML ``span`` element + tagStrike, ## the deprecated HTML ``strike`` element tagStrong, ## the HTML ``strong`` element tagStyle, ## the HTML ``style`` element tagSub, ## the HTML ``sub`` element @@ -102,21 +118,116 @@ type tagTitle, ## the HTML ``title`` element tagTr, ## the HTML ``tr`` element tagTt, ## the HTML ``tt`` element + tagU, ## the deprecated HTML ``u`` element tagUl, ## the HTML ``ul`` element tagVar ## the HTML ``var`` element -const +const tagStrs = [ - "a", "acronym", "address", "area", "b", "base", "big", "blockquote", - "body", "br", "button", "caption", "cite", "code", "col", "colgroup", - "dd", "del", "dfn", "div", "dl", "dt", "em", "fieldset", - "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "html", "hr", - "i", "img", "input", "ins", "kbd", "label", "legend", "li", "link", - "map", "meta", "noscript", "object", "ol", "optgroup", "option", - "p", "param", "pre", "q", "samp", "script", "select", "small", - "span", "strong", "style", "sub", "sup", "table", "tbody", "td", - "textarea", "tfoot", "th", "thead", "title", "tr", "tt", "ul", "var" + "a", "abbr", "acronym", "address", "applet", "area", + "b", "base", "basefont", "bdo", "big", "blockquote", "body", + "br", "button", "caption", "center", "cite", "code", + "col", "colgroup", "dd", "del", "dfn", "div", + "dir", "dl", "dt", "em", "fieldset", "font", + "form", "frame", "frameset", "h1", "h2", "h3", + "h4", "h5", "h6", "head", "html", "hr", + "i", "iframe", "img", "input", "ins", "isindex", + "kbd", "label", "legend", "li", "link", "map", + "menu", "meta", "nobr", "noframes", "noscript", "object", "ol", + "optgroup", "option", "p", "param", "pre", "q", + "s", "samp", "script", "select", "small", "span", + "strike", "strong", "style", "sub", "sup", "table", + "tbody", "td", "textarea", "tfoot", "th", "thead", + "title", "tr", "tt", "u", "ul", "var" ] + InlineTags* = {tagA, tagAbbr, tagAcronym, tagApplet, tagB, tagBasefont, + tagBdo, tagBig, tagBr, tagButton, tagCite, tagCode, tagDel, tagDfn, + tagEm, tagFont, tagI, tagImg, tagIns, tagInput, tagIframe, tagKbd, + tagLabel, tagMap, tagObject, tagQ, tagSamp, tagScript, tagSelect, + tagSmall, tagSpan, tagStrong, tagSub, tagSup, tagTextarea, tagTt, + tagVar, tagApplet, tagBasefont, tagFont, tagIframe, tagU, tagS, + tagStrike} + BlockTags* = {tagAddress, tagBlockquote, tagCenter, tagDel, tagDir, tagDiv, + tagDl, tagFieldset, tagForm, tagH1, tagH2, tagH3, tagH4, + tagH5, tagH6, tagHr, tagIns, tagIsindex, tagMenu, tagNoframes, tagNoscript, + tagOl, tagP, tagPre, tagTable, tagUl, tagCenter, tagDir, tagIsindex, + tagMenu, tagNoframes} + SingleTags* = {tagArea, tagBase, tagBasefont, + tagBr, tagCol, tagFrame, tagHr, tagImg, tagInput, tagIsindex, + tagLink, tagMeta, tagParam} # `tagP` can be both! + + Entities = [ + ("nbsp", 0x00A0), ("iexcl", 0x00A1), ("cent", 0x00A2), ("pound", 0x00A3), + ("curren", 0x00A4), ("yen", 0x00A5), ("brvbar", 0x00A6), ("sect", 0x00A7), + ("uml", 0x00A8), ("copy", 0x00A9), ("ordf", 0x00AA), ("laquo", 0x00AB), + ("not", 0x00AC), ("shy", 0x00AD), ("reg", 0x00AE), ("macr", 0x00AF), + ("deg", 0x00B0), ("plusmn", 0x00B1), ("sup2", 0x00B2), ("sup3", 0x00B3), + ("acute", 0x00B4), ("micro", 0x00B5), ("para", 0x00B6), ("middot", 0x00B7), + ("cedil", 0x00B8), ("sup1", 0x00B9), ("ordm", 0x00BA), ("raquo", 0x00BB), + ("frac14", 0x00BC), ("frac12", 0x00BD), ("frac34", 0x00BE), + ("iquest", 0x00BF), ("Agrave", 0x00C0), ("Aacute", 0x00C1), + ("Acirc", 0x00C2), ("Atilde", 0x00C3), ("Auml", 0x00C4), ("Aring", 0x00C5), + ("AElig", 0x00C6), ("Ccedil", 0x00C7), ("Egrave", 0x00C8), + ("Eacute", 0x00C9), ("Ecirc", 0x00CA), ("Euml", 0x00CB), ("Igrave", 0x00CC), + ("Iacute", 0x00CD), ("Icirc", 0x00CE), ("Iuml", 0x00CF), ("ETH", 0x00D0), + ("Ntilde", 0x00D1), ("Ograve", 0x00D2), ("Oacute", 0x00D3), + ("Ocirc", 0x00D4), ("Otilde", 0x00D5), ("Ouml", 0x00D6), ("times", 0x00D7), + ("Oslash", 0x00D8), ("Ugrave", 0x00D9), ("Uacute", 0x00DA), + ("Ucirc", 0x00DB), ("Uuml", 0x00DC), ("Yacute", 0x00DD), ("THORN", 0x00DE), + ("szlig", 0x00DF), ("agrave", 0x00E0), ("aacute", 0x00E1), + ("acirc", 0x00E2), ("atilde", 0x00E3), ("auml", 0x00E4), ("aring", 0x00E5), + ("aelig", 0x00E6), ("ccedil", 0x00E7), ("egrave", 0x00E8), + ("eacute", 0x00E9), ("ecirc", 0x00EA), ("euml", 0x00EB), ("igrave", 0x00EC), + ("iacute", 0x00ED), ("icirc", 0x00EE), ("iuml", 0x00EF), ("eth", 0x00F0), + ("ntilde", 0x00F1), ("ograve", 0x00F2), ("oacute", 0x00F3), + ("ocirc", 0x00F4), ("otilde", 0x00F5), ("ouml", 0x00F6), ("divide", 0x00F7), + ("oslash", 0x00F8), ("ugrave", 0x00F9), ("uacute", 0x00FA), + ("ucirc", 0x00FB), ("uuml", 0x00FC), ("yacute", 0x00FD), ("thorn", 0x00FE), + ("yuml", 0x00FF), ("OElig", 0x0152), ("oelig", 0x0153), ("Scaron", 0x0160), + ("scaron", 0x0161), ("Yuml", 0x0178), ("fnof", 0x0192), ("circ", 0x02C6), + ("tilde", 0x02DC), ("Alpha", 0x0391), ("Beta", 0x0392), ("Gamma", 0x0393), + ("Delta", 0x0394), ("Epsilon", 0x0395), ("Zeta", 0x0396), ("Eta", 0x0397), + ("Theta", 0x0398), ("Iota", 0x0399), ("Kappa", 0x039A), ("Lambda", 0x039B), + ("Mu", 0x039C), ("Nu", 0x039D), ("Xi", 0x039E), ("Omicron", 0x039F), + ("Pi", 0x03A0), ("Rho", 0x03A1), ("Sigma", 0x03A3), ("Tau", 0x03A4), + ("Upsilon", 0x03A5), ("Phi", 0x03A6), ("Chi", 0x03A7), ("Psi", 0x03A8), + ("Omega", 0x03A9), ("alpha", 0x03B1), ("beta", 0x03B2), ("gamma", 0x03B3), + ("delta", 0x03B4), ("epsilon", 0x03B5), ("zeta", 0x03B6), ("eta", 0x03B7), + ("theta", 0x03B8), ("iota", 0x03B9), ("kappa", 0x03BA), ("lambda", 0x03BB), + ("mu", 0x03BC), ("nu", 0x03BD), ("xi", 0x03BE), ("omicron", 0x03BF), + ("pi", 0x03C0), ("rho", 0x03C1), ("sigmaf", 0x03C2), ("sigma", 0x03C3), + ("tau", 0x03C4), ("upsilon", 0x03C5), ("phi", 0x03C6), ("chi", 0x03C7), + ("psi", 0x03C8), ("omega", 0x03C9), ("thetasym", 0x03D1), ("upsih", 0x03D2), + ("piv", 0x03D6), ("ensp", 0x2002), ("emsp", 0x2003), ("thinsp", 0x2009), + ("zwnj", 0x200C), ("zwj", 0x200D), ("lrm", 0x200E), ("rlm", 0x200F), + ("ndash", 0x2013), ("mdash", 0x2014), ("lsquo", 0x2018), ("rsquo", 0x2019), + ("sbquo", 0x201A), ("ldquo", 0x201C), ("rdquo", 0x201D), ("bdquo", 0x201E), + ("dagger", 0x2020), ("Dagger", 0x2021), ("bull", 0x2022), + ("hellip", 0x2026), ("permil", 0x2030), ("prime", 0x2032), + ("Prime", 0x2033), ("lsaquo", 0x2039), ("rsaquo", 0x203A), + ("oline", 0x203E), ("frasl", 0x2044), ("euro", 0x20AC), + ("image", 0x2111), ("weierp", 0x2118), ("real", 0x211C), + ("trade", 0x2122), ("alefsym", 0x2135), ("larr", 0x2190), + ("uarr", 0x2191), ("rarr", 0x2192), ("darr", 0x2193), + ("harr", 0x2194), ("crarr", 0x21B5), ("lArr", 0x21D0), + ("uArr", 0x21D1), ("rArr", 0x21D2), ("dArr", 0x21D3), + ("hArr", 0x21D4), ("forall", 0x2200), ("part", 0x2202), + ("exist", 0x2203), ("empty", 0x2205), ("nabla", 0x2207), + ("isin", 0x2208), ("notin", 0x2209), ("ni", 0x220B), + ("prod", 0x220F), ("sum", 0x2211), ("minus", 0x2212), + ("lowast", 0x2217), ("radic", 0x221A), ("prop", 0x221D), + ("infin", 0x221E), ("ang", 0x2220), ("and", 0x2227), + ("or", 0x2228), ("cap", 0x2229), ("cup", 0x222A), + ("int", 0x222B), ("there4", 0x2234), ("sim", 0x223C), + ("cong", 0x2245), ("asymp", 0x2248), ("ne", 0x2260), + ("equiv", 0x2261), ("le", 0x2264), ("ge", 0x2265), + ("sub", 0x2282), ("sup", 0x2283), ("nsub", 0x2284), + ("sube", 0x2286), ("supe", 0x2287), ("oplus", 0x2295), + ("otimes", 0x2297), ("perp", 0x22A5), ("sdot", 0x22C5), + ("lceil", 0x2308), ("rceil", 0x2309), ("lfloor", 0x230A), + ("rfloor", 0x230B), ("lang", 0x2329), ("rang", 0x232A), + ("loz", 0x25CA), ("spades", 0x2660), ("clubs", 0x2663), + ("hearts", 0x2665), ("diams", 0x2666)] proc binaryStrSearch(x: openarray[string], y: string): int = ## XXX put this into the library somewhere! @@ -125,110 +236,121 @@ proc binaryStrSearch(x: openarray[string], y: string): int = while a <= b: var mid = (a + b) div 2 var c = cmp(x[mid], y) - if c < 0: - a = mid + 1 - elif c > 0: - b = mid - 1 - else: - return mid + if c < 0: a = mid + 1 + elif c > 0: b = mid - 1 + else: return mid result = - 1 proc htmlTag*(n: PXmlNode): THtmlTag = - ## gets `n`'s tag as a ``THtmlTag``. Even though results are cached, this is - ## can be more expensive than comparing ``tag`` directly to a string. + ## gets `n`'s tag as a ``THtmlTag``. if n.clientData == 0: n.clientData = binaryStrSearch(tagStrs, n.tag)+1 result = THtmlTag(n.clientData) -proc parseElement(x: var TXmlParser, doc: var PDocument): PElement = - var n = doc.createElement("") +proc entityToUtf8*(entity: string): string = + ## converts an HTML entity name like ``Ü`` to its UTF-8 equivalent. + ## "" is returned if the entity name is unknown. The HTML parser + ## already converts entities to UTF-8. + for name, val in items(entities): + if name == entity: + return toUTF8(TRune(val)) + result = "" + +proc addNode(father, son: PXmlNode) = + if son != nil: add(father, son) + +proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode - while True: - case x.kind() +proc untilElementEnd(x: var TXmlParser, result: PXmlNode, + errors: var seq[string]) = + if result.htmlTag in singleTags: + if x.kind != xmlElementEnd or cmpIgnoreCase(x.elementName, result.tag) != 0: + return + while true: + case x.kind + of xmlElementEnd: + if cmpIgnoreCase(x.elementName, result.tag) == 0: + next(x) + else: + errors.add(errorMsg(x, "</" & result.tag & "$1> expected")) + # do not skip it here! + break of xmlEof: + errors.add(errorMsg(x, "</" & result.tag & "$1> expected")) break - of xmlElementStart: - if n.tagName() != "": - n.appendChild(parseElement(x, doc)) - else: - n = doc.createElement(x.elementName) - of xmlElementOpen: - if n.tagName() != "": - n.appendChild(parseElement(x, doc)) - else: - if x.elementName.contains(':'): - #TODO: NamespaceURI - n = doc.createElementNS("nil", x.elementName) - else: - n = doc.createElement(x.elementName) - - of xmlElementEnd: - if x.elementName == n.nodeName: - # n.normalize() # Remove any whitespace etc. - return n - else: #The wrong element is ended - raise newException(EMismatchedTag, "Mismatched tag at line " & - $x.getLine() & " column " & $x.getColumn) - - of xmlCharData: - n.appendChild(parseText(x, doc)) - of xmlAttribute: - if x.attrKey.contains(':'): - #TODO: NamespaceURI - n.setAttributeNS("nil", x.attrKey, x.attrValue) - else: - n.setAttribute(x.attrKey, x.attrValue) - of xmlCData: - n.appendChild(doc.createCDATASection(x.charData())) - of xmlComment: - n.appendChild(doc.createComment(x.charData())) - of xmlPI: - n.appendChild(doc.createProcessingInstruction(x.PIName(), x.PIRest())) - - of xmlWhitespace, xmlElementClose, xmlEntity, xmlSpecial: - # Unused 'events' - else: - raise newException(EParserError, "Unexpected XML Parser event") - x.next() + result.addNode(parse(x, errors)) - raise newException(EMismatchedTag, - "Mismatched tag at line " & $x.getLine() & " column " & $x.getColumn) - - -proc parse*(x: var TXmlParser, father: PXmlNode) = - +proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode = + case x.kind + of xmlComment: + result = newComment(x.charData) + next(x) + of xmlCharData, xmlWhitespace: + result = newText(x.charData) + next(x) + of xmlPI, xmlSpecial: + # we just ignore processing instructions for now + next(x) + of xmlError: + errors.add(errorMsg(x)) + next(x) + of xmlElementStart: + result = newElement(x.elementName) + next(x) + untilElementEnd(x, result, errors) + of xmlElementEnd: + errors.add(errorMsg(x, "unexpected ending tag: " & x.elementName)) + of xmlElementOpen: + result = newElement(x.elementName) + next(x) + result.attr = newStringTable() + while true: + case x.kind + of xmlAttribute: + result.attr[x.attrKey] = x.attrValue + next(x) + of xmlElementClose: + next(x) + break + of xmlError: + errors.add(errorMsg(x)) + next(x) + break + else: + errors.add(errorMsg(x, "'>' expected")) + next(x) + break + untilElementEnd(x, result, errors) + of xmlAttribute, xmlElementClose: + errors.add(errorMsg(x, "<some_tag> expected")) + next(x) + of xmlCData: + result = newCData(x.charData) + next(x) + of xmlEntity: + var u = entityToUtf8(x.entityName) + if u.len != 0: result = newText(u) + next(x) + of xmlEof: nil proc parseHtml*(s: PStream, filename: string, errors: var seq[string]): PXmlNode = - ## parses the HTML from stream `s` and returns a ``PXmlNode``. Every + ## parses the XML from stream `s` and returns a ``PXmlNode``. Every ## occured parsing error is added to the `errors` sequence. var x: TXmlParser open(x, s, filename, {reportComments}) - - result = newElement("html") - while true: - x.next() - case x.kind - of xmlWhitespace: nil # just skip it - of xmlComment: - result.add(newComment(x.text)) - - while True: - x.next() - case x.kind - of xmlEof: break - of xmlElementStart, xmlElementOpen: - var el: PElement = parseElement(x, XmlDoc) - XmlDoc = dom.createDocument(el) - of xmlWhitespace, xmlElementClose, xmlEntity, xmlSpecial: - # Unused 'events' - else: - raise newException(EParserError, "Unexpected XML Parser event") + next(x) + # skip the DOCTYPE: + if x.kind == xmlSpecial: next(x) + result = parse(x, errors) + while x.kind != xmlEof: + errors.add(errorMsg(x, "EOF expected")) + result.addNode(parse(x, errors)) close(x) proc parseHtml*(s: PStream): PXmlNode = - ## parses the HTML from stream `s` and returns a ``PXmlNode``. All parsing + ## parses the XTML from stream `s` and returns a ``PXmlNode``. All parsing ## errors are ignored. var errors: seq[string] = @[] result = parseHtml(s, "unknown_html_doc", errors) @@ -236,7 +358,7 @@ proc parseHtml*(s: PStream): PXmlNode = proc loadHtml*(path: string, reportErrors = false): PXmlNode = ## Loads and parses HTML from file specified by ``path``, and returns ## a ``PXmlNode``. If `reportErrors` is true, the parsing errors are - ## ``echo``ed. + ## ``echo``ed, otherwise they are ignored. var s = newFileStream(path, fmRead) if s == nil: raise newException(EIO, "Unable to read file: " & path) @@ -245,3 +367,16 @@ proc loadHtml*(path: string, reportErrors = false): PXmlNode = if reportErrors: for msg in items(errors): echo(msg) +when true: + nil +else: + proc checkHtmlAux(n: PXmlNode, errors: var seq[string]) = + nil + + proc checkHtmlStructure*(n: PXmlNode, errors: var seq[string]) = + ## checks the HTML structure after parsing for other errors like + ## a ``<h1>`` element within a ``<p>`` element. + if n == nil or n.htmlTag != tagHtml: + errors.add("<html> tag expected") + checkHtmlAux(n, errors) + \ No newline at end of file diff --git a/lib/pure/httpclient.nim b/lib/pure/httpclient.nim index 43eab0404..0f9054873 100755..100644 --- a/lib/pure/httpclient.nim +++ b/lib/pure/httpclient.nim @@ -60,14 +60,6 @@ type ## and ``postContent`` proc, ## when the server returns an error -template newException(exceptn, message: expr): expr = - block: # open a new scope - var - e: ref exceptn - new(e) - e.msg = message - e - proc httpError(msg: string) = var e: ref EInvalidProtocol new(e) diff --git a/lib/pure/httpserver.nim b/lib/pure/httpserver.nim index 2c85d8137..2c85d8137 100755..100644 --- a/lib/pure/httpserver.nim +++ b/lib/pure/httpserver.nim diff --git a/lib/pure/lexbase.nim b/lib/pure/lexbase.nim index bb207e92a..bb207e92a 100755..100644 --- a/lib/pure/lexbase.nim +++ b/lib/pure/lexbase.nim diff --git a/lib/pure/logging.nim b/lib/pure/logging.nim index 6df39f50b..6df39f50b 100755..100644 --- a/lib/pure/logging.nim +++ b/lib/pure/logging.nim diff --git a/lib/pure/macros.nim b/lib/pure/macros.nim index 677469ed2..677469ed2 100755..100644 --- a/lib/pure/macros.nim +++ b/lib/pure/macros.nim diff --git a/lib/pure/math.nim b/lib/pure/math.nim index cf4b6d95c..cf4b6d95c 100755..100644 --- a/lib/pure/math.nim +++ b/lib/pure/math.nim diff --git a/lib/pure/md5.nim b/lib/pure/md5.nim index e75f80b4c..e75f80b4c 100755..100644 --- a/lib/pure/md5.nim +++ b/lib/pure/md5.nim diff --git a/lib/pure/os.nim b/lib/pure/os.nim index 1879fb5db..ef526993a 100755..100644 --- a/lib/pure/os.nim +++ b/lib/pure/os.nim @@ -26,15 +26,6 @@ else: include "system/ansi_c" -# copied from excpt.nim, because I don't want to make this template public -template newException(exceptn, message: expr): expr = - block: # open a new scope - var - e: ref exceptn - new(e) - e.msg = message - e - const doslike = defined(windows) or defined(OS2) or defined(DOS) # DOS-like filesystem diff --git a/lib/pure/osproc.nim b/lib/pure/osproc.nim index bbdea1eee..bbdea1eee 100755..100644 --- a/lib/pure/osproc.nim +++ b/lib/pure/osproc.nim diff --git a/lib/pure/parsecfg.nim b/lib/pure/parsecfg.nim index c26dab099..c26dab099 100755..100644 --- a/lib/pure/parsecfg.nim +++ b/lib/pure/parsecfg.nim diff --git a/lib/pure/parsecsv.nim b/lib/pure/parsecsv.nim index 5970f2090..5970f2090 100755..100644 --- a/lib/pure/parsecsv.nim +++ b/lib/pure/parsecsv.nim diff --git a/lib/pure/parseopt.nim b/lib/pure/parseopt.nim index 8f4be98f4..8f4be98f4 100755..100644 --- a/lib/pure/parseopt.nim +++ b/lib/pure/parseopt.nim diff --git a/lib/pure/parsesql.nim b/lib/pure/parsesql.nim index 2109c273a..2109c273a 100755..100644 --- a/lib/pure/parsesql.nim +++ b/lib/pure/parsesql.nim diff --git a/lib/pure/parseurl.nim b/lib/pure/parseurl.nim index cd3bc621a..cd3bc621a 100755..100644 --- a/lib/pure/parseurl.nim +++ b/lib/pure/parseurl.nim diff --git a/lib/pure/parseutils.nim b/lib/pure/parseutils.nim index 04d2a7973..0f107793c 100755..100644 --- a/lib/pure/parseutils.nim +++ b/lib/pure/parseutils.nim @@ -14,15 +14,6 @@ {.push debugger:off .} # the user does not want to trace a part # of the standard library! -# copied from excpt.nim, because I don't want to make this template public -template newException(exceptn, message: expr): expr = - block: # open a new scope - var - e: ref exceptn - new(e) - e.msg = message - e - const Whitespace = {' ', '\t', '\v', '\r', '\l', '\f'} IdentChars = {'a'..'z', 'A'..'Z', '0'..'9', '_'} diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim index 598ae6c68..598ae6c68 100755..100644 --- a/lib/pure/parsexml.nim +++ b/lib/pure/parsexml.nim diff --git a/lib/pure/pegs.nim b/lib/pure/pegs.nim index 5ba0351ad..5ba0351ad 100755..100644 --- a/lib/pure/pegs.nim +++ b/lib/pure/pegs.nim diff --git a/lib/pure/re.nim b/lib/pure/re.nim index 953f9c744..953f9c744 100755..100644 --- a/lib/pure/re.nim +++ b/lib/pure/re.nim diff --git a/lib/pure/regexprs.nim b/lib/pure/regexprs.nim index 43c7f05be..43c7f05be 100755..100644 --- a/lib/pure/regexprs.nim +++ b/lib/pure/regexprs.nim diff --git a/lib/pure/ropes.nim b/lib/pure/ropes.nim index aa793b4f3..df85baf92 100755..100644 --- a/lib/pure/ropes.nim +++ b/lib/pure/ropes.nim @@ -21,15 +21,6 @@ {.push debugger:off .} # the user does not want to trace a part # of the standard library! -# copied from excpt.nim, because I don't want to make this template public -template newException(exceptn, message: expr): expr = - block: # open a new scope - var - e: ref exceptn - new(e) - e.msg = message - e - const countCacheMisses = false diff --git a/lib/pure/sockets.nim b/lib/pure/sockets.nim index 85628db78..85628db78 100755..100644 --- a/lib/pure/sockets.nim +++ b/lib/pure/sockets.nim diff --git a/lib/pure/streams.nim b/lib/pure/streams.nim index f4d2911fc..f4d2911fc 100755..100644 --- a/lib/pure/streams.nim +++ b/lib/pure/streams.nim diff --git a/lib/pure/strtabs.nim b/lib/pure/strtabs.nim index 8ea59637a..8ea59637a 100755..100644 --- a/lib/pure/strtabs.nim +++ b/lib/pure/strtabs.nim diff --git a/lib/pure/strutils.nim b/lib/pure/strutils.nim index fe70130e5..2fd2aaeef 100755..100644 --- a/lib/pure/strutils.nim +++ b/lib/pure/strutils.nim @@ -17,16 +17,6 @@ import parseutils {.push debugger:off .} # the user does not want to trace a part # of the standard library! -# copied from excpt.nim, because I don't want to make this template public -template newException(exceptn, message: expr): expr = - block: # open a new scope - var - e: ref exceptn - new(e) - e.msg = message - e - - type TCharSet* = set[char] # for compatibility with Nim diff --git a/lib/pure/terminal.nim b/lib/pure/terminal.nim index 42bd80cb4..42bd80cb4 100755..100644 --- a/lib/pure/terminal.nim +++ b/lib/pure/terminal.nim diff --git a/lib/pure/times.nim b/lib/pure/times.nim index a54af3254..a54af3254 100755..100644 --- a/lib/pure/times.nim +++ b/lib/pure/times.nim diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim index bebbe56c5..bebbe56c5 100755..100644 --- a/lib/pure/unicode.nim +++ b/lib/pure/unicode.nim diff --git a/lib/pure/unidecode/gen.py b/lib/pure/unidecode/gen.py index 8da0136ff..8da0136ff 100755..100644 --- a/lib/pure/unidecode/gen.py +++ b/lib/pure/unidecode/gen.py diff --git a/lib/pure/unidecode/unidecode.dat b/lib/pure/unidecode/unidecode.dat index 9dff0a4a9..9dff0a4a9 100755..100644 --- a/lib/pure/unidecode/unidecode.dat +++ b/lib/pure/unidecode/unidecode.dat diff --git a/lib/pure/unidecode/unidecode.nim b/lib/pure/unidecode/unidecode.nim index a665dd73e..a665dd73e 100755..100644 --- a/lib/pure/unidecode/unidecode.nim +++ b/lib/pure/unidecode/unidecode.nim diff --git a/lib/pure/variants.nim b/lib/pure/variants.nim index f661f81a6..f661f81a6 100755..100644 --- a/lib/pure/variants.nim +++ b/lib/pure/variants.nim diff --git a/lib/pure/xmldom.nim b/lib/pure/xmldom.nim index 4e9d721d7..76c666de0 100755..100644 --- a/lib/pure/xmldom.nim +++ b/lib/pure/xmldom.nim @@ -34,14 +34,6 @@ type ESyntaxErr* = object of EDOMException ## If an invalid or illegal string is specified. EWrongDocumentErr* = object of EDOMException ## If a node is used in a different document than the one that created it (that doesn't support it) -template newException(exceptn, message: expr): expr = - block: # open a new scope - var - e: ref exceptn - new(e) - e.msg = message - e - const ElementNode* = 1 AttributeNode* = 2 diff --git a/lib/pure/xmldomparser.nim b/lib/pure/xmldomparser.nim index 9df60cab8..b73baf1ff 100755..100644 --- a/lib/pure/xmldomparser.nim +++ b/lib/pure/xmldomparser.nim @@ -17,14 +17,6 @@ type #Parsing errors EMismatchedTag* = object of E_Base ## Raised when a tag is not properly closed EParserError* = object of E_Base ## Raised when an unexpected XML Parser event occurs - -template newException(exceptn, message: expr): expr = - block: # open a new scope - var - e: ref exceptn - new(e) - e.msg = message - e proc parseText(x: var TXmlParser, doc: var PDocument): PText = result = doc.createTextNode(x.charData()) diff --git a/lib/pure/xmlgen.nim b/lib/pure/xmlgen.nim index 29f2700f2..29f2700f2 100755..100644 --- a/lib/pure/xmlgen.nim +++ b/lib/pure/xmlgen.nim diff --git a/lib/pure/xmltree.nim b/lib/pure/xmltree.nim index 005969fc4..2b0977874 100755..100644 --- a/lib/pure/xmltree.nim +++ b/lib/pure/xmltree.nim @@ -63,7 +63,7 @@ proc newCData*(cdata: string): PXmlNode = proc newEntity*(entity: string): PXmlNode = ## creates a new ``PXmlNode`` of kind ``xnEntity`` with the text `entity`. result = newXmlNode(xnCData) - result.fText = cdata + result.fText = entity proc text*(n: PXmlNode): string {.inline.} = ## gets the associated text with the node `n`. `n` can be a CDATA, Text, diff --git a/lib/pure/xmltreeparser.nim b/lib/pure/xmltreeparser.nim index b7a9ba54a..bf2c05570 100755..100644 --- a/lib/pure/xmltreeparser.nim +++ b/lib/pure/xmltreeparser.nim @@ -9,7 +9,7 @@ ## This module parses an XML document and creates its XML tree representation. -import streams, parsexml, strtabs, xmltree, hxmlcommon +import streams, parsexml, strtabs, xmltree type EInvalidXml* = object of E_Base ## exception that is raised for invalid XML @@ -25,13 +25,30 @@ proc raiseInvalidXml(errors: seq[string]) = proc addNode(father, son: PXmlNode) = if son != nil: add(father, son) -proc parse*(x: var TXmlParser, errors: var seq[string]): PXmlNode = +proc untilElementEnd(x: var TXmlParser, result: PXmlNode, + errors: var seq[string]) = + while true: + case x.kind + of xmlElementEnd: + if x.elementName == result.tag: + next(x) + else: + errors.add(errorMsg(x, "</" & result.tag & "$1> expected")) + # do not skip it here! + break + of xmlEof: + errors.add(errorMsg(x, "</" & result.tag & "$1> expected")) + break + else: + result.addNode(parse(x, errors)) + +proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode = case x.kind of xmlComment: - result = newComment(x.text) + result = newComment(x.charData) next(x) of xmlCharData, xmlWhitespace: - result = newText(x.text) + result = newText(x.charData) next(x) of xmlPI, xmlSpecial: # we just ignore processing instructions for now @@ -42,23 +59,10 @@ proc parse*(x: var TXmlParser, errors: var seq[string]): PXmlNode = of xmlElementStart: ## ``<elem>`` result = newElement(x.elementName) next(x) - while true: - case x.kind - of xmlElementEnd: - if x.elementName == result.tag: - next(x) - else: - errors.add(errorMsg(x, "</$1> expected" % result.tag)) - # do not skip it here! - break - of xmlEof: - errors.add(errorMsg(x, "</$1> expected" % result.tag)) - break - else: - result.addNode(parse(x, errors)) - of xmlElementEnd: ## ``</elem>`` + untilElementEnd(x, result, errors) + of xmlElementEnd: errors.add(errorMsg(x, "unexpected ending tag: " & x.elementName)) - of xmlElementOpen: ## ``<elem + of xmlElementOpen: result = newElement(x.elementName) next(x) result.attr = newStringTable() @@ -75,12 +79,12 @@ proc parse*(x: var TXmlParser, errors: var seq[string]): PXmlNode = next(x) break else: - errors.add(errorMsg(x, "'>' expected" % result.tag)) + errors.add(errorMsg(x, "'>' expected")) next(x) break - + untilElementEnd(x, result, errors) of xmlAttribute, xmlElementClose: - errors.add(errorMsg(x, "<some_tag> expected") + errors.add(errorMsg(x, "<some_tag> expected")) next(x) of xmlCData: result = newCData(x.charData) @@ -107,7 +111,7 @@ proc parseXml*(s: PStream, filename: string, of xmlError: errors.add(errorMsg(x)) else: - errors.add(errorMsg(x, "<some_tag> expected") + errors.add(errorMsg(x, "<some_tag> expected")) break close(x) |