diff options
-rw-r--r-- | lib/pure/htmlparser.nim | 2 | ||||
-rw-r--r-- | lib/pure/parsexml.nim | 20 | ||||
-rw-r--r-- | tests/stdlib/thtmlparser.nim | 45 |
3 files changed, 65 insertions, 2 deletions
diff --git a/lib/pure/htmlparser.nim b/lib/pure/htmlparser.nim index fbf2b8e73..9e1a5a101 100644 --- a/lib/pure/htmlparser.nim +++ b/lib/pure/htmlparser.nim @@ -2014,7 +2014,7 @@ proc parseHtml*(s: Stream, filename: string, ## Parses the XML from stream `s` and returns a ``XmlNode``. Every ## occurred parsing error is added to the `errors` sequence. var x: XmlParser - open(x, s, filename, {reportComments, reportWhitespace}) + open(x, s, filename, {reportComments, reportWhitespace, allowUnquotedAttribs}) next(x) # skip the DOCTYPE: if x.kind == xmlSpecial: next(x) diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim index d8d5a7a2d..39b117d40 100644 --- a/lib/pure/parsexml.nim +++ b/lib/pure/parsexml.nim @@ -180,6 +180,7 @@ type errEqExpected, ## ``=`` expected errQuoteExpected, ## ``"`` or ``'`` expected errEndOfCommentExpected ## ``-->`` expected + errAttributeValueExpected ## non-empty attribute value expected ParserState = enum stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError @@ -187,6 +188,7 @@ type XmlParseOption* = enum ## options for the XML parser reportWhitespace, ## report whitespace reportComments ## report comments + allowUnquotedAttribs ## allow unquoted attribute values (for HTML) XmlParser* = object of BaseLexer ## the parser object. a, b, c: string @@ -207,7 +209,8 @@ const "'>' expected", "'=' expected", "'\"' or \"'\" expected", - "'-->' expected" + "'-->' expected", + "attribute value expected" ] proc open*(my: var XmlParser, input: Stream, filename: string, @@ -669,6 +672,21 @@ proc parseAttribute(my: var XmlParser) = pendingSpace = false add(my.b, buf[pos]) inc(pos) + elif allowUnquotedAttribs in my.options: + const disallowedChars = {'"', '\'', '`', '=', '<', '>', ' ', + '\0', '\t', '\L', '\F', '\f'} + let startPos = pos + while (let c = buf[pos]; c notin disallowedChars): + if c == '&': + my.bufpos = pos + parseEntity(my, my.b) + my.kind = xmlAttribute # parseEntity overwrites my.kind! + pos = my.bufpos + else: + add(my.b, c) + inc(pos) + if pos == startPos: + markError(my, errAttributeValueExpected) else: markError(my, errQuoteExpected) # error corrections: guess what was meant diff --git a/tests/stdlib/thtmlparser.nim b/tests/stdlib/thtmlparser.nim index d59e8b302..58b2d0377 100644 --- a/tests/stdlib/thtmlparser.nim +++ b/tests/stdlib/thtmlparser.nim @@ -78,3 +78,48 @@ block t2814: echo "case " & ltype[0] & " failed !" quit(2) echo "true" + +block t6154: + let foo = """ + <!DOCTYPE html> + <html> + <head> + <title> foobar </title> + </head> + <body> + <p class=foo id=bar></p> + <p something=	foo	bar²></p> + <p something= 	foo	bar² foo =bloo></p> + <p class="foo2" id="bar2"></p> + <p wrong= ></p> + </body> + </html> + """ + + var errors: seq[string] = @[] + let html = parseHtml(newStringStream(foo), "statichtml", errors=errors) + doAssert "statichtml(11, 18) Error: attribute value expected" in errors + let ps = html.findAll("p") + doAssert ps.len == 5 + + doAssert ps[0].attrsLen == 2 + doAssert ps[0].attr("class") == "foo" + doAssert ps[0].attr("id") == "bar" + doassert ps[0].len == 0 + + doAssert ps[1].attrsLen == 1 + doAssert ps[1].attr("something") == "\tfoo\tbar²" + doassert ps[1].len == 0 + + doAssert ps[2].attrsLen == 2 + doAssert ps[2].attr("something") == "\tfoo\tbar²" + doAssert ps[2].attr("foo") == "bloo" + doassert ps[2].len == 0 + + doAssert ps[3].attrsLen == 2 + doAssert ps[3].attr("class") == "foo2" + doAssert ps[3].attr("id") == "bar2" + doassert ps[3].len == 0 + + doAssert ps[4].attrsLen == 1 + doAssert ps[4].attr("wrong") == "" |