summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorrec <44084068+recloser@users.noreply.github.com>2018-10-29 11:10:00 +0100
committerAndreas Rumpf <rumpf_a@web.de>2018-10-29 11:10:00 +0100
commit9fd0a71e4d4c6b3d451a137372973dcb4f4b47cc (patch)
treee60a2b3c247973c47c2446ce3d0b42df64f9d997
parent680f5eeb15583f45dc2dcdbc4f05a6557d09f31a (diff)
downloadNim-9fd0a71e4d4c6b3d451a137372973dcb4f4b47cc.tar.gz
Make htmlparser parse unquoted attrib values (#9537)
Fixes #6154
-rw-r--r--lib/pure/htmlparser.nim2
-rw-r--r--lib/pure/parsexml.nim20
-rw-r--r--tests/stdlib/thtmlparser.nim45
3 files changed, 65 insertions, 2 deletions
diff --git a/lib/pure/htmlparser.nim b/lib/pure/htmlparser.nim
index fbf2b8e73..9e1a5a101 100644
--- a/lib/pure/htmlparser.nim
+++ b/lib/pure/htmlparser.nim
@@ -2014,7 +2014,7 @@ proc parseHtml*(s: Stream, filename: string,
   ## Parses the XML from stream `s` and returns a ``XmlNode``. Every
   ## occurred parsing error is added to the `errors` sequence.
   var x: XmlParser
-  open(x, s, filename, {reportComments, reportWhitespace})
+  open(x, s, filename, {reportComments, reportWhitespace, allowUnquotedAttribs})
   next(x)
   # skip the DOCTYPE:
   if x.kind == xmlSpecial: next(x)
diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim
index d8d5a7a2d..39b117d40 100644
--- a/lib/pure/parsexml.nim
+++ b/lib/pure/parsexml.nim
@@ -180,6 +180,7 @@ type
     errEqExpected,           ## ``=`` expected
     errQuoteExpected,        ## ``"`` or ``'`` expected
     errEndOfCommentExpected  ## ``-->`` expected
+    errAttributeValueExpected ## non-empty attribute value expected
 
   ParserState = enum
     stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError
@@ -187,6 +188,7 @@ type
   XmlParseOption* = enum  ## options for the XML parser
     reportWhitespace,      ## report whitespace
     reportComments         ## report comments
+    allowUnquotedAttribs   ## allow unquoted attribute values (for HTML)
 
   XmlParser* = object of BaseLexer ## the parser object.
     a, b, c: string
@@ -207,7 +209,8 @@ const
     "'>' expected",
     "'=' expected",
     "'\"' or \"'\" expected",
-    "'-->' expected"
+    "'-->' expected",
+    "attribute value expected"
   ]
 
 proc open*(my: var XmlParser, input: Stream, filename: string,
@@ -669,6 +672,21 @@ proc parseAttribute(my: var XmlParser) =
             pendingSpace = false
           add(my.b, buf[pos])
           inc(pos)
+  elif allowUnquotedAttribs in my.options:
+    const disallowedChars = {'"', '\'', '`', '=', '<', '>', ' ',
+                             '\0', '\t', '\L', '\F', '\f'}
+    let startPos = pos
+    while (let c = buf[pos]; c notin disallowedChars):
+      if c == '&':
+        my.bufpos = pos
+        parseEntity(my, my.b)
+        my.kind = xmlAttribute # parseEntity overwrites my.kind!
+        pos = my.bufpos
+      else:
+        add(my.b, c)
+        inc(pos)
+    if pos == startPos:
+      markError(my, errAttributeValueExpected)
   else:
     markError(my, errQuoteExpected)
     # error corrections: guess what was meant
diff --git a/tests/stdlib/thtmlparser.nim b/tests/stdlib/thtmlparser.nim
index d59e8b302..58b2d0377 100644
--- a/tests/stdlib/thtmlparser.nim
+++ b/tests/stdlib/thtmlparser.nim
@@ -78,3 +78,48 @@ block t2814:
       echo "case " & ltype[0] & " failed !"
       quit(2)
   echo "true"
+
+block t6154:
+  let foo = """
+  <!DOCTYPE html>
+  <html>
+      <head>
+        <title> foobar </title>
+      </head>
+      <body>
+        <p class=foo id=bar></p>
+        <p something=&#9;foo&#9;bar&#178;></p>
+        <p something=  &#9;foo&#9;bar&#178; foo  =bloo></p>
+        <p class="foo2" id="bar2"></p>
+        <p wrong= ></p>
+      </body>
+  </html>
+  """
+
+  var errors: seq[string] = @[]
+  let html = parseHtml(newStringStream(foo), "statichtml", errors=errors)
+  doAssert "statichtml(11, 18) Error: attribute value expected" in errors
+  let ps = html.findAll("p")
+  doAssert ps.len == 5
+
+  doAssert ps[0].attrsLen == 2
+  doAssert ps[0].attr("class") == "foo"
+  doAssert ps[0].attr("id") == "bar"
+  doassert ps[0].len == 0
+
+  doAssert ps[1].attrsLen == 1
+  doAssert ps[1].attr("something") == "\tfoo\tbar²"
+  doassert ps[1].len == 0
+
+  doAssert ps[2].attrsLen == 2
+  doAssert ps[2].attr("something") == "\tfoo\tbar²"
+  doAssert ps[2].attr("foo") == "bloo"
+  doassert ps[2].len == 0
+
+  doAssert ps[3].attrsLen == 2
+  doAssert ps[3].attr("class") == "foo2"
+  doAssert ps[3].attr("id") == "bar2"
+  doassert ps[3].len == 0
+
+  doAssert ps[4].attrsLen == 1
+  doAssert ps[4].attr("wrong") == ""