about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2023-12-28 18:09:35 +0100
committerbptato <nincsnevem662@gmail.com>2023-12-28 18:09:35 +0100
commita24e6467ea2a3bb1260efbfbcb11a98d73b2bcc8 (patch)
treed42f06b9e11563b897eebf0edabf0a25641c8be3
parent897ef4021382de3630bb3d0b6a8c58686b09231f (diff)
downloadchawan-a24e6467ea2a3bb1260efbfbcb11a98d73b2bcc8.tar.gz
Fix tests 10 .. 17, add todo
-rw-r--r--chame/htmlparser.nim126
-rw-r--r--chame/minidom.nim13
-rw-r--r--tests/shared/tree_common.nim15
-rw-r--r--tests/tree.nim23
-rw-r--r--todo10
5 files changed, 139 insertions, 48 deletions
diff --git a/chame/htmlparser.nim b/chame/htmlparser.nim
index 7a9178c0..330961ee 100644
--- a/chame/htmlparser.nim
+++ b/chame/htmlparser.nim
@@ -488,8 +488,16 @@ proc resetInsertionMode(parser: var HTML5Parser) =
     if last:
       switch_insertion_mode_and_return IN_BODY
 
+func currentNodeToken[Handle, Atom](parser: HTML5Parser[Handle, Atom]):
+    OpenElement[Handle, Atom] =
+  return parser.openElements[^1]
+
 func currentNode[Handle, Atom](parser: HTML5Parser[Handle, Atom]): Handle =
-  return parser.openElements[^1].element
+  return parser.currentNodeToken.element
+
+func currentToken[Handle, Atom](parser: HTML5Parser[Handle, Atom]):
+    Token[Atom] =
+  return parser.currentNodeToken.token
 
 func adjustedCurrentNode[Handle, Atom](parser: HTML5Parser[Handle, Atom]):
     Handle =
@@ -503,7 +511,7 @@ func adjustedCurrentNodeToken[Handle, Atom](parser: HTML5Parser[Handle, Atom]):
   if parser.fragment:
     parser.opts.ctx.get
   else:
-    parser.openElements[^1]
+    parser.currentNodeToken
 
 func lastElementOfTag[Handle, Atom](parser: HTML5Parser[Handle, Atom],
     tagType: TagType): tuple[element: Option[Handle], pos: int] =
@@ -575,16 +583,20 @@ func hasElementInSpecificScope[Handle, Atom](parser: HTML5Parser[Handle, Atom],
       return false
     let namespace = parser.getNamespace(element)
     if namespace == Namespace.MATHML:
-      return localName notin [
+      let elements = [
         parser.atomMap[ATOM_MI], parser.atomMap[ATOM_MO],
         parser.atomMap[ATOM_MN], parser.atomMap[ATOM_MS],
         parser.atomMap[ATOM_MTEXT], parser.atomMap[ATOM_ANNOTATION_XML]
       ]
+      if localName in elements:
+        return false
     elif namespace == Namespace.SVG:
-      return localName notin [
+      let elements = [
         parser.atomMap[ATOM_FOREIGNOBJECT], parser.atomMap[ATOM_DESC],
         parser.atomMap[ATOM_TITLE]
       ]
+      if localName in elements:
+        return false
   assert false
 
 func hasElementInSpecificScope[Handle, Atom](parser: HTML5Parser[Handle, Atom],
@@ -599,16 +611,20 @@ func hasElementInSpecificScope[Handle, Atom](parser: HTML5Parser[Handle, Atom],
       return false
     let namespace = parser.getNamespace(element)
     if namespace == Namespace.MATHML:
-      return localName notin [
+      let elements = [
         parser.atomMap[ATOM_MI], parser.atomMap[ATOM_MO],
         parser.atomMap[ATOM_MN], parser.atomMap[ATOM_MS],
         parser.atomMap[ATOM_MTEXT], parser.atomMap[ATOM_ANNOTATION_XML]
       ]
+      if localName in elements:
+        return false
     elif namespace == Namespace.SVG:
-      return localName notin [
+      let elements = [
         parser.atomMap[ATOM_FOREIGNOBJECT], parser.atomMap[ATOM_DESC],
         parser.atomMap[ATOM_TITLE]
       ]
+      if localName in elements:
+        return false
   assert false
 
 func hasElementInSpecificScope[Handle, Atom](parser: HTML5Parser[Handle, Atom],
@@ -677,14 +693,15 @@ func findAttr[Atom](attrs: seq[ParsedAttr[Atom]], atom: Atom): int =
   return -1
 
 func createElement[Handle, Atom](parser: HTML5Parser[Handle, Atom],
-    token: Token, namespace: Namespace, intendedParent: Handle,
+    localName: Atom, namespace: Namespace, intendedParent: Handle,
     attrs: seq[ParsedAttr[Atom]]): Handle =
   #TODO custom elements
-  let element = parser.createElement(token.tagname, namespace, attrs)
-  if token.tagtype in FormAssociatedElements and parser.form.isSome and
+  let element = parser.createElement(localName, namespace, attrs)
+  let tagType = localName.toTagType()
+  if tagType in FormAssociatedElements and parser.form.isSome and
       not parser.hasElement(TAG_TEMPLATE) and
-      (token.tagtype notin ListedElements or
-        token.findAttr(parser.atomMap[ATOM_FORM]) == -1):
+      (tagType notin ListedElements or
+        attrs.findAttr(parser.atomMap[ATOM_FORM]) == -1):
     parser.associateWithForm(element, parser.form.get, intendedParent)
   return element
 
@@ -698,7 +715,7 @@ func createElement[Handle, Atom](parser: HTML5Parser[Handle, Atom],
     token: Token, namespace: Namespace, intendedParent: Handle): Handle =
   # attrs not adjusted
   let attrs = token.attrs.toParsedAttrs()
-  return parser.createElement(token, namespace, intendedParent, attrs)
+  return parser.createElement(token.tagname, namespace, intendedParent, attrs)
 
 proc pushElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom],
     node: Handle, token: Token[Atom]) =
@@ -726,10 +743,11 @@ proc append[Handle, Atom](parser: HTML5Parser[Handle, Atom], parent, node: Handl
   parser.insertBefore(parent, node, none(Handle))
 
 proc insertForeignElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom],
-    token: Token, namespace: Namespace, stackOnly: bool,
+    token: Token, localName: Atom, namespace: Namespace, stackOnly: bool,
     attrs: seq[ParsedAttr[Atom]]): Handle =
   let location = parser.appropriatePlaceForInsert()
-  let element = parser.createElement(token, namespace, location.inside, attrs)
+  let parent = location.inside
+  let element = parser.createElement(localName, namespace, parent, attrs)
   #TODO custom elements
   if not stackOnly:
     parser.insert(location, element)
@@ -739,7 +757,8 @@ proc insertForeignElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom],
 proc insertForeignElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom],
     token: Token, namespace: Namespace, stackOnly: bool): Handle =
   let attrs = token.attrs.toParsedAttrs()
-  parser.insertForeignElement(token, namespace, stackOnly, attrs)
+  let localName = token.tagname
+  parser.insertForeignElement(token, localName, namespace, stackOnly, attrs)
 
 proc insertHTMLElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom],
     token: Token): Handle =
@@ -765,6 +784,10 @@ proc adjustSVGAttributes[Handle, Atom](parser: var HTML5Parser[Handle, Atom],
     parser.adjustedTable.withValue(attr.name, p):
       attr.name = p[]
 
+proc sortAttributes[Handle, Atom](parser: var HTML5Parser[Handle, Atom],
+    attrs: var seq[ParsedAttr[Atom]]) =
+  attrs.sort(func(a, b: ParsedAttr[Atom]): int = cmp(a.name, b.name))
+
 proc insertCharacter(parser: var HTML5Parser, data: string) =
   let location = parser.appropriatePlaceForInsert()
   if location.inside == parser.getDocument():
@@ -911,7 +934,8 @@ proc popElementsIncl(parser: var HTML5Parser, tags: set[TagType]) =
     discard
 
 # Pop all elements, including the specified element.
-proc popElementsIncl[Handle, Atom](parser: var HTML5Parser[Handle, Atom], handle: Handle) =
+proc popElementsIncl[Handle, Atom](parser: var HTML5Parser[Handle, Atom],
+    handle: Handle) =
   while parser.popElement() != handle:
     discard
 
@@ -1008,7 +1032,7 @@ proc reconstructActiveFormatting[Handle, Atom](parser: var HTML5Parser[Handle, A
         state = CREATE
         continue
       dec i
-      if entry.isSome and parser.findOpenElement(entry.get) != -1:
+      if entry.isSome and parser.findOpenElement(entry.get) == -1:
         continue
       state = ADVANCE
     of ADVANCE:
@@ -1052,7 +1076,7 @@ func isHTMLIntegrationPoint[Handle, Atom](parser: HTML5Parser[Handle, Atom],
       let i = token.findAttr(parser.atomMap[ATOM_ENCODING])
       if i != -1:
         return token.attrs[i].value in ["text/html", "application/xhtml+xml"]
-  elif namespace == Namespace.XML:
+  elif namespace == Namespace.SVG:
     let elements = [
       parser.atomMap[ATOM_FOREIGNOBJECT],
       parser.atomMap[ATOM_DESC],
@@ -1775,6 +1799,7 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom],
           while parser.openElements.len > 1:
             pop_current_node
           discard parser.insertHTMLElement(token)
+          parser.insertionMode = IN_FRAMESET
       )
       TokenType.EOF => (block:
         if parser.templateModes.len > 0:
@@ -1909,8 +1934,7 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom],
         if not parser.hasElement(TAG_TEMPLATE):
           let form = parser.form
           parser.form = none(Handle)
-          if form.isNone or
-              not parser.hasElementInScope(parser.getTagType(form.get)):
+          if form.isNone or not parser.hasElementInScope(form.get):
             parse_error ELEMENT_NOT_IN_SCOPE
             return
           let node = form.get
@@ -2097,7 +2121,10 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom],
         parser.reconstructActiveFormatting()
         discard parser.insertHTMLElement(token)
         parser.framesetOk = false
-        if parser.insertionMode in {IN_TABLE, IN_CAPTION, IN_TABLE_BODY, IN_CELL}:
+        const TableInsertionModes = {
+          IN_TABLE, IN_CAPTION, IN_TABLE_BODY, IN_ROW, IN_CELL
+        }
+        if parser.insertionMode in TableInsertionModes:
           parser.insertionMode = IN_SELECT_IN_TABLE
         else:
           parser.insertionMode = IN_SELECT
@@ -2125,8 +2152,10 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom],
         var attrs = token.attrs.toParsedAttrs()
         parser.adjustMathMLAttributes(attrs)
         parser.adjustForeignAttributes(attrs)
+        parser.sortAttributes(attrs)
         const ns = Namespace.MATHML
-        discard parser.insertForeignElement(token, ns, false, attrs)
+        let localName = token.tagname
+        discard parser.insertForeignElement(token, localName, ns, false, attrs)
         if token.selfclosing:
           pop_current_node
       )
@@ -2135,8 +2164,10 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom],
         var attrs = token.attrs.toParsedAttrs()
         parser.adjustSVGAttributes(attrs)
         parser.adjustForeignAttributes(attrs)
+        parser.sortAttributes(attrs)
         const ns = Namespace.SVG
-        discard parser.insertForeignElement(token, ns, false, attrs)
+        let localName = token.tagname
+        discard parser.insertForeignElement(token, localName, ns, false, attrs)
         if token.selfclosing:
           pop_current_node
       )
@@ -2748,22 +2779,24 @@ proc processInForeignContent(parser: var HTML5Parser, token: Token) =
     parser.parseError(e)
 
   template any_other_end_tag() =
-    if parser.getLocalName(parser.currentNode) != token.tagname:
+    if parser.currentToken.tagname != token.tagname:
+      # Compare the start tag token, since it is guaranteed to be lower case.
+      # (The local name might have been adjusted to a non-lower-case string.)
       parse_error UNEXPECTED_END_TAG
-    var i = parser.openElements.high
-    while i >= 0:
-      if i > parser.openElements.high:
-        i = parser.openElements.high
-      let node = parser.openElements[i].element
-      if parser.getLocalName(node) == token.tagname:
-        #TODO TODO TODO node tagname should be converted to ASCII lower
-        while parser.popElement() != node:
-          discard
+    for i in countdown(parser.openElements.high, 0): # loop
+      if i == 0: # fragment case
+        assert parser.fragment
         break
-      if parser.getNamespace(node) == Namespace.HTML:
+      let (node, nodeToken) = parser.openElements[i]
+      if i != parser.openElements.high and
+          parser.getNamespace(node) == Namespace.HTML:
+        parser.processInHTMLContent(token, parser.insertionMode)
+        break
+      if nodeToken.tagname == token.tagname:
+        # Compare the start tag token, since it is guaranteed to be lower case.
+        # (The local name might have been adjusted to a non-lower-case string.)
+        parser.popElementsIncl(node)
         break
-      parser.processInHTMLContent(token, parser.insertionMode)
-      dec i
 
   match token:
     TokenType.CHARACTER_NULL => (block:
@@ -2773,6 +2806,7 @@ proc processInForeignContent(parser: var HTML5Parser, token: Token) =
     TokenType.CHARACTER_WHITESPACE => (block: parser.insertCharacter(token.s))
     TokenType.CHARACTER => (block:
       parser.insertCharacter(token.s)
+      parser.framesetOk = false
     )
     TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE)
     ("<b>", "<big>", "<blockquote>", "<body>", "<br>", "<center>", "<code>",
@@ -2783,7 +2817,7 @@ proc processInForeignContent(parser: var HTML5Parser, token: Token) =
      "<sup>", "<table>", "<tt>", "<u>", "<ul>", "<var>") => (block:
       parse_error UNEXPECTED_START_TAG
       while not parser.isMathMLIntegrationPoint(parser.currentNode) and
-          not parser.isHTMLIntegrationPoint(parser.openElements[^1]) and
+          not parser.isHTMLIntegrationPoint(parser.currentNodeToken) and
           parser.getNamespace(parser.currentNode) != Namespace.HTML:
         pop_current_node
       parser.processInHTMLContent(token, parser.insertionMode)
@@ -2791,14 +2825,17 @@ proc processInForeignContent(parser: var HTML5Parser, token: Token) =
     TokenType.START_TAG => (block:
       let namespace = parser.getNamespace(parser.adjustedCurrentNode)
       var attrs = token.attrs.toParsedAttrs()
+      var tagname = token.tagname
       if namespace == Namespace.SVG:
-        parser.caseTable.withValue(token.tagname, p):
-          token.tagname = p[]
+        parser.caseTable.withValue(tagname, p):
+          tagname = p[]
         parser.adjustSVGAttributes(attrs)
       elif namespace == Namespace.MATHML:
         parser.adjustMathMLAttributes(attrs)
       parser.adjustForeignAttributes(attrs)
-      discard parser.insertForeignElement(token, namespace, false, attrs)
+      parser.sortAttributes(attrs)
+      discard parser.insertForeignElement(token, tagname, namespace, false,
+        attrs)
       if token.selfclosing:
         if namespace == Namespace.SVG:
           script_end_tag
@@ -3016,8 +3053,13 @@ proc createForeignTable[Handle, Atom](parser: var HTML5Parser[Handle, Atom]) =
 
 proc parseHTML*[Handle, Atom](inputStream: Stream,
     dombuilder: DOMBuilder[Handle, Atom], opts: HTML5ParserOpts[Handle, Atom]) =
-  ## Parse an HTML document, using the DOMBuilder object `dombuilder`, and
-  ## parser options `opts`.
+  ## Read and parse an HTML document from input stream `inputStream`, using
+  ## the DOMBuilder object `dombuilder`, and parser options `opts`.
+  ##
+  ## The generic `Handle` must be the node handle type of the DOM builder. The
+  ## generic `Atom` must be the interned string type of the DOM builder.
+  ##
+  ## The input stream does not have to be seekable for this function.
   dombuilder.checkCallbacks()
   let tokstate = opts.initialTokenizerState
   let factory = dombuilder.getAtomFactory(dombuilder)
diff --git a/chame/minidom.nim b/chame/minidom.nim
index f90b0b83..1261c6c6 100644
--- a/chame/minidom.nim
+++ b/chame/minidom.nim
@@ -9,6 +9,7 @@
 ## For a variant that can switch encodings when meta tags are encountered etc.
 ## see `chame/minidom_cs <minidom.html>`.
 
+import std/algorithm
 import std/hashes
 import std/options
 import std/sets
@@ -88,11 +89,9 @@ func tagTypeToAtom(factory: AtomFactory[MAtom], tagType: TagType): MAtom =
 func atomToStr*(factory: MAtomFactory, atom: MAtom): string =
   return factory.atomMap[int(atom)]
 
-# Overload useful for testing in htmlparser:
-#[
+# Overload for debugging htmlparser:
 func atomToStr*(factory: AtomFactory[MAtom], atom: MAtom): string =
   cast[MAtomFactory](factory).atomToStr(atom)
-]#
 
 # Node types
 type
@@ -122,7 +121,7 @@ type
     localName*: MAtom
     namespace*: Namespace
     attrs*: seq[Attribute]
-    document* {.cursor.}: Document
+    document*: Document
 
 type
   MiniDOMBuilder* = ref object of DOMBuilder[Node, MAtom]
@@ -208,6 +207,7 @@ proc createElement(builder: DOMBuilder[Node, MAtom], localName: MAtom,
     document: builder.document,
     attrs: attrs
   )
+  assert element.document != nil and element.document.factory != nil
   for attr in element.attrs.mitems:
     attr.value = attr.value.toValidUTF8()
   return element
@@ -353,6 +353,7 @@ proc addAttrsIfMissing(builder: DOMBuilder[Node, MAtom], element: Node,
     if attr.name notin oldNames:
       let value = attr.value.toValidUTF8()
       element.attrs.add((NO_PREFIX, NO_NAMESPACE, attr.name, value))
+  element.attrs.sort(func(a, b: Attribute): int = cmp(a.name, b.name))
 
 proc initMiniDOMBuilder*(builder: MiniDOMBuilder) =
   builder.getDocument = getDocument
@@ -415,8 +416,10 @@ proc parseHTMLFragment*(inputStream: Stream, element: Element,
   let root = Element(
     nodeType: ELEMENT_NODE,
     localName: htmlAtom,
-    namespace: HTML
+    namespace: HTML,
+    document: document
   )
+  assert root.document != nil and root.document.factory != nil
   let rootToken = Token[MAtom](t: START_TAG, tagname: htmlAtom)
   document.childList = @[Node(root)]
   var opts = opts
diff --git a/tests/shared/tree_common.nim b/tests/shared/tree_common.nim
index a0543426..e1f1ad28 100644
--- a/tests/shared/tree_common.nim
+++ b/tests/shared/tree_common.nim
@@ -1,3 +1,4 @@
+import std/algorithm
 import std/options
 import std/strutils
 import std/tables
@@ -148,6 +149,11 @@ proc parseTestDocument(ctx: var TCTestParser): Document =
   template top: auto = stack[^1]
   var thistext: Text
   var indent = 1
+  template pop_node =
+    let node = stack.pop()
+    if node of Element:
+      Element(node).attrs.sort(proc(a, b: Attribute): int = cmp(a.name, b.name))
+    indent -= 2
   while ctx.has:
     let line = ctx.consumeLine()
     if line == "":
@@ -161,7 +167,9 @@ proc parseTestDocument(ctx: var TCTestParser): Document =
       continue
     assert line[0] == '|' and line[1] == ' '
     while indent >= line.len or not line.startsWith('|' & ' '.repeat(indent)):
-      discard stack.pop()
+      let node = stack.pop()
+      if node of Element:
+        Element(node).attrs.sort(proc(a, b: Attribute): int = cmp(a.name, b.name))
       indent -= 2
     let str = line.substr(indent + 1)
     if str.startsWith("<!DOCTYPE "):
@@ -221,6 +229,8 @@ proc parseTestDocument(ctx: var TCTestParser): Document =
       let na = ctx.factory.strToAtom(name)
       let value = ss[1][1..^2]
       Element(top).attrs.add((prefix, ns, na, value))
+  while indent > 1:
+    pop_node
 
 proc parseTest(ctx: var TCTestParser): TCTest =
   doAssert ctx.consumeLine() == "#data"
@@ -266,6 +276,9 @@ proc checkTest(nodein, nodep: Node) =
     let nodep = Element(nodep)
     check nodein.localName == nodep.localName
     check nodein.namespace == nodep.namespace
+    if nodein.attrs != nodep.attrs:
+      echo "NODEIN", $nodein
+      echo "NODEP", $nodep
     check nodein.attrs == nodep.attrs
   of ATTRIBUTE_NODE, ENTITY_REFERENCE_NODE, ENTITY_NODE,
       DOCUMENT_FRAGMENT_NODE, NOTATION_NODE:
diff --git a/tests/tree.nim b/tests/tree.nim
index 53a03750..ea269367 100644
--- a/tests/tree.nim
+++ b/tests/tree.nim
@@ -71,3 +71,26 @@ test "tests8.dat":
 
 test "tests9.dat":
   runTests("tests9.dat")
+
+test "tests10.dat":
+  runTests("tests10.dat")
+
+test "tests11.dat":
+  runTests("tests11.dat")
+
+test "tests12.dat":
+  runTests("tests12.dat")
+
+# no tests13 in html5lib-tests :)
+
+test "tests14.dat":
+  runTests("tests14.dat")
+
+test "tests15.dat":
+  runTests("tests15.dat")
+
+test "tests16.dat":
+  runTests("tests16.dat")
+
+test "tests17.dat":
+  runTests("tests17.dat")
diff --git a/todo b/todo
new file mode 100644
index 00000000..dca22c29
--- /dev/null
+++ b/todo
@@ -0,0 +1,10 @@
+optimizations:
+* use tagNameEquals/getLocalName less (we could just compare start tags most
+  of the time?)
+* go through TODOs in htmltokenizer
+* replace "bag of function pointers" with mixins
+* on-demand creation of attribute name atoms (most documents don't even
+  need atomMap); use ref array/Option?
+* htmltokenizer: do something with copyBuf, it's ugly and probably broken
+etc:
+* turn test1 into a serializer module