about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2023-08-15 10:15:39 +0200
committerbptato <nincsnevem662@gmail.com>2023-08-15 10:15:39 +0200
commitf73a6683469df17608af51a1e7e582615e43f972 (patch)
treea361c058418f3e169607b848b024299445c87bbf
parent29a36a1d898e2b26878a6ea6b78f4a314e595856 (diff)
downloadchawan-f73a6683469df17608af51a1e7e582615e43f972.tar.gz
Add restart callback, implement setCharacterSet
restart is mainly needed for resetting the document node.
setCharacterSet now works (albeit somewhat differently than previously
specified.)
-rw-r--r--chame/htmlparser.nim39
-rw-r--r--chame/minidom.nim6
2 files changed, 38 insertions, 7 deletions
diff --git a/chame/htmlparser.nim b/chame/htmlparser.nim
index e23db583..afe96ef1 100644
--- a/chame/htmlparser.nim
+++ b/chame/htmlparser.nim
@@ -24,6 +24,8 @@ type
     ## Must never be nil.
     finish*: DOMBuilderFinish[Handle]
     ## May be nil.
+    restart*: DOMBuilderRestart[Handle]
+    ## May be nil.
     parseError*: DOMBuilderParseError[Handle]
     ## May be nil.
     setQuirksMode*: DOMBuilderSetQuirksMode[Handle]
@@ -76,9 +78,12 @@ type
     ## Note: this only works if inputStream is seekable, i.e.
     ## inputStream.setPosition(0) must work correctly.
     ##
-    ## Note 2: when this canReinterpret is false, confidence is set to
-    ## certain, no BOM sniffing is performed and meta charset tags are
-    ## disregarded. Expect this to change in the future.
+    ## Note 2: when canReinterpret is false, confidence is set to certain,
+    ## no BOM sniffing is performed and meta charset tags are
+    ## disregarded.
+    ## Expect this to change in the future. (In particular, this should not
+    ## be necessary for ASCII-compatible decoders that have only seen ASCII
+    ## characters, i.e. for most <meta charset=...> tags.)
     charsets*: seq[Charset]
     ## Fallback charsets. If empty, UTF-8 is used. In most cases, an empty
     ## sequence or a single-element sequence consisting of a character set
@@ -111,6 +116,12 @@ type
     proc(builder: DOMBuilder[Handle]) {.nimcall.}
       ## Parsing has finished.
 
+  DOMBuilderRestart*[Handle] =
+    proc(builder: DOMBuilder[Handle]) {.nimcall.}
+      ## Parsing has been restarted. This is required if charset switching
+      ## is enabled; in this case, restart must reset all properties of the
+      ## document handle, and remove all of its child nodes.
+
   DOMBuilderParseError*[Handle] =
     proc(builder: DOMBuilder[Handle], message: ParseError) {.nimcall.}
       ## Parse error. `message` is an error code either specified by the
@@ -125,7 +136,9 @@ type
 
   DOMBuilderSetCharacterSet*[Handle] =
     proc(builder: DOMBuilder[Handle], charset: Charset) {.nimcall.}
-      ## Set the recognized charset, if it differs from the initial input.
+      ## Set the charset used in the current parsing attempt.
+      ## Note that this is called even for all attempts, i.e. at least once
+      ## for every parse.
 
   DOMBuilderElementPopped*[Handle] =
     proc(builder: DOMBuilder[Handle], element: Handle) {.nimcall.}
@@ -287,6 +300,10 @@ proc finish[Handle](parser: HTML5Parser[Handle]) =
   if parser.dombuilder.finish != nil:
     parser.dombuilder.finish(parser.dombuilder)
 
+proc restart[Handle](parser: HTML5Parser[Handle]) =
+  if parser.dombuilder.restart != nil:
+    parser.dombuilder.restart(parser.dombuilder)
+
 proc parseError(parser: HTML5Parser, e: ParseError) =
   if parser.dombuilder.parseError != nil:
     parser.dombuilder.parseError(parser.dombuilder, e)
@@ -296,6 +313,11 @@ proc setQuirksMode[Handle](parser: var HTML5Parser[Handle], mode: QuirksMode) =
   if parser.dombuilder.setQuirksMode != nil:
     parser.dombuilder.setQuirksMode(parser.dombuilder, mode)
 
+proc setCharacterSet[Handle](parser: var HTML5Parser[Handle],
+    charset: Charset) =
+  if parser.dombuilder.setCharacterSet != nil:
+    parser.dombuilder.setCharacterSet(parser.dombuilder, charset)
+
 func document[Handle](parser: HTML5Parser[Handle]): Handle {.inline.} =
   return parser.dombuilder.document
 
@@ -2713,6 +2735,8 @@ proc parseHTML*[Handle](inputStream: Stream, dombuilder: DOMBuilder[Handle],
       canReinterpret = false
   if charsetStack.len == 0:
     charsetStack.add(DefaultCharset) # UTF-8
+  var previousCharset = CHARSET_UNKNOWN
+  var first = true
   while true:
     let charset = charsetStack.pop()
     var parser = HTML5Parser[Handle](
@@ -2721,6 +2745,13 @@ proc parseHTML*[Handle](inputStream: Stream, dombuilder: DOMBuilder[Handle],
       charset: charset,
       opts: opts
     )
+    if charset != previousCharset:
+      parser.setCharacterSet(charset)
+      previousCharset = charset
+    if first:
+      first = false
+    else:
+      parser.restart()
     confidence = CONFIDENCE_TENTATIVE # used in the next iteration
     if not canReinterpret:
       parser.confidence = CONFIDENCE_CERTAIN
diff --git a/chame/minidom.nim b/chame/minidom.nim
index 8a61f2fa..81fc6b98 100644
--- a/chame/minidom.nim
+++ b/chame/minidom.nim
@@ -44,8 +44,8 @@ type
 type
   MiniDOMBuilder = ref object of DOMBuilder[Node]
 
-proc finish(builder: DOMBuilder[Node]) =
-  let builder = cast[MiniDOMBuilder](builder)
+proc restart(builder: DOMBuilder[Node]) =
+  builder.document = Document(nodeType: DOCUMENT_NODE)
 
 proc getParentNode(builder: DOMBuilder[Node], handle: Node): Option[Node] =
   return option(handle.parentNode)
@@ -194,7 +194,7 @@ proc newMiniDOMBuilder(): MiniDOMBuilder =
   let document = Document(nodeType: DOCUMENT_NODE)
   return MiniDOMBuilder(
     document: document,
-    finish: finish,
+    restart: restart,
     getTagType: getTagType,
     getParentNode: getParentNode,
     getLocalName: getLocalName,