about summary refs log tree commit diff stats
path: root/src/html/chadombuilder.nim
diff options
context:
space:
mode:
Diffstat (limited to 'src/html/chadombuilder.nim')
-rw-r--r--src/html/chadombuilder.nim371
1 files changed, 242 insertions, 129 deletions
diff --git a/src/html/chadombuilder.nim b/src/html/chadombuilder.nim
index fa3b761d..21e94901 100644
--- a/src/html/chadombuilder.nim
+++ b/src/html/chadombuilder.nim
@@ -2,6 +2,7 @@ import std/deques
 import std/options
 import std/streams
 
+import html/catom
 import html/dom
 import html/enums
 import js/error
@@ -10,138 +11,167 @@ import js/javascript
 import types/url
 
 import chakasu/charset
+import chakasu/decoderstream
+import chakasu/encoderstream
 
 import chame/htmlparser
-import chame/htmltokenizer
 import chame/tags
 
 # DOMBuilder implementation for Chawan.
 
+type CharsetConfidence = enum
+  CONFIDENCE_TENTATIVE, CONFIDENCE_CERTAIN, CONFIDENCE_IRRELEVANT
+
 type
-  ChaDOMBuilder = ref object of DOMBuilder[Node]
+  ChaDOMBuilder = ref object of DOMBuilder[Node, CAtom]
+    charset: Charset
+    confidence: CharsetConfidence
+    document: Document
     isFragment: bool
+    factory: CAtomFactory
+    poppedScript: HTMLScriptElement
+
+type
+  DOMBuilderImpl = ChaDOMBuilder
+  HandleImpl = Node
+  AtomImpl = CAtom
+
+include chame/htmlparseriface
 
 type DOMParser = ref object # JS interface
 
 jsDestructor(DOMParser)
 
-template getDocument(dombuilder: ChaDOMBuilder): Document =
-  cast[Document](dombuilder.document)
+proc getDocumentImpl(builder: ChaDOMBuilder): Node =
+  return builder.document
+
+proc atomToTagTypeImpl(builder: ChaDOMBuilder, atom: CAtom): TagType =
+  return builder.factory.toTagType(atom)
+
+proc tagTypeToAtomImpl(builder: ChaDOMBuilder, tagType: TagType): CAtom =
+  return builder.factory.toAtom(tagType)
 
-proc finish(builder: DOMBuilder[Node]) =
-  let builder = cast[ChaDOMBuilder](builder)
-  let document = builder.getDocument()
-  while document.scriptsToExecOnLoad.len > 0:
+proc strToAtomImpl(builder: ChaDOMBuilder, s: string): CAtom =
+  return builder.factory.toAtom(s)
+
+proc finish(builder: ChaDOMBuilder) =
+  while builder.document.scriptsToExecOnLoad.len > 0:
     #TODO spin event loop
-    let script = document.scriptsToExecOnLoad.popFirst()
+    let script = builder.document.scriptsToExecOnLoad.popFirst()
     script.execute()
   #TODO events
 
-proc restart(builder: DOMBuilder[Node]) =
-  let document = newDocument()
+proc restart(builder: ChaDOMBuilder) =
+  let document = newDocument(builder.factory)
   document.contentType = "text/html"
-  let oldDocument = cast[Document](builder.document)
+  let oldDocument = builder.document
   document.url = oldDocument.url
   let window = oldDocument.window
   if window != nil:
     document.window = window
     window.document = document
   builder.document = document
+  assert document.factory != nil
+
+proc setQuirksModeImpl(builder: ChaDOMBuilder, quirksMode: QuirksMode) =
+  if not builder.document.parser_cannot_change_the_mode_flag:
+    builder.document.mode = quirksMode
+
+proc setEncodingImpl(builder: ChaDOMBuilder, encoding: string):
+    SetEncodingResult =
+  let charset = getCharset(encoding)
+  if charset == CHARSET_UNKNOWN:
+    return SET_ENCODING_CONTINUE
+  if builder.charset in {CHARSET_UTF_16_LE, CHARSET_UTF_16_BE}:
+    builder.confidence = CONFIDENCE_CERTAIN
+    return SET_ENCODING_CONTINUE
+  builder.confidence = CONFIDENCE_CERTAIN
+  if charset == builder.charset:
+    return SET_ENCODING_CONTINUE
+  if charset == CHARSET_X_USER_DEFINED:
+    builder.charset = CHARSET_WINDOWS_1252
+  else:
+    builder.charset = charset
+  return SET_ENCODING_STOP
 
-proc parseError(builder: DOMBuilder[Node], message: string) =
-  discard
-
-proc setQuirksMode(builder: DOMBuilder[Node], quirksMode: QuirksMode) =
-  let builder = cast[ChaDOMBuilder](builder)
-  let document = builder.getDocument()
-  if not document.parser_cannot_change_the_mode_flag:
-    document.mode = quirksMode
-
-proc setCharacterSet(builder: DOMBuilder[Node], charset: Charset) =
-  let builder = cast[ChaDOMBuilder](builder)
-  let document = builder.getDocument()
-  document.charset = charset
-
-proc getTemplateContent(builder: DOMBuilder[Node], handle: Node): Node =
+proc getTemplateContentImpl(builder: ChaDOMBuilder, handle: Node): Node =
   return HTMLTemplateElement(handle).content
 
-proc getTagType(builder: DOMBuilder[Node], handle: Node): TagType =
-  return Element(handle).tagType
-
-proc getParentNode(builder: DOMBuilder[Node], handle: Node): Option[Node] =
+proc getParentNodeImpl(builder: ChaDOMBuilder, handle: Node): Option[Node] =
   return option(handle.parentNode)
 
-proc getLocalName(builder: DOMBuilder[Node], handle: Node): string =
+proc getLocalNameImpl(builder: ChaDOMBuilder, handle: Node): CAtom =
   return Element(handle).localName
 
-proc getNamespace(builder: DOMBuilder[Node], handle: Node): Namespace =
+proc getNamespaceImpl(builder: ChaDOMBuilder, handle: Node): Namespace =
   return Element(handle).namespace
 
-proc createElement(builder: DOMBuilder[Node], localName: string,
-    namespace: Namespace, tagType: TagType,
-    attrs: Table[string, string]): Node =
-  let builder = cast[ChaDOMBuilder](builder)
-  let document = builder.getDocument()
-  let element = document.newHTMLElement(localName, namespace,
-    tagType = tagType, attrs = attrs)
-  if element.isResettable():
+proc createHTMLElementImpl(builder: ChaDOMBuilder): Node =
+  return builder.document.newHTMLElement(TAG_HTML)
+
+proc createElementForTokenImpl(builder: ChaDOMBuilder, localName: CAtom,
+    namespace: Namespace, intendedParent: Node, htmlAttrs: Table[CAtom, string],
+    xmlAttrs: seq[ParsedAttr[CAtom]]): Node =
+  let document = builder.document
+  let element = document.newHTMLElement(localName, namespace)
+  for k, v in htmlAttrs:
+    element.attr(k, v)
+  for attr in xmlAttrs:
+    element.attrns(attr.name, attr.prefix, attr.namespace, attr.value)
+  if element.tagType in ResettableElements:
     element.resetElement()
-  if tagType == TAG_SCRIPT:
+  if element of HTMLScriptElement:
     let script = HTMLScriptElement(element)
     script.parserDocument = document
     script.forceAsync = false
-    if builder.isFragment:
-      script.alreadyStarted = true
-      #TODO document.write (?)
+    # Note: per standard, we could set already started to true here when we
+    # are parsing from document.write, but that sounds like a horrible idea.
   return element
 
-proc createComment(builder: DOMBuilder[Node], text: string): Node =
-  let builder = cast[ChaDOMBuilder](builder)
-  return builder.getDocument().createComment(text)
+proc createCommentImpl(builder: ChaDOMBuilder, text: string): Node =
+  return builder.document.createComment(text)
 
-proc createDocumentType(builder: DOMBuilder[Node], name, publicId,
+proc createDocumentTypeImpl(builder: ChaDOMBuilder, name, publicId,
     systemId: string): Node =
-  let builder = cast[ChaDOMBuilder](builder)
-  return builder.getDocument().newDocumentType(name, publicId, systemId)
-
-proc insertBefore(builder: DOMBuilder[Node], parent, child,
-    before: Node) =
-  discard parent.insertBefore(child, before)
-
-proc insertText(builder: DOMBuilder[Node], parent: Node, text: string,
-    before: Node) =
-  let builder = cast[ChaDOMBuilder](builder)
-  let prevSibling = if before != nil:
-    before.previousSibling
+  return builder.document.newDocumentType(name, publicId, systemId)
+
+proc insertBeforeImpl(builder: ChaDOMBuilder, parent, child: Node,
+    before: Option[Node]) =
+  discard parent.insertBefore(child, before.get(nil))
+
+proc insertTextImpl(builder: ChaDOMBuilder, parent: Node, text: string,
+    before: Option[Node]) =
+  let prevSibling = if before.isSome:
+    before.get.previousSibling
   else:
     parent.lastChild
-  if prevSibling != nil and prevSibling.nodeType == TEXT_NODE:
+  if prevSibling != nil and prevSibling of Text:
     Text(prevSibling).data &= text
   else:
-    let text = builder.getDocument().createTextNode(text)
-    discard parent.insertBefore(text, before)
+    let text = builder.document.createTextNode(text)
+    discard parent.insertBefore(text, before.get(nil))
 
-proc remove(builder: DOMBuilder[Node], child: Node) =
+proc removeImpl(builder: ChaDOMBuilder, child: Node) =
   child.remove(suppressObservers = true)
 
-proc moveChildren(builder: DOMBuilder[Node], fromNode, toNode: Node) =
+proc moveChildrenImpl(builder: ChaDOMBuilder, fromNode, toNode: Node) =
   var tomove = fromNode.childList
   for node in tomove:
     node.remove(suppressObservers = true)
   for child in tomove:
     toNode.insert(child, nil)
 
-proc addAttrsIfMissing(builder: DOMBuilder[Node], element: Node,
-    attrs: Table[string, string]) =
-  let element = Element(element)
+proc addAttrsIfMissingImpl(builder: ChaDOMBuilder, handle: Node,
+    attrs: Table[CAtom, string]) =
+  let element = Element(handle)
   for k, v in attrs:
     if not element.attrb(k):
       element.attr(k, v)
 
-proc setScriptAlreadyStarted(builder: DOMBuilder[Node], script: Node) =
+proc setScriptAlreadyStartedImpl(builder: ChaDOMBuilder, script: Node) =
   HTMLScriptElement(script).alreadyStarted = true
 
-proc associateWithForm(builder: DOMBuilder[Node], element, form,
+proc associateWithFormImpl(builder: ChaDOMBuilder, element, form,
     intendedParent: Node) =
   if form.inSameTree(intendedParent):
     #TODO remove following test eventually
@@ -150,26 +180,17 @@ proc associateWithForm(builder: DOMBuilder[Node], element, form,
       element.setForm(HTMLFormElement(form))
       element.parserInserted = true
 
-proc elementPopped(builder: DOMBuilder[Node], element: Node) =
-  let builder = cast[ChaDOMBuilder](builder)
-  let document = builder.getDocument()
+proc elementPoppedImpl(builder: ChaDOMBuilder, element: Node) =
   let element = Element(element)
-  if element.tagType == TAG_TEXTAREA:
+  if element of HTMLTextAreaElement:
     element.resetElement()
-  elif element.tagType == TAG_SCRIPT:
-    #TODO microtask (maybe it works here too?)
-    let script = HTMLScriptElement(element)
-    #TODO document.write() (?)
-    script.prepare()
-    while document.parserBlockingScript != nil:
-      let script = document.parserBlockingScript
-      document.parserBlockingScript = nil
-      #TODO style sheet
-      script.execute()
-
-proc newChaDOMBuilder(url: URL, window: Window, isFragment = false):
-    ChaDOMBuilder =
-  let document = newDocument()
+  elif element of HTMLScriptElement:
+    assert builder.poppedScript == nil or not builder.document.scriptingEnabled
+    builder.poppedScript = HTMLScriptElement(element)
+
+proc newChaDOMBuilder(url: URL, window: Window, factory: CAtomFactory,
+    isFragment = false): ChaDOMBuilder =
+  let document = newDocument(factory)
   document.contentType = "text/html"
   document.url = url
   if window != nil:
@@ -177,36 +198,18 @@ proc newChaDOMBuilder(url: URL, window: Window, isFragment = false):
     window.document = document
   return ChaDOMBuilder(
     document: document,
-    finish: finish,
-    restart: restart,
-    setQuirksMode: setQuirksMode,
-    setCharacterSet: setCharacterSet,
-    elementPopped: elementPopped,
-    getTemplateContent: getTemplateContent,
-    getTagType: getTagType,
-    getParentNode: getParentNode,
-    getLocalName: getLocalName,
-    getNamespace: getNamespace,
-    createElement: createElement,
-    createComment: createComment,
-    createDocumentType: createDocumentType,
-    insertBefore: insertBefore,
-    insertText: insertText,
-    remove: remove,
-    moveChildren: moveChildren,
-    addAttrsIfMissing: addAttrsIfMissing,
-    setScriptAlreadyStarted: setScriptAlreadyStarted,
-    associateWithForm: associateWithForm,
-    #TODO isSVGIntegrationPoint (SVG support)
-    isFragment: isFragment
+    isFragment: isFragment,
+    factory: factory
   )
 
 # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
 proc parseHTMLFragment*(element: Element, s: string): seq[Node] =
   let url = parseURL("about:blank").get
-  let builder = newChaDOMBuilder(url, nil)
+  let factory = element.document.factory
+  let builder = newChaDOMBuilder(url, nil, factory)
+  let inputStream = newStringStream(s)
   builder.isFragment = true
-  let document = Document(builder.document)
+  let document = builder.document
   document.mode = element.document.mode
   let state = case element.tagType
   of TAG_TITLE, TAG_TEXTAREA: RCDATA
@@ -222,31 +225,139 @@ proc parseHTMLFragment*(element: Element, s: string): seq[Node] =
   else: DATA
   let root = document.newHTMLElement(TAG_HTML)
   document.append(root)
-  let opts = HTML5ParserOpts[Node](
+  let opts = HTML5ParserOpts[Node, CAtom](
     isIframeSrcdoc: false, #TODO?
     scripting: false,
-    canReinterpret: false,
-    charsets: @[CHARSET_UTF_8],
-    ctx: some(Node(element)),
+    ctx: some((Node(element), element.localName)),
     initialTokenizerState: state,
-    openElementsInit: @[Node(root)],
+    openElementsInit: @[(Node(root), root.localName)],
     pushInTemplate: element.tagType == TAG_TEMPLATE
   )
-  let inputStream = newStringStream(s)
-  parseHTML(inputStream, builder, opts)
+  var parser = initHTML5Parser(builder, opts)
+  var buffer: array[4096, char]
+  while true:
+    let n = inputStream.readData(addr buffer[0], buffer.len)
+    if n == 0: break
+    let res = parser.parseChunk(buffer.toOpenArray(0, n - 1))
+    assert res == PRES_CONTINUE # scripting is false, so this must be continue
+  parser.finish()
+  builder.finish()
   return root.childList
 
+#TODO this should be handled by decoderstream
+proc bomSniff(inputStream: Stream): Charset =
+  let bom = inputStream.readStr(2)
+  if bom == "\xFE\xFF":
+    return CHARSET_UTF_16_BE
+  if bom == "\xFF\xFE":
+    return CHARSET_UTF_16_LE
+  if bom == "\xEF\xBB":
+    if inputStream.readChar() == '\xBF':
+      return CHARSET_UTF_8
+  inputStream.setPosition(0)
+  return CHARSET_UNKNOWN
+
 proc parseHTML*(inputStream: Stream, window: Window, url: URL,
-    charsets: seq[Charset] = @[], canReinterpret = true): Document =
-  let builder = newChaDOMBuilder(url, window)
-  let opts = HTML5ParserOpts[Node](
+    factory: CAtomFactory, charsets: seq[Charset] = @[],
+    seekable = true): Document =
+  let opts = HTML5ParserOpts[Node, CAtom](
     isIframeSrcdoc: false, #TODO?
-    scripting: window != nil and window.settings.scripting,
-    canReinterpret: canReinterpret,
-    charsets: charsets
+    scripting: window != nil and window.settings.scripting
   )
-  parseHTML(inputStream, builder, opts)
-  return Document(builder.document)
+  let builder = newChaDOMBuilder(url, window, factory)
+  var charsetStack: seq[Charset]
+  for i in countdown(charsets.high, 0):
+    charsetStack.add(charsets[i])
+  var seekable = seekable
+  var inputStream = inputStream
+  if seekable:
+    let scs = inputStream.bomSniff()
+    if scs != CHARSET_UNKNOWN:
+      charsetStack.add(scs)
+      builder.confidence = CONFIDENCE_CERTAIN
+      seekable = false
+  if charsetStack.len == 0:
+    charsetStack.add(DefaultCharset) # UTF-8
+  while true:
+    builder.charset = charsetStack.pop()
+    if seekable:
+      builder.confidence = CONFIDENCE_TENTATIVE # used in the next iteration
+    else:
+      builder.confidence = CONFIDENCE_CERTAIN
+    let em = if charsetStack.len == 0 or not seekable:
+      DECODER_ERROR_MODE_REPLACEMENT
+    else:
+      DECODER_ERROR_MODE_FATAL
+    let decoder = newDecoderStream(inputStream, builder.charset, errormode = em)
+    let encoder = newEncoderStream(decoder, CHARSET_UTF_8,
+      errormode = ENCODER_ERROR_MODE_FATAL)
+    var parser = initHTML5Parser(builder, opts)
+    let document = builder.document
+    var buffer: array[4096, char]
+    while true:
+      let n = encoder.readData(addr buffer[0], buffer.len)
+      if n == 0: break
+      var res = parser.parseChunk(buffer.toOpenArray(0, n - 1))
+      # set insertion point for when it's needed
+      var ip = parser.getInsertionPoint()
+      while res == PRES_SCRIPT:
+        if builder.poppedScript != nil:
+          #TODO microtask
+          document.writeBuffers.add(DocumentWriteBuffer())
+          builder.poppedScript.prepare()
+        while document.parserBlockingScript != nil:
+          let script = document.parserBlockingScript
+          document.parserBlockingScript = nil
+          #TODO style sheet
+          script.execute()
+          assert document.parserBlockingScript != script
+        builder.poppedScript = nil
+        if document.writeBuffers.len == 0:
+          if ip == n:
+            # nothing left to re-parse.
+            break
+          # parse rest of input buffer
+          res = parser.parseChunk(buffer.toOpenArray(ip, n - 1))
+          ip += parser.getInsertionPoint() # move insertion point
+        else:
+          let writeBuffer = document.writeBuffers[^1]
+          let p = writeBuffer.i
+          let n = writeBuffer.data.len
+          res = parser.parseChunk(writeBuffer.data.toOpenArray(p, n - 1))
+          case res
+          of PRES_CONTINUE:
+            discard document.writeBuffers.pop()
+            res = PRES_SCRIPT
+          of PRES_SCRIPT:
+            let pp = p + parser.getInsertionPoint()
+            if pp == writeBuffer.data.len:
+              discard document.writeBuffers.pop()
+            else:
+              writeBuffer.i = pp
+          of PRES_STOP:
+            break
+            {.linearScanEnd.}
+      # PRES_STOP is returned when we return SET_ENCODING_STOP from
+      # setEncodingImpl. We immediately stop parsing in this case.
+      if res == PRES_STOP:
+        break
+    parser.finish()
+    if builder.confidence == CONFIDENCE_CERTAIN and seekable:
+      # A meta tag describing the charset has been found; force use of this
+      # charset.
+      builder.restart()
+      inputStream.setPosition(0)
+      charsetStack.add(builder.charset)
+      seekable = false
+      continue
+    if decoder.failed and seekable:
+      # Retry with another charset.
+      builder.restart()
+      inputStream.setPosition(0)
+      continue
+    break
+  builder.finish()
+  return builder.document
 
 proc newDOMParser(): DOMParser {.jsctor.} =
   return DOMParser()
@@ -265,7 +376,9 @@ proc parseFromString(ctx: JSContext, parser: DOMParser, str, t: string):
       window.document.url
     else:
       newURL("about:blank").get
-    let res = parseHTML(newStringStream(str), Window(nil), url)
+    #TODO this is probably broken in client (or at least sub-optimal)
+    let factory = if window != nil: window.factory else: newCAtomFactory()
+    let res = parseHTML(newStringStream(str), Window(nil), url, factory)
     return ok(res)
   of "text/xml", "application/xml", "application/xhtml+xml", "image/svg+xml":
     return err(newInternalError("XML parsing is not supported yet"))