about summary refs log tree commit diff stats
path: root/src/html
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2022-06-27 23:53:44 +0200
committerbptato <nincsnevem662@gmail.com>2022-07-11 21:08:10 +0200
commit62cba694e47a7a1f4bedc7fd48ceac9c26aa3aa1 (patch)
treee20a9f39a293c256f707162c46e117d13f3d5621 /src/html
parent84882cb8a6f9bca58d178a1f2b8fb5cafa8b3a56 (diff)
downloadchawan-62cba694e47a7a1f4bedc7fd48ceac9c26aa3aa1.tar.gz
Implement HTML5 parsing
Completely replaced the previous HTML2 (?) parser, which was a bad
re-implementation of w3m's parser in the first place. Now we have a
(sort of) compliant HTML5 parser.
Needs tests, badly.
Diffstat (limited to 'src/html')
-rw-r--r--src/html/dom.nim436
-rw-r--r--src/html/htmlparser.nim3887
-rw-r--r--src/html/htmltokenizer.nim1525
-rw-r--r--src/html/tags.nim75
4 files changed, 3856 insertions, 2067 deletions
diff --git a/src/html/dom.nim b/src/html/dom.nim
index aaa33163..6c63e1ad 100644
--- a/src/html/dom.nim
+++ b/src/html/dom.nim
@@ -1,5 +1,7 @@
 import tables
 import options
+import streams
+import strformat
 import strutils
 
 import css/values
@@ -17,25 +19,33 @@ type
     FORM_ENCODING_TYPE_MULTIPART = "multipart/form-data",
     FORM_ENCODING_TYPE_TEXT_PLAIN = "text/plain"
 
+  QuirksMode* = enum
+    NO_QUIRKS, QUIRKS, LIMITED_QUIRKS
+
+  Namespace* = enum
+    HTML = "http://www.w3.org/1999/xhtml",
+    MATHML = "http://www.w3.org/1998/Math/MathML",
+    SVG = "http://www.w3.org/2000/svg",
+    XLINK = "http://www.w3.org/1999/xlink",
+    XML = "http://www.w3.org/XML/1998/namespace",
+    XMLNS = "http://www.w3.org/2000/xmlns/"
+
 type
-  EventTarget* = ref EventTargetObj
-  EventTargetObj = object of RootObj
+  EventTarget* = ref object of RootObj
 
-  Node* = ref NodeObj
-  NodeObj = object of EventTargetObj
+  Node* = ref object of EventTarget
     nodeType*: NodeType
     childNodes*: seq[Node]
-    children*: seq[Element]
     isConnected*: bool
     nextSibling*: Node
     previousSibling*: Node
     parentNode*: Node
     parentElement*: Element
-    ownerDocument*: Document
+    rootNode: Node
+    document*: Document
     uid*: int # Unique id
 
-  Attr* = ref AttrObj
-  AttrObj = object of NodeObj
+  Attr* = ref object of Node
     namespaceURI*: string
     prefix*: string
     localName*: string
@@ -43,38 +53,40 @@ type
     value*: string
     ownerElement*: Element
 
-  Document* = ref DocumentObj
-  DocumentObj = object of NodeObj
+  Document* = ref object of Node
     location*: Url
     type_elements*: array[TagType, seq[Element]]
-    id_elements*: Table[string, seq[Element]]
     class_elements*: Table[string, seq[Element]]
     all_elements*: seq[Element]
-    head*: HTMLElement
-    body*: HTMLElement
-    root*: Element
+    mode*: QuirksMode
+
+    parser_cannot_change_the_mode_flag*: bool
+    is_iframe_srcdoc*: bool
 
-  CharacterData* = ref CharacterDataObj
-  CharacterDataObj = object of NodeObj
+  CharacterData* = ref object of Node
     data*: string
     length*: int
 
-  Text* = ref TextObj
-  TextObj = object of CharacterDataObj
+  Text* = ref object of CharacterData
     wholeText*: string
 
-  Comment* = ref CommentObj
-  CommentObj = object of CharacterDataObj
+  Comment* = ref object of CharacterData
 
-  Element* = ref ElementObj
-  ElementObj = object of NodeObj
-    namespaceURI*: string
+  DocumentFragment* = ref object of Node
+    host*: Element
+
+  DocumentType* = ref object of Node
+    name*: string
+    publicId*: string
+    systemId*: string
+
+  Element* = ref object of Node
+    namespace*: Namespace
+    namespacePrefix*: Option[string] #TODO namespaces
     prefix*: string
     localName*: string
-    tagName*: string
     tagType*: TagType
 
-    sheets*: seq[CSSStylesheet]
     id*: string
     classList*: seq[string]
     attributes*: Table[string, string]
@@ -84,7 +96,7 @@ type
     cssapplied*: bool
     rendered*: bool
 
-  HTMLElement* = ref object of ElementObj
+  HTMLElement* = ref object of Element
 
   HTMLInputElement* = ref object of HTMLElement
     inputType*: InputType
@@ -131,6 +143,8 @@ type
     ordinalvalue*: int
 
   HTMLStyleElement* = ref object of HTMLElement
+    sheet*: CSSStylesheet
+    sheet_invalid*: bool
 
   HTMLLinkElement* = ref object of HTMLElement
     href*: string
@@ -145,6 +159,22 @@ type
     constructingentrylist*: bool
     inputs*: seq[HTMLInputElement]
 
+  HTMLTemplateElement* = ref object of HTMLElement
+    content*: DocumentFragment
+
+  HTMLUnknownElement* = ref object of HTMLElement
+
+  HTMLScriptElement* = ref object of HTMLElement
+    parserDocument*: Document
+    preparationTimeDocument*: Document
+    forceAsync*: bool
+    fromAnExternalFile*: bool
+    readyToBeParser*: bool
+    alreadyStarted*: bool
+    delayingTheLoadEvent*: bool
+    ctype*: bool
+    #TODO result
+
 # For debugging
 func `$`*(node: Node): string =
   case node.nodeType
@@ -177,7 +207,7 @@ iterator radiogroup*(input: HTMLInputElement): HTMLInputElement {.inline.} =
     for input in input.form.radiogroup:
       yield input
   else:
-    for input in input.ownerDocument.radiogroup:
+    for input in input.document.radiogroup:
       yield input
 
 iterator textNodes*(node: Node): Text {.inline.} =
@@ -197,7 +227,68 @@ iterator branch*(node: Node): Node {.inline.} =
   var node = node
   while node != nil:
     yield node
-    node = node.parentElement
+    node = node.parentNode
+
+iterator children*(node: Node): Element {.inline.} =
+  for child in node.childNodes:
+    if child.nodeType == ELEMENT_NODE:
+      yield Element(child)
+
+func qualifiedName*(element: Element): string =
+  if element.namespacePrefix.issome: element.namespacePrefix.get & ':' & element.localName
+  else: element.localName
+
+func html*(document: Document): HTMLElement =
+  for element in document.children:
+    if element.tagType == TAG_HTML:
+      return HTMLElement(element)
+  return nil
+
+func head*(document: Document): HTMLElement =
+  if document.html != nil:
+    for element in document.html.children:
+      if element.tagType == TAG_HEAD:
+        return HTMLElement(element)
+  return nil
+
+func body*(document: Document): HTMLElement =
+  if document.html != nil:
+    for element in document.html.children:
+      if element.tagType == TAG_BODY:
+        return HTMLElement(element)
+  return nil
+
+
+func countChildren(node: Node, nodeType: NodeType): int =
+  for child in node.childNodes:
+    if child.nodeType == nodeType:
+      inc result
+
+func hasChild(node: Node, nodeType: NodeType): bool =
+  for child in node.childNodes:
+    if child.nodeType == nodeType:
+      return false
+
+func hasNextSibling(node: Node, nodeType: NodeType): bool =
+  var node = node.nextSibling
+  while node != nil:
+    if node.nodeType == nodeType: return true
+    node = node.nextSibling
+  return false
+
+func hasPreviousSibling(node: Node, nodeType: NodeType): bool =
+  var node = node.previousSibling
+  while node != nil:
+    if node.nodeType == nodeType: return true
+    node = node.previousSibling
+  return false
+
+func inSameTree*(a, b: Node): bool =
+  a.rootNode == b.rootNode and (a.rootNode != nil or b.rootNode != nil)
+
+func children*(node: Node): seq[Element] =
+  for child in node.children:
+    result.add(child)
 
 func filterDescendants*(element: Element, predicate: (proc(child: Element): bool)): seq[Element] =
   var stack: seq[Element]
@@ -231,7 +322,7 @@ func firstChild(node: Node): Node =
     return nil
   return node.childNodes[0]
 
-func lastChild(node: Node): Node =
+func lastChild*(node: Node): Node =
   if node.childNodes.len == 0:
     return nil
   return node.childNodes[^1]
@@ -262,27 +353,6 @@ func nextElementSibling*(elem: Element): Element =
     e = e.nextSibling
   return nil
 
-func isTextNode*(node: Node): bool =
-  return node.nodeType == TEXT_NODE
-
-func isElemNode*(node: Node): bool =
-  return node.nodeType == ELEMENT_NODE
-
-func isComment*(node: Node): bool =
-  return node.nodeType == COMMENT_NODE
-
-func isCData*(node: Node): bool =
-  return node.nodeType == CDATA_SECTION_NODE
-
-func isDocument*(node: Node): bool =
-  return node.nodeType == DOCUMENT_NODE
-
-func firstNode*(node: Node): bool =
-  return node.parentElement != nil and node.parentElement.childNodes[0] == node
-
-func lastNode*(node: Node): bool =
-  return node.parentElement != nil and node.parentElement.childNodes[^1] == node
-
 func attr*(element: Element, s: string): string =
   return element.attributes.getOrDefault(s, "")
 
@@ -309,31 +379,13 @@ func textContent*(node: Node): string =
       if child.nodeType != COMMENT_NODE:
         result &= child.textContent
 
-func toInputType*(str: string): InputType =
-  case str
-  of "button": INPUT_BUTTON
-  of "checkbox": INPUT_CHECKBOX
-  of "color": INPUT_COLOR
-  of "date": INPUT_DATE
-  of "datetime_local": INPUT_DATETIME_LOCAL
-  of "email": INPUT_EMAIL
-  of "file": INPUT_FILE
-  of "hidden": INPUT_HIDDEN
-  of "image": INPUT_IMAGE
-  of "month": INPUT_MONTH
-  of "number": INPUT_NUMBER
-  of "password": INPUT_PASSWORD
-  of "radio": INPUT_RADIO
-  of "range": INPUT_RANGE
-  of "reset": INPUT_RESET
-  of "search": INPUT_SEARCH
-  of "submit": INPUT_SUBMIT
-  of "tel": INPUT_TEL
-  of "text": INPUT_TEXT
-  of "time": INPUT_TIME
-  of "url": INPUT_URL
-  of "week": INPUT_WEEK
-  else: INPUT_UNKNOWN
+proc sheets*(element: Element): seq[CSSStylesheet] =
+  for child in element.children:
+    if child.tagType == TAG_STYLE:
+      let child = HTMLStyleElement(child)
+      if child.sheet_invalid:
+        child.sheet = parseStylesheet(newStringStream(child.textContent))
+      result.add(child.sheet)
 
 func inputString*(input: HTMLInputElement): string =
   var text = case input.inputType
@@ -431,7 +483,7 @@ func formmethod*(element: Element): FormMethod =
 func target*(element: Element): string =
   if element.attrb("target"):
     return element.attr("target")
-  for base in element.ownerDocument.elements(TAG_BASE):
+  for base in element.document.elements(TAG_BASE):
     if base.attrb("target"):
       return base.attr("target")
   return ""
@@ -442,15 +494,27 @@ func findAncestor*(node: Node, tagTypes: set[TagType]): Element =
       return element
   return nil
 
-func newText*(): Text =
+func newText*(document: Document, data: string = ""): Text =
   new(result)
   result.nodeType = TEXT_NODE
+  result.document = document
+  result.data = data
+  result.rootNode = result
 
-func newComment*(): Comment =
+func newComment*(document: Document, data: string = ""): Comment =
   new(result)
   result.nodeType = COMMENT_NODE
+  result.document = document
+  result.data = data
+  result.rootNode = result
+
+func namespace(s: string): Option[Namespace] =
+  for n in Namespace:
+    if s == $n:
+      return some(n)
 
-func newHtmlElement*(document: Document, tagType: TagType): HTMLElement =
+# note: we do not implement custom elements
+func newHTMLElement*(document: Document, tagType: TagType, namespace = Namespace.HTML, prefix = Option[string]): HTMLElement =
   case tagType
   of TAG_INPUT:
     result = new(HTMLInputElement)
@@ -478,10 +542,19 @@ func newHtmlElement*(document: Document, tagType: TagType): HTMLElement =
     result = new(HTMLLIElement)
   of TAG_STYLE:
     result = new(HTMLStyleElement)
+    HTMLStyleElement(result).sheet_invalid = true
   of TAG_LINK:
     result = new(HTMLLinkElement)
   of TAG_FORM:
     result = new(HTMLFormElement)
+  of TAG_TEMPLATE:
+    result = new(HTMLTemplateElement)
+    HTMLTemplateElement(result).content = DocumentFragment(document: document, host: result)
+  of TAG_UNKNOWN:
+    result = new(HTMLUnknownElement)
+  of TAG_SCRIPT:
+    result = new(HTMLScriptElement)
+    HTMLScriptElement(result).forceAsync = true
   else:
     result = new(HTMLElement)
 
@@ -489,26 +562,63 @@ func newHtmlElement*(document: Document, tagType: TagType): HTMLElement =
   result.tagType = tagType
   result.css = rootProperties()
   result.uid = document.all_elements.len
+  result.rootNode = result
+  result.document = document
   document.all_elements.add(result)
 
+func newHTMLElement*(document: Document, localName: string, namespace = "", prefix = none[string](), tagType = tagType(localName)): Element =
+  result = document.newHTMLElement(tagType, namespace(namespace).get(HTML))
+  result.namespacePrefix = prefix
+
 func newDocument*(): Document =
   new(result)
-  result.root = result.newHtmlElement(TAG_HTML)
-  result.head = result.newHtmlElement(TAG_HEAD)
-  result.body = result.newHtmlElement(TAG_BODY)
   result.nodeType = DOCUMENT_NODE
+  result.rootNode = result
+  result.document = result
+
+func newDocumentType*(document: Document, name: string, publicId = "", systemId = ""): DocumentType =
+  new(result)
+  result.document = document
+  result.name = name
+  result.publicId = publicId
+  result.systemId = systemId
+  result.rootNode = result
 
 func newAttr*(parent: Element, key, value: string): Attr =
   new(result)
+  result.document = parent.document
   result.nodeType = ATTRIBUTE_NODE
   result.ownerElement = parent
   result.name = key
   result.value = value
+  result.rootNode = result
 
+#TODO optimize?
 func getElementById*(document: Document, id: string): Element =
-  if id.len == 0 or id notin document.id_elements:
+  if id.len == 0:
     return nil
-  return document.id_elements[id][0]
+  var stack = document.children
+  while stack.len > 0:
+    let element = stack.pop()
+    if element.id == id:
+      return element
+    for i in countdown(element.childNodes.high, 0):
+      let child = element.childNodes[i]
+      if child.nodeType == ELEMENT_NODE:
+        stack.add(Element(child))
+  return nil
+
+#TODO optimize?
+func getElementsByTag*(document: Document, tag: TagType): seq[Element] =
+  var stack = document.children
+  while stack.len > 0:
+    let element = stack.pop()
+    if element.tagType == tag:
+      result.add(element)
+    for i in countdown(element.childNodes.high, 0):
+      let child = element.childNodes[i]
+      if child.nodeType == ELEMENT_NODE:
+        stack.add(Element(child))
 
 func baseUrl*(document: Document): Url =
   var href = ""
@@ -522,8 +632,139 @@ func baseUrl*(document: Document): Url =
     return document.location
   return url.get
 
-func getElementsByTag*(document: Document, tag: TagType): seq[Element] =
-  return document.type_elements[tag]
+func inHTMLNamespace*(element: Element): bool = element.namespace == Namespace.HTML
+func inMathMLNamespace*(element: Element): bool = element.namespace == Namespace.MATHML
+func inSVGNamespace*(element: Element): bool = element.namespace == Namespace.SVG
+func inXLinkNamespace*(element: Element): bool = element.namespace == Namespace.XLINK
+func inXMLNamespace*(element: Element): bool = element.namespace == Namespace.XML
+func inXMLNSNamespace*(element: Element): bool = element.namespace == Namespace.XMLNS
+
+func isResettable*(element: Element): bool =
+  return element.tagType in {TAG_INPUT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA}
+
+func isHostIncludingInclusiveAncestor*(a, b: Node): bool =
+  for parent in b.branch:
+    if parent == a:
+      return true
+  if b.rootNode.nodeType == DOCUMENT_FRAGMENT_NODE and DocumentFragment(b.rootNode).host != nil:
+    for parent in b.rootNode.branch:
+      if parent == a:
+        return true
+  return false
+
+# WARNING the ordering of the arguments in the standard is whack so this doesn't match it
+func preInsertionValidity*(parent, node, before: Node): bool =
+  if parent.nodeType notin {DOCUMENT_NODE, DOCUMENT_FRAGMENT_NODE, ELEMENT_NODE}:
+    # HierarchyRequestError
+    return false
+  if node.isHostIncludingInclusiveAncestor(parent):
+    # HierarchyRequestError
+    return false
+  if before != nil and before.parentNode != parent:
+    # NotFoundError
+    return false
+  if node.nodeType notin {DOCUMENT_FRAGMENT_NODE, DOCUMENT_TYPE_NODE, ELEMENT_NODE, CDATA_SECTION_NODE}:
+    # HierarchyRequestError
+    return false
+  if (node.nodeType == TEXT_NODE and parent.nodeType == DOCUMENT_NODE) or
+      (node.nodeType == DOCUMENT_TYPE_NODE and parent.nodeType != DOCUMENT_NODE):
+    # HierarchyRequestError
+    return false
+  if parent.nodeType == DOCUMENT_NODE:
+    case node.nodeType
+    of DOCUMENT_FRAGMENT_NODE:
+      let elems = node.countChildren(ELEMENT_NODE)
+      if elems > 1 or node.hasChild(TEXT_NODE):
+        # HierarchyRequestError
+        return false
+      elif elems == 1 and (parent.hasChild(ELEMENT_NODE) or before != nil and (before.nodeType == DOCUMENT_TYPE_NODE or before.hasNextSibling(DOCUMENT_TYPE_NODE))):
+        # HierarchyRequestError
+        return false
+    of ELEMENT_NODE:
+      if parent.hasChild(ELEMENT_NODE) or before != nil and (before.nodeType == DOCUMENT_TYPE_NODE or before.hasNextSibling(DOCUMENT_TYPE_NODE)):
+        # HierarchyRequestError
+        return false
+    of DOCUMENT_TYPE_NODE:
+      if parent.hasChild(DOCUMENT_TYPE_NODE) or before != nil and before.hasPreviousSibling(ELEMENT_NODE) or before == nil and parent.hasChild(ELEMENT_NODE):
+        # HierarchyRequestError
+        return false
+    else: discard
+  return true # no exception reached
+
+proc remove*(node: Node) =
+  let parent = node.parentNode
+  assert parent != nil
+  let index = parent.childNodes.find(node)
+  assert index != -1
+  #TODO live ranges
+  #TODO NodeIterator
+  let oldPreviousSibling = node.previousSibling
+  let oldNextSibling = node.nextSibling
+  parent.childNodes.del(index)
+  if oldPreviousSibling != nil:
+    oldPreviousSibling.nextSibling = oldNextSibling
+  if oldNextSibling != nil:
+    oldNextSibling.previousSibling = oldPreviousSibling
+
+  #TODO assigned, shadow root, shadow root again, custom nodes, registered observers
+  #TODO not surpress observers => queue tree mutation record
+
+proc adopt(document: Document, node: Node) =
+  let oldDocument = node.document
+  if node.parentNode != nil:
+    remove(node)
+
+proc applyChildInsert(parent, child: Node, index: int) =
+  if parent.rootNode != nil:
+    child.rootNode = parent.rootNode
+  else:
+    child.rootNode = parent
+  child.parentNode = parent
+  if parent.nodeType == ELEMENT_NODE:
+    child.parentElement = Element(parent)
+  if index - 1 >= 0:
+    child.previousSibling = parent.childNodes[index - 1]
+    child.previousSibling.nextSibling = child
+  if index + 1 < parent.childNodes.len:
+    child.nextSibling = parent.childNodes[index + 1]
+    child.nextSibling.previousSibling = child
+
+# WARNING ditto
+proc insert*(parent, node, before: Node) =
+  let nodes = if node.nodeType == DOCUMENT_FRAGMENT_NODE: node.childNodes
+  else: @[node]
+  let count = nodes.len
+  if count == 0:
+    return
+  if node.nodeType == DOCUMENT_FRAGMENT_NODE:
+    for child in node.childNodes:
+      child.remove()
+    #TODO tree mutation record
+  if before != nil:
+    #TODO live ranges
+    discard
+  let previousSibling = if before == nil: parent.lastChild
+  else: before.previousSibling
+  for node in nodes:
+    parent.document.adopt(node)
+    if before == nil:
+      parent.childNodes.add(node)
+      parent.applyChildInsert(node, parent.childNodes.high)
+    else:
+      let index = parent.childNodes.find(before)
+      parent.childNodes.insert(node, index)
+      parent.applyChildInsert(node, index)
+    #TODO shadow root
+
+# WARNING ditto
+proc preInsert*(parent, node, before: Node) =
+  if parent.preInsertionValidity(node, before):
+    let referenceChild = if before == node: node.nextSibling
+    else: before
+    parent.insert(node, referenceChild)
+
+proc append*(parent, node: Node) =
+  parent.preInsert(node, nil)
 
 proc applyOrdinal*(elem: HTMLLIElement) =
   let val = elem.attri("value")
@@ -549,8 +790,10 @@ proc applyOrdinal*(elem: HTMLLIElement) =
         inc menu.ordinalcounter
       else: discard
 
-proc reset*(form: HTMLFormElement) =
-  for input in form.inputs:
+proc reset*(element: Element) = 
+  case element.tagType
+  of TAG_INPUT:
+    let input = HTMLInputELement(element)
     case input.inputType
     of INPUT_SEARCH, INPUT_TEXT, INPUT_PASSWORD:
       input.value = input.attr("value")
@@ -560,3 +803,20 @@ proc reset*(form: HTMLFormElement) =
       input.file = none(Url)
     else: discard
     input.rendered = false
+  else: discard
+
+proc reset*(form: HTMLFormElement) =
+  for input in form.inputs:
+    input.reset()
+    input.rendered = false
+
+proc appendAttribute*(element: Element, k, v: string) =
+  element.attributes[k] = v
+
+proc setForm*(element: Element, form: HTMLFormElement) =
+  case element.tagType
+  of TAG_INPUT:
+    HTMLInputElement(element).form = form
+  of TAG_BUTTON, TAG_FIELDSET, TAG_OBJECT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA, TAG_IMG:
+    discard #TODO
+  else: assert false
diff --git a/src/html/htmlparser.nim b/src/html/htmlparser.nim
index d5d5effe..3e962495 100644
--- a/src/html/htmlparser.nim
+++ b/src/html/htmlparser.nim
@@ -1,2009 +1,1962 @@
-import streams
-import unicode
-import strutils
-import tables
-import json
 import macros
 import options
+import sequtils
+import streams
 import strformat
+import tables
+import unicode
 
 import utils/twtstr
-import utils/radixtree
 import html/dom
-import html/entity
 import html/tags
+import html/htmltokenizer
 import css/sheet
 
 type
-  HTMLParseState = object
-    in_comment: bool
-    in_script: bool
-    in_style: bool
-    in_noscript: bool
-    in_body: bool
-    skip_lf: bool
-    elementNode: Element
-    textNode: Text
-    commentNode: Comment
+  HTML5Parser = object
+    case fragment: bool
+    of true: ctx: Element
+    else: discard
+    openElements: seq[Element]
+    insertionMode: InsertionMode
+    oldInsertionMode: InsertionMode
+    templateModes: seq[InsertionMode]
+    head: Element
+    tokenizer: Tokenizer
     document: Document
-    formowners: seq[HTMLFormElement]
+    form: HTMLFormElement
+    fosterParenting: bool
+    scripting: bool
+    activeFormatting: seq[(Element, Token)] # nil => marker
+    framesetok: bool
+    ignoreLF: bool
+    pendingTableChars: string
+    pendingTableCharsWhitespace: bool
+
+  AdjustedInsertionLocation = tuple[inside: Node, before: Node]
+
+# 13.2.4.1
+  InsertionMode = enum
+    INITIAL, BEFORE_HTML, BEFORE_HEAD, IN_HEAD, IN_HEAD_NOSCRIPT, AFTER_HEAD,
+    IN_BODY, TEXT, IN_TABLE, IN_TABLE_TEXT, IN_CAPTION, IN_COLUMN_GROUP,
+    IN_TABLE_BODY, IN_ROW, IN_CELL, IN_SELECT, IN_SELECT_IN_TABLE, IN_TEMPLATE,
+    AFTER_BODY, IN_FRAMESET, AFTER_FRAMESET, AFTER_AFTER_BODY,
+    AFTER_AFTER_FRAMESET
+
+proc resetInsertionMode(parser: var HTML5Parser) =
+  template switch_insertion_mode_and_return(mode: InsertionMode) =
+    parser.insertionMode = mode
+    return
+  for i in countdown(parser.openElements.high, 0):
+    var node = parser.openElements[i]
+    let last = i == 0
+    if parser.fragment:
+      node = parser.ctx
+    if node.tagType == TAG_SELECT:
+      if not last:
+        var ancestor = node
+        for j in countdown(parser.openElements.high, 1):
+          let ancestor = parser.openElements[j]
+          case ancestor.tagType
+          of TAG_TEMPLATE: break
+          of TAG_TABLE: switch_insertion_mode_and_return IN_SELECT_IN_TABLE
+          else: discard
+      switch_insertion_mode_and_return IN_SELECT
+    case node.tagType
+    of TAG_TD, TAG_TH:
+      if not last:
+        switch_insertion_mode_and_return IN_CELL
+    of TAG_TR: switch_insertion_mode_and_return IN_ROW
+    of TAG_TBODY, TAG_THEAD, TAG_TFOOT: switch_insertion_mode_and_return IN_CAPTION
+    of TAG_COLGROUP: switch_insertion_mode_and_return IN_COLUMN_GROUP
+    of TAG_TABLE: switch_insertion_mode_and_return IN_TABLE
+    of TAG_TEMPLATE: switch_insertion_mode_and_return parser.templateModes[^1]
+    of TAG_HEAD:
+      if not last:
+        switch_insertion_mode_and_return IN_HEAD
+    of TAG_BODY: switch_insertion_mode_and_return IN_BODY
+    of TAG_FRAMESET: switch_insertion_mode_and_return IN_FRAMESET
+    of TAG_HTML:
+      if parser.head != nil:
+        switch_insertion_mode_and_return BEFORE_HEAD
+      else:
+        switch_insertion_mode_and_return AFTER_HEAD
+    else: discard
+    if last:
+      switch_insertion_mode_and_return IN_BODY
 
-# Tokenizer
-type
-  Tokenizer = object
-    state: TokenizerState
-    rstate: TokenizerState
-    curr: Rune
-    tmp: string
-    code: int
-    tok: Token
-    laststart: Token
-    attrn: string
-    attrv: string
-    attr: bool
-
-    istream: Stream
-    sbuf: string
-    sbuf_i: int
-    sbuf_ip: int
-    eof_i: int
-
-  TokenType = enum
-    DOCTYPE, START_TAG, END_TAG, COMMENT, CHARACTER, EOF
-
-  TokenizerState = enum
-    DATA, CHARACTER_REFERENCE, TAG_OPEN, RCDATA, RCDATA_LESS_THAN_SIGN,
-    RAWTEXT, RAWTEXT_LESS_THAN_SIGN, SCRIPT_DATA, SCRIPT_DATA_LESS_THAN_SIGN,
-    PLAINTEXT, MARKUP_DECLARATION_OPEN, END_TAG_OPEN, BOGUS_COMMENT, TAG_NAME,
-    BEFORE_ATTRIBUTE_NAME, RCDATA_END_TAG_OPEN, RCDATA_END_TAG_NAME,
-    RAWTEXT_END_TAG_OPEN, RAWTEXT_END_TAG_NAME, SELF_CLOSING_START_TAG,
-    SCRIPT_DATA_END_TAG_OPEN, SCRIPT_DATA_ESCAPE_START,
-    SCRIPT_DATA_END_TAG_NAME, SCRIPT_DATA_ESCAPE_START_DASH,
-    SCRIPT_DATA_ESCAPED_DASH_DASH, SCRIPT_DATA_ESCAPED,
-    SCRIPT_DATA_ESCAPED_DASH, SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN,
-    SCRIPT_DATA_ESCAPED_END_TAG_OPEN, SCRIPT_DATA_DOUBLE_ESCAPE_START,
-    SCRIPT_DATA_ESCAPED_END_TAG_NAME, SCRIPT_DATA_DOUBLE_ESCAPED,
-    SCRIPT_DATA_DOUBLE_ESCAPED_DASH, SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN,
-    SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, SCRIPT_DATA_DOUBLE_ESCAPE_END,
-    AFTER_ATTRIBUTE_NAME, ATTRIBUTE_NAME, BEFORE_ATTRIBUTE_VALUE,
-    ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED,
-    ATTRIBUTE_VALUE_UNQUOTED, AFTER_ATTRIBUTE_VALUE_QUOTED, COMMENT_START,
-    CDATA_SECTION, COMMENT_START_DASH, COMMENT, COMMENT_END,
-    COMMENT_LESS_THAN_SIGN, COMMENT_END_DASH, COMMENT_LESS_THAN_SIGN_BANG,
-    COMMENT_LESS_THAN_SIGN_BANG_DASH, COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH,
-    COMMENT_END_BANG, DOCTYPE, BEFORE_DOCTYPE_NAME, DOCTYPE_NAME,
-    AFTER_DOCTYPE_NAME, AFTER_DOCTYPE_PUBLIC_KEYWORD,
-    AFTER_DOCTYPE_SYSTEM_KEYWORD, BOGUS_DOCTYPE,
-    BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED,
-    DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, AFTER_DOCTYPE_PUBLIC_IDENTIFIER,
-    BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
-    DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED,
-    DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
-    AFTER_DOCTYPE_SYSTEM_IDENTIFIER, CDATA_SECTION_BRACKET, CDATA_SECTION_END,
-    NAMED_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE,
-    AMBIGUOUS_AMPERSAND_STATE, HEXADECIMAL_CHARACTER_REFERENCE_START,
-    DECIMAL_CHARACTER_REFERENCE_START, HEXADECIMAL_CHARACTER_REFERENCE,
-    DECIMAL_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE_END
-
-  Token = ref object
-    case t: TokenType
-    of DOCTYPE:
-      name: Option[string]
-      pubid: Option[string]
-      sysid: Option[string]
-      quirks: bool
-    of START_TAG, END_TAG:
-      tagname: string
-      selfclosing: bool
-      attrs: Table[string, string]
-    of CHARACTER:
-      r: Rune
-    of COMMENT:
-      data: string
-    of EOF: discard
-
-func `$`*(tok: Token): string =
-  case tok.t
-  of DOCTYPE: fmt"{tok.t} {tok.name} {tok.pubid} {tok.sysid} {tok.quirks}"
-  of START_TAG, END_TAG: fmt"{tok.t} {tok.tagname} {tok.selfclosing} {tok.attrs}"
-  of CHARACTER: fmt"{tok.t} {tok.r}"
-  of COMMENT: fmt"{tok.t} {tok.data}"
-  of EOF: fmt"{tok.t}"
-
-const bufSize = 512
-const copyBufSize = 16
-proc newTokenizer(s: Stream): Tokenizer =
-  result.sbuf = newString(bufSize)
-  result.istream = s
-  if result.istream.atEnd:
-    result.eof_i = 0
+func currentNode(parser: HTML5Parser): Element =
+  if parser.openElements.len == 0:
+    assert false
   else:
-    let n = s.readDataStr(result.sbuf, 0..bufSize-1)
-    result.eof_i = n
-
-func atEof(t: Tokenizer): bool =
-  t.eof_i != -1 and t.sbuf_i >= t.eof_i
-
-proc consume(t: var Tokenizer): char {.inline.} =
-  if t.sbuf_i >= bufSize-copyBufSize:
-    var sbuf2 = newString(copyBufSize)
-    var i = 0
-    while t.sbuf_i + i < bufSize:
-      sbuf2[i] = t.sbuf[t.sbuf_i + i]
-      inc i
-    let n = t.istream.readDataStr(t.sbuf, i..bufSize-1)
-    if n != bufSize - i:
-      t.eof_i = i + n
-    t.sbuf_i = 0
-
-    var j = 0
-    while j < i:
-      t.sbuf[j] = sbuf2[j]
-      inc j
-
-  assert t.eof_i == -1 or t.sbuf_i < t.eof_i # not consuming eof...
-  t.sbuf_ip = t.sbuf_i # save previous pointer for potential reconsume
-
-  # Normalize newlines (\r\n -> \n, single \r -> \n)
-  if t.sbuf[t.sbuf_i] == '\r':
-    inc t.sbuf_i
-    if t.sbuf[t.sbuf_i] != '\n':
-      # \r
-      result = '\n'
-      t.curr = Rune('\n')
-      return
-    # else, \r\n so just return the \n
-
-  result = t.sbuf[t.sbuf_i]
-  fastRuneAt(t.sbuf, t.sbuf_i, t.curr)
-
-proc reconsume(t: var Tokenizer) =
-  t.sbuf_i = t.sbuf_ip
-
-iterator tokenize(tokenizer: var Tokenizer): Token =
-  template emit(tok: Token) =
-    if tok.t == START_TAG:
-      tokenizer.laststart = tok
-    yield tok
-  template emit(tok: TokenType) = emit Token(t: tok)
-  template emit(rn: Rune) = emit Token(t: CHARACTER, r: rn)
-  template emit(ch: char) = emit Token(t: CHARACTER, r: Rune(ch))
-  template emit_eof =
-    emit EOF
-    break
-  template emit_tok =
-    if tokenizer.attr:
-      tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv
-    emit tokenizer.tok
-  template emit_current =
-    if tokenizer.atEof:
-      emit_eof
-    else:
-      emit Token(t: CHARACTER, r: tokenizer.curr)
-  template emit_replacement = emit Rune(0xFFFD)
-  template switch_state(s: TokenizerState) =
-    tokenizer.state = s
-  template switch_state_return(s: TokenizerState) =
-    tokenizer.rstate = tokenizer.state
-    tokenizer.state = s
-  template reconsume_in(s: TokenizerState) =
-    tokenizer.reconsume()
-    switch_state s
-  template parse_error(error: untyped) = discard # does nothing for now... TODO?
-  template is_appropriate_end_tag_token(): bool =
-    tokenizer.laststart != nil and tokenizer.laststart.data == tokenizer.tok.data
-  template start_new_attribute =
-    if tokenizer.attr:
-      tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv
-    tokenizer.attrn = ""
-    tokenizer.attrv = ""
-    tokenizer.attr = true
-  template leave_attribute_name_state =
-    if tokenizer.attrn in tokenizer.tok.attrs:
-      tokenizer.attr = false
-  template append_to_current_attr_value(c: typed) =
-    if tokenizer.attr:
-      tokenizer.attrv &= c
-  template peek_str(s: string): bool =
-    # WARNING: will break on strings with copyBufSize + 4 bytes
-    assert s.len < copyBufSize - 4 and s.len > 0
-    if tokenizer.sbuf_i + s.len > tokenizer.eof_i:
-      false
-    else:
-      let slice = tokenizer.sbuf[tokenizer.sbuf_i..tokenizer.sbuf_i+s.high]
-      s == slice
-  template peek_str_nocase(s: string): bool =
-    # WARNING: will break on strings with copyBufSize + 4 bytes
-    # WARNING: only works with UPPER CASE ascii
-    assert s.len < copyBufSize - 4 and s.len > 0
-    if tokenizer.sbuf_i + s.len > tokenizer.eof_i:
-      false
-    else:
-      let slice = tokenizer.sbuf[tokenizer.sbuf_i..tokenizer.sbuf_i+s.high]
-      s == slice.toUpperAscii()
-  template peek_char(): char = tokenizer.sbuf[tokenizer.sbuf_i]
-  template has_adjusted_current_node(): bool = false #TODO implement this
-  template consume_and_discard(n: int) = #TODO optimize
-    var i = 0
-    while i < n:
-      discard tokenizer.consume()
-      inc i
-  template consumed_as_an_attribute(): bool =
-    tokenizer.rstate in {ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED, ATTRIBUTE_VALUE_UNQUOTED}
-  template flush_code_points_consumed_as_a_character_reference() =
-    if consumed_as_an_attribute:
-      append_to_current_attr_value tokenizer.tmp
-    else:
-      for r in tokenizer.tmp.runes:
-        emit r
-  template new_token(t: Token) =
-    if tokenizer.attr:
-      tokenizer.attr = false
-    tokenizer.tok = t
-
-  # Fake EOF as an actual character. Also replace anything_else with the else
-  # branch.
-  # Yes this is kind of ugly but it works and I'm too lazy to come up with
-  # anything better.
-  macro stateMachine(states: varargs[untyped]): untyped =
-    var maincase = newNimNode(nnkCaseStmt).add(quote do: tokenizer.state)
-    for state in states:
-      if state.kind == nnkOfBranch:
-        let mainstmtlist = findChild(state, it.kind == nnkStmtList)
-        if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "ignore_eof":
-          maincase.add(state)
-          continue
-
-        var hasanythingelse = false
-        if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "has_anything_else":
-          hasanythingelse = true
-
-        let childcase = findChild(mainstmtlist, it.kind == nnkCaseStmt)
-        var haseof = false
-        var eofstmts: NimNode
-        var elsestmts: NimNode
-
-        for i in countdown(childcase.len-1, 0):
-          let childof = childcase[i]
-          if childof.kind == nnkOfBranch:
-            for j in countdown(childof.len-1, 0):
-              if childof[j].kind == nnkIdent and childof[j].strVal == "eof":
-                haseof = true
-                eofstmts = childof.findChild(it.kind == nnkStmtList)
-                if childof.findChild(it.kind == nnkIdent and it.strVal != "eof") != nil:
-                  childof.del(j)
-                else:
-                  childcase.del(i)
-          elif childof.kind == nnkElse:
-            elsestmts = childof.findChild(it.kind == nnkStmtList)
-
-        if not haseof:
-          eofstmts = elsestmts
-        let fake_eof = quote do:
-          if tokenizer.atEof:
-            `eofstmts`
-            continue
-        mainstmtlist.insert(0, fake_eof)
-        if hasanythingelse:
-          let fake_anything_else = quote do:
-            template anything_else =
-              `elsestmts`
-          mainstmtlist.insert(0, fake_anything_else)
-      maincase.add(state)
-    result = newNimNode(nnkStmtList)
-    result.add(maincase)
-
-  template ignore_eof = discard # does nothing
-  template has_anything_else = discard # does nothing
-
-  const null = char(0)
-  const whitespace = {'\t', '\n', '\f', ' '}
+    return parser.openElements[^1]
+
+func adjustedCurrentNode(parser: HTML5Parser): Element =
+  if parser.fragment: parser.ctx
+  else: parser.currentNode
+
+template parse_error() = discard
+
+func lastElementOfTag(parser: HTML5Parser, tagType: TagType): tuple[element: Element, pos: int] =
+  for i in countdown(parser.openElements.high, 0):
+    if parser.openElements[i].tagType == tagType:
+      return (parser.openElements[i], i)
+  return (nil, -1)
+
+template last_child_of(n: Node): AdjustedInsertionLocation =
+  (n, nil)
+
+# 13.2.6.1
+func appropriatePlaceForInsert(parser: HTML5Parser, target: Element): AdjustedInsertionLocation =
+  assert parser.openElements[0].tagType == TAG_HTML
+  if parser.fosterParenting and target.tagType in {TAG_TABLE, TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TR}:
+    let lastTemplate = parser.lastElementOfTag(TAG_TEMPLATE)
+    let lastTable = parser.lastElementOfTag(TAG_TABLE)
+    if lastTemplate.element != nil and (lastTable.element == nil or lastTable.pos < lastTemplate.pos):
+      return last_child_of(HTMLTemplateElement(lastTemplate.element).content)
+    if lastTable.element == nil:
+      return last_child_of(parser.openElements[0])
+    if lastTable.element.parentNode != nil:
+      return (lastTable.element.parentNode, lastTable.element)
+    let previousElement = parser.openElements[lastTable.pos - 1]
+    result = last_child_of(previousElement)
+  else:
+    result = last_child_of(target)
+  if result.inside.nodeType == ELEMENT_NODE and Element(result.inside).tagType == TAG_TEMPLATE:
+    result = (HTMLTemplateElement(result.inside).content, nil)
+
+func appropriatePlaceForInsert(parser: HTML5Parser): AdjustedInsertionLocation =
+  parser.appropriatePlaceForInsert(parser.currentNode)
+
+func hasElement(elements: seq[Element], tag: TagType): bool =
+  for element in elements:
+    if element.tagType == tag:
+      return true
+  return false
+
+func hasElementInSpecificScope(elements: seq[Element], target: Element, list: set[TagType]): bool =
+  for i in countdown(elements.high, 0):
+    if elements[i] == target:
+      return true
+    if elements[i].tagType in list:
+      return false
+  assert false
+
+func hasElementInSpecificScope(elements: seq[Element], target: TagType, list: set[TagType]): bool =
+  for i in countdown(elements.high, 0):
+    if elements[i].tagType == target:
+      return true
+    if elements[i].tagType in list:
+      return false
+  assert false
+
+func hasElementInSpecificScope(elements: seq[Element], target: set[TagType], list: set[TagType]): bool =
+  for i in countdown(elements.high, 0):
+    if elements[i].tagType in target:
+      return true
+    if elements[i].tagType in list:
+      return false
+  assert false
+
+const Scope = {TAG_APPLET, TAG_CAPTION, TAG_HTML, TAG_TABLE, TAG_TD, TAG_TH,
+               TAG_MARQUEE, TAG_OBJECT, TAG_TEMPLATE} #TODO SVG (NOTE MathML not implemented)
+func hasElementInScope(elements: seq[Element], target: TagType): bool =
+  return elements.hasElementInSpecificScope(target, Scope)
+
+func hasElementInScope(elements: seq[Element], target: set[TagType]): bool =
+  return elements.hasElementInSpecificScope(target, Scope)
+
+func hasElementInScope(elements: seq[Element], target: Element): bool =
+  return elements.hasElementInSpecificScope(target, Scope)
+
+func hasElementInListItemScope(elements: seq[Element], target: TagType): bool =
+  return elements.hasElementInSpecificScope(target, Scope + {TAG_OL, TAG_UL})
+
+func hasElementInButtonScope(elements: seq[Element], target: TagType): bool =
+  return elements.hasElementInSpecificScope(target, Scope + {TAG_BUTTON})
+
+func hasElementInTableScope(elements: seq[Element], target: TagType): bool =
+  return elements.hasElementInSpecificScope(target, {TAG_HTML, TAG_TABLE, TAG_TEMPLATE})
+
+func hasElementInTableScope(elements: seq[Element], target: set[TagType]): bool =
+  return elements.hasElementInSpecificScope(target, {TAG_HTML, TAG_TABLE, TAG_TEMPLATE})
+
+func hasElementInSelectScope(elements: seq[Element], target: TagType): bool =
+  for i in countdown(elements.high, 0):
+    if elements[i].tagType == target:
+      return true
+    if elements[i].tagType notin {TAG_OPTION, TAG_OPTGROUP}:
+      return false
+  assert false
+
+func createElement(parser: HTML5Parser, token: Token, namespace: string, intendedParent: Node): Element =
+  #TODO custom elements
+  let document = intendedParent.document
+  let localName = token.tagname
+  let element = document.newHTMLElement(localName, namespace, tagType = token.tagtype)
+  for k, v in token.attrs:
+    element.appendAttribute(k, v)
+  if element.isResettable():
+    element.reset()
+
+  if element.tagType in FormAssociatedElements and parser.form != nil and
+      not parser.openElements.hasElement(TAG_TEMPLATE) and
+      (element.tagType notin ListedElements or not element.attrb("form")) and
+      element.inSameTree(parser.form):
+    element.setForm(parser.form)
+  return element
+
+proc insert(location: AdjustedInsertionLocation, node: Node) =
+  location.inside.insert(node, location.before)
+
+proc insertForeignElement(parser: var HTML5Parser, token: Token, namespace: string): Element =
+  let location = parser.appropriatePlaceForInsert()
+  let element = parser.createElement(token, namespace, location.inside)
+  if location.inside.preInsertionValidity(element, location.before):
+    #TODO custom elements
+    location.insert(element)
+  parser.openElements.add(element)
+  return element
+
+proc insertHTMLElement(parser: var HTML5Parser, token: Token): Element =
+  return parser.insertForeignElement(token, $Namespace.HTML)
+
+template insert_character_impl(parser: var HTML5Parser, data: typed) =
+  let location = parser.appropriatePlaceForInsert()
+  if location.inside.nodeType == DOCUMENT_NODE:
+    return
+  let insertNode = if location.before == nil:
+    location.inside.lastChild
+  else:
+    location.before.previousSibling
+  assert location.before == nil
+  if insertNode != nil and insertNode.nodeType == TEXT_NODE:
+    dom.Text(insertNode).data &= data
+  else:
+    let text = location.inside.document.newText($data)
+    location.insert(text)
+
+  if location.inside.nodeType == ELEMENT_NODE:
+    let parent = Element(location.inside)
+    if parent.tagType == TAG_STYLE:
+      let parent = HTMLStyleElement(parent)
+      parent.sheet_invalid = true
+
+proc insertCharacter(parser: var HTML5Parser, data: string) =
+  insert_character_impl(parser, data)
+
+proc insertCharacter(parser: var HTML5Parser, data: char) =
+  insert_character_impl(parser, data)
+
+proc insertCharacter(parser: var HTML5Parser, data: Rune) =
+  insert_character_impl(parser, data)
+
+proc insertComment(parser: var HTML5Parser, token: Token, position: AdjustedInsertionLocation) =
+  position.insert(position.inside.document.newComment(token.data))
+
+proc insertComment(parser: var HTML5Parser, token: Token) =
+  let position = parser.appropriatePlaceForInsert()
+  position.insert(position.inside.document.newComment(token.data))
+
+const PublicIdentifierEquals = [
+  "-//W3O//DTD W3 HTML Strict 3.0//EN//",
+  "-/W3C/DTD HTML 4.0 Transitional/EN",
+  "HTML" 
+]
+
+const PublicIdentifierStartsWith = [
+  "+//Silmaril//dtd html Pro v0r11 19970101//",
+  "-//AS//DTD HTML 3.0 asWedit + extensions//",
+  "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//",
+  "-//IETF//DTD HTML 2.0 Level 1//",
+  "-//IETF//DTD HTML 2.0 Level 2//",
+  "-//IETF//DTD HTML 2.0 Strict Level 1//",
+  "-//IETF//DTD HTML 2.0 Strict Level 2//",
+  "-//IETF//DTD HTML 2.0 Strict//",
+  "-//IETF//DTD HTML 2.0//",
+  "-//IETF//DTD HTML 2.1E//",
+  "-//IETF//DTD HTML 3.0//",
+  "-//IETF//DTD HTML 3.2 Final//",
+  "-//IETF//DTD HTML 3.2//",
+  "-//IETF//DTD HTML 3//",
+  "-//IETF//DTD HTML Level 0//",
+  "-//IETF//DTD HTML Level 1//",
+  "-//IETF//DTD HTML Level 2//",
+  "-//IETF//DTD HTML Level 3//",
+  "-//IETF//DTD HTML Strict Level 0//",
+  "-//IETF//DTD HTML Strict Level 1//",
+  "-//IETF//DTD HTML Strict Level 2//",
+  "-//IETF//DTD HTML Strict Level 3//",
+  "-//IETF//DTD HTML Strict//",
+  "-//IETF//DTD HTML//",
+  "-//Metrius//DTD Metrius Presentational//",
+  "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//",
+  "-//Microsoft//DTD Internet Explorer 2.0 HTML//",
+  "-//Microsoft//DTD Internet Explorer 2.0 Tables//",
+  "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//",
+  "-//Microsoft//DTD Internet Explorer 3.0 HTML//",
+  "-//Microsoft//DTD Internet Explorer 3.0 Tables//",
+  "-//Netscape Comm. Corp.//DTD HTML//",
+  "-//Netscape Comm. Corp.//DTD Strict HTML//",
+  "-//O'Reilly and Associates//DTD HTML 2.0//",
+  "-//O'Reilly and Associates//DTD HTML Extended 1.0//",
+  "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//",
+  "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//",
+  "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//",
+  "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//",
+  "-//Spyglass//DTD HTML 2.0 Extended//",
+  "-//Sun Microsystems Corp.//DTD HotJava HTML//",
+  "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//",
+  "-//W3C//DTD HTML 3 1995-03-24//",
+  "-//W3C//DTD HTML 3.2 Draft//",
+  "-//W3C//DTD HTML 3.2 Final//",
+  "-//W3C//DTD HTML 3.2//",
+  "-//W3C//DTD HTML 3.2S Draft//",
+  "-//W3C//DTD HTML 4.0 Frameset//",
+  "-//W3C//DTD HTML 4.0 Transitional//",
+  "-//W3C//DTD HTML Experimental 19960712//",
+  "-//W3C//DTD HTML Experimental 970421//",
+  "-//W3C//DTD W3 HTML//",
+  "-//W3O//DTD W3 HTML 3.0//",
+  "-//WebTechs//DTD Mozilla HTML 2.0//",
+  "-//WebTechs//DTD Mozilla HTML//", 
+]
+
+const SystemIdentifierMissingAndPublicIdentifierStartsWith = [
+  "-//W3C//DTD HTML 4.01 Frameset//",
+  "-//W3C//DTD HTML 4.01 Transitional//"
+]
+
+const PublicIdentifierStartsWithLimited = [
+  "-//W3C//DTD XHTML 1.0 Frameset//",
+  "-//W3C//DTD XHTML 1.0 Transitional//"
+]
+
+const SystemIdentifierNotMissingAndPublicIdentifierStartsWith = [
+  "-//W3C//DTD HTML 4.01 Frameset//",
+  "-//W3C//DTD HTML 4.01 Transitional//"
+]
+
+func quirksConditions(token: Token): bool =
+  if token.quirks: return true
+  if token.name.isnone or token.name.get != "html": return true
+  if token.sysid.issome:
+    if token.sysid.get == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd":
+      return true
+  if token.pubid.issome:
+    if token.pubid.get in PublicIdentifierEquals:
+      return true
+    for id in PublicIdentifierStartsWith:
+      if token.pubid.get.startsWithNoCase(id):
+        return true
+    if token.sysid.isnone:
+      for id in SystemIdentifierMissingAndPublicIdentifierStartsWith:
+        if token.pubid.get.startsWithNoCase(id):
+          return true
+  return false
+
+func limitedQuirksConditions(token: Token): bool =
+  if token.pubid.isnone: return false
+  for id in PublicIdentifierStartsWithLimited:
+    if token.pubid.get.startsWithNoCase(id):
+      return true
+  if token.sysid.isnone: return false
+  for id in SystemIdentifierNotMissingAndPublicIdentifierStartsWith:
+    if token.pubid.get.startsWithNoCase(id):
+      return true
+  return false
+
+# 13.2.6.2
+proc genericRawtextElementParsingAlgorithm(parser: var HTML5Parser, token: Token) =
+  discard parser.insertHTMLElement(token)
+  parser.tokenizer.state = RAWTEXT
+  parser.oldInsertionMode = parser.insertionMode
+  parser.insertionMode = TEXT
+
+proc genericRCDATAElementParsingAlgorithm(parser: var HTML5Parser, token: Token) =
+  discard parser.insertHTMLElement(token)
+  parser.tokenizer.state = RCDATA
+  parser.oldInsertionMode = parser.insertionMode
+  parser.insertionMode = TEXT
+
+# 13.2.6.3
+proc generateImpliedEndTags(parser: var HTML5Parser) =
+  const tags = {TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P,
+                TAG_RB, TAG_RP, TAG_RT, TAG_RTC}
+  while parser.currentNode.tagType in tags:
+    discard parser.openElements.pop()
+
+proc generateImpliedEndTags(parser: var HTML5Parser, exclude: TagType) =
+  let tags = {TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P,
+                TAG_RB, TAG_RP, TAG_RT, TAG_RTC} - {exclude}
+  while parser.currentNode.tagType in tags:
+    discard parser.openElements.pop()
+
+proc generateImpliedEndTagsThoroughly(parser: var HTML5Parser) =
+  const tags = {TAG_CAPTION, TAG_COLGROUP, TAG_DD, TAG_DT, TAG_LI,
+                TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, TAG_RT,
+                TAG_RTC, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH, TAG_THEAD,
+                TAG_TR}
+  while parser.currentNode.tagType in tags:
+    discard parser.openElements.pop()
+
+# 13.2.4.3
+proc pushOntoActiveFormatting(parser: var HTML5Parser, element: Element, token: Token) =
+  var count = 0
+  for i in countdown(parser.activeFormatting.high, 0):
+    let it = parser.activeFormatting[i]
+    if it[0] == nil: break
+    if it[0].tagType != element.tagType: continue
+    if it[0].tagType == TAG_UNKNOWN:
+      if it[0].localName != element.localName: continue #TODO local or qualified?
+    if it[0].namespace != element.namespace: continue
+    var fail = false
+    for k, v in it[0].attributes:
+      if k notin element.attributes:
+        fail = true
+        break
+      if v != element.attributes[k]:
+        fail = true
+        break
+    if fail: continue
+    for k, v in element.attributes:
+      if k notin it[0].attributes:
+        fail = true
+        break
+    if fail: continue
+    inc count
+    if count == 3:
+      parser.activeFormatting.del(i)
+      break
+  parser.activeFormatting.add((element, token))
 
+proc reconstructActiveFormatting(parser: var HTML5Parser) =
+  type State = enum
+    REWIND, ADVANCE, CREATE
+  if parser.activeFormatting.len == 0:
+    return
+  if parser.activeFormatting[^1][0] == nil or parser.openElements.hasElement(parser.activeFormatting[^1][0].tagType):
+    return
+  var i = parser.activeFormatting.high
+  template entry: Element = (parser.activeFormatting[i][0])
+  var state = REWIND
   while true:
     {.computedGoto.}
-    let c = tokenizer.consume()
-    stateMachine: # => case tokenizer.state
-    of DATA:
-      case c
-      of '&': switch_state_return CHARACTER_REFERENCE
-      of '<': switch_state TAG_OPEN
-      of null:
-        parse_error unexpected_null_character
-        emit_current
-      of eof: emit_eof
-      else: emit_current
-
-    of RCDATA:
-      case c
-      of '&': switch_state_return CHARACTER_REFERENCE
-      of '<': switch_state RCDATA_LESS_THAN_SIGN
-      of null: parse_error unexpected_null_character
-      of eof: emit_eof
-      else: emit_current
-
-    of RAWTEXT:
-      case c
-      of '<': switch_state RAWTEXT_LESS_THAN_SIGN
-      of null:
-        parse_error unexpected_null_character
-        emit_replacement
-      of eof: emit_eof
-      else: emit_current
-
-    of SCRIPT_DATA:
-      case c
-      of '<': switch_state SCRIPT_DATA_LESS_THAN_SIGN
-      of null:
-        parse_error unexpected_null_character
-        emit_replacement
-      of eof: emit_eof
-      else: emit_current
-
-    of PLAINTEXT:
-      case c
-      of null:
-        parse_error unexpected_null_character
-        emit_replacement
-      of eof: emit_eof
-      else: emit_current
-
-    of TAG_OPEN:
-      case c
-      of '!': switch_state MARKUP_DECLARATION_OPEN
-      of '/': switch_state END_TAG_OPEN
-      of AsciiAlpha:
-        new_token Token(t: START_TAG)
-        reconsume_in TAG_NAME
-      of '?':
-        parse_error unexpected_question_mark_instead_of_tag_name
-        new_token Token(t: COMMENT)
-        reconsume_in BOGUS_COMMENT
-      of eof:
-        parse_error eof_before_tag_name
-        emit '<'
-        emit_eof
-      else:
-        parse_error invalid_first_character_of_tag_name
-        emit '<'
-        reconsume_in DATA
-
-    of END_TAG_OPEN:
-      case c
-      of AsciiAlpha: new_token Token(t: END_TAG)
-      of '>':
-        parse_error missing_end_tag_name
-        switch_state DATA
-      of eof:
-        parse_error eof_before_tag_name
-        emit '<'
-        emit '/'
-        emit_eof
-      else:
-        parse_error invalid_first_character_of_tag_name
-        new_token Token(t: COMMENT)
-        reconsume_in BOGUS_COMMENT
-
-    of TAG_NAME:
-      case c
-      of whitespace: switch_state BEFORE_ATTRIBUTE_NAME
-      of '/': switch_state SELF_CLOSING_START_TAG
-      of '>':
-        switch_state DATA
-        emit_tok
-      of AsciiUpperAlpha: tokenizer.tok.tagname &= char(tokenizer.curr).tolower()
-      of null:
-        parse_error unexpected_null_character
-        tokenizer.tok.tagname &= Rune(0xFFFD)
-      of eof:
-        parse_error eof_in_tag
-        emit_eof
-      else: tokenizer.tok.tagname &= tokenizer.curr
-
-    of RCDATA_LESS_THAN_SIGN:
-      case c
-      of '/':
-        tokenizer.tmp = ""
-        switch_state RCDATA_END_TAG_OPEN
+    case state
+    of REWIND:
+      if i == 0:
+        state = CREATE
+        continue
+      dec i
+      if entry != nil and not parser.openElements.hasElement(entry.tagType):
+        continue
+      state = ADVANCE
+    of ADVANCE:
+      inc i
+      state = CREATE
+    of CREATE:
+      parser.activeFormatting[i] = (parser.insertHTMLElement(parser.activeFormatting[i][1]), parser.activeFormatting[i][1])
+      if i != parser.activeFormatting.high:
+        state = ADVANCE
+        continue
+      break
+
+proc clearActiveFormattingTillMarker(parser: var HTML5Parser) =
+  while parser.activeFormatting.len > 0 and parser.activeFormatting.pop()[0] != nil: discard
+
+template pop_current_node = discard parser.openElements.pop()
+
+func isHTMLIntegrationPoint(node: Element): bool =
+  return false #TODO SVG (NOTE MathML not implemented)
+
+# Following is an implementation of the state (?) machine defined in
+# https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhtml
+# It uses the ad-hoc pattern matching macro `match' to apply the following
+# transformations:
+# * First, pairs of patterns and actions are stored in tuples (and `discard'
+#   statements...)
+# * These pairs are then assigned to token types, later mapped to legs of the
+#   first case statement.
+# * Another case statement is constructed where needed, e.g. for switching on
+#   characters/tags/etc.
+# * Finally, the whole thing is wrapped in a named block, to implement a
+#   pseudo-goto by breaking out only when the else statement needn't be
+#   executed.
+# So for example the following code:
+#
+#   match token:
+#     TokenType.COMMENT => (block: echo "comment")
+#     ("<p>", "<a>", "</div>") => (block: echo "p, a or closing div")
+#     ("<div>", "</p>") => (block: anything_else)
+#     (TokenType.START_TAG, TokenType.END_TAG) => (block: assert false, "invalid")
+#     _ => (block: echo "anything else")
+#
+# (effectively) generates this:
+#
+#   block inside_not_else:
+#     case token.t
+#     of TokenType.COMMENT:
+#       echo "comment"
+#       break inside_not_else
+#     of TokenType.START_TAG:
+#       case token.tagtype
+#       of {TAG_P, TAG_A}:
+#         echo "p, a or closing div"
+#         break inside_not_else
+#       of TAG_DIV: discard
+#       else:
+#         assert false
+#         break inside_not_else
+#     of TokenType.END_TAG:
+#       case token.tagtype
+#       of TAG_DIV:
+#         echo "p, a or closing div"
+#         break inside_not_else
+#       of TAG_P: discard
+#       else:
+#         assert false
+#         break inside_not_else
+#     else: discard
+#     echo "anything else"
+#
+# This duplicates any code that applies for several token types, except for the
+# else branch.
+macro match(token: Token, body: typed): untyped =
+  type OfBranchStore = object
+    ofBranches: seq[(seq[NimNode], NimNode)]
+    defaultBranch: NimNode
+
+  # Stores 'of' branches
+  var ofBranches: array[TokenType, OfBranchStore]
+  # Stores 'else', 'elif' branches
+  var defaultBranch: NimNode
+
+  const tokenTypes = (func(): Table[string, TokenType] =
+    for tt in TokenType:
+      result[$tt] = tt)()
+
+  for disc in body:
+    let tup = disc[0] # access actual tuple
+    let pattern = `tup`[0]
+    let lambda = `tup`[1]
+    var action = lambda.findChild(it.kind notin {nnkSym, nnkEmpty, nnkFormalParams})
+    if pattern.kind != nnkDiscardStmt and not (action.len == 2 and action[1].kind == nnkDiscardStmt and action[1][0] == newStrLitNode("anything_else")):
+      action = quote do:
+        `action`
+        #eprint token #debug
+        break inside_not_else
+
+    var patterns = @[pattern]
+    while patterns.len > 0:
+      let pattern = patterns.pop()
+      case pattern.kind
+      of nnkSym: # simple symbols; we assume these are the enums
+        ofBranches[tokenTypes[pattern.strVal]].defaultBranch = action
+      of nnkCharLit:
+        ofBranches[CHARACTER_ASCII].ofBranches.add((@[pattern], action))
+      of nnkCurly:
+        case pattern[0].kind
+        of nnkCharLit:
+          ofBranches[CHARACTER_ASCII].ofBranches.add((@[pattern], action))
+        else: error fmt"Unsupported curly of kind {pattern[0].kind}"
+      of nnkStrLit:
+        var tempTokenizer = newTokenizer(newStringStream(pattern.strVal))
+        for token in tempTokenizer.tokenize:
+          let tt = int(token.tagtype)
+          case token.t
+          of START_TAG, END_TAG:
+            var found = false
+            for i in 0..ofBranches[token.t].ofBranches.high:
+              if ofBranches[token.t].ofBranches[i][1] == action:
+                found = true
+                ofBranches[token.t].ofBranches[i][0].add((quote do: TagType(`tt`)))
+                break
+            if not found:
+              ofBranches[token.t].ofBranches.add((@[(quote do: TagType(`tt`))], action))
+          else: error fmt"{pattern.strVal}: Unsupported token {token} of kind {token.t}"
+          break
+      of nnkDiscardStmt:
+        defaultBranch = action
+      of nnkTupleConstr:
+        for child in pattern:
+          patterns.add(child)
+      else: error fmt"{pattern}: Unsupported pattern of kind {pattern.kind}"
+
+  func tokenBranchOn(tok: TokenType): NimNode =
+    case tok
+    of START_TAG, END_TAG:
+      return quote do: token.tagtype
+    of CHARACTER:
+      return quote do: token.r
+    of CHARACTER_ASCII:
+      return quote do: token.c
+    else: error fmt"Unsupported branching of token {tok}"
+
+  template add_to_case(branch: typed) =
+    if branch[0].len == 1:
+      tokenCase.add(newNimNode(nnkOfBranch).add(branch[0][0]).add(branch[1]))
+    else:
+      var curly = newNimNode(nnkCurly)
+      for node in branch[0]:
+        curly.add(node)
+      tokenCase.add(newNimNode(nnkOfBranch).add(curly).add(branch[1]))
+
+  # Build case statements
+  var mainCase = newNimNode(nnkCaseStmt).add(quote do: `token`.t)
+  for tt in TokenType:
+    let ofBranch = newNimNode(nnkOfBranch).add(quote do: TokenType(`tt`))
+    let tokenCase = newNimNode(nnkCaseStmt)
+    if ofBranches[tt].defaultBranch != nil:
+      if ofBranches[tt].ofBranches.len > 0:
+        tokenCase.add(tokenBranchOn(tt))
+        for branch in ofBranches[tt].ofBranches:
+          add_to_case branch
+        tokenCase.add(newNimNode(nnkElse).add(ofBranches[tt].defaultBranch))
+        ofBranch.add(tokenCase)
+        mainCase.add(ofBranch)
       else:
-        emit '<'
-        reconsume_in RCDATA
-
-    of RCDATA_END_TAG_OPEN:
-      case c
-      of AsciiAlpha:
-        new_token Token(t: END_TAG)
-        reconsume_in RCDATA_END_TAG_NAME
+        ofBranch.add(ofBranches[tt].defaultBranch)
+        mainCase.add(ofBranch)
+    else:
+      if ofBranches[tt].ofBranches.len > 0:
+        tokenCase.add(tokenBranchOn(tt))
+        for branch in ofBranches[tt].ofBranches:
+          add_to_case branch
+        ofBranch.add(tokenCase)
+        tokenCase.add(newNimNode(nnkElse).add(quote do: discard))
+        mainCase.add(ofBranch)
       else:
-        emit '<'
-        emit '/'
-        reconsume_in RCDATA
-
-    of RCDATA_END_TAG_NAME:
-      has_anything_else
-      case c
-      of whitespace:
-        if is_appropriate_end_tag_token:
-          switch_state BEFORE_ATTRIBUTE_NAME
+        discard
+
+  mainCase.add(newNimNode(nnkElse).add(quote do: discard))
+
+  var stmts = newStmtList().add(mainCase)
+  for stmt in defaultBranch:
+    stmts.add(stmt)
+  result = newBlockStmt(ident("inside_not_else"), stmts)
+
+proc processInHTMLContent(parser: var HTML5Parser, token: Token, insertionMode = parser.insertionMode) =
+  template pop_all_nodes =
+    while parser.openElements.len > 1: pop_current_node
+  template anything_else = discard "anything_else"
+  macro `=>`(v: typed, body: untyped): untyped =
+    quote do:
+      discard (`v`, proc() = `body`)
+  template _ = discard
+  template reprocess(tok: Token) =
+    parser.processInHTMLContent(tok)
+
+  case insertionMode
+  of INITIAL:
+    match token:
+      AsciiWhitespace => (block: discard)
+      TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document)))
+      TokenType.DOCTYPE => (block:
+        if token.name.isnone or token.name.get != "html" or token.pubid.issome or (token.sysid.issome and token.sysid.get != "about:legacy-compat"):
+          parse_error
+        let doctype = parser.document.newDocumentType(token.name.get(""), token.pubid.get(""), token.sysid.get(""))
+        parser.document.append(doctype)
+        if not parser.document.is_iframe_srcdoc and not parser.document.parser_cannot_change_the_mode_flag:
+          if quirksConditions(token):
+            parser.document.mode = QUIRKS
+          elif limitedQuirksConditions(token):
+            parser.document.mode = LIMITED_QUIRKS
+        parser.insertionMode = BEFORE_HTML
+      )
+      _ => (block:
+        if not parser.document.is_iframe_srcdoc:
+          parse_error
+        if not parser.document.parser_cannot_change_the_mode_flag:
+          parser.document.mode = QUIRKS
+        parser.insertionMode = BEFORE_HTML
+        reprocess token
+      )
+
+  of BEFORE_HTML:
+    match token:
+      TokenType.DOCTYPE => (block: parse_error)
+      TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document)))
+      AsciiWhitespace => (block: discard)
+      "<html>" => (block:
+        let element = parser.createElement(token, $Namespace.HTML, parser.document)
+        parser.document.append(element)
+        parser.openElements.add(element)
+        parser.insertionMode = BEFORE_HEAD
+      )
+      ("</head>", "</body>", "</html>", "</br>") => (block: anything_else)
+      TokenType.END_TAG => (block: parse_error)
+      _ => (block:
+        let element = parser.document.newHTMLElement(TAG_HTML)
+        parser.document.append(element)
+        parser.openElements.add(element)
+        parser.insertionMode = BEFORE_HEAD
+        reprocess token
+      )
+
+  of BEFORE_HEAD:
+    match token:
+      AsciiWhitespace => (block: discard)
+      TokenType.COMMENT => (block: parser.insertComment(token))
+      TokenType.DOCTYPE => (block: parse_error)
+      "<html>" => (block: parser.processInHTMLContent(token, IN_BODY))
+      "<head>" => (block:
+        parser.head = parser.insertHTMLElement(token)
+        parser.insertionMode = IN_HEAD
+      )
+      ("</head>", "</body>", "</html>", "</br>") => (block: anything_else)
+      TokenType.END_TAG => (block: parse_error)
+      _ => (block:
+        parser.head = parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_HEAD))
+        parser.insertionMode = IN_HEAD
+        reprocess token
+      )
+
+  of IN_HEAD:
+    match token:
+      AsciiWhitespace => (block: discard)
+      TokenType.COMMENT => (block: parser.insertComment(token))
+      TokenType.DOCTYPE => (block: parse_error)
+      "<html>" => (block: parser.processInHTMLContent(token, IN_BODY))
+      ("<base>", "<basefont>", "<bgsound>", "<link>") => (block:
+        discard parser.insertHTMLElement(token)
+        pop_current_node
+      )
+      "<meta>" => (block:
+        discard parser.insertHTMLElement(token)
+        pop_current_node
+        #TODO encodings
+      )
+      "<title>" => (block: parser.genericRCDATAElementParsingAlgorithm(token))
+      "<noscript>" => (block:
+        if not parser.scripting:
+          discard parser.insertHTMLElement(token)
+          parser.insertionMode = IN_HEAD_NOSCRIPT
         else:
-          anything_else
-      of '/':
-        if is_appropriate_end_tag_token:
-          switch_state SELF_CLOSING_START_TAG
+          parser.genericRawtextElementParsingAlgorithm(token)
+      )
+      ("<noframes>", "<style>") => (block: parser.genericRawtextElementParsingAlgorithm(token))
+      "<script>" => (block:
+        let location = parser.appropriatePlaceForInsert()
+        let element = HTMLScriptElement(parser.createElement(token, $Namespace.HTML, location.inside))
+        element.parserDocument = parser.document
+        element.forceAsync = false
+        if parser.fragment:
+          element.alreadyStarted = true
+        #TODO document.write (?)
+        location.insert(element)
+        parser.openElements.add(element)
+        parser.tokenizer.state = SCRIPT_DATA
+        parser.insertionMode = TEXT
+      )
+      "</head>" => (block:
+        pop_current_node
+        parser.insertionMode = AFTER_HEAD
+      )
+      ("</body>", "</html>", "</br>") => (block: anything_else)
+      "<template>" => (block:
+        discard parser.insertHTMLElement(token)
+        parser.activeFormatting.add((nil, nil))
+        parser.framesetok = false
+        parser.insertionMode = IN_TEMPLATE
+        parser.templateModes.add(IN_TEMPLATE)
+      )
+      "</template>" => (block:
+        if not parser.openElements.hasElement(TAG_TEMPLATE):
+          parse_error
         else:
-          anything_else
-      of '>':
-        if is_appropriate_end_tag_token:
-          switch_state DATA
-      of AsciiAlpha: # note: merged upper & lower
-        tokenizer.tok.tagname &= char(tokenizer.curr).tolower()
-        tokenizer.tmp &= tokenizer.curr
-      else:
-        new_token nil #TODO
-        emit '<'
-        emit '/'
-        for r in tokenizer.tmp.runes:
-          emit r
-        reconsume_in RCDATA
-
-    of RAWTEXT_LESS_THAN_SIGN:
-      case c
-      of '/':
-        tokenizer.tmp = ""
-        switch_state RAWTEXT_END_TAG_OPEN
-      else:
-        emit '<'
-        reconsume_in RAWTEXT
-
-    of RAWTEXT_END_TAG_OPEN:
-      case c
-      of AsciiAlpha:
-        new_token Token(t: END_TAG)
-        reconsume_in RAWTEXT_END_TAG_NAME
-      else:
-        emit '<'
-        emit '/'
-        reconsume_in RAWTEXT
-
-    of RAWTEXT_END_TAG_NAME:
-      has_anything_else
-      case c
-      of whitespace:
-        if is_appropriate_end_tag_token:
-          switch_state BEFORE_ATTRIBUTE_NAME
+          parser.generateImpliedEndTagsThoroughly()
+          if parser.currentNode.tagType != TAG_TEMPLATE:
+            parse_error
+          while parser.openElements.pop().tagType != TAG_TEMPLATE: discard
+          parser.clearActiveFormattingTillMarker()
+          discard parser.templateModes.pop()
+          parser.resetInsertionMode()
+      )
+      ("<head>", TokenType.END_TAG) => (block: parse_error)
+      _ => (block:
+        pop_current_node
+        parser.insertionMode = AFTER_HEAD
+        reprocess token
+      )
+
+  of IN_HEAD_NOSCRIPT:
+    match token:
+      TokenType.DOCTYPE => (block: parse_error)
+      "<html>" => (block: parser.processInHTMLContent(token, IN_BODY))
+      "</noscript>" => (block:
+        pop_current_node
+        parser.insertionMode = IN_HEAD
+      )
+      (AsciiWhitespace,
+       TokenType.COMMENT,
+       "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<style>") => (block:
+        parser.processInHTMLContent(token, IN_HEAD))
+      "</br>" => (block: anything_else)
+      ("<head>", "<noscript>") => (block: parse_error)
+      TokenType.END_TAG => (block: parse_error)
+      _ => (block:
+        pop_current_node
+        parser.insertionMode = IN_HEAD
+        reprocess token
+      )
+
+  of AFTER_HEAD:
+    match token:
+      AsciiWhitespace => (block: parser.insertCharacter(token.c))
+      TokenType.COMMENT => (block: parser.insertComment(token))
+      TokenType.DOCTYPE => (block: parse_error)
+      "<html>" => (block: parser.processInHTMLContent(token, IN_BODY))
+      "<body>" => (block:
+        discard parser.insertHTMLElement(token)
+        parser.framesetok = false
+        parser.insertionMode = IN_BODY
+      )
+      "<frameset>" => (block:
+        discard parser.insertHTMLElement(token)
+        parser.insertionMode = IN_FRAMESET
+      )
+      ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<script>", "<style>", "<template>", "<title>") => (block:
+        parse_error
+        parser.openElements.add(parser.head)
+        parser.processInHTMLContent(token, IN_HEAD)
+        for i in countdown(parser.openElements.high, 0):
+          if parser.openElements[i] == parser.head:
+            parser.openElements.del(i)
+      )
+      "</template>" => (block: parser.processInHTMLContent(token, IN_HEAD))
+      ("</body>", "</html>", "</br>") => (block: anything_else)
+      ("<head>", TokenType.END_TAG) => (block: parse_error)
+      _ => (block:
+        discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_BODY))
+        parser.insertionMode = IN_BODY
+        reprocess token
+      )
+
+  of IN_BODY:
+    proc closeP(parser: var HTML5Parser) =
+      parser.generateImpliedEndTags(TAG_P)
+      if parser.currentNode.tagType != TAG_P: parse_error
+      while parser.openElements.pop().tagType != TAG_P: discard
+
+    proc adoptionAgencyAlgorithm(parser: var HTML5Parser, token: Token): bool =
+      if parser.currentNode.tagType != TAG_UNKNOWN and parser.currentNode.tagtype == token.tagtype or parser.currentNode.localName == token.tagname: #TODO local or qualified name?
+        var fail = true
+        for it in parser.activeFormatting:
+          if it[0] == parser.currentNode:
+            fail = false
+        if fail:
+          pop_current_node
+          return false
+      var i = 0
+      while true:
+        if i >= 8: return false
+        inc i
+        if parser.activeFormatting.len == 0: return true
+        var formatting: Element
+        var formattingIndex: int
+        for j in countdown(parser.activeFormatting.high, 0):
+          let element = parser.activeFormatting[j][0]
+          if element == nil:
+            return true
+          if element.tagType != TAG_UNKNOWN and element.tagtype == token.tagtype or element.qualifiedName == token.tagname:
+            formatting = element
+            formattingIndex = j
+            break
+          if j == 0:
+            return true
+        let stackIndex = parser.openElements.find(formatting)
+        if stackIndex < 0:
+          parse_error
+          parser.activeFormatting.del(formattingIndex)
+          return false
+        if not parser.openElements.hasElementInScope(formatting):
+          parse_error
+          return false
+        if formatting != parser.currentNode: parse_error
+        var furthestBlock: Element = nil
+        var furthestBlockIndex: int
+        for j in countdown(parser.openElements.high, 0):
+          if parser.openElements[j] == formatting:
+            break
+          if parser.openElements[j].tagType in SpecialElements:
+            furthestBlock = parser.openElements[j]
+            furthestBlockIndex = j
+            break
+        if furthestBlock == nil:
+          while parser.openElements.pop() != formatting: discard
+          parser.activeFormatting.del(formattingIndex)
+          return false
+        let commonAncestor = parser.openElements[stackIndex - 1]
+        var bookmark = formattingIndex
+        var node = furthestBlock
+        var aboveNode = parser.openElements[furthestBlockIndex - 1]
+        var lastNode = furthestBlock
+        var j = 0
+        while true:
+          inc j
+          node = aboveNode
+          if node == formatting: break
+          var nodeFormattingIndex = -1
+          for i in countdown(parser.activeFormatting.high, 0):
+            if parser.activeFormatting[i][0] == node:
+              nodeFormattingIndex = i
+              break
+          if j > 3 and nodeFormattingIndex >= 0:
+            parser.activeFormatting.del(nodeFormattingIndex)
+            if nodeFormattingIndex < bookmark:
+              dec bookmark # a previous node got deleted, so decrease bookmark by one
+          let nodeStackIndex = parser.openElements.find(node)
+          if nodeFormattingIndex < 0:
+            parser.openElements.del(nodeStackIndex)
+            if nodeStackIndex < furthestBlockIndex:
+              dec furthestBlockIndex
+            continue
+          let element = parser.createElement(parser.activeFormatting[nodeFormattingIndex][1], $Namespace.HTML, commonAncestor)
+          parser.activeFormatting[nodeFormattingIndex] = (element, parser.activeFormatting[nodeFormattingIndex][1])
+          parser.openElements[nodeFormattingIndex] = element
+          aboveNode = parser.openElements[nodeFormattingIndex - 1]
+          node = element
+          if lastNode == furthestBlock:
+            bookmark = nodeFormattingIndex
+          node.append(lastNode)
+          lastNode = node
+        let location = parser.appropriatePlaceForInsert(commonAncestor)
+        location.inside.insert(lastNode, location.before)
+        let token = parser.activeFormatting[formattingIndex][1]
+        let element = parser.createElement(token, $Namespace.HTML, furthestBlock)
+        for child in furthestBlock.childNodes:
+          child.remove()
+          element.append(child)
+        furthestBlock.append(element)
+        parser.activeFormatting.insert((element, token), bookmark)
+        parser.activeFormatting.del(formattingIndex)
+        parser.openElements.insert(element, furthestBlockIndex)
+        parser.openElements.del(stackIndex)
+
+    template any_other_start_tag() =
+      parser.reconstructActiveFormatting()
+      discard parser.insertHTMLElement(token)
+
+    template any_other_end_tag() =
+      for i in countdown(parser.openElements.high, 0):
+        let node = parser.openElements[i]
+        if node.tagType != TAG_UNKNOWN and node.tagType == token.tagtype or node.localName == token.tagname: #TODO local or qualified name?
+          parser.generateImpliedEndTags(token.tagtype)
+          if node != parser.currentNode: parse_error
+          while parser.openElements.pop() != node: discard
+          break
+        elif node.tagType in SpecialElements:
+          parse_error
+          return
+    
+    match token:
+      '\0' => (block: parse_error)
+      AsciiWhitespace => (block:
+        parser.reconstructActiveFormatting()
+        parser.insertCharacter(token.c)
+      )
+      TokenType.CHARACTER_ASCII => (block:
+        parser.reconstructActiveFormatting()
+        parser.insertCharacter(token.c)
+        parser.framesetOk = false
+      )
+      TokenType.CHARACTER => (block:
+        parser.reconstructActiveFormatting()
+        parser.insertCharacter(token.r)
+        parser.framesetOk = false
+      )
+      TokenType.COMMENT => (block: parser.insertComment(token))
+      TokenType.DOCTYPE => (block: parse_error)
+      "<html>" => (block:
+        parse_error
+        if parser.openElements.hasElement(TAG_TEMPLATE):
+          discard
         else:
-          anything_else
-      of '/':
-        if is_appropriate_end_tag_token:
-          switch_state SELF_CLOSING_START_TAG
+          for k, v in token.attrs:
+            if k notin parser.openElements[0].attributes:
+              parser.openElements[0].attributes[k] = v
+      )
+      ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<script>", "<style>", "<template>", "<title>",
+       "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD))
+      "<body>" => (block:
+        parse_error
+        if parser.openElements.len == 1 or parser.openElements[1].tagType != TAG_BODY or parser.openElements.hasElement(TAG_TEMPLATE):
+          discard
         else:
-          anything_else
-      of '>':
-        if is_appropriate_end_tag_token:
-          switch_state DATA
-          emit_tok
+          parser.framesetOk = false
+          for k, v in token.attrs:
+            if k notin parser.openElements[1].attributes:
+              parser.openElements[1].attributes[k] = v
+      )
+      "<frameset>" => (block:
+        parse_error
+        if parser.openElements.len == 1 or parser.openElements[1].tagType != TAG_BODY or not parser.framesetOk:
+          discard
         else:
-          anything_else
-      of AsciiAlpha: # note: merged upper & lower
-        tokenizer.tok.tagname &= char(tokenizer.curr).tolower()
-        tokenizer.tmp &= tokenizer.curr
-      else:
-        new_token nil #TODO
-        emit '<'
-        emit '/'
-        for r in tokenizer.tmp.runes:
-          emit r
-        reconsume_in RAWTEXT
-
-    of SCRIPT_DATA_LESS_THAN_SIGN:
-      case c
-      of '/':
-        tokenizer.tmp = ""
-        switch_state SCRIPT_DATA_END_TAG_OPEN
-      of '!':
-        switch_state SCRIPT_DATA_ESCAPE_START
-        emit '<'
-        emit '!'
-      else:
-        emit '<'
-        reconsume_in SCRIPT_DATA
-
-    of SCRIPT_DATA_END_TAG_OPEN:
-      case c
-      of AsciiAlpha:
-        new_token Token(t: END_TAG)
-        reconsume_in SCRIPT_DATA_END_TAG_NAME
-      else:
-        emit '<'
-        emit '/'
-        reconsume_in SCRIPT_DATA
-
-    of SCRIPT_DATA_END_TAG_NAME:
-      has_anything_else
-      case c
-      of whitespace:
-        if is_appropriate_end_tag_token:
-          switch_state BEFORE_ATTRIBUTE_NAME
+          if parser.openElements[1].parentNode != nil:
+            parser.openElements[1].remove()
+            pop_all_nodes
+      )
+      TokenType.EOF => (block:
+        if parser.templateModes.len > 0:
+          parser.processInHTMLContent(token, IN_TEMPLATE)
         else:
-          anything_else
-      of '/':
-        if is_appropriate_end_tag_token:
-          switch_state SELF_CLOSING_START_TAG
+          #NOTE parse error omitted
+          discard # stop
+      )
+      "</body>" => (block:
+        if not parser.openElements.hasElementInScope(TAG_BODY):
+          parse_error
         else:
-          anything_else
-      of '>':
-        if is_appropriate_end_tag_token:
-          switch_state DATA
-          emit_tok
+          #NOTE parse error omitted
+          parser.insertionMode = AFTER_BODY
+      )
+      "</html>" => (block:
+        if not parser.openElements.hasElementInScope(TAG_BODY):
+          parse_error
         else:
-          anything_else
-      of AsciiAlpha: # note: merged upper & lower
-        tokenizer.tok.tagname &= char(tokenizer.curr).tolower()
-        tokenizer.tmp &= tokenizer.curr
-      else:
-        emit '<'
-        emit '/'
-        for r in tokenizer.tmp.runes:
-          emit r
-        reconsume_in SCRIPT_DATA
-
-    of SCRIPT_DATA_ESCAPE_START:
-      case c
-      of '-':
-        switch_state SCRIPT_DATA_ESCAPE_START_DASH
-        emit '-'
-      else:
-        reconsume_in SCRIPT_DATA
-
-    of SCRIPT_DATA_ESCAPE_START_DASH:
-      case c
-      of '-':
-        switch_state SCRIPT_DATA_ESCAPED_DASH_DASH
-        emit '-'
-      else:
-        reconsume_in SCRIPT_DATA
-
-    of SCRIPT_DATA_ESCAPED:
-      case c
-      of '-':
-        switch_state SCRIPT_DATA_ESCAPED_DASH
-        emit '-'
-      of '<':
-        switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
-      of null:
-        parse_error unexpected_null_character
-        emit_replacement
-      of eof:
-        parse_error eof_in_script_html_comment_like_text
-        emit_eof
-      else:
-        emit_current
-
-    of SCRIPT_DATA_ESCAPED_DASH:
-      case c
-      of '-':
-        switch_state SCRIPT_DATA_ESCAPED_DASH_DASH
-        emit '-'
-      of '<':
-        switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
-      of null:
-        parse_error unexpected_null_character
-        switch_state SCRIPT_DATA_ESCAPED
-      of eof:
-        parse_error eof_in_script_html_comment_like_text
-        emit_eof
-      else:
-        switch_state SCRIPT_DATA_ESCAPED
-        emit_current
-
-    of SCRIPT_DATA_ESCAPED_DASH_DASH:
-      case c
-      of '-':
-        emit '-'
-      of '<':
-        switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
-      of '>':
-        switch_state SCRIPT_DATA
-        emit '>'
-      of null:
-        parse_error unexpected_null_character
-        switch_state SCRIPT_DATA_ESCAPED
-      of eof:
-        parse_error eof_in_script_html_comment_like_text
-        emit_eof
-      else:
-        switch_state SCRIPT_DATA_ESCAPED
-        emit_current
-
-    of SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
-      case c
-      of '/':
-        tokenizer.tmp = ""
-        switch_state SCRIPT_DATA_ESCAPED_END_TAG_OPEN
-      of AsciiAlpha:
-        tokenizer.tmp = ""
-        emit '<'
-        reconsume_in SCRIPT_DATA_DOUBLE_ESCAPE_START
-      else:
-        emit '<'
-        reconsume_in SCRIPT_DATA_ESCAPED
-
-    of SCRIPT_DATA_ESCAPED_END_TAG_OPEN:
-      case c
-      of AsciiAlpha:
-        new_token Token(t: START_TAG)
-        reconsume_in SCRIPT_DATA_ESCAPED_END_TAG_NAME
-      else:
-        emit '<'
-        emit '/'
-        reconsume_in SCRIPT_DATA_ESCAPED
-
-    of SCRIPT_DATA_ESCAPED_END_TAG_NAME:
-      has_anything_else
-      case c
-      of whitespace:
-        if is_appropriate_end_tag_token:
-          switch_state BEFORE_ATTRIBUTE_NAME
+          #NOTE parse error omitted
+          parser.insertionMode = AFTER_BODY
+          reprocess token
+      )
+      ("<address>", "<article>", "<aside>", "<blockquote>", "<center>",
+      "<details>", "<dialog>", "<dir>", "<div>", "<dl>", "<fieldset>",
+      "<figcaption>", "<figure>", "<footer>", "<header>", "<hgroup>", "<main>",
+      "<menu>", "<nav>", "<ol>", "<p>", "<section>", "<summary>", "<ul>") => (block:
+        if parser.openElements.hasElementInButtonScope(TAG_P):
+          parser.closeP()
+        discard parser.insertHTMLElement(token)
+      )
+      ("<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>") => (block:
+        if parser.openElements.hasElementInButtonScope(TAG_P):
+          parser.closeP()
+        if parser.currentNode.tagType in HTagTypes:
+          parse_error
+          pop_current_node
+        discard parser.insertHTMLElement(token)
+      )
+      ("<pre>", "<listing>") => (block:
+        if parser.openElements.hasElementInButtonScope(TAG_P):
+          parser.closeP()
+        discard parser.insertHTMLElement(token)
+        parser.ignoreLF = true
+        parser.framesetOk = false
+      )
+      "<form>" => (block:
+        let hasTemplate = parser.openElements.hasElement(TAG_TEMPLATE)
+        if parser.form != nil and not hasTemplate:
+          parse_error
         else:
-          anything_else
-      of '/':
-        if is_appropriate_end_tag_token:
-          switch_state SELF_CLOSING_START_TAG
+          if parser.openElements.hasElementInButtonScope(TAG_P):
+            parser.closeP()
+          let element = parser.insertHTMLElement(token)
+          if not hasTemplate:
+            parser.form = HTMLFormElement(element)
+      )
+      "<li>" => (block:
+        parser.framesetOk = false
+        for i in countdown(parser.openElements.high, 0):
+          let node = parser.openElements[i]
+          case node.tagType
+          of TAG_LI:
+            parser.generateImpliedEndTags(TAG_LI)
+            if parser.currentNode.tagType != TAG_LI: parse_error
+            while parser.openElements.pop().tagType != TAG_LI: discard
+            break
+          of SpecialElements - {TAG_ADDRESS, TAG_DIV, TAG_P, TAG_LI}:
+            break
+          else: discard
+        if parser.openElements.hasElementInButtonScope(TAG_P):
+          parser.closeP()
+        discard parser.insertHTMLElement(token)
+      )
+      ("<dd>", "<dt>") => (block:
+        parser.framesetOk = false
+        for i in countdown(parser.openElements.high, 0):
+          let node = parser.openElements[i]
+          case node.tagType
+          of TAG_DD:
+            parser.generateImpliedEndTags(TAG_DD)
+            if parser.currentNode.tagType != TAG_DD: parse_error
+            while parser.openElements.pop().tagType != TAG_DD: discard
+            break
+          of TAG_DT:
+            parser.generateImpliedEndTags(TAG_DT)
+            if parser.currentNode.tagType != TAG_DT: parse_error
+            while parser.openElements.pop().tagType != TAG_DT: discard
+            break
+          of SpecialElements - {TAG_ADDRESS, TAG_DIV, TAG_P, TAG_DD, TAG_DT}:
+            break
+          else: discard
+        if parser.openElements.hasElementInButtonScope(TAG_P):
+          parser.closeP()
+        discard parser.insertHTMLElement(token)
+      )
+      "<plaintext>" => (block:
+        if parser.openElements.hasElementInButtonScope(TAG_P):
+          parser.closeP()
+        discard parser.insertHTMLElement(token)
+        parser.tokenizer.state = PLAINTEXT
+      )
+      "<button>" => (block:
+        if parser.openElements.hasElementInScope(TAG_BUTTON):
+          parse_error
+          parser.generateImpliedEndTags()
+          while parser.openElements.pop().tagType != TAG_BUTTON: discard
+        parser.reconstructActiveFormatting()
+        discard parser.insertHTMLElement(token)
+        parser.framesetOk = false
+      )
+      ("</address>", "</article>", "</aside>", "</blockquote>", "</button>",
+       "</center>", "</details>", "</dialog>", "</dir>", "</div>", "</dl>",
+       "</fieldset>", "</figcaption>", "</figure>", "</footer>", "</header>",
+       "</hgroup>", "</listing>", "</main>", "</menu>", "</nav>", "</ol>",
+       "</pre>", "</section>", "</summary>", "</ul>") => (block:
+        if not parser.openElements.hasElementInScope(token.tagtype):
+          parse_error
         else:
-          anything_else
-      of '>':
-        if is_appropriate_end_tag_token:
-          switch_state DATA
+          parser.generateImpliedEndTags()
+          if parser.currentNode.tagType != token.tagtype: parse_error
+          while parser.openElements.pop().tagType != token.tagtype: discard
+      )
+      "</form>" => (block:
+        if not parser.openElements.hasElement(TAG_TEMPLATE):
+          let node = parser.form
+          parser.form = nil
+          if node == nil or not parser.openElements.hasElementInScope(node.tagType):
+            parse_error
+            return
+          parser.generateImpliedEndTags()
+          if parser.currentNode != node: parse_error
+          parser.openElements.del(parser.openElements.find(node))
         else:
-          anything_else
-      of AsciiAlpha:
-        tokenizer.tok.tagname &= char(tokenizer.curr).tolower()
-        tokenizer.tmp &= tokenizer.curr
-      else:
-        emit '<'
-        emit '/'
-        for r in tokenizer.tmp.runes:
-          emit r
-        reconsume_in SCRIPT_DATA_ESCAPED
-
-    of SCRIPT_DATA_DOUBLE_ESCAPE_START:
-      case c
-      of whitespace, '/', '>':
-        if tokenizer.tmp == "script":
-          switch_state SCRIPT_DATA_DOUBLE_ESCAPED
+          if not parser.openElements.hasElementInScope(TAG_FORM):
+            parse_error
+            return
+          parser.generateImpliedEndTags()
+          if parser.currentNode.tagType != TAG_FORM: parse_error
+          while parser.openElements.pop().tagType != TAG_FORM: discard
+      )
+      "</p>" => (block:
+        if not parser.openElements.hasElementInButtonScope(TAG_P):
+          parse_error
+          discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_P))
+        parser.closeP()
+      )
+      "</li>" => (block:
+        if not parser.openElements.hasElementInListItemScope(TAG_LI):
+          parse_error
         else:
-          switch_state SCRIPT_DATA_ESCAPED
-          emit_current
-      of AsciiAlpha: # note: merged upper & lower
-        tokenizer.tmp &= c.tolower()
-        emit_current
-      else: reconsume_in SCRIPT_DATA_ESCAPED
-
-    of SCRIPT_DATA_DOUBLE_ESCAPED:
-      case c
-      of '-':
-        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_DASH
-        emit '-'
-      of '<':
-        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
-        emit '<'
-      of null:
-        parse_error unexpected_null_character
-        emit_replacement
-      of eof:
-        parse_error eof_in_script_html_comment_like_text
-        emit_eof
-      else: emit_current
-
-    of SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
-      case c
-      of '-':
-        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH
-        emit '-'
-      of '<':
-        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
-        emit '<'
-      of null:
-        parse_error unexpected_null_character
-        switch_state SCRIPT_DATA_DOUBLE_ESCAPED
-        emit_replacement
-      of eof:
-        parse_error eof_in_script_html_comment_like_text
-        emit_eof
-      else:
-        switch_state SCRIPT_DATA_DOUBLE_ESCAPED
-        emit_current
-
-    of SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
-      case c
-      of '-': emit '-'
-      of '<':
-        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
-        emit '<'
-      of '>':
-        switch_state SCRIPT_DATA
-        emit '>'
-      of null:
-        parse_error unexpected_null_character
-        switch_state SCRIPT_DATA_DOUBLE_ESCAPED
-        emit_replacement
-      of eof:
-        parse_error eof_in_script_html_comment_like_text
-        emit_eof
-      else: switch_state SCRIPT_DATA_DOUBLE_ESCAPED
-
-    of SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
-      case c
-      of '/':
-        tokenizer.tmp = ""
-        switch_state SCRIPT_DATA_DOUBLE_ESCAPE_END
-        emit '/'
-      else: reconsume_in SCRIPT_DATA_DOUBLE_ESCAPED
-
-    of SCRIPT_DATA_DOUBLE_ESCAPE_END:
-      case c
-      of whitespace, '/', '>':
-        if tokenizer.tmp == "script":
-          switch_state SCRIPT_DATA_ESCAPED
+          parser.generateImpliedEndTags(TAG_LI)
+          if parser.currentNode.tagType != TAG_LI: parse_error
+          while parser.openElements.pop().tagType != TAG_LI: discard
+      )
+      ("</dd>", "</dt>") => (block:
+        if not parser.openElements.hasElementInScope(token.tagtype):
+          parse_error
         else:
-          switch_state SCRIPT_DATA_DOUBLE_ESCAPED
-          emit_current
-      of AsciiAlpha: # note: merged upper & lower
-        tokenizer.tmp &= c.tolower()
-        emit_current
-      else:
-        reconsume_in SCRIPT_DATA_DOUBLE_ESCAPED
-
-    of BEFORE_ATTRIBUTE_NAME:
-      case c
-      of whitespace: discard
-      of '/', '>', eof: reconsume_in AFTER_ATTRIBUTE_NAME
-      of '=':
-        parse_error unexpected_equals_sign_before_attribute_name
-        start_new_attribute
-        switch_state ATTRIBUTE_NAME
-      else:
-        start_new_attribute
-        reconsume_in ATTRIBUTE_NAME
-
-    of ATTRIBUTE_NAME:
-      has_anything_else
-      case c
-      of whitespace, '/', '>', eof:
-        leave_attribute_name_state
-        reconsume_in AFTER_ATTRIBUTE_NAME
-      of '=':
-        leave_attribute_name_state
-        switch_state BEFORE_ATTRIBUTE_VALUE
-      of AsciiUpperAlpha:
-        tokenizer.attrn &= c.tolower()
-      of null:
-        parse_error unexpected_null_character
-        tokenizer.attrn &= Rune(0xFFFD)
-      of '"', '\'', '<':
-        parse_error unexpected_character_in_attribute_name
+          parser.generateImpliedEndTags(token.tagtype)
+          if parser.currentNode.tagType != token.tagtype: parse_error
+          while parser.openElements.pop().tagType != token.tagtype: discard
+      )
+      ("</h1>", "</h2>", "</h3>", "</h4>", "</h5>", "</h6>") => (block:
+        if not parser.openElements.hasElementInScope(HTagTypes):
+          parse_error
+        else:
+          parser.generateImpliedEndTags()
+          if parser.currentNode.tagType != token.tagtype: parse_error
+          while parser.openElements.pop().tagType notin HTagTypes: discard
+      )
+      "</sarcasm>" => (block:
+        #*deep breath*
         anything_else
-      else:
-        tokenizer.attrn &= tokenizer.curr
-
-    of AFTER_ATTRIBUTE_NAME:
-      case c
-      of whitespace: discard
-      of '/': switch_state SELF_CLOSING_START_TAG
-      of '=': switch_state BEFORE_ATTRIBUTE_VALUE
-      of '>':
-        switch_state DATA
-        emit '>'
-      of eof:
-        parse_error eof_in_tag
-        emit_eof
-      else:
-        start_new_attribute
-        reconsume_in ATTRIBUTE_NAME
-
-    of BEFORE_ATTRIBUTE_VALUE:
-      case c
-      of whitespace: discard
-      of '"': switch_state ATTRIBUTE_VALUE_DOUBLE_QUOTED
-      of '\'': switch_state ATTRIBUTE_VALUE_SINGLE_QUOTED
-      of '>':
-        parse_error missing_attribute_value
-        switch_state DATA
-        emit '>'
-      else: reconsume_in ATTRIBUTE_VALUE_UNQUOTED
-
-    of ATTRIBUTE_VALUE_DOUBLE_QUOTED:
-      case c
-      of '"': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED
-      of '&': switch_state_return CHARACTER_REFERENCE
-      of null:
-        parse_error unexpected_null_character
-        append_to_current_attr_value Rune(0xFFFD)
-      of eof:
-        parse_error eof_in_tag
-        emit_eof
-      else: append_to_current_attr_value tokenizer.curr
-
-    of ATTRIBUTE_VALUE_SINGLE_QUOTED:
-      case c
-      of '\'': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED
-      of '&': switch_state_return CHARACTER_REFERENCE
-      of null:
-        parse_error unexpected_null_character
-        append_to_current_attr_value Rune(0xFFFD)
-      of eof:
-        parse_error eof_in_tag
-        emit_eof
-      else: append_to_current_attr_value tokenizer.curr
-
-    of ATTRIBUTE_VALUE_UNQUOTED:
-      case c
-      of whitespace: switch_state BEFORE_ATTRIBUTE_NAME
-      of '&': switch_state_return CHARACTER_REFERENCE
-      of '>': switch_state DATA
-      of null:
-        parse_error unexpected_null_character
-        append_to_current_attr_value Rune(0xFFFD)
-      of '"', '\'', '<', '=', '`':
-        parse_error unexpected_character_in_unquoted_attribute_value
-        append_to_current_attr_value c
-      of eof:
-        parse_error eof_in_tag
-        emit_eof
-      else: append_to_current_attr_value tokenizer.curr
-
-    of AFTER_ATTRIBUTE_VALUE_QUOTED:
-      case c
-      of whitespace:
-        switch_state BEFORE_ATTRIBUTE_NAME
-      of '/':
-        switch_state SELF_CLOSING_START_TAG
-      of '>':
-        switch_state DATA
-        emit_tok
-      of eof:
-        parse_error eof_in_tag
-        emit_eof
-      else: append_to_current_attr_value tokenizer.curr
-
-    of SELF_CLOSING_START_TAG:
-      case c
-      of '>':
-        tokenizer.tok.selfclosing = true
-        switch_state DATA
-        emit '>'
-      of eof:
-        parse_error eof_in_tag
-        emit_eof
-      else:
-        parse_error unexpected_solidus_in_tag
-        reconsume_in BEFORE_ATTRIBUTE_NAME
-
-    of BOGUS_COMMENT:
-      assert tokenizer.tok.t == COMMENT
-      case c
-      of '>':
-        switch_state DATA
-        emit_tok
-      of eof:
-        emit_tok
-        emit_eof
-      of null: parse_error unexpected_null_character
-      else: tokenizer.tok.data &= tokenizer.curr
-
-    of MARKUP_DECLARATION_OPEN: # note: rewritten to fit case model as we consume a char anyway
-      has_anything_else
-      case c
-      of '-':
-        if peek_char == '-':
-          new_token Token(t: COMMENT)
-          tokenizer.state = COMMENT_START
-          consume_and_discard 1
-        else: anything_else
-      of 'D', 'd':
-        if peek_str_nocase("OCTYPE"):
-          consume_and_discard "OCTYPE".len
-          switch_state DOCTYPE
-        else: anything_else
-      of '[':
-        if peek_str("CDATA["):
-          consume_and_discard "CDATA[".len
-          if has_adjusted_current_node: #TODO and it is not an element in the HTML namespace
-            switch_state CDATA_SECTION
-          else:
-            parse_error cdata_in_html_content
-            new_token Token(t: COMMENT, data: "[CDATA[")
-            switch_state BOGUS_COMMENT
-        else: anything_else
-      else:
-        parse_error incorrectly_opened_comment
-        new_token Token(t: COMMENT)
-        reconsume_in BOGUS_COMMENT
-
-    of COMMENT_START:
-      case c
-      of '-': switch_state COMMENT_START_DASH
-      of '>':
-        parse_error abrupt_closing_of_empty_comment
-        switch_state DATA
-        emit_tok
-      else: reconsume_in COMMENT
-
-    of COMMENT_START_DASH:
-      case c
-      of '-': switch_state COMMENT_END
-      of '>':
-        parse_error abrupt_closing_of_empty_comment
-        switch_state DATA
-        emit_tok
-      of eof:
-        parse_error eof_in_comment
-        emit_tok
-        emit_eof
-      else:
-        tokenizer.tok.data &= '-'
-        reconsume_in COMMENT
-
-    of COMMENT:
-      case c
-      of '<':
-        tokenizer.tok.data &= c
-        switch_state COMMENT_LESS_THAN_SIGN
-      of '-': switch_state COMMENT_END_DASH
-      of null:
-        parse_error unexpected_null_character
-        tokenizer.tok.data &= Rune(0xFFFD)
-      of eof:
-        parse_error eof_in_comment
-        emit_tok
-        emit_eof
-      else: tokenizer.tok.data &= tokenizer.curr
-
-    of COMMENT_LESS_THAN_SIGN:
-      case c
-      of '!':
-        tokenizer.tok.data &= c
-        switch_state COMMENT_LESS_THAN_SIGN_BANG
-      of '<': tokenizer.tok.data &= c
-      else: reconsume_in COMMENT
-
-    of COMMENT_LESS_THAN_SIGN_BANG:
-      case c
-      of '-': switch_state COMMENT_LESS_THAN_SIGN_BANG_DASH
-      else: reconsume_in COMMENT
-
-    of COMMENT_LESS_THAN_SIGN_BANG_DASH:
-      case c
-      of '-': switch_state COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH
-      else: reconsume_in COMMENT_END_DASH
-
-    of COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH:
-      case c
-      of '>', eof: reconsume_in COMMENT_END
-      else:
-        parse_error nested_comment
-        reconsume_in COMMENT_END
-
-    of COMMENT_END_DASH:
-      case c
-      of '-': switch_state COMMENT_END
-      of eof:
-        parse_error eof_in_comment
-        emit_tok
-        emit_eof
-      else:
-        tokenizer.tok.data &= '-'
-        reconsume_in COMMENT
-
-    of COMMENT_END:
-      case c
-      of '>': switch_state DATA
-      of '!': switch_state COMMENT_END_BANG
-      of '-': tokenizer.tok.data &= '-'
-      of eof:
-        parse_error eof_in_comment
-        emit_tok
-        emit_eof
-      else:
-        tokenizer.tok.data &= "--"
-        reconsume_in COMMENT
-
-    of COMMENT_END_BANG:
-      case c
-      of '-':
-        tokenizer.tok.data &= "--!"
-        switch_state COMMENT_END_DASH
-      of '>':
-        parse_error incorrectly_closed_comment
-        switch_state DATA
-        emit_tok
-      of eof:
-        parse_error eof_in_comment
-        emit_tok
-        emit_eof
-      else:
-        tokenizer.tok.data &= "--!"
-        reconsume_in COMMENT
-
-    of DOCTYPE:
-      case c
-      of whitespace: switch_state BEFORE_DOCTYPE_NAME
-      of '>': reconsume_in BEFORE_DOCTYPE_NAME
-      of eof:
-        parse_error eof_in_doctype
-        new_token Token(t: DOCTYPE, quirks: true)
-        emit_tok
-        emit_eof
-      else:
-        parse_error missing_whitespace_before_doctype_name
-        reconsume_in BEFORE_DOCTYPE_NAME
-
-    of BEFORE_DOCTYPE_NAME:
-      case c
-      of whitespace: discard
-      of AsciiUpperAlpha:
-        new_token Token(t: DOCTYPE, name: some($c.tolower()))
-        switch_state DOCTYPE_NAME
-      of null:
-        parse_error unexpected_null_character
-        new_token Token(t: DOCTYPE, name: some($Rune(0xFFFD)))
-      of '>':
-        parse_error missing_doctype_name
-        new_token Token(t: DOCTYPE, quirks: true)
-        switch_state DATA
-        emit_tok
-      of eof:
-        parse_error eof_in_doctype
-        new_token Token(t: DOCTYPE, quirks: true)
-        emit_tok
-        emit_eof
-      else:
-        new_token Token(t: DOCTYPE, name: some($tokenizer.curr))
-        switch_state DOCTYPE_NAME
-
-    of DOCTYPE_NAME:
-      case c
-      of whitespace: switch_state AFTER_DOCTYPE_NAME
-      of '>':
-        switch_state DATA
-        emit_tok
-      of AsciiUpperAlpha:
-        tokenizer.tok.name.get &= c.tolower()
-      of null:
-        parse_error unexpected_null_character
-        tokenizer.tok.name.get &= Rune(0xFFFD)
-      of eof:
-        parse_error eof_in_doctype
-        tokenizer.tok.quirks = true
-        emit_tok
-        emit_eof
-      else:
-        tokenizer.tok.name.get &= tokenizer.curr
-
-    of AFTER_DOCTYPE_NAME: # note: rewritten to fit case model as we consume a char anyway
-      has_anything_else
-      case c
-      of whitespace: discard
-      of '>':
-        switch_state DATA
-        emit_tok
-      of eof:
-        parse_error eof_in_doctype
-        tokenizer.tok.quirks = true
-        emit_tok
-        emit_eof
-      of 'p', 'P':
-        if peek_str("UBLIC"):
-          consume_and_discard "UBLIC".len
-          switch_state AFTER_DOCTYPE_PUBLIC_KEYWORD
+      )
+      "<a>" => (block:
+        var element: Element = nil
+        for i in countdown(parser.activeFormatting.high, 0):
+          let format = parser.activeFormatting[i]
+          if format[0] == nil:
+            break
+          if format[0].tagType == TAG_A:
+            element = format[0]
+            break
+        if element != nil:
+          parse_error
+          if parser.adoptionAgencyAlgorithm(token):
+            any_other_end_tag
+            return
+          for i in 0..parser.activeFormatting.high:
+            if parser.activeFormatting[i][0] == element:
+              parser.activeFormatting.del(i)
+              break
+          for i in 0..parser.openElements.high:
+            if parser.openElements[i] == element:
+              parser.openElements.del(i)
+              break
+          parser.reconstructActiveFormatting()
+          let element = parser.insertHTMLElement(token)
+          parser.pushOntoActiveFormatting(element, token)
+      )
+      ("<b>", "<big>", "<code>", "<em>", "<font>", "<i>", "<s>", "<small>",
+       "<strike>", "<strong>", "<tt>", "<u>") => (block:
+        parser.reconstructActiveFormatting()
+        let element = parser.insertHTMLElement(token)
+        parser.pushOntoActiveFormatting(element, token)
+      )
+      "<nobr>" => (block:
+        parser.reconstructActiveFormatting()
+        if parser.openElements.hasElementInScope(TAG_NOBR):
+          parse_error
+          if parser.adoptionAgencyAlgorithm(token):
+            any_other_end_tag
+            return
+          parser.reconstructActiveFormatting()
+        let element = parser.insertHTMLElement(token)
+        parser.pushOntoActiveFormatting(element, token)
+      )
+      ("</a>", "</b>", "</big>", "</code>", "</em>", "</font>", "</i>",
+       "</nobr>", "</s>", "</small>", "</strike>", "</strong>", "</tt>",
+       "</u>") => (block:
+        if parser.adoptionAgencyAlgorithm(token):
+          any_other_end_tag
+          return
+      )
+      ("<applet>", "<marquee>", "<object>") => (block:
+        parser.reconstructActiveFormatting()
+        discard parser.insertHTMLElement(token)
+        parser.activeFormatting.add((nil, nil))
+        parser.framesetOk = false
+      )
+      ("</applet>", "</marquee>", "</object>") => (block:
+        if not parser.openElements.hasElementInScope(token.tagtype):
+          parse_error
         else:
-          anything_else
-      of 's', 'S':
-        if peek_str("YSTEM"):
-          consume_and_discard "YSTEM".len
-          switch_state AFTER_DOCTYPE_SYSTEM_KEYWORD
+          parser.generateImpliedEndTags()
+          if parser.currentNode.tagType != token.tagtype: parse_error
+          while parser.openElements.pop().tagType != token.tagtype: discard
+          parser.clearActiveFormattingTillMarker()
+      )
+      "<table>" => (block:
+        if parser.document.mode != QUIRKS:
+          if parser.openElements.hasElementInButtonScope(TAG_P):
+            parser.closeP()
+        discard parser.insertHTMLElement(token)
+        parser.framesetOk = false
+        parser.insertionMode = IN_TABLE
+      )
+      "</br>" => (block:
+        parse_error
+        parser.processInHTMLContent(Token(t: START_TAG, tagtype: TAG_BR))
+      )
+      ("<area>", "<br>", "<embed>", "<img>", "<keygen>", "<wbr>") => (block:
+        parser.reconstructActiveFormatting()
+        discard parser.insertHTMLElement(token)
+        pop_current_node
+        parser.framesetOk = false
+      )
+      "<input>" => (block:
+        parser.reconstructActiveFormatting()
+        discard parser.insertHTMLElement(token)
+        pop_current_node
+        if not token.attrs.getOrDefault("type").equalsIgnoreCase("hidden"):
+          parser.framesetOk = false
+      )
+      ("<param>", "<source>", "<track>") => (block:
+        discard parser.insertHTMLElement(token)
+        pop_current_node
+      )
+      "<hr>" => (block:
+        if parser.openElements.hasElementInButtonScope(TAG_P):
+          parser.closeP()
+        discard parser.insertHTMLElement(token)
+        pop_current_node
+        parser.framesetOk = false
+      )
+      "<image>" => (block:
+        #TODO ew
+        let token = Token(t: START_TAG, tagtype: TAG_IMG, tagname: "img", selfclosing: token.selfclosing, attrs: token.attrs)
+        reprocess token
+      )
+      "<textarea>" => (block:
+        discard parser.insertHTMLElement(token)
+        parser.ignoreLF = true
+        parser.tokenizer.state = RCDATA
+        parser.oldInsertionMode = parser.insertionMode
+        parser.framesetOk = false
+        parser.insertionMode = TEXT
+      )
+      "<xmp>" => (block:
+        if parser.openElements.hasElementInButtonScope(TAG_P):
+          parser.closeP()
+        parser.reconstructActiveFormatting()
+        parser.framesetOk = false
+        parser.genericRawtextElementParsingAlgorithm(token)
+      )
+      "<iframe>" => (block:
+        parser.framesetOk = false
+        parser.genericRawtextElementParsingAlgorithm(token)
+      )
+      "<noembed>" => (block:
+        parser.genericRawtextElementParsingAlgorithm(token)
+      )
+      "<noscript>" => (block:
+        if parser.scripting:
+          parser.genericRawtextElementParsingAlgorithm(token)
         else:
-          anything_else
-      else:
-        parse_error invalid_character_sequence_after_doctype_name
-        tokenizer.tok.quirks = true
-        reconsume_in BOGUS_DOCTYPE
-
-    of AFTER_DOCTYPE_PUBLIC_KEYWORD:
-      case c
-      of whitespace: switch_state BEFORE_DOCTYPE_PUBLIC_IDENTIFIER
-      of '"':
-        parse_error missing_whitespace_after_doctype_public_keyword
-        tokenizer.tok.pubid = some("")
-        switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
-      of '>':
-        parse_error missing_doctype_public_identifier
-        tokenizer.tok.quirks = true
-        switch_state DATA
-        emit_tok
-      of eof:
-        parse_error eof_in_doctype
-        tokenizer.tok.quirks = true
-        emit_tok
-        emit_eof
-      else:
-        parse_error missing_quote_before_doctype_public_identifier
-        tokenizer.tok.quirks = true
-        reconsume_in BOGUS_DOCTYPE
-
-    of BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
-      case c
-      of whitespace: discard
-      of '"':
-        tokenizer.tok.pubid = some("")
-        switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
-      of '\'':
-        tokenizer.tok.pubid = some("")
-        switch_state DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED
-      of '>':
-        parse_error missing_doctype_public_identifier
-        tokenizer.tok.quirks = true
-        switch_state DATA
-        emit_tok
-      of eof:
-        parse_error eof_in_doctype
-        tokenizer.tok.quirks = true
-        emit_tok
-        emit_eof
-      else:
-        parse_error missing_quote_before_doctype_public_identifier
-        tokenizer.tok.quirks = true
-        reconsume_in BOGUS_DOCTYPE
-
-    of DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
-      case c
-      of '"': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER
-      of null:
-        parse_error unexpected_null_character
-        tokenizer.tok.pubid.get &= Rune(0xFFFD)
-      of '>':
-        parse_error abrupt_doctype_public_identifier
-        tokenizer.tok.quirks = true
-        switch_state DATA
-        emit_tok
-      of eof:
-        parse_error eof_in_doctype
-        tokenizer.tok.quirks = true
-        emit_tok
-        emit_eof
-      else:
-        tokenizer.tok.pubid.get &= tokenizer.curr
-
-    of DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
-      case c
-      of '\'': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER
-      of null:
-        parse_error unexpected_null_character
-        tokenizer.tok.pubid.get &= Rune(0xFFFD)
-      of '>':
-        parse_error abrupt_doctype_public_identifier
-        tokenizer.tok.quirks = true
-        switch_state DATA
-        emit_tok
-      of eof:
-        parse_error eof_in_doctype
-        tokenizer.tok.quirks = true
-        emit_tok
-        emit_eof
-      else:
-        tokenizer.tok.pubid.get &= tokenizer.curr
-
-    of AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
-      case c
-      of whitespace: switch_state BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
-      of '>':
-        switch_state DATA
-        emit_tok
-      of '"':
-        parse_error missing_whitespace_between_doctype_public_and_system_identifiers
-        tokenizer.tok.sysid = some("")
-        switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
-      of '\'':
-        parse_error missing_whitespace_between_doctype_public_and_system_identifiers
-        tokenizer.tok.sysid = some("")
-        switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
-      of eof:
-        parse_error eof_in_doctype
-        tokenizer.tok.quirks = true
-        emit_tok
-        emit_eof
-      else:
-        parse_error missing_quote_before_doctype_system_identifier
-        tokenizer.tok.quirks = true
-        reconsume_in BOGUS_DOCTYPE
-
-    of BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
-      case c
-      of whitespace: discard
-      of '>':
-        switch_state DATA
-        emit_tok
-      of '"':
-        tokenizer.tok.sysid = some("")
-        switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
-      of '\'':
-        tokenizer.tok.sysid = some("")
-        switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
-      of eof:
-        parse_error eof_in_doctype
-        tokenizer.tok.quirks = true
-        emit_tok
-        emit_eof
-      else:
-        parse_error missing_quote_before_doctype_system_identifier
-        tokenizer.tok.quirks = true
-        reconsume_in BOGUS_DOCTYPE
-
-    of AFTER_DOCTYPE_SYSTEM_KEYWORD:
-      case c
-      of whitespace: switch_state BEFORE_DOCTYPE_SYSTEM_IDENTIFIER
-      of '"':
-        parse_error missing_whitespace_after_doctype_system_keyword
-        tokenizer.tok.sysid = some("")
-        switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
-      of '\'':
-        parse_error missing_whitespace_after_doctype_system_keyword
-        tokenizer.tok.sysid = some("")
-        switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
-      of '>':
-        parse_error missing_doctype_system_identifier
-        tokenizer.tok.quirks = true
-        switch_state DATA
-        emit_tok
-      of eof:
-        parse_error eof_in_doctype
-        tokenizer.tok.quirks = true
-        emit_tok
-        emit_eof
-      else:
-        parse_error missing_quote_before_doctype_system_identifier
-        tokenizer.tok.quirks = true
-        reconsume_in BOGUS_DOCTYPE
-
-    of BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
-      case c
-      of whitespace: discard
-      of '"':
-        tokenizer.tok.pubid = some("")
-        switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
-      of '\'':
-        tokenizer.tok.pubid = some("")
-        switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
-      of '>':
-        parse_error missing_doctype_system_identifier
-        tokenizer.tok.quirks = true
-        switch_state DATA
-        emit_tok
-      of eof:
-        parse_error eof_in_doctype
-        tokenizer.tok.quirks = true
-        emit_tok
-        emit_eof
-      else:
-        parse_error missing_quote_before_doctype_system_identifier
-        tokenizer.tok.quirks = true
-        reconsume_in BOGUS_DOCTYPE
-
-    of DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
-      case c
-      of '"': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER
-      of null:
-        parse_error unexpected_null_character
-        tokenizer.tok.sysid.get &= Rune(0xFFFD)
-      of '>':
-        parse_error abrupt_doctype_system_identifier
-        tokenizer.tok.quirks = true
-        switch_state DATA
-        emit_tok
-      of eof:
-        parse_error eof_in_doctype
-        tokenizer.tok.quirks = true
-        emit_tok
-        emit_eof
-      else:
-        tokenizer.tok.sysid.get &= tokenizer.curr
-
-    of DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
-      case c
-      of '\'': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER
-      of null:
-        parse_error unexpected_null_character
-        tokenizer.tok.sysid.get &= Rune(0xFFFD)
-      of '>':
-        parse_error abrupt_doctype_system_identifier
-        tokenizer.tok.quirks = true
-        switch_state DATA
-        emit_tok
-      of eof:
-        parse_error eof_in_doctype
-        tokenizer.tok.quirks = true
-        emit_tok
-        emit_eof
-      else:
-        tokenizer.tok.sysid.get &= tokenizer.curr
-
-    of AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
-      case c
-      of whitespace: discard
-      of '>':
-        switch_state DATA
-        emit_tok
-      of eof:
-        parse_error eof_in_doctype
-        tokenizer.tok.quirks = true
-        emit_tok
-        emit_eof
-      else:
-        parse_error unexpected_character_after_doctype_system_identifier
-        reconsume_in BOGUS_DOCTYPE
-
-    of BOGUS_DOCTYPE:
-      case c
-      of '>':
-        switch_state DATA
-        emit_tok
-      of null: parse_error unexpected_null_character
-      of eof:
-        emit_tok
-        emit_eof
-      else: discard
-
-    of CDATA_SECTION:
-      case c
-      of ']': switch_state CDATA_SECTION_BRACKET
-      of eof:
-        parse_error eof_in_cdata
-        emit_eof
-      else:
-        emit_current
-
-    of CDATA_SECTION_BRACKET:
-      case c
-      of ']': switch_state CDATA_SECTION_END
-      of '>': switch_state DATA
-      else:
-        emit ']'
-        reconsume_in CDATA_SECTION
-
-    of CDATA_SECTION_END:
-      case c
-      of ']': emit ']'
-      of '>': switch_state DATA
-      else:
-        emit ']'
-        emit ']'
-        reconsume_in CDATA_SECTION
-
-    of CHARACTER_REFERENCE:
-      tokenizer.tmp = "&"
-      case c
-      of AsciiAlpha: reconsume_in NAMED_CHARACTER_REFERENCE
-      of '#':
-        tokenizer.tmp &= '#'
-        switch_state NUMERIC_CHARACTER_REFERENCE
-      else:
-        flush_code_points_consumed_as_a_character_reference
-        reconsume_in tokenizer.rstate
-
-    of NAMED_CHARACTER_REFERENCE:
-      ignore_eof # we check for eof ourselves
-      tokenizer.reconsume() #TODO optimize this away
-      var buf = ""
-      var node = entityMap
-      var value = none(string) # last value
-      var match = true
-      #TODO interfacing with RadixNode is suffering
-      # plus this doesn't look very efficient either
-      while not tokenizer.atEof:
-        let c = tokenizer.consume()
-        buf &= c
-        if not node.hasPrefix(buf):
-          break
-        let prevnode = node
-        node = node{buf}
-        if node != prevnode:
-          buf = ""
-          if node.value.issome:
-            value = node.value
-        tokenizer.tmp &= tokenizer.curr
-      if value.issome:
-        if consumed_as_an_attribute and tokenizer.tmp[^1] != ';' and peek_char in {';'} + AsciiAlpha:
-          flush_code_points_consumed_as_a_character_reference
-          switch_state tokenizer.rstate
+          any_other_start_tag
+      )
+      "<select>" => (block:
+        parser.reconstructActiveFormatting()
+        discard parser.insertHTMLElement(token)
+        parser.framesetOk = false
+        if parser.insertionMode in {IN_TABLE, IN_CAPTION, IN_TABLE_BODY, IN_CELL}:
+          parser.insertionMode = IN_SELECT_IN_TABLE
         else:
-          if tokenizer.tmp[^1] != ';':
-            parse_error missing_semicolon_after_character_reference_parse_error
-            tokenizer.tmp = node.value.get
-            flush_code_points_consumed_as_a_character_reference
-            switch_state tokenizer.rstate
-      else:
-        flush_code_points_consumed_as_a_character_reference
-        switch_state AMBIGUOUS_AMPERSAND_STATE
-
-    of AMBIGUOUS_AMPERSAND_STATE:
-      case c
-      of AsciiAlpha:
-        if consumed_as_an_attribute:
-          append_to_current_attr_value c
+          parser.insertionMode = IN_SELECT
+      )
+      ("<optgroup>", "<option>") => (block:
+        if parser.currentNode.tagType == TAG_OPTION:
+          pop_current_node
+        parser.reconstructActiveFormatting()
+        discard parser.insertHTMLElement(token)
+      )
+      ("<rb>", "<rtc>") => (block:
+        if parser.openElements.hasElementInScope(TAG_RUBY):
+          parser.generateImpliedEndTags()
+          if parser.currentNode.tagType != TAG_RUBY: parse_error
+        discard parser.insertHTMLElement(token)
+      )
+      ("<rp>", "<rt>") => (block:
+        if parser.openElements.hasElementInScope(TAG_RUBY):
+          parser.generateImpliedEndTags(TAG_RTC)
+          if parser.currentNode.tagType notin {TAG_RUBY, TAG_RTC}: parse_error
+        discard parser.insertHTMLElement(token)
+      )
+      #NOTE <math> (not implemented)
+      #TODO <svg> (SVG)
+      ("<caption>", "<col>", "<colgroup>", "<frame>", "<head>", "<tbody>",
+       "<td>", "<tfoot>", "<th>", "<thead>", "<tr>") => (block: parse_error)
+      TokenType.START_TAG => (block: any_other_start_tag)
+      TokenType.END_TAG => (block: any_other_end_tag)
+
+  of TEXT:
+    match token:
+      TokenType.CHARACTER_ASCII => (block:
+        assert token.c != '\0'
+        parser.insertCharacter(token.c)
+      )
+      TokenType.CHARACTER => (block:
+        parser.insertCharacter(token.r)
+      )
+      TokenType.EOF => (block:
+        parse_error
+        if parser.currentNode.tagType == TAG_SCRIPT:
+          HTMLScriptElement(parser.currentNode).alreadyStarted = true
+        pop_current_node
+        parser.insertionMode = parser.oldInsertionMode
+        reprocess token
+      )
+      "</script>" => (block:
+        #TODO microtask
+        let script = parser.currentNode
+        pop_current_node
+        parser.insertionMode = parser.oldInsertionMode
+        #TODO document.write() ?
+        #TODO prepare script element
+        #TODO uh implement scripting or something
+      )
+      TokenType.END_TAG => (block:
+        pop_current_node
+        parser.insertionMode = parser.oldInsertionMode
+      )
+
+  of IN_TABLE:
+    template clear_the_stack_back_to_a_table_context() =
+      while parser.currentNode.tagType notin {TAG_TABLE, TAG_TEMPLATE, TAG_HTML}:
+        pop_current_node
+
+    match token:
+      (TokenType.CHARACTER_ASCII, TokenType.CHARACTER) => (block:
+        if parser.currentNode.tagType in {TAG_TABLE, TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TR}:
+          parser.pendingTableChars = ""
+          parser.pendingTableCharsWhitespace = true
+          parser.oldInsertionMode = parser.insertionMode
+          parser.insertionMode = IN_TABLE_TEXT
+          reprocess token
+        else: # anything else
+          parse_error
+          parser.fosterParenting = true
+          parser.processInHTMLContent(token, IN_BODY)
+          parser.fosterParenting = false
+      )
+      TokenType.COMMENT => (block: parser.insertComment(token))
+      TokenType.DOCTYPE => (block: parse_error)
+      "<caption>" => (block: 
+        clear_the_stack_back_to_a_table_context
+        parser.activeFormatting.add((nil, nil))
+        discard parser.insertHTMLElement(token)
+        parser.insertionMode = IN_CAPTION
+      )
+      "<colgroup>" => (block:
+        clear_the_stack_back_to_a_table_context
+        discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_COLGROUP))
+        parser.insertionMode = IN_COLUMN_GROUP
+      )
+      ("<tbody>", "<tfoot>", "<thead>") => (block:
+        clear_the_stack_back_to_a_table_context
+        discard parser.insertHTMLElement(token)
+        parser.insertionMode = IN_TABLE_BODY
+      )
+      ("<td>", "<th>", "<tr>") => (block:
+        clear_the_stack_back_to_a_table_context
+        discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_TBODY))
+        parser.insertionMode = IN_TABLE_BODY
+      )
+      "<table>" => (block:
+        parse_error
+        if not parser.openElements.hasElementInScope(TAG_TABLE):
+          discard
         else:
-          emit_current
-      of ';':
-        parse_error unknown_named_character_reference
-        reconsume_in tokenizer.rstate
-      else: reconsume_in tokenizer.rstate
-
-    of NUMERIC_CHARACTER_REFERENCE:
-      tokenizer.code = 0
-      case c
-      of 'x', 'X':
-        tokenizer.tmp &= c
-        switch_state HEXADECIMAL_CHARACTER_REFERENCE_START
-      else: reconsume_in DECIMAL_CHARACTER_REFERENCE_START
-
-    of HEXADECIMAL_CHARACTER_REFERENCE_START:
-      case c
-      of AsciiHexDigit: reconsume_in HEXADECIMAL_CHARACTER_REFERENCE
-      else:
-        parse_error absence_of_digits_in_numeric_character_reference
-        flush_code_points_consumed_as_a_character_reference
-        reconsume_in tokenizer.rstate
-
-    of DECIMAL_CHARACTER_REFERENCE_START:
-      case c
-      of AsciiDigit: reconsume_in DECIMAL_CHARACTER_REFERENCE
-      else:
-        parse_error absence_of_digits_in_numeric_character_reference
-        flush_code_points_consumed_as_a_character_reference
-        reconsume_in tokenizer.rstate
-
-    of HEXADECIMAL_CHARACTER_REFERENCE:
-      case c
-      of AsciiHexDigit: # note: merged digit, upper hex, lower hex
-        tokenizer.code *= 0x10
-        tokenizer.code += hexValue(c)
-      of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END
-      else:
-        parse_error missing_semicolon_after_character_reference
-        reconsume_in NUMERIC_CHARACTER_REFERENCE_END
-
-    of DECIMAL_CHARACTER_REFERENCE:
-      case c
-      of AsciiDigit:
-        tokenizer.code *= 10
-        tokenizer.code += decValue(c)
-      of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END
+          while parser.openElements.pop().tagType != TAG_TABLE: discard
+          parser.resetInsertionMode()
+          reprocess token
+      )
+      "</table>" => (block:
+        if not parser.openElements.hasElementInScope(TAG_TABLE):
+          parse_error
+        else:
+          while parser.openElements.pop().tagType != TAG_TABLE: discard
+          parser.resetInsertionMode()
+      )
+      ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</tbody>",
+       "</td>", "</tfoot>", "</th>", "</thead>", "</tr>") => (block:
+        parse_error
+      )
+      ("<style>", "<script>", "<template>", "</template>") => (block:
+        parser.processInHTMLContent(token, IN_HEAD)
+      )
+      "<input>" => (block:
+        if not token.attrs.getOrDefault("type").equalsIgnoreCase("hidden"):
+          # anything else
+          parse_error
+          parser.fosterParenting = true
+          parser.processInHTMLContent(token, IN_BODY)
+          parser.fosterParenting = false
+        else:
+          parse_error
+          discard parser.insertHTMLElement(token)
+          pop_current_node
+      )
+      "<form>" => (block:
+        parse_error
+        if parser.form != nil or parser.openElements.hasElement(TAG_TEMPLATE):
+          discard
+        else:
+          parser.form = HTMLFormElement(parser.insertHTMLElement(token))
+          pop_current_node
+      )
+      TokenType.EOF => (block:
+        parser.processInHTMLContent(token, IN_BODY)
+      )
+      _ => (block:
+        parse_error
+        parser.fosterParenting = true
+        parser.processInHTMLContent(token, IN_BODY)
+        parser.fosterParenting = false
+      )
+
+  of IN_TABLE_TEXT:
+    match token:
+      '\0' => (block: parse_error)
+      TokenType.CHARACTER_ASCII => (block:
+        if token.c notin AsciiWhitespace:
+          parser.pendingTableCharsWhitespace = false
+        parser.pendingTableChars &= token.c
+      )
+      TokenType.CHARACTER => (block:
+        parser.pendingTableChars &= token.r
+        parser.pendingTableCharsWhitespace = false
+      )
+      _ => (block:
+        if not parser.pendingTableCharsWhitespace:
+          # I *think* this is effectively the same thing the specification wants...
+          parse_error
+          parser.fosterParenting = true
+          parser.reconstructActiveFormatting()
+          parser.insertCharacter(token.c)
+          parser.framesetOk = false
+          parser.fosterParenting = false
+        else:
+          parser.insertCharacter(parser.pendingTableChars)
+        parser.insertionMode = parser.oldInsertionMode
+        reprocess token
+      )
+
+  of IN_CAPTION:
+    match token:
+      "</caption>" => (block:
+        if parser.openElements.hasElementInTableScope(TAG_CAPTION):
+          parse_error
+        else:
+          parser.generateImpliedEndTags()
+          if parser.currentNode.tagType != TAG_CAPTION: parse_error
+          while parser.openElements.pop().tagType != TAG_CAPTION: discard
+          parser.clearActiveFormattingTillMarker()
+          parser.insertionMode = IN_TABLE
+      )
+      ("<caption>", "<col>", "<colgroup>", "<tbody>", "<td>", "<tfoot>",
+       "<th>", "<thead>", "<tr>", "</table>") => (block:
+        if not parser.openElements.hasElementInTableScope(TAG_CAPTION):
+          parse_error
+        else:
+          parser.generateImpliedEndTags()
+          if parser.currentNode.tagType != TAG_CAPTION: parse_error
+          parser.clearActiveFormattingTillMarker()
+          parser.insertionMode = IN_TABLE
+          reprocess token
+      )
+      ("</body>", "</col>", "</colgroup>", "</html>", "</tbody>", "</td>",
+       "</tfoot>", "</th>", "</thead>", "</tr>") => (block: parse_error)
+      _ => (block: parser.processInHTMLContent(token, IN_BODY))
+
+  of IN_COLUMN_GROUP:
+    match token:
+      AsciiWhitespace => (block: parser.insertCharacter(token.c))
+      TokenType.COMMENT => (block: parser.insertComment(token))
+      TokenType.DOCTYPE => (block: parse_error)
+      "<html>" => (block: parser.processInHTMLContent(token, IN_BODY))
+      "<col>" => (block:
+        discard parser.insertHTMLElement(token)
+        pop_current_node
+      )
+      "</colgroup>" => (block:
+        if parser.currentNode.tagType != TAG_COLGROUP:
+          parse_error
+        else:
+          pop_current_node
+          parser.insertionMode = IN_TABLE
+      )
+      "</col>" => (block: parse_error)
+      ("<template>", "</template>") => (block:
+        parser.processInHTMLContent(token, IN_HEAD)
+      )
+      TokenType.EOF => (block: parser.processInHTMLContent(token, IN_BODY))
+      _ => (block:
+        if parser.currentNode.tagType != TAG_COLGROUP:
+          parse_error
+        else:
+          pop_current_node
+          parser.insertionMode = IN_TABLE
+          reprocess token
+      )
+
+  of IN_TABLE_BODY:
+    template clear_the_stack_back_to_a_table_body_context() =
+      while parser.currentNode.tagType notin {TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TEMPLATE, TAG_HTML}:
+        pop_current_node
+
+    match token:
+      "<tr>" => (block:
+        clear_the_stack_back_to_a_table_body_context
+        discard parser.insertHTMLElement(token)
+        parser.insertionMode = IN_ROW
+      )
+      ("<th>", "<td>") => (block:
+        parse_error
+        clear_the_stack_back_to_a_table_body_context
+        discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_TR))
+        parser.insertionMode = IN_ROW
+        reprocess token
+      )
+      ("</tbody>", "</tfoot>", "</thead>") => (block:
+        if not parser.openElements.hasElementInTableScope(token.tagtype):
+          parse_error
+        else:
+          clear_the_stack_back_to_a_table_body_context
+          pop_current_node
+          parser.insertionMode = IN_TABLE
+      )
+      ("<caption>", "<col>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>",
+       "</table>") => (block:
+        if not parser.openElements.hasElementInTableScope({TAG_TBODY, TAG_THEAD, TAG_TFOOT}):
+          parse_error
+        else:
+          clear_the_stack_back_to_a_table_body_context
+          pop_current_node
+          parser.insertionMode = IN_TABLE
+          reprocess token
+      )
+      ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</td>",
+       "</th>", "</tr>") => (block:
+        parse_error
+      )
+      _ => (block: parser.processInHTMLContent(token, IN_TABLE))
+
+  of IN_ROW:
+    template clear_the_stack_back_to_a_table_row_context() =
+      while parser.currentNode.tagType notin {TAG_TR, TAG_TEMPLATE, TAG_HTML}:
+        pop_current_node
+
+    match token:
+      ("<th>", "<td>") => (block:
+        clear_the_stack_back_to_a_table_row_context
+        discard parser.insertHTMLElement(token)
+        parser.insertionMode = IN_CELL
+        parser.activeFormatting.add((nil, nil))
+      )
+      "</tr>" => (block:
+        if not parser.openElements.hasElementInTableScope(TAG_TR):
+          parse_error
+        else:
+          clear_the_stack_back_to_a_table_row_context
+          pop_current_node
+          parser.insertionMode = IN_TABLE_BODY
+      )
+      ("<caption>", "<col>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>",
+       "<tr>", "</table>") => (block:
+        if not parser.openElements.hasElementInTableScope(TAG_TR):
+          parse_error
+        else:
+          clear_the_stack_back_to_a_table_row_context
+          pop_current_node
+          parser.insertionMode = IN_TABLE_BODY
+          reprocess token
+      )
+      ("</tbody>", "</tfoot>", "</thead>") => (block:
+        if not parser.openElements.hasElementInTableScope(token.tagtype):
+          parse_error
+        elif not parser.openElements.hasElementInTableScope(TAG_TR):
+          discard
+        else:
+          clear_the_stack_back_to_a_table_row_context
+          pop_current_node
+          parser.insertionMode = IN_BODY
+          reprocess token
+      )
+      ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</td>",
+       "</th>") => (block: parse_error)
+      _ => (block: parser.processInHTMLContent(token, IN_TABLE))
+
+  of IN_CELL:
+    template close_cell() =
+      parser.generateImpliedEndTags()
+      if parser.currentNode.tagType notin {TAG_TD, TAG_TH}: parse_error
+      while parser.openElements.pop().tagType notin {TAG_TD, TAG_TH}: discard
+      parser.clearActiveFormattingTillMarker()
+      parser.insertionMode = IN_ROW
+
+    match token:
+      ("</td>", "</th>") => (block:
+        if not parser.openElements.hasElementInTableScope(token.tagtype):
+          parse_error
+        else:
+          parser.generateImpliedEndTags()
+          if parser.currentNode.tagType != token.tagtype: parse_error
+          while parser.openElements.pop().tagType != token.tagtype: discard
+          parser.clearActiveFormattingTillMarker()
+          parser.insertionMode = IN_ROW
+      )
+      ("<caption>", "<col>", "<colgroup>", "<tbody>", "<td>", "<tfoot>",
+       "<thead>", "<tr>") => (block:
+        if not parser.openElements.hasElementInTableScope({TAG_TD, TAG_TH}):
+          parse_error
+        else:
+          close_cell
+      )
+      ("</body>", "</caption>", "</col>", "</colgroup>",
+       "</html>") => (block: parse_error)
+      ("</table>", "</tbody>", "</tfoot>", "</thead>", "</tr>") => (block:
+        if not parser.openElements.hasElementInTableScope(token.tagtype):
+          parse_error
+        else:
+          close_cell
+          reprocess token
+      )
+      _ => (block: parser.processInHTMLContent(token, IN_BODY))
+
+  of IN_SELECT:
+    match token:
+      '\0' => (block: parse_error)
+      TokenType.CHARACTER_ASCII => (block: parser.insertCharacter(token.c))
+      TokenType.CHARACTER => (block: parser.insertCharacter(token.r))
+      TokenType.DOCTYPE => (block: parse_error)
+      "<html>" => (block: parser.processInHTMLContent(token, IN_BODY))
+      "<option>" => (block:
+        if parser.currentNode.tagType == TAG_OPTION:
+          pop_current_node
+        discard parser.insertHTMLElement(token)
+      )
+      "<optgroup>" => (block:
+        if parser.currentNode.tagType == TAG_OPTION:
+          pop_current_node
+        if parser.currentNode.tagType == TAG_OPTGROUP:
+          pop_current_node
+        discard parser.insertHTMLElement(token)
+      )
+      "</optgroup>" => (block:
+        if parser.currentNode.tagType == TAG_OPTION:
+          if parser.openElements.len > 1 and parser.openElements[^2].tagType == TAG_OPTGROUP:
+            pop_current_node
+        if parser.currentNode.tagType == TAG_OPTGROUP:
+          pop_current_node
+        else:
+          parse_error
+      )
+      "</option>" => (block:
+        if parser.currentNode.tagType == TAG_OPTION:
+          pop_current_node
+        else:
+          parse_error
+      )
+      "</select>" => (block:
+        if not parser.openElements.hasElementInSelectScope(TAG_SELECT):
+          parse_error
+        else:
+          while parser.openElements.pop().tagType != TAG_SELECT: discard
+          parser.resetInsertionMode()
+      )
+      ("<input>", "<keygen>", "<textarea>") => (block:
+        parse_error
+        if not parser.openElements.hasElementInSelectScope(TAG_SELECT):
+          discard
+        else:
+          while parser.openElements.pop().tagType != TAG_SELECT: discard
+          parser.resetInsertionMode()
+          reprocess token
+      )
+      ("<script>", "<template>", "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD))
+      TokenType.EOF => (block: parser.processInHTMLContent(token, IN_BODY))
+      _ => (block: parse_error)
+
+  of IN_SELECT_IN_TABLE:
+    match token:
+      ("<caption>", "<table>", "<tbody>", "<tfoot>", "<thead>", "<tr>", "<td>",
+       "<th>") => (block:
+        parse_error
+        while parser.openElements.pop().tagType != TAG_SELECT: discard
+        parser.resetInsertionMode()
+        reprocess token
+      )
+      ("</caption>", "</table>", "</tbody>", "</tfoot>", "</thead>", "</tr>",
+       "</td>", "</th>") => (block:
+        parse_error
+        if not parser.openElements.hasElementInTableScope(token.tagtype):
+          discard
+        else:
+          while parser.openElements.pop().tagType != TAG_SELECT: discard
+          parser.resetInsertionMode()
+          reprocess token
+      )
+      _ => (block: parser.processInHTMLContent(token, IN_SELECT))
+
+  of IN_TEMPLATE:
+    match token:
+      (TokenType.CHARACTER_ASCII, TokenType.CHARACTER, TokenType.DOCTYPE) => (block:
+        parser.processInHTMLContent(token, IN_BODY)
+      )
+      ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>",
+       "<script>", "<style>", "<template>", "<title>", "</template>") => (block:
+        parser.processInHTMLContent(token, IN_HEAD)
+      )
+      ("<caption>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>") => (block:
+        discard parser.templateModes.pop()
+        parser.templateModes.add(IN_TABLE)
+        parser.insertionMode = IN_TABLE
+        reprocess token
+      )
+      "<col>" => (block:
+        discard parser.templateModes.pop()
+        parser.templateModes.add(IN_COLUMN_GROUP)
+        parser.insertionMode = IN_COLUMN_GROUP
+        reprocess token
+      )
+      "<tr>" => (block:
+        discard parser.templateModes.pop()
+        parser.templateModes.add(IN_TABLE_BODY)
+        parser.insertionMode = IN_TABLE_BODY
+        reprocess token
+      )
+      ("<td>", "<th>") => (block:
+        discard parser.templateModes.pop()
+        parser.templateModes.add(IN_ROW)
+        parser.insertionMode = IN_ROW
+        reprocess token
+      )
+      TokenType.START_TAG => (block:
+        discard parser.templateModes.pop()
+        parser.templateModes.add(IN_BODY)
+        parser.insertionMode = IN_BODY
+        reprocess token
+      )
+      TokenType.END_TAG => (block: parse_error)
+      TokenType.EOF => (block:
+        if not parser.openElements.hasElement(TAG_TEMPLATE):
+          discard # stop
+        else:
+          parse_error
+          while parser.openElements.pop().tagType != TAG_TEMPLATE: discard
+          parser.clearActiveFormattingTillMarker()
+          discard parser.templateModes.pop()
+          parser.resetInsertionMode()
+          reprocess token
+      )
+
+  of AFTER_BODY:
+    match token:
+      AsciiWhitespace => (block: parser.processInHTMLContent(token, IN_BODY))
+      TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.openElements[0])))
+      TokenType.DOCTYPE => (block: parse_error)
+      "<html>" => (block: parser.processInHTMLContent(token, IN_BODY))
+      "</html>" => (block:
+        if parser.fragment:
+          parse_error
+        else:
+          parser.insertionMode = AFTER_AFTER_BODY
+      )
+      TokenType.EOF => (block: discard) # stop
+      _ => (block:
+        parse_error
+        parser.insertionMode = IN_BODY
+        reprocess token
+      )
+
+  of IN_FRAMESET:
+    match token:
+      AsciiWhitespace => (block: parser.insertCharacter(token.c))
+      TokenType.COMMENT => (block: parser.insertComment(token))
+      TokenType.DOCTYPE => (block: parse_error)
+      "<html>" => (block: parser.processInHTMLContent(token, IN_BODY))
+      "<frameset>" => (block:
+        if parser.currentNode == parser.document.html:
+          parse_error
+        else:
+          pop_current_node
+        if not parser.fragment and parser.currentNode.tagType != TAG_FRAMESET:
+          parser.insertionMode = AFTER_FRAMESET
+      )
+      "<frame>" => (block:
+        discard parser.insertHTMLElement(token)
+        pop_current_node
+      )
+      "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD))
+      TokenType.EOF => (block:
+        if parser.currentNode != parser.document.html: parse_error
+        # stop
+      )
+      _ => (block: parse_error)
+
+  of AFTER_FRAMESET:
+    match token:
+      AsciiWhitespace => (block: parser.insertCharacter(token.c))
+      TokenType.COMMENT => (block: parser.insertComment(token))
+      TokenType.DOCTYPE => (block: parse_error)
+      "<html>" => (block: parser.processInHTMLContent(token, IN_BODY))
+      "</html>" => (block: parser.insertionMode = AFTER_AFTER_FRAMESET)
+      "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD))
+      TokenType.EOF => (block: discard) # stop
+      _ => (block: parse_error)
+
+  of AFTER_AFTER_BODY:
+    match token:
+      TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document)))
+      (TokenType.DOCTYPE, AsciiWhitespace, "<html>") => (block: parser.processInHTMLContent(token, IN_BODY))
+      TokenType.EOF => (block: discard) # stop
+      _ => (block:
+        parse_error
+        parser.insertionMode = IN_BODY
+        reprocess token
+      )
+
+  of AFTER_AFTER_FRAMESET:
+    match token:
+      TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document)))
+      (TokenType.DOCTYPE, AsciiWhitespace, "<html>") => (block: parser.processInHTMLContent(token, IN_BODY))
+      TokenType.EOF => (block: discard) # stop
+      "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD))
+      _ => (block: parse_error)
+
+proc processInForeignContent(parser: var HTML5Parser, token: Token) =
+  macro `=>`(v: typed, body: untyped): untyped =
+    quote do:
+      discard (`v`, proc() = `body`)
+  template script_end_tag() =
+    pop_current_node
+    #TODO document.write (?)
+    #TODO SVG
+  template any_other_end_tag() =
+    if parser.currentNode.localName != token.tagname: parse_error
+    for i in countdown(parser.openElements.high, 1):
+      let node = parser.openElements[i]
+      if node.localName == token.tagname:
+        while parser.openElements.pop() != node: discard
+        break
+      if node.namespace == Namespace.HTML: break
+      parser.processInHTMLContent(token)
+
+
+  match token:
+    '\0' => (block:
+      parse_error
+      parser.insertCharacter(Rune(0xFFFD))
+    )
+    AsciiWhitespace => (block: parser.insertCharacter(token.c))
+    TokenType.CHARACTER_ASCII => (block: parser.insertCharacter(token.c))
+    TokenType.CHARACTER => (block: parser.insertCharacter(token.r))
+    TokenType.DOCTYPE => (block: parse_error)
+    ("<b>", "<big>", "<blockquote>", "<body>", "<br>", "<center>", "<code>",
+     "<dd>", "<div>", "<dl>", "<dt>", "<em>", "<embed>", "<h1>", "<h2>", "<h3>",
+     "<h4>", "<h5>", "<h6>", "<head>", "<hr>", "<i>", "<img>", "<li>",
+     "<listing>", "<menu>", "<meta>", "<nobr>", "<ol>", "<p>", "<pre>",
+     "<ruby>", "<s>", "<small>", "<span>", "<strong>", "<strike>", "<sub>",
+     "<sup>", "<table>", "<tt>", "<u>", "<ul>", "<var>") => (block:
+      parse_error
+      #NOTE MathML not implemented
+      while not (parser.currentNode.isHTMLIntegrationPoint() or parser.currentNode.inHTMLNamespace()):
+        pop_current_node
+      parser.processInHTMLContent(token)
+    )
+    TokenType.START_TAG => (block:
+      #NOTE MathML not implemented
+      #TODO SVG
+      #TODO adjust foreign attributes
+      let element = parser.insertForeignElement(token, $parser.adjustedCurrentNode.namespace)
+      if token.selfclosing and element.inSVGNamespace():
+        script_end_tag
       else:
-        parse_error missing_semicolon_after_character_reference
-        reconsume_in NUMERIC_CHARACTER_REFERENCE_END
-
-    of NUMERIC_CHARACTER_REFERENCE_END:
-      ignore_eof # we reconsume anyway
-      case tokenizer.code
-      of 0x00:
-        parse_error null_character_reference
-        tokenizer.code = 0xFFFD
-      elif tokenizer.code > 0x10FFFF:
-        parse_error character_reference_outside_unicode_range
-        tokenizer.code = 0xFFFD
-      elif Rune(tokenizer.code).isSurrogate():
-        parse_error surrogate_character_reference
-        tokenizer.code = 0xFFFD
-      elif Rune(tokenizer.code).isNonCharacter():
-        parse_error noncharacter_character_reference
-        # do nothing
-      elif tokenizer.code in 0..255 and char(tokenizer.code) in ((Controls - AsciiWhitespace) + {chr(0x0D)}):
-        const ControlMapTable = [
-          (0x80, 0x20AC), (0x82, 0x201A), (0x83, 0x0192), (0x84, 0x201E),
-          (0x85, 0x2026), (0x86, 0x2020), (0x87, 0x2021), (0x88, 0x02C6),
-          (0x89, 0x2030), (0x8A, 0x0160), (0x8B, 0x2039), (0x8C, 0x0152),
-          (0x8E, 0x017D), (0x91, 0x2018), (0x92, 0x2019), (0x93, 0x201C),
-          (0x94, 0x201D), (0x95, 0x2022), (0x96, 0x2013), (0x97, 0x2014),
-          (0x98, 0x02DC), (0x99, 0x2122), (0x9A, 0x0161), (0x9B, 0x203A),
-          (0x9C, 0x0153), (0x9E, 0x017E), (0x9F, 0x0178),
-        ].toTable()
-        if ControlMapTable.hasKey(tokenizer.code):
-          tokenizer.code = ControlMapTable[tokenizer.code]
-      tokenizer.tmp = $Rune(tokenizer.code)
-      flush_code_points_consumed_as_a_character_reference #TODO optimize so we flush directly
-      reconsume_in tokenizer.rstate # we unnecessarily consumed once so reconsume
-
-func inputSize*(str: string): int =
-  if str.len == 0:
-    return 20
-  for c in str:
-    if not c.isDigit:
-      return 20
-  return str.parseInt()
-
-#w3m's getescapecmd and parse_tag, transpiled to nim and heavily modified.
-#(C) Copyright 1994-2002 by Akinori Ito
-#(C) Copyright 2002-2011 by Akinori Ito, Hironori Sakamoto, Fumitoshi Ukai
-#
-#Use, modification and redistribution of this software is hereby granted,
-#provided that this entire copyright notice is included on any copies of
-#this software and applications and derivations thereof.
-#
-#This software is provided on an "as is" basis, without warranty of any
-#kind, either expressed or implied, as to any matter including, but not
-#limited to warranty of fitness of purpose, or merchantability, or
-#results obtained from use of this software.
-proc getescapecmd(buf: string, at: var int): string =
-  var i = at
-
-  if buf[i] == '#': #num
-    inc i
-    var num: int
-    if buf[i].tolower() == 'x': #hex
-      inc i
-      if not isdigit(buf[i]):
-        at = i
-        return "&"
-
-      num = hexValue(buf[i])
-      inc i
-      while i < buf.len and hexValue(buf[i]) != -1:
-        num *= 0x10
-        num += hexValue(buf[i])
-        inc i
-    else: #dec
-      if not isDigit(buf[i]):
-        at = i
-        return "&"
-
-      num = decValue(buf[i])
-      inc i
-      while i < buf.len and isDigit(buf[i]):
-        num *= 10
-        num += decValue(buf[i])
-        inc i
-
-    if buf[i] == ';':
-      inc i
-    at = i
-    return $(Rune(num))
-  elif not isAlphaAscii(buf[i]):
-    return "&"
-
-  var n = entityMap
-  var s = ""
-  while true:
-    s &= buf[i]
-    if not n.hasPrefix(s):
-      break
-    let pn = n
-    n = n{s}
-    if n != pn:
-      s = ""
-    inc i
-
-  if n.value.issome:
-    at = i
-    return n.value.get
-
-  return "&"
-
-type
-  DOMParsedTag = object
-    tagid: TagType
-    attrs: Table[string, string]
-    open: bool
-
-proc parse_tag(buf: string, at: var int): DOMParsedTag =
-  var tag = DOMParsedTag()
-  tag.open = true
-
-  #Parse tag name
-  var tagname = ""
-  inc at
-  if buf[at] == '/':
-    inc at
-    tag.open = false
-    at = skipBlanks(buf, at)
-
-  while at < buf.len and not buf[at].isWhitespace() and not (tag.open and buf[at] == '/') and buf[at] != '>' and buf[at].isAscii():
-    tagname &= buf[at].tolower()
-    inc at
-
-  tag.tagid = tagType(tagname)
-  at = skipBlanks(buf, at)
-
-  while at < buf.len and buf[at] != '>':
-    var value = ""
-    var attrname = ""
-    while at < buf.len and buf[at] != '=' and not buf[at].isWhitespace() and buf[at] != '>':
-      var r: Rune
-      fastRuneAt(buf, at, r)
-      if r.isAscii():
-        attrname &= char(r).tolower()
+        pop_current_node
+    )
+    "</script>" => (block:
+      if parser.currentNode.namespace == Namespace.SVG and parser.currentNode.localName == "script": #TODO SVG
+        script_end_tag
       else:
-        attrname &= r
-
-    at = skipBlanks(buf, at)
-    if at < buf.len and buf[at] == '=':
-      inc at
-      at = skipBlanks(buf, at)
-      if at < buf.len and (buf[at] == '"' or buf[at] == '\''):
-        let startc = buf[at]
-        inc at
-        while at < buf.len and buf[at] != startc:
-          if buf[at] == '&':
-            inc at
-            value &= getescapecmd(buf, at)
-          else:
-            value &= buf[at]
-            inc at
-        if at < buf.len:
-          inc at
-      elif at < buf.len:
-        while at < buf.len and not buf[at].isWhitespace() and buf[at] != '>':
-          var r: Rune
-          fastRuneAt(buf, at, r)
-          value &= $r
-
-    if attrname.len > 0:
-      tag.attrs[attrname] = value
-
-  while at < buf.len and buf[at] != '>':
-    inc at
-
-  if at < buf.len and buf[at] == '>':
-    inc at
-  return tag
-
-proc insertNode(parent, node: Node) =
-  parent.childNodes.add(node)
-
-  if parent.childNodes.len > 1:
-    let prevSibling = parent.childNodes[^2]
-    prevSibling.nextSibling = node
-    node.previousSibling = prevSibling
-
-  node.parentNode = parent
-  if parent.nodeType == ELEMENT_NODE:
-    node.parentElement = Element(parent)
-
-  if parent.ownerDocument != nil:
-    node.ownerDocument = parent.ownerDocument
-  elif parent.nodeType == DOCUMENT_NODE:
-    node.ownerDocument = Document(parent)
-
-  if node.nodeType == ELEMENT_NODE:
-    parent.children.add(Element(node))
-
-    let element = (Element(node))
-    if element.ownerDocument != nil:
-      element.ownerDocument.type_elements[element.tagType].add(element)
-      if element.id != "":
-        if not (element.id in element.ownerDocument.id_elements):
-          element.ownerDocument.id_elements[element.id] = newSeq[Element]()
-        element.ownerDocument.id_elements[element.id].add(element)
-
-      for c in element.classList:
-        if not (c in element.ownerDocument.class_elements):
-          element.ownerDocument.class_elements[c] = newSeq[Element]()
-        element.ownerDocument.class_elements[c].add(element)
-
-proc processDocumentBody(state: var HTMLParseState) =
-  if not state.in_body:
-    state.in_body = true
-    if state.elementNode.ownerDocument != nil:
-      state.elementNode = state.elementNode.ownerDocument.body
-
-#TODO this adds text nodes to head
-proc processDocumentAddNode(state: var HTMLParseState, newNode: Node) =
-  if state.elementNode.tagType == TAG_HTML:
-    if state.in_body:
-      state.elementNode = state.elementNode.ownerDocument.body
+        any_other_end_tag
+    )
+    TokenType.END_TAG => (block: any_other_end_tag)
+
+proc constructTree(parser: var HTML5Parser): Document =
+  for token in parser.tokenizer.tokenize:
+    if parser.ignoreLF:
+      parser.ignoreLF = false
+      if token.t == CHARACTER_ASCII and token.c == '\n':
+        continue
+    if parser.openElements.len == 0 or
+       parser.adjustedCurrentNode.inHTMLNamespace() or
+       parser.adjustedCurrentNode.isHTMLIntegrationPoint() and token.t in {START_TAG, CHARACTER, CHARACTER_ASCII} or
+       token.t == EOF:
+      #NOTE MathML not implemented
+      parser.processInHTMLContent(token)
     else:
-      state.elementNode = state.elementNode.ownerDocument.head
+      #TODO disabled path because I'm pretty sure it'd just break things
+      #parser.processInForeignContent(token)
+      pop_current_node
 
-  insertNode(state.elementNode, newNode)
+  #TODO document.write (?)
+  #TODO etc etc...
 
-proc processDocumentEndNode(state: var HTMLParseState) =
-  if state.elementNode == nil or state.elementNode.nodeType == DOCUMENT_NODE:
-    return
-  state.elementNode = state.elementNode.parentElement
-
-proc processDocumentText(state: var HTMLParseState) =
-  if state.textNode == nil:
-    state.textNode = newText()
-    processDocumentAddNode(state, state.textNode)
-
-proc processDocumentStartElement(state: var HTMLParseState, element: Element, tag: DOMParsedTag) =
-  var add = true
-
-  for k, v in tag.attrs:
-    element.attributes[k] = v
-  
-  element.id = element.attr("id")
-  if element.attributes.hasKey("class"):
-    for w in unicode.split(element.attributes["class"], Rune(' ')):
-      element.classList.add(w)
-
-  case element.tagType
-  of TAG_SCRIPT:
-    state.in_script = true
-  of TAG_NOSCRIPT:
-    state.in_noscript = true
-  of TAG_STYLE:
-    state.in_style = true
-  of TAG_SELECT:
-    HTMLSelectElement(element).name = element.attr("name")
-    HTMLSelectElement(element).value = element.attr("value")
-  of TAG_INPUT:
-    let element = HTMLInputElement(element)
-    element.value = element.attr("value")
-    element.inputType = element.attr("type").inputType()
-    element.size = element.attr("size").inputSize()
-    element.checked = element.attrb("checked")
-    if state.formowners.len > 0:
-      element.form = state.formowners[^1]
-      element.form.inputs.add(element)
-  of TAG_A:
-    HTMLAnchorElement(element).href = element.attr("href")
-  of TAG_OPTION:
-    HTMLOptionElement(element).value = element.attr("href")
-  of TAG_OL:
-    HTMLOListElement(element).start = element.attri("start") 
-    HTMLOListElement(element).ordinalcounter = HTMLOListElement(element).start.get(1)
-  of TAG_LI:
-    HTMLLIElement(element).value = element.attri("value")
-  of TAG_HTML:
-    add = false
-  of TAG_HEAD:
-    add = false
-    state.in_body = false
-    if state.elementNode.ownerDocument != nil:
-      state.elementNode = state.elementNode.ownerDocument.head
-  of TAG_BODY:
-    add = false
-  of TAG_PRE:
-    state.skip_lf = true
-  of TAG_H1:
-    HTMLHeadingElement(element).rank = 1
-  of TAG_H2:
-    HTMLHeadingElement(element).rank = 2
-  of TAG_H3:
-    HTMLHeadingElement(element).rank = 3
-  of TAG_H4:
-    HTMLHeadingElement(element).rank = 4
-  of TAG_H5:
-    HTMLHeadingElement(element).rank = 5
-  of TAG_H6:
-    HTMLHeadingElement(element).rank = 6
-  of TAG_LINK:
-    HTMLLinkElement(element).href = element.attr("href")
-    HTMLLinkElement(element).rel = element.attr("rel")
-  of TAG_FORM:
-    let element = HTMLFormElement(element)
-    element.name = element.attr("name")
-    element.smethod = element.attr("method")
-    element.enctype = element.attr("enctype")
-    element.target = element.attr("target")
-    element.novalidate = element.attrb("novalidate")
-    state.formowners.add(element)
-  else: discard
-
-  if not state.in_body and not (element.tagType in HeadTagTypes):
-    processDocumentBody(state)
-
-  if state.elementNode.nodeType == ELEMENT_NODE:
-    if element.tagType in SelfClosingTagTypes:
-      if state.elementNode.tagType == element.tagType:
-        processDocumentEndNode(state)
-
-    if state.elementNode.tagType == TAG_P and element.tagType in PClosingTagTypes:
-      processDocumentEndNode(state)
-
-  if add:
-    processDocumentAddNode(state, element)
-    state.elementNode = element
-
-    case element.tagType
-    of VoidTagTypes:
-      processDocumentEndNode(state)
-    of TAG_LI:
-      HTMLLIElement(element).applyOrdinal() #needs to know parent
-    else: discard
+  return parser.document
 
-proc processDocumentEndElement(state: var HTMLParseState, tag: DOMParsedTag) =
-  if tag.tagid != state.elementNode.tagType:
-    if state.elementNode.tagType in SelfClosingTagTypes:
-      processDocumentEndNode(state)
-      processDocumentEndNode(state)
-  else:
-    case tag.tagid
-    of VoidTagTypes:
-      return
-    of TAG_HEAD:
-      processDocumentBody(state)
-      return
-    of TAG_BODY:
-      return
-    of TAG_FORM:
-      if state.formowners.len > 0:
-        discard state.formowners.pop()
-    of TAG_STYLE:
-      let style = HTMLStyleElement(state.elementNode)
-      var str = ""
-      for child in style.textNodes:
-        str &= child.data
-      let sheet = newStringStream(str).parseStylesheet()
-      style.parentElement.sheets.add(sheet)
-    else: discard
-    processDocumentEndNode(state)
-
-proc processDocumentTag(state: var HTMLParseState, tag: DOMParsedTag) =
-  if state.in_script:
-    if not tag.open and tag.tagid == TAG_SCRIPT:
-      state.in_script = false
-    else:
-      return
-
-  if state.in_style:
-    if not tag.open and tag.tagid == TAG_STYLE:
-      state.in_style = false
-    else:
-      return
-
-  if not tag.open and state.in_noscript:
-    if tag.tagid == TAG_NOSCRIPT:
-      state.in_noscript = false
-    else:
-      return
-
-  if tag.open:
-    processDocumentStartElement(state, state.document.newHtmlElement(tag.tagid), tag)
-  else:
-    processDocumentEndElement(state, tag)
-
-proc processDocumentPart(state: var HTMLParseState, buf: string) =
-  var at = 0
-  var max = 0
-  var was_script = false
-
-  max = buf.len
-
-  template process_char(c: char) =
-    if state.in_comment:
-      state.commentNode.data &= c
-    else:
-      if not c.isWhitespace() and state.elementNode.tagType == TAG_HTML:
-        state.textNode = nil
-        processDocumentBody(state)
-        processDocumentText(state)
-      if not (state.skip_lf and c == '\n'):
-        processDocumentText(state)
-        state.textNode.data &= c
-      state.skip_lf = false
-
-  template process_text(s: string) =
-    if state.in_comment:
-      state.commentNode.data &= s
-    else:
-      if not (state.skip_lf and s[0] == '\n'):
-        processDocumentText(state)
-        state.textNode.data &= s
-      state.skip_lf = false
-
-  template has(buf: string, s: string): bool =
-    (at + s.len < buf.len and buf.substr(at, at + 8) == "</script>")
-
-  while at < max:
-    case buf[at]
-    of '&':
-      inc at
-      let p = getescapecmd(buf, at)
-      process_text(p)
-    of '<':
-      if state.in_comment:
-        state.commentNode.data &= buf[at]
-        inc at
-      else:
-        var p = at
-        inc p
-        if p < max and buf[p] == '!':
-          inc p
-          if p < max and buf[p] == '-':
-            inc p
-            if p < max and buf[p] == '-':
-              inc p
-              at = p
-              state.in_comment = true
-              let comment = newComment()
-              state.commentNode = comment
-              processDocumentAddNode(state, comment)
-              state.textNode = nil
-          else:
-            #TODO for doctype
-            while p < max and buf[p] != '>':
-              inc p
-            at = p + 1
-            continue
-
-        if not state.in_comment:
-          state.textNode = nil
-          p = at
-          if state.in_script:
-            if buf.has("</script>"):
-              var tag = parse_tag(buf, at)
-              processDocumentTag(state, tag)
-            else:
-              process_char(buf[at])
-              inc at
-          else:
-            var tag = parse_tag(buf, at)
-            processDocumentTag(state, tag)
-    elif buf[at] == '-' and state.in_comment:
-      var p = at
-      inc p
-      if p < max and buf[p] == '-':
-        inc p
-        if p < max and buf[p] == '>':
-          inc p
-          at = p
-          state.commentNode = nil
-          state.in_comment = false
-
-      if state.in_comment:
-        state.commentNode.data &= buf[at]
-        inc at
-    else:
-      process_char(buf[at])
-      inc at
-
-proc parseHtml5(inputStream: Stream, savesource: bool, source: var string): Document =
-  #TODO implement HTML5 parsing
-  var tokenizer = inputStream.newTokenizer()
-  for tok in tokenizer.tokenize:
-    eprint tok
-
-proc parseHtml(inputStream: Stream, savesource: bool, source: var string): Document =
-  let document = newDocument()
-  insertNode(document, document.root)
-  insertNode(document.root, document.head)
-  insertNode(document.root, document.body)
-
-  var state = HTMLParseState()
-  state.document = document
-  state.elementNode = document.root
-
-  var till_when = false
-
-  var buf = ""
-  var lineBuf: string
-  while not inputStream.atEnd():
-    lineBuf = inputStream.readLine() & '\n'
-    if savesource:
-      source &= lineBuf
-    buf &= lineBuf
-
-    var at = 0
-    while at < lineBuf.len:
-      case lineBuf[at]
-      of '<':
-        till_when = true
-      of '>':
-        till_when = false
-      else: discard
-      inc at
-
-    if till_when:
-      continue
-
-    processDocumentPart(state, buf)
-    buf = ""
-
-  inputStream.close()
-  return document
-
-proc parseHtml*(inputStream: Stream, source: var string): Document =
-  return parseHtml(inputStream, true, source)
-
-proc parseHtml*(inputStream: Stream): Document =
-  var placeholder = ""
-  return parseHtml(inputStream, false, placeholder)
+proc parseHTML5*(inputStream: Stream): Document =
+  var parser: HTML5Parser
+  parser.document = newDocument()
+  parser.tokenizer = inputStream.newTokenizer()
+  return parser.constructTree()
diff --git a/src/html/htmltokenizer.nim b/src/html/htmltokenizer.nim
new file mode 100644
index 00000000..29680d19
--- /dev/null
+++ b/src/html/htmltokenizer.nim
@@ -0,0 +1,1525 @@
+import options
+import streams
+import strformat
+import strutils
+import macros
+import tables
+import unicode
+
+import html/entity
+import html/tags
+import utils/radixtree
+import utils/twtstr
+
+# Tokenizer
+type
+  Tokenizer* = object
+    state*: TokenizerState
+    rstate: TokenizerState
+    curr: Rune
+    tmp: string
+    code: int
+    tok: Token
+    laststart: Token
+    attrn: string
+    attrv: string
+    attr: bool
+
+    istream: Stream
+    sbuf: string
+    sbuf_i: int
+    sbuf_ip: int
+    eof_i: int
+
+  TokenType* = enum
+    DOCTYPE, START_TAG, END_TAG, COMMENT, CHARACTER, CHARACTER_ASCII, EOF
+
+  TokenizerState* = enum
+    DATA, CHARACTER_REFERENCE, TAG_OPEN, RCDATA, RCDATA_LESS_THAN_SIGN,
+    RAWTEXT, RAWTEXT_LESS_THAN_SIGN, SCRIPT_DATA, SCRIPT_DATA_LESS_THAN_SIGN,
+    PLAINTEXT, MARKUP_DECLARATION_OPEN, END_TAG_OPEN, BOGUS_COMMENT, TAG_NAME,
+    BEFORE_ATTRIBUTE_NAME, RCDATA_END_TAG_OPEN, RCDATA_END_TAG_NAME,
+    RAWTEXT_END_TAG_OPEN, RAWTEXT_END_TAG_NAME, SELF_CLOSING_START_TAG,
+    SCRIPT_DATA_END_TAG_OPEN, SCRIPT_DATA_ESCAPE_START,
+    SCRIPT_DATA_END_TAG_NAME, SCRIPT_DATA_ESCAPE_START_DASH,
+    SCRIPT_DATA_ESCAPED_DASH_DASH, SCRIPT_DATA_ESCAPED,
+    SCRIPT_DATA_ESCAPED_DASH, SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN,
+    SCRIPT_DATA_ESCAPED_END_TAG_OPEN, SCRIPT_DATA_DOUBLE_ESCAPE_START,
+    SCRIPT_DATA_ESCAPED_END_TAG_NAME, SCRIPT_DATA_DOUBLE_ESCAPED,
+    SCRIPT_DATA_DOUBLE_ESCAPED_DASH, SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN,
+    SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, SCRIPT_DATA_DOUBLE_ESCAPE_END,
+    AFTER_ATTRIBUTE_NAME, ATTRIBUTE_NAME, BEFORE_ATTRIBUTE_VALUE,
+    ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED,
+    ATTRIBUTE_VALUE_UNQUOTED, AFTER_ATTRIBUTE_VALUE_QUOTED, COMMENT_START,
+    CDATA_SECTION, COMMENT_START_DASH, COMMENT, COMMENT_END,
+    COMMENT_LESS_THAN_SIGN, COMMENT_END_DASH, COMMENT_LESS_THAN_SIGN_BANG,
+    COMMENT_LESS_THAN_SIGN_BANG_DASH, COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH,
+    COMMENT_END_BANG, DOCTYPE, BEFORE_DOCTYPE_NAME, DOCTYPE_NAME,
+    AFTER_DOCTYPE_NAME, AFTER_DOCTYPE_PUBLIC_KEYWORD,
+    AFTER_DOCTYPE_SYSTEM_KEYWORD, BOGUS_DOCTYPE,
+    BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED,
+    DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, AFTER_DOCTYPE_PUBLIC_IDENTIFIER,
+    BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
+    DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED,
+    DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
+    AFTER_DOCTYPE_SYSTEM_IDENTIFIER, CDATA_SECTION_BRACKET, CDATA_SECTION_END,
+    NAMED_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE,
+    AMBIGUOUS_AMPERSAND_STATE, HEXADECIMAL_CHARACTER_REFERENCE_START,
+    DECIMAL_CHARACTER_REFERENCE_START, HEXADECIMAL_CHARACTER_REFERENCE,
+    DECIMAL_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE_END
+
+  Token* = ref object
+    case t*: TokenType
+    of DOCTYPE:
+      name*: Option[string]
+      pubid*: Option[string]
+      sysid*: Option[string]
+      quirks*: bool
+    of START_TAG, END_TAG:
+      tagname*: string
+      tagtype*: TagType
+      selfclosing*: bool
+      attrs*: Table[string, string]
+    of CHARACTER:
+      r*: Rune
+    of CHARACTER_ASCII:
+      c*: char
+    of COMMENT:
+      data*: string
+    of EOF: discard
+
+func `$`*(tok: Token): string =
+  case tok.t
+  of DOCTYPE: fmt"{tok.t} {tok.name} {tok.pubid} {tok.sysid} {tok.quirks}"
+  of START_TAG, END_TAG: fmt"{tok.t} {tok.tagname} {tok.selfclosing} {tok.attrs}"
+  of CHARACTER: fmt"{tok.t} {tok.r}"
+  of CHARACTER_ASCII: fmt"{tok.t} {tok.c}"
+  of COMMENT: fmt"{tok.t} {tok.data}"
+  of EOF: fmt"{tok.t}"
+
+const bufSize = 512
+const copyBufSize = 16
+proc newTokenizer*(s: Stream): Tokenizer =
+  result.sbuf = newString(bufSize)
+  result.istream = s
+  result.eof_i = -1
+  if result.istream.atEnd:
+    result.eof_i = 0
+  else:
+    let n = s.readDataStr(result.sbuf, 0..bufSize-1)
+    if n != bufSize:
+      result.eof_i = n
+
+func atEof(t: Tokenizer): bool =
+  t.eof_i != -1 and t.sbuf_i >= t.eof_i
+
+proc consume(t: var Tokenizer): char {.inline.} =
+  if t.eof_i == -1 and t.sbuf_i >= bufSize-copyBufSize:
+    # Workaround to swap buffer without breaking fastRuneAt.
+    var sbuf2 = newString(copyBufSize)
+    var i = 0
+    while t.sbuf_i + i < bufSize:
+      sbuf2[i] = t.sbuf[t.sbuf_i + i]
+      inc i
+    let n = t.istream.readDataStr(t.sbuf, i..bufSize-1)
+    if n != bufSize - i:
+      t.eof_i = i + n
+    t.sbuf_i = 0
+
+    var j = 0
+    while j < i:
+      t.sbuf[j] = sbuf2[j]
+      inc j
+
+  assert t.eof_i == -1 or t.sbuf_i < t.eof_i # not consuming eof...
+  t.sbuf_ip = t.sbuf_i # save previous pointer for potential reconsume
+
+  # Normalize newlines (\r\n -> \n, single \r -> \n)
+  if t.sbuf[t.sbuf_i] == '\r':
+    inc t.sbuf_i
+    if t.sbuf[t.sbuf_i] != '\n':
+      # \r
+      result = '\n'
+      t.curr = Rune('\n')
+      return
+    # else, \r\n so just return the \n
+
+  result = t.sbuf[t.sbuf_i]
+  fastRuneAt(t.sbuf, t.sbuf_i, t.curr)
+
+proc reconsume(t: var Tokenizer) =
+  t.sbuf_i = t.sbuf_ip
+
+iterator tokenize*(tokenizer: var Tokenizer): Token =
+  template emit(tok: Token) =
+    if tok.t == START_TAG:
+      tokenizer.laststart = tok
+    if tok.t in {START_TAG, END_TAG}:
+      tok.tagtype = tagType(tok.tagName)
+    yield tok
+  template emit(tok: TokenType) = emit Token(t: tok)
+  template emit(rn: Rune) = emit Token(t: CHARACTER, r: rn)
+  template emit(ch: char) = emit Token(t: CHARACTER_ASCII, c: ch)
+  template emit_eof =
+    emit EOF
+    break
+  template emit_tok =
+    if tokenizer.attr:
+      tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv
+    emit tokenizer.tok
+  template emit_current =
+    if is_eof:
+      emit_eof
+    elif c in Ascii:
+      emit c
+    else:
+      emit tokenizer.curr
+  template emit_replacement = emit Rune(0xFFFD)
+  template switch_state(s: TokenizerState) =
+    tokenizer.state = s
+  template switch_state_return(s: TokenizerState) =
+    tokenizer.rstate = tokenizer.state
+    tokenizer.state = s
+  template reconsume_in(s: TokenizerState) =
+    tokenizer.reconsume()
+    switch_state s
+  template parse_error(error: untyped) = discard # does nothing for now... TODO?
+  template is_appropriate_end_tag_token(): bool =
+    tokenizer.laststart != nil and tokenizer.laststart.tagname == tokenizer.tok.tagname
+  template start_new_attribute =
+    if tokenizer.attr:
+      tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv
+    tokenizer.attrn = ""
+    tokenizer.attrv = ""
+    tokenizer.attr = true
+  template leave_attribute_name_state =
+    if tokenizer.attrn in tokenizer.tok.attrs:
+      tokenizer.attr = false
+  template append_to_current_attr_value(c: typed) =
+    if tokenizer.attr:
+      tokenizer.attrv &= c
+  template peek_str(s: string): bool =
+    # WARNING: will break on strings with copyBufSize + 4 bytes
+    assert s.len < copyBufSize - 4 and s.len > 0
+    if tokenizer.sbuf_i + s.len > tokenizer.eof_i:
+      false
+    else:
+      let slice = tokenizer.sbuf[tokenizer.sbuf_i..tokenizer.sbuf_i+s.high]
+      s == slice
+  template peek_str_nocase(s: string): bool =
+    # WARNING: will break on strings with copyBufSize + 4 bytes
+    # WARNING: only works with UPPER CASE ascii
+    assert s.len < copyBufSize - 4 and s.len > 0
+    if tokenizer.sbuf_i + s.len > tokenizer.eof_i:
+      false
+    else:
+      let slice = tokenizer.sbuf[tokenizer.sbuf_i..tokenizer.sbuf_i+s.high]
+      s == slice.toUpperAscii()
+  template peek_char(): char = tokenizer.sbuf[tokenizer.sbuf_i]
+  template has_adjusted_current_node(): bool = false #TODO implement this
+  template consume_and_discard(n: int) = #TODO optimize
+    var i = 0
+    while i < n:
+      discard tokenizer.consume()
+      inc i
+  template consumed_as_an_attribute(): bool =
+    tokenizer.rstate in {ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED, ATTRIBUTE_VALUE_UNQUOTED}
+  template emit_tmp() =
+    var i = 0
+    while i < tokenizer.tmp.len:
+      if tokenizer.tmp[i].isAscii():
+        emit tokenizer.tmp[i]
+        inc i
+      else:
+        var r: Rune
+        fastRuneAt(tokenizer.tmp, i, r)
+        emit r
+  template flush_code_points_consumed_as_a_character_reference() =
+    if consumed_as_an_attribute:
+      append_to_current_attr_value tokenizer.tmp
+    else:
+      emit_tmp
+  template new_token(t: Token) =
+    if tokenizer.attr:
+      tokenizer.attr = false
+    tokenizer.tok = t
+
+  # Fake EOF as an actual character. Also replace anything_else with the else
+  # branch.
+  macro stateMachine(states: varargs[untyped]): untyped =
+    var maincase = newNimNode(nnkCaseStmt).add(quote do: tokenizer.state)
+    for state in states:
+      if state.kind == nnkOfBranch:
+        let mainstmtlist = findChild(state, it.kind == nnkStmtList)
+        if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "ignore_eof":
+          maincase.add(state)
+          continue
+
+        var hasanythingelse = false
+        if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "has_anything_else":
+          hasanythingelse = true
+
+        let childcase = findChild(mainstmtlist, it.kind == nnkCaseStmt)
+        var haseof = false
+        var eofstmts: NimNode
+        var elsestmts: NimNode
+
+        for i in countdown(childcase.len-1, 0):
+          let childof = childcase[i]
+          if childof.kind == nnkOfBranch:
+            for j in countdown(childof.len-1, 0):
+              if childof[j].kind == nnkIdent and childof[j].strVal == "eof":
+                haseof = true
+                eofstmts = childof.findChild(it.kind == nnkStmtList)
+                if childof.findChild(it.kind == nnkIdent and it.strVal != "eof") != nil:
+                  childof.del(j)
+                else:
+                  childcase.del(i)
+          elif childof.kind == nnkElse:
+            elsestmts = childof.findChild(it.kind == nnkStmtList)
+
+        if not haseof:
+          eofstmts = elsestmts
+        let fake_eof = quote do:
+          if is_eof:
+            `eofstmts`
+            continue
+        mainstmtlist.insert(0, fake_eof)
+        if hasanythingelse:
+          let fake_anything_else = quote do:
+            template anything_else =
+              `elsestmts`
+          mainstmtlist.insert(0, fake_anything_else)
+      maincase.add(state)
+    result = newNimNode(nnkStmtList)
+    result.add(maincase)
+
+  template ignore_eof = discard # does nothing
+  template has_anything_else = discard # does nothing
+
+  const null = char(0)
+  const whitespace = {'\t', '\n', '\f', ' '}
+
+  while true:
+    {.computedGoto.}
+    #eprint tokenizer.state #debug
+    let is_eof = tokenizer.atEof # set eof here, otherwise we would exit at the last character
+    let c = if not is_eof:
+      tokenizer.consume()
+    else:
+      # avoid consuming eof...
+      null
+    stateMachine: # => case tokenizer.state
+    of DATA:
+      case c
+      of '&': switch_state_return CHARACTER_REFERENCE
+      of '<': switch_state TAG_OPEN
+      of null:
+        parse_error unexpected_null_character
+        emit_current
+      of eof: emit_eof
+      else: emit_current
+
+    of RCDATA:
+      case c
+      of '&': switch_state_return CHARACTER_REFERENCE
+      of '<': switch_state RCDATA_LESS_THAN_SIGN
+      of null: parse_error unexpected_null_character
+      of eof: emit_eof
+      else: emit_current
+
+    of RAWTEXT:
+      case c
+      of '<': switch_state RAWTEXT_LESS_THAN_SIGN
+      of null:
+        parse_error unexpected_null_character
+        emit_replacement
+      of eof: emit_eof
+      else: emit_current
+
+    of SCRIPT_DATA:
+      case c
+      of '<': switch_state SCRIPT_DATA_LESS_THAN_SIGN
+      of null:
+        parse_error unexpected_null_character
+        emit_replacement
+      of eof: emit_eof
+      else: emit_current
+
+    of PLAINTEXT:
+      case c
+      of null:
+        parse_error unexpected_null_character
+        emit_replacement
+      of eof: emit_eof
+      else: emit_current
+
+    of TAG_OPEN:
+      case c
+      of '!': switch_state MARKUP_DECLARATION_OPEN
+      of '/': switch_state END_TAG_OPEN
+      of AsciiAlpha:
+        new_token Token(t: START_TAG)
+        reconsume_in TAG_NAME
+      of '?':
+        parse_error unexpected_question_mark_instead_of_tag_name
+        new_token Token(t: COMMENT)
+        reconsume_in BOGUS_COMMENT
+      of eof:
+        parse_error eof_before_tag_name
+        emit '<'
+        emit_eof
+      else:
+        parse_error invalid_first_character_of_tag_name
+        emit '<'
+        reconsume_in DATA
+
+    of END_TAG_OPEN:
+      case c
+      of AsciiAlpha:
+        new_token Token(t: END_TAG)
+        reconsume_in TAG_NAME
+      of '>':
+        parse_error missing_end_tag_name
+        switch_state DATA
+      of eof:
+        parse_error eof_before_tag_name
+        emit '<'
+        emit '/'
+        emit_eof
+      else:
+        parse_error invalid_first_character_of_tag_name
+        new_token Token(t: COMMENT)
+        reconsume_in BOGUS_COMMENT
+
+    of TAG_NAME:
+      case c
+      of whitespace: switch_state BEFORE_ATTRIBUTE_NAME
+      of '/': switch_state SELF_CLOSING_START_TAG
+      of '>':
+        switch_state DATA
+        emit_tok
+      of AsciiUpperAlpha: tokenizer.tok.tagname &= char(tokenizer.curr).tolower()
+      of null:
+        parse_error unexpected_null_character
+        tokenizer.tok.tagname &= Rune(0xFFFD)
+      of eof:
+        parse_error eof_in_tag
+        emit_eof
+      else: tokenizer.tok.tagname &= tokenizer.curr
+
+    of RCDATA_LESS_THAN_SIGN:
+      case c
+      of '/':
+        tokenizer.tmp = ""
+        switch_state RCDATA_END_TAG_OPEN
+      else:
+        emit '<'
+        reconsume_in RCDATA
+
+    of RCDATA_END_TAG_OPEN:
+      case c
+      of AsciiAlpha:
+        new_token Token(t: END_TAG)
+        reconsume_in RCDATA_END_TAG_NAME
+      else:
+        emit '<'
+        emit '/'
+        reconsume_in RCDATA
+
+    of RCDATA_END_TAG_NAME:
+      has_anything_else
+      case c
+      of whitespace:
+        if is_appropriate_end_tag_token:
+          switch_state BEFORE_ATTRIBUTE_NAME
+        else:
+          anything_else
+      of '/':
+        if is_appropriate_end_tag_token:
+          switch_state SELF_CLOSING_START_TAG
+        else:
+          anything_else
+      of '>':
+        if is_appropriate_end_tag_token:
+          switch_state DATA
+          emit_tok
+        else:
+          anything_else
+      of AsciiAlpha: # note: merged upper & lower
+        tokenizer.tok.tagname &= char(tokenizer.curr).tolower()
+        tokenizer.tmp &= tokenizer.curr
+      else:
+        new_token nil #TODO
+        emit '<'
+        emit '/'
+        emit_tmp
+        reconsume_in RCDATA
+
+    of RAWTEXT_LESS_THAN_SIGN:
+      case c
+      of '/':
+        tokenizer.tmp = ""
+        switch_state RAWTEXT_END_TAG_OPEN
+      else:
+        emit '<'
+        reconsume_in RAWTEXT
+
+    of RAWTEXT_END_TAG_OPEN:
+      case c
+      of AsciiAlpha:
+        new_token Token(t: END_TAG)
+        reconsume_in RAWTEXT_END_TAG_NAME
+      else:
+        emit '<'
+        emit '/'
+        reconsume_in RAWTEXT
+
+    of RAWTEXT_END_TAG_NAME:
+      has_anything_else
+      case c
+      of whitespace:
+        if is_appropriate_end_tag_token:
+          switch_state BEFORE_ATTRIBUTE_NAME
+        else:
+          anything_else
+      of '/':
+        if is_appropriate_end_tag_token:
+          switch_state SELF_CLOSING_START_TAG
+        else:
+          anything_else
+      of '>':
+        if is_appropriate_end_tag_token:
+          switch_state DATA
+          emit_tok
+        else:
+          anything_else
+      of AsciiAlpha: # note: merged upper & lower
+        tokenizer.tok.tagname &= char(tokenizer.curr).tolower()
+        tokenizer.tmp &= tokenizer.curr
+      else:
+        new_token nil #TODO
+        emit '<'
+        emit '/'
+        for r in tokenizer.tmp.runes:
+          emit r
+        reconsume_in RAWTEXT
+
+    of SCRIPT_DATA_LESS_THAN_SIGN:
+      case c
+      of '/':
+        tokenizer.tmp = ""
+        switch_state SCRIPT_DATA_END_TAG_OPEN
+      of '!':
+        switch_state SCRIPT_DATA_ESCAPE_START
+        emit '<'
+        emit '!'
+      else:
+        emit '<'
+        reconsume_in SCRIPT_DATA
+
+    of SCRIPT_DATA_END_TAG_OPEN:
+      case c
+      of AsciiAlpha:
+        new_token Token(t: END_TAG)
+        reconsume_in SCRIPT_DATA_END_TAG_NAME
+      else:
+        emit '<'
+        emit '/'
+        reconsume_in SCRIPT_DATA
+
+    of SCRIPT_DATA_END_TAG_NAME:
+      has_anything_else
+      case c
+      of whitespace:
+        if is_appropriate_end_tag_token:
+          switch_state BEFORE_ATTRIBUTE_NAME
+        else:
+          anything_else
+      of '/':
+        if is_appropriate_end_tag_token:
+          switch_state SELF_CLOSING_START_TAG
+        else:
+          anything_else
+      of '>':
+        if is_appropriate_end_tag_token:
+          switch_state DATA
+          emit_tok
+        else:
+          anything_else
+      of AsciiAlpha: # note: merged upper & lower
+        tokenizer.tok.tagname &= char(tokenizer.curr).tolower()
+        tokenizer.tmp &= tokenizer.curr
+      else:
+        emit '<'
+        emit '/'
+        emit_tmp
+        reconsume_in SCRIPT_DATA
+
+    of SCRIPT_DATA_ESCAPE_START:
+      case c
+      of '-':
+        switch_state SCRIPT_DATA_ESCAPE_START_DASH
+        emit '-'
+      else:
+        reconsume_in SCRIPT_DATA
+
+    of SCRIPT_DATA_ESCAPE_START_DASH:
+      case c
+      of '-':
+        switch_state SCRIPT_DATA_ESCAPED_DASH_DASH
+        emit '-'
+      else:
+        reconsume_in SCRIPT_DATA
+
+    of SCRIPT_DATA_ESCAPED:
+      case c
+      of '-':
+        switch_state SCRIPT_DATA_ESCAPED_DASH
+        emit '-'
+      of '<':
+        switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
+      of null:
+        parse_error unexpected_null_character
+        emit_replacement
+      of eof:
+        parse_error eof_in_script_html_comment_like_text
+        emit_eof
+      else:
+        emit_current
+
+    of SCRIPT_DATA_ESCAPED_DASH:
+      case c
+      of '-':
+        switch_state SCRIPT_DATA_ESCAPED_DASH_DASH
+        emit '-'
+      of '<':
+        switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
+      of null:
+        parse_error unexpected_null_character
+        switch_state SCRIPT_DATA_ESCAPED
+      of eof:
+        parse_error eof_in_script_html_comment_like_text
+        emit_eof
+      else:
+        switch_state SCRIPT_DATA_ESCAPED
+        emit_current
+
+    of SCRIPT_DATA_ESCAPED_DASH_DASH:
+      case c
+      of '-':
+        emit '-'
+      of '<':
+        switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
+      of '>':
+        switch_state SCRIPT_DATA
+        emit '>'
+      of null:
+        parse_error unexpected_null_character
+        switch_state SCRIPT_DATA_ESCAPED
+      of eof:
+        parse_error eof_in_script_html_comment_like_text
+        emit_eof
+      else:
+        switch_state SCRIPT_DATA_ESCAPED
+        emit_current
+
+    of SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
+      case c
+      of '/':
+        tokenizer.tmp = ""
+        switch_state SCRIPT_DATA_ESCAPED_END_TAG_OPEN
+      of AsciiAlpha:
+        tokenizer.tmp = ""
+        emit '<'
+        reconsume_in SCRIPT_DATA_DOUBLE_ESCAPE_START
+      else:
+        emit '<'
+        reconsume_in SCRIPT_DATA_ESCAPED
+
+    of SCRIPT_DATA_ESCAPED_END_TAG_OPEN:
+      case c
+      of AsciiAlpha:
+        new_token Token(t: START_TAG)
+        reconsume_in SCRIPT_DATA_ESCAPED_END_TAG_NAME
+      else:
+        emit '<'
+        emit '/'
+        reconsume_in SCRIPT_DATA_ESCAPED
+
+    of SCRIPT_DATA_ESCAPED_END_TAG_NAME:
+      has_anything_else
+      case c
+      of whitespace:
+        if is_appropriate_end_tag_token:
+          switch_state BEFORE_ATTRIBUTE_NAME
+        else:
+          anything_else
+      of '/':
+        if is_appropriate_end_tag_token:
+          switch_state SELF_CLOSING_START_TAG
+        else:
+          anything_else
+      of '>':
+        if is_appropriate_end_tag_token:
+          switch_state DATA
+        else:
+          anything_else
+      of AsciiAlpha:
+        tokenizer.tok.tagname &= char(tokenizer.curr).tolower()
+        tokenizer.tmp &= tokenizer.curr
+      else:
+        emit '<'
+        emit '/'
+        emit_tmp
+        reconsume_in SCRIPT_DATA_ESCAPED
+
+    of SCRIPT_DATA_DOUBLE_ESCAPE_START:
+      case c
+      of whitespace, '/', '>':
+        if tokenizer.tmp == "script":
+          switch_state SCRIPT_DATA_DOUBLE_ESCAPED
+        else:
+          switch_state SCRIPT_DATA_ESCAPED
+          emit_current
+      of AsciiAlpha: # note: merged upper & lower
+        tokenizer.tmp &= c.tolower()
+        emit_current
+      else: reconsume_in SCRIPT_DATA_ESCAPED
+
+    of SCRIPT_DATA_DOUBLE_ESCAPED:
+      case c
+      of '-':
+        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_DASH
+        emit '-'
+      of '<':
+        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
+        emit '<'
+      of null:
+        parse_error unexpected_null_character
+        emit_replacement
+      of eof:
+        parse_error eof_in_script_html_comment_like_text
+        emit_eof
+      else: emit_current
+
+    of SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
+      case c
+      of '-':
+        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH
+        emit '-'
+      of '<':
+        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
+        emit '<'
+      of null:
+        parse_error unexpected_null_character
+        switch_state SCRIPT_DATA_DOUBLE_ESCAPED
+        emit_replacement
+      of eof:
+        parse_error eof_in_script_html_comment_like_text
+        emit_eof
+      else:
+        switch_state SCRIPT_DATA_DOUBLE_ESCAPED
+        emit_current
+
+    of SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
+      case c
+      of '-': emit '-'
+      of '<':
+        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
+        emit '<'
+      of '>':
+        switch_state SCRIPT_DATA
+        emit '>'
+      of null:
+        parse_error unexpected_null_character
+        switch_state SCRIPT_DATA_DOUBLE_ESCAPED
+        emit_replacement
+      of eof:
+        parse_error eof_in_script_html_comment_like_text
+        emit_eof
+      else: switch_state SCRIPT_DATA_DOUBLE_ESCAPED
+
+    of SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
+      case c
+      of '/':
+        tokenizer.tmp = ""
+        switch_state SCRIPT_DATA_DOUBLE_ESCAPE_END
+        emit '/'
+      else: reconsume_in SCRIPT_DATA_DOUBLE_ESCAPED
+
+    of SCRIPT_DATA_DOUBLE_ESCAPE_END:
+      case c
+      of whitespace, '/', '>':
+        if tokenizer.tmp == "script":
+          switch_state SCRIPT_DATA_ESCAPED
+        else:
+          switch_state SCRIPT_DATA_DOUBLE_ESCAPED
+          emit_current
+      of AsciiAlpha: # note: merged upper & lower
+        tokenizer.tmp &= c.tolower()
+        emit_current
+      else:
+        reconsume_in SCRIPT_DATA_DOUBLE_ESCAPED
+
+    of BEFORE_ATTRIBUTE_NAME:
+      case c
+      of whitespace: discard
+      of '/', '>', eof: reconsume_in AFTER_ATTRIBUTE_NAME
+      of '=':
+        parse_error unexpected_equals_sign_before_attribute_name
+        start_new_attribute
+        switch_state ATTRIBUTE_NAME
+      else:
+        start_new_attribute
+        reconsume_in ATTRIBUTE_NAME
+
+    of ATTRIBUTE_NAME:
+      has_anything_else
+      case c
+      of whitespace, '/', '>', eof:
+        leave_attribute_name_state
+        reconsume_in AFTER_ATTRIBUTE_NAME
+      of '=':
+        leave_attribute_name_state
+        switch_state BEFORE_ATTRIBUTE_VALUE
+      of AsciiUpperAlpha:
+        tokenizer.attrn &= c.tolower()
+      of null:
+        parse_error unexpected_null_character
+        tokenizer.attrn &= Rune(0xFFFD)
+      of '"', '\'', '<':
+        parse_error unexpected_character_in_attribute_name
+        anything_else
+      else:
+        tokenizer.attrn &= tokenizer.curr
+
+    of AFTER_ATTRIBUTE_NAME:
+      case c
+      of whitespace: discard
+      of '/': switch_state SELF_CLOSING_START_TAG
+      of '=': switch_state BEFORE_ATTRIBUTE_VALUE
+      of '>':
+        switch_state DATA
+        emit '>'
+      of eof:
+        parse_error eof_in_tag
+        emit_eof
+      else:
+        start_new_attribute
+        reconsume_in ATTRIBUTE_NAME
+
+    of BEFORE_ATTRIBUTE_VALUE:
+      case c
+      of whitespace: discard
+      of '"': switch_state ATTRIBUTE_VALUE_DOUBLE_QUOTED
+      of '\'': switch_state ATTRIBUTE_VALUE_SINGLE_QUOTED
+      of '>':
+        parse_error missing_attribute_value
+        switch_state DATA
+        emit '>'
+      else: reconsume_in ATTRIBUTE_VALUE_UNQUOTED
+
+    of ATTRIBUTE_VALUE_DOUBLE_QUOTED:
+      case c
+      of '"': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED
+      of '&': switch_state_return CHARACTER_REFERENCE
+      of null:
+        parse_error unexpected_null_character
+        append_to_current_attr_value Rune(0xFFFD)
+      of eof:
+        parse_error eof_in_tag
+        emit_eof
+      else: append_to_current_attr_value tokenizer.curr
+
+    of ATTRIBUTE_VALUE_SINGLE_QUOTED:
+      case c
+      of '\'': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED
+      of '&': switch_state_return CHARACTER_REFERENCE
+      of null:
+        parse_error unexpected_null_character
+        append_to_current_attr_value Rune(0xFFFD)
+      of eof:
+        parse_error eof_in_tag
+        emit_eof
+      else: append_to_current_attr_value tokenizer.curr
+
+    of ATTRIBUTE_VALUE_UNQUOTED:
+      case c
+      of whitespace: switch_state BEFORE_ATTRIBUTE_NAME
+      of '&': switch_state_return CHARACTER_REFERENCE
+      of '>': switch_state DATA
+      of null:
+        parse_error unexpected_null_character
+        append_to_current_attr_value Rune(0xFFFD)
+      of '"', '\'', '<', '=', '`':
+        parse_error unexpected_character_in_unquoted_attribute_value
+        append_to_current_attr_value c
+      of eof:
+        parse_error eof_in_tag
+        emit_eof
+      else: append_to_current_attr_value tokenizer.curr
+
+    of AFTER_ATTRIBUTE_VALUE_QUOTED:
+      case c
+      of whitespace:
+        switch_state BEFORE_ATTRIBUTE_NAME
+      of '/':
+        switch_state SELF_CLOSING_START_TAG
+      of '>':
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_tag
+        emit_eof
+      else: append_to_current_attr_value tokenizer.curr
+
+    of SELF_CLOSING_START_TAG:
+      case c
+      of '>':
+        tokenizer.tok.selfclosing = true
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_tag
+        emit_eof
+      else:
+        parse_error unexpected_solidus_in_tag
+        reconsume_in BEFORE_ATTRIBUTE_NAME
+
+    of BOGUS_COMMENT:
+      assert tokenizer.tok.t == COMMENT
+      case c
+      of '>':
+        switch_state DATA
+        emit_tok
+      of eof:
+        emit_tok
+        emit_eof
+      of null: parse_error unexpected_null_character
+      else: tokenizer.tok.data &= tokenizer.curr
+
+    of MARKUP_DECLARATION_OPEN: # note: rewritten to fit case model as we consume a char anyway
+      has_anything_else
+      case c
+      of '-':
+        if peek_char == '-':
+          new_token Token(t: COMMENT)
+          tokenizer.state = COMMENT_START
+          consume_and_discard 1
+        else: anything_else
+      of 'D', 'd':
+        if peek_str_nocase("OCTYPE"):
+          consume_and_discard "OCTYPE".len
+          switch_state DOCTYPE
+        else: anything_else
+      of '[':
+        if peek_str("CDATA["):
+          consume_and_discard "CDATA[".len
+          if has_adjusted_current_node: #TODO and it is not an element in the HTML namespace
+            switch_state CDATA_SECTION
+          else:
+            parse_error cdata_in_html_content
+            new_token Token(t: COMMENT, data: "[CDATA[")
+            switch_state BOGUS_COMMENT
+        else: anything_else
+      else:
+        parse_error incorrectly_opened_comment
+        new_token Token(t: COMMENT)
+        reconsume_in BOGUS_COMMENT
+
+    of COMMENT_START:
+      case c
+      of '-': switch_state COMMENT_START_DASH
+      of '>':
+        parse_error abrupt_closing_of_empty_comment
+        switch_state DATA
+        emit_tok
+      else: reconsume_in COMMENT
+
+    of COMMENT_START_DASH:
+      case c
+      of '-': switch_state COMMENT_END
+      of '>':
+        parse_error abrupt_closing_of_empty_comment
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_comment
+        emit_tok
+        emit_eof
+      else:
+        tokenizer.tok.data &= '-'
+        reconsume_in COMMENT
+
+    of COMMENT:
+      case c
+      of '<':
+        tokenizer.tok.data &= c
+        switch_state COMMENT_LESS_THAN_SIGN
+      of '-': switch_state COMMENT_END_DASH
+      of null:
+        parse_error unexpected_null_character
+        tokenizer.tok.data &= Rune(0xFFFD)
+      of eof:
+        parse_error eof_in_comment
+        emit_tok
+        emit_eof
+      else: tokenizer.tok.data &= tokenizer.curr
+
+    of COMMENT_LESS_THAN_SIGN:
+      case c
+      of '!':
+        tokenizer.tok.data &= c
+        switch_state COMMENT_LESS_THAN_SIGN_BANG
+      of '<': tokenizer.tok.data &= c
+      else: reconsume_in COMMENT
+
+    of COMMENT_LESS_THAN_SIGN_BANG:
+      case c
+      of '-': switch_state COMMENT_LESS_THAN_SIGN_BANG_DASH
+      else: reconsume_in COMMENT
+
+    of COMMENT_LESS_THAN_SIGN_BANG_DASH:
+      case c
+      of '-': switch_state COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH
+      else: reconsume_in COMMENT_END_DASH
+
+    of COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH:
+      case c
+      of '>', eof: reconsume_in COMMENT_END
+      else:
+        parse_error nested_comment
+        reconsume_in COMMENT_END
+
+    of COMMENT_END_DASH:
+      case c
+      of '-': switch_state COMMENT_END
+      of eof:
+        parse_error eof_in_comment
+        emit_tok
+        emit_eof
+      else:
+        tokenizer.tok.data &= '-'
+        reconsume_in COMMENT
+
+    of COMMENT_END:
+      case c
+      of '>': switch_state DATA
+      of '!': switch_state COMMENT_END_BANG
+      of '-': tokenizer.tok.data &= '-'
+      of eof:
+        parse_error eof_in_comment
+        emit_tok
+        emit_eof
+      else:
+        tokenizer.tok.data &= "--"
+        reconsume_in COMMENT
+
+    of COMMENT_END_BANG:
+      case c
+      of '-':
+        tokenizer.tok.data &= "--!"
+        switch_state COMMENT_END_DASH
+      of '>':
+        parse_error incorrectly_closed_comment
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_comment
+        emit_tok
+        emit_eof
+      else:
+        tokenizer.tok.data &= "--!"
+        reconsume_in COMMENT
+
+    of DOCTYPE:
+      case c
+      of whitespace: switch_state BEFORE_DOCTYPE_NAME
+      of '>': reconsume_in BEFORE_DOCTYPE_NAME
+      of eof:
+        parse_error eof_in_doctype
+        new_token Token(t: DOCTYPE, quirks: true)
+        emit_tok
+        emit_eof
+      else:
+        parse_error missing_whitespace_before_doctype_name
+        reconsume_in BEFORE_DOCTYPE_NAME
+
+    of BEFORE_DOCTYPE_NAME:
+      case c
+      of whitespace: discard
+      of AsciiUpperAlpha:
+        new_token Token(t: DOCTYPE, name: some($c.tolower()))
+        switch_state DOCTYPE_NAME
+      of null:
+        parse_error unexpected_null_character
+        new_token Token(t: DOCTYPE, name: some($Rune(0xFFFD)))
+      of '>':
+        parse_error missing_doctype_name
+        new_token Token(t: DOCTYPE, quirks: true)
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_doctype
+        new_token Token(t: DOCTYPE, quirks: true)
+        emit_tok
+        emit_eof
+      else:
+        new_token Token(t: DOCTYPE, name: some($tokenizer.curr))
+        switch_state DOCTYPE_NAME
+
+    of DOCTYPE_NAME:
+      case c
+      of whitespace: switch_state AFTER_DOCTYPE_NAME
+      of '>':
+        switch_state DATA
+        emit_tok
+      of AsciiUpperAlpha:
+        tokenizer.tok.name.get &= c.tolower()
+      of null:
+        parse_error unexpected_null_character
+        tokenizer.tok.name.get &= Rune(0xFFFD)
+      of eof:
+        parse_error eof_in_doctype
+        tokenizer.tok.quirks = true
+        emit_tok
+        emit_eof
+      else:
+        tokenizer.tok.name.get &= tokenizer.curr
+
+    of AFTER_DOCTYPE_NAME: # note: rewritten to fit case model as we consume a char anyway
+      has_anything_else
+      case c
+      of whitespace: discard
+      of '>':
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_doctype
+        tokenizer.tok.quirks = true
+        emit_tok
+        emit_eof
+      of 'p', 'P':
+        if peek_str("UBLIC"):
+          consume_and_discard "UBLIC".len
+          switch_state AFTER_DOCTYPE_PUBLIC_KEYWORD
+        else:
+          anything_else
+      of 's', 'S':
+        if peek_str("YSTEM"):
+          consume_and_discard "YSTEM".len
+          switch_state AFTER_DOCTYPE_SYSTEM_KEYWORD
+        else:
+          anything_else
+      else:
+        parse_error invalid_character_sequence_after_doctype_name
+        tokenizer.tok.quirks = true
+        reconsume_in BOGUS_DOCTYPE
+
+    of AFTER_DOCTYPE_PUBLIC_KEYWORD:
+      case c
+      of whitespace: switch_state BEFORE_DOCTYPE_PUBLIC_IDENTIFIER
+      of '"':
+        parse_error missing_whitespace_after_doctype_public_keyword
+        tokenizer.tok.pubid = some("")
+        switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
+      of '>':
+        parse_error missing_doctype_public_identifier
+        tokenizer.tok.quirks = true
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_doctype
+        tokenizer.tok.quirks = true
+        emit_tok
+        emit_eof
+      else:
+        parse_error missing_quote_before_doctype_public_identifier
+        tokenizer.tok.quirks = true
+        reconsume_in BOGUS_DOCTYPE
+
+    of BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
+      case c
+      of whitespace: discard
+      of '"':
+        tokenizer.tok.pubid = some("")
+        switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
+      of '\'':
+        tokenizer.tok.pubid = some("")
+        switch_state DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED
+      of '>':
+        parse_error missing_doctype_public_identifier
+        tokenizer.tok.quirks = true
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_doctype
+        tokenizer.tok.quirks = true
+        emit_tok
+        emit_eof
+      else:
+        parse_error missing_quote_before_doctype_public_identifier
+        tokenizer.tok.quirks = true
+        reconsume_in BOGUS_DOCTYPE
+
+    of DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
+      case c
+      of '"': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER
+      of null:
+        parse_error unexpected_null_character
+        tokenizer.tok.pubid.get &= Rune(0xFFFD)
+      of '>':
+        parse_error abrupt_doctype_public_identifier
+        tokenizer.tok.quirks = true
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_doctype
+        tokenizer.tok.quirks = true
+        emit_tok
+        emit_eof
+      else:
+        tokenizer.tok.pubid.get &= tokenizer.curr
+
+    of DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
+      case c
+      of '\'': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER
+      of null:
+        parse_error unexpected_null_character
+        tokenizer.tok.pubid.get &= Rune(0xFFFD)
+      of '>':
+        parse_error abrupt_doctype_public_identifier
+        tokenizer.tok.quirks = true
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_doctype
+        tokenizer.tok.quirks = true
+        emit_tok
+        emit_eof
+      else:
+        tokenizer.tok.pubid.get &= tokenizer.curr
+
+    of AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
+      case c
+      of whitespace: switch_state BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
+      of '>':
+        switch_state DATA
+        emit_tok
+      of '"':
+        parse_error missing_whitespace_between_doctype_public_and_system_identifiers
+        tokenizer.tok.sysid = some("")
+        switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
+      of '\'':
+        parse_error missing_whitespace_between_doctype_public_and_system_identifiers
+        tokenizer.tok.sysid = some("")
+        switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
+      of eof:
+        parse_error eof_in_doctype
+        tokenizer.tok.quirks = true
+        emit_tok
+        emit_eof
+      else:
+        parse_error missing_quote_before_doctype_system_identifier
+        tokenizer.tok.quirks = true
+        reconsume_in BOGUS_DOCTYPE
+
+    of BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
+      case c
+      of whitespace: discard
+      of '>':
+        switch_state DATA
+        emit_tok
+      of '"':
+        tokenizer.tok.sysid = some("")
+        switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
+      of '\'':
+        tokenizer.tok.sysid = some("")
+        switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
+      of eof:
+        parse_error eof_in_doctype
+        tokenizer.tok.quirks = true
+        emit_tok
+        emit_eof
+      else:
+        parse_error missing_quote_before_doctype_system_identifier
+        tokenizer.tok.quirks = true
+        reconsume_in BOGUS_DOCTYPE
+
+    of AFTER_DOCTYPE_SYSTEM_KEYWORD:
+      case c
+      of whitespace: switch_state BEFORE_DOCTYPE_SYSTEM_IDENTIFIER
+      of '"':
+        parse_error missing_whitespace_after_doctype_system_keyword
+        tokenizer.tok.sysid = some("")
+        switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
+      of '\'':
+        parse_error missing_whitespace_after_doctype_system_keyword
+        tokenizer.tok.sysid = some("")
+        switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
+      of '>':
+        parse_error missing_doctype_system_identifier
+        tokenizer.tok.quirks = true
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_doctype
+        tokenizer.tok.quirks = true
+        emit_tok
+        emit_eof
+      else:
+        parse_error missing_quote_before_doctype_system_identifier
+        tokenizer.tok.quirks = true
+        reconsume_in BOGUS_DOCTYPE
+
+    of BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
+      case c
+      of whitespace: discard
+      of '"':
+        tokenizer.tok.pubid = some("")
+        switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
+      of '\'':
+        tokenizer.tok.pubid = some("")
+        switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
+      of '>':
+        parse_error missing_doctype_system_identifier
+        tokenizer.tok.quirks = true
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_doctype
+        tokenizer.tok.quirks = true
+        emit_tok
+        emit_eof
+      else:
+        parse_error missing_quote_before_doctype_system_identifier
+        tokenizer.tok.quirks = true
+        reconsume_in BOGUS_DOCTYPE
+
+    of DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
+      case c
+      of '"': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER
+      of null:
+        parse_error unexpected_null_character
+        tokenizer.tok.sysid.get &= Rune(0xFFFD)
+      of '>':
+        parse_error abrupt_doctype_system_identifier
+        tokenizer.tok.quirks = true
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_doctype
+        tokenizer.tok.quirks = true
+        emit_tok
+        emit_eof
+      else:
+        tokenizer.tok.sysid.get &= tokenizer.curr
+
+    of DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
+      case c
+      of '\'': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER
+      of null:
+        parse_error unexpected_null_character
+        tokenizer.tok.sysid.get &= Rune(0xFFFD)
+      of '>':
+        parse_error abrupt_doctype_system_identifier
+        tokenizer.tok.quirks = true
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_doctype
+        tokenizer.tok.quirks = true
+        emit_tok
+        emit_eof
+      else:
+        tokenizer.tok.sysid.get &= tokenizer.curr
+
+    of AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
+      case c
+      of whitespace: discard
+      of '>':
+        switch_state DATA
+        emit_tok
+      of eof:
+        parse_error eof_in_doctype
+        tokenizer.tok.quirks = true
+        emit_tok
+        emit_eof
+      else:
+        parse_error unexpected_character_after_doctype_system_identifier
+        reconsume_in BOGUS_DOCTYPE
+
+    of BOGUS_DOCTYPE:
+      case c
+      of '>':
+        switch_state DATA
+        emit_tok
+      of null: parse_error unexpected_null_character
+      of eof:
+        emit_tok
+        emit_eof
+      else: discard
+
+    of CDATA_SECTION:
+      case c
+      of ']': switch_state CDATA_SECTION_BRACKET
+      of eof:
+        parse_error eof_in_cdata
+        emit_eof
+      else:
+        emit_current
+
+    of CDATA_SECTION_BRACKET:
+      case c
+      of ']': switch_state CDATA_SECTION_END
+      of '>': switch_state DATA
+      else:
+        emit ']'
+        reconsume_in CDATA_SECTION
+
+    of CDATA_SECTION_END:
+      case c
+      of ']': emit ']'
+      of '>': switch_state DATA
+      else:
+        emit ']'
+        emit ']'
+        reconsume_in CDATA_SECTION
+
+    of CHARACTER_REFERENCE:
+      tokenizer.tmp = "&"
+      case c
+      of AsciiAlpha: reconsume_in NAMED_CHARACTER_REFERENCE
+      of '#':
+        tokenizer.tmp &= '#'
+        switch_state NUMERIC_CHARACTER_REFERENCE
+      else:
+        flush_code_points_consumed_as_a_character_reference
+        reconsume_in tokenizer.rstate
+
+    of NAMED_CHARACTER_REFERENCE:
+      ignore_eof # we check for eof ourselves
+      tokenizer.reconsume()
+      when nimVm:
+        eprint "Cannot evaluate character references at compile time"
+      else:
+        var buf = ""
+        var node = entityMap
+        var value = none(string) # last value
+        #TODO interfacing with RadixNode is suffering
+        # plus this doesn't look very efficient either
+        while not tokenizer.atEof:
+          let c = tokenizer.consume()
+          buf &= c
+          if not node.hasPrefix(buf):
+            tokenizer.reconsume()
+            break
+          let prevnode = node
+          node = node{buf}
+          if node != prevnode:
+            buf = ""
+            if node.value.issome:
+              value = node.value
+          tokenizer.tmp &= tokenizer.curr
+        if value.issome:
+          if consumed_as_an_attribute and tokenizer.tmp[^1] != ';' and peek_char in {'='} + AsciiAlpha:
+            flush_code_points_consumed_as_a_character_reference
+            switch_state tokenizer.rstate
+          else:
+            if tokenizer.tmp[^1] != ';':
+              parse_error missing_semicolon_after_character_reference_parse_error
+            tokenizer.tmp = value.get
+            flush_code_points_consumed_as_a_character_reference
+            switch_state tokenizer.rstate
+        else:
+          flush_code_points_consumed_as_a_character_reference
+          switch_state AMBIGUOUS_AMPERSAND_STATE
+
+    of AMBIGUOUS_AMPERSAND_STATE:
+      case c
+      of AsciiAlpha:
+        if consumed_as_an_attribute:
+          append_to_current_attr_value c
+        else:
+          emit_current
+      of ';':
+        parse_error unknown_named_character_reference
+        reconsume_in tokenizer.rstate
+      else: reconsume_in tokenizer.rstate
+
+    of NUMERIC_CHARACTER_REFERENCE:
+      tokenizer.code = 0
+      case c
+      of 'x', 'X':
+        tokenizer.tmp &= c
+        switch_state HEXADECIMAL_CHARACTER_REFERENCE_START
+      else: reconsume_in DECIMAL_CHARACTER_REFERENCE_START
+
+    of HEXADECIMAL_CHARACTER_REFERENCE_START:
+      case c
+      of AsciiHexDigit: reconsume_in HEXADECIMAL_CHARACTER_REFERENCE
+      else:
+        parse_error absence_of_digits_in_numeric_character_reference
+        flush_code_points_consumed_as_a_character_reference
+        reconsume_in tokenizer.rstate
+
+    of DECIMAL_CHARACTER_REFERENCE_START:
+      case c
+      of AsciiDigit: reconsume_in DECIMAL_CHARACTER_REFERENCE
+      else:
+        parse_error absence_of_digits_in_numeric_character_reference
+        flush_code_points_consumed_as_a_character_reference
+        reconsume_in tokenizer.rstate
+
+    of HEXADECIMAL_CHARACTER_REFERENCE:
+      case c
+      of AsciiHexDigit: # note: merged digit, upper hex, lower hex
+        tokenizer.code *= 0x10
+        tokenizer.code += hexValue(c)
+      of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END
+      else:
+        parse_error missing_semicolon_after_character_reference
+        reconsume_in NUMERIC_CHARACTER_REFERENCE_END
+
+    of DECIMAL_CHARACTER_REFERENCE:
+      case c
+      of AsciiDigit:
+        tokenizer.code *= 10
+        tokenizer.code += decValue(c)
+      of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END
+      else:
+        parse_error missing_semicolon_after_character_reference
+        reconsume_in NUMERIC_CHARACTER_REFERENCE_END
+
+    of NUMERIC_CHARACTER_REFERENCE_END:
+      ignore_eof # we reconsume anyway
+      case tokenizer.code
+      of 0x00:
+        parse_error null_character_reference
+        tokenizer.code = 0xFFFD
+      elif tokenizer.code > 0x10FFFF:
+        parse_error character_reference_outside_unicode_range
+        tokenizer.code = 0xFFFD
+      elif Rune(tokenizer.code).isSurrogate():
+        parse_error surrogate_character_reference
+        tokenizer.code = 0xFFFD
+      elif Rune(tokenizer.code).isNonCharacter():
+        parse_error noncharacter_character_reference
+        # do nothing
+      elif tokenizer.code in 0..255 and char(tokenizer.code) in ((Controls - AsciiWhitespace) + {chr(0x0D)}):
+        const ControlMapTable = [
+          (0x80, 0x20AC), (0x82, 0x201A), (0x83, 0x0192), (0x84, 0x201E),
+          (0x85, 0x2026), (0x86, 0x2020), (0x87, 0x2021), (0x88, 0x02C6),
+          (0x89, 0x2030), (0x8A, 0x0160), (0x8B, 0x2039), (0x8C, 0x0152),
+          (0x8E, 0x017D), (0x91, 0x2018), (0x92, 0x2019), (0x93, 0x201C),
+          (0x94, 0x201D), (0x95, 0x2022), (0x96, 0x2013), (0x97, 0x2014),
+          (0x98, 0x02DC), (0x99, 0x2122), (0x9A, 0x0161), (0x9B, 0x203A),
+          (0x9C, 0x0153), (0x9E, 0x017E), (0x9F, 0x0178),
+        ].toTable()
+        if ControlMapTable.hasKey(tokenizer.code):
+          tokenizer.code = ControlMapTable[tokenizer.code]
+      tokenizer.tmp = $Rune(tokenizer.code)
+      flush_code_points_consumed_as_a_character_reference #TODO optimize so we flush directly
+      reconsume_in tokenizer.rstate # we unnecessarily consumed once so reconsume
+
diff --git a/src/html/tags.nim b/src/html/tags.nim
index 9e356444..d3bd7f6b 100644
--- a/src/html/tags.nim
+++ b/src/html/tags.nim
@@ -25,24 +25,27 @@ type
     INPUT_URL, INPUT_WEEK
 
   TagType* = enum
-    TAG_UNKNOWN, TAG_HTML, TAG_BASE, TAG_HEAD, TAG_LINK, TAG_META, TAG_STYLE,
+    TAG_UNKNOWN, TAG_APPLET, TAG_BIG, TAG_HTML, TAG_BASE, TAG_BASEFONT,
+    TAG_BGSOUND, TAG_HEAD, TAG_LINK, TAG_LISTING, TAG_META, TAG_STYLE,
     TAG_TITLE, TAG_BODY, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_FOOTER,
     TAG_HEADER, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HGROUP,
     TAG_MAIN, TAG_NAV, TAG_SECTION, TAG_BLOCKQUOTE, TAG_DD, TAG_DIV, TAG_DL,
     TAG_DT, TAG_FIGCAPTION, TAG_FIGURE, TAG_HR, TAG_LI, TAG_OL, TAG_P, TAG_PRE,
-    TAG_UL, TAG_A, TAG_ABBR, TAG_B, TAG_BDI, TAG_BDO, TAG_BR, TAG_CITE,
-    TAG_CODE, TAG_DATA, TAG_DFN, TAG_EM, TAG_I, TAG_KBD, TAG_MARK, TAG_Q,
-    TAG_RB, TAG_RP, TAG_RT, TAG_RTC, TAG_RUBY, TAG_S, TAG_SAMP, TAG_SMALL,
-    TAG_SPAN, TAG_STRONG, TAG_SUB, TAG_SUP, TAG_TIME, TAG_U, TAG_VAR, TAG_WBR,
-    TAG_AREA, TAG_AUDIO, TAG_IMG, TAG_MAP, TAG_TRACK, TAG_VIDEO,
-    TAG_IFRAME, TAG_OBJECT, TAG_PARAM, TAG_PICTURE, TAG_PORTAL, TAG_SOURCE,
-    TAG_CANVAS, TAG_NOSCRIPT, TAG_SCRIPT, TAG_DEL, TAG_INS, TAG_CAPTION,
-    TAG_COL, TAG_COLGROUP, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH,
-    TAG_THEAD, TAG_TR, TAG_BUTTON, TAG_DATALIST, TAG_FIELDSET, TAG_FORM,
-    TAG_INPUT, TAG_LABEL, TAG_LEGEND, TAG_METER, TAG_OPTGROUP, TAG_OPTION,
+    TAG_UL, TAG_A, TAG_ABBR, TAG_B, TAG_BDI, TAG_BDO, TAG_BR, TAG_NOBR,
+    TAG_CITE, TAG_CODE, TAG_DATA, TAG_DFN, TAG_EM, TAG_EMBED, TAG_I, TAG_KBD,
+    TAG_MARK, TAG_MARQUEE, TAG_Q, TAG_RB, TAG_RP, TAG_RT, TAG_RTC, TAG_RUBY,
+    TAG_S, TAG_SAMP, TAG_SMALL, TAG_SPAN, TAG_STRONG, TAG_SUB, TAG_SUP,
+    TAG_TIME, TAG_U, TAG_VAR, TAG_WBR, TAG_AREA, TAG_AUDIO, TAG_IMG, TAG_IMAGE,
+    TAG_MAP, TAG_TRACK, TAG_VIDEO, TAG_IFRAME, TAG_OBJECT, TAG_PARAM,
+    TAG_PICTURE, TAG_PORTAL, TAG_SOURCE, TAG_CANVAS, TAG_NOSCRIPT, TAG_NOEMBED,
+    TAG_PLAINTEXT, TAG_XMP, TAG_SCRIPT, TAG_DEL, TAG_INS, TAG_CAPTION, TAG_COL,
+    TAG_COLGROUP, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH, TAG_THEAD,
+    TAG_TR, TAG_BUTTON, TAG_DATALIST, TAG_FIELDSET, TAG_FORM, TAG_INPUT,
+    TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_METER, TAG_OPTGROUP, TAG_OPTION,
     TAG_OUTPUT, TAG_PROGRESS, TAG_SELECT, TAG_TEXTAREA, TAG_DETAILS,
     TAG_DIALOG, TAG_MENU, TAG_SUMMARY, TAG_BLINK, TAG_CENTER, TAG_CONTENT,
-    TAG_DIR, TAG_FONT, TAG_FRAME, TAG_NOFRAMES, TAG_FRAMESET, TAG_STRIKE, TAG_TT
+    TAG_DIR, TAG_FONT, TAG_FRAME, TAG_NOFRAMES, TAG_FRAMESET, TAG_STRIKE,
+    TAG_TT, TAG_TEMPLATE, TAG_SARCASM
 
 func getTagTypeMap(): Table[string, TagType] =
   for i in TagType:
@@ -88,6 +91,54 @@ const PClosingTagTypes* = {
   TAG_TABLE, TAG_UL
 }
 
+const HTagTypes* = {
+  TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6
+}
+
 const HeadTagTypes* = {
   TAG_BASE, TAG_LINK, TAG_META, TAG_TITLE, TAG_NOSCRIPT, TAG_SCRIPT, TAG_NOFRAMES, TAG_STYLE, TAG_HEAD
 }
+
+# 4.10.2 Categories
+const FormAssociatedElements* = {
+  TAG_BUTTON, TAG_FIELDSET, TAG_INPUT, TAG_OBJECT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA, TAG_IMG
+}
+
+const ListedElements* = {
+  TAG_BUTTON, TAG_FIELDSET, TAG_INPUT, TAG_OBJECT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA
+}
+
+const SubmittableElements* = {
+  TAG_BUTTON, TAG_INPUT, TAG_SELECT, TAG_TEXTAREA
+}
+
+const ResettableElements* = {
+  TAG_INPUT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA
+}
+
+const AutocapitalizeInheritingElements* = {
+  TAG_BUTTON, TAG_FIELDSET, TAG_INPUT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA
+}
+
+const LabelableElements* = {
+  # input only if type not hidden
+  TAG_BUTTON, TAG_INPUT, TAG_METER, TAG_OUTPUT, TAG_PROGRESS, TAG_SELECT, TAG_TEXTAREA
+}
+
+#https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements
+#NOTE MathML not implemented
+#TODO SVG foreignObject, SVG desc, SVG title
+const SpecialElements* = {
+ TAG_ADDRESS, TAG_APPLET, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_BASE,
+ TAG_BASEFONT, TAG_BGSOUND, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON,
+ TAG_CAPTION, TAG_CENTER, TAG_COL, TAG_COLGROUP, TAG_DD, TAG_DETAILS, TAG_DIR,
+ TAG_DIV, TAG_DL, TAG_DT, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE,
+ TAG_FOOTER, TAG_FORM, TAG_FRAME, TAG_FRAMESET, TAG_H1, TAG_H2, TAG_H3, TAG_H4,
+ TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HGROUP, TAG_HR, TAG_HTML,
+ TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_KEYGEN, TAG_LI, TAG_LINK, TAG_LISTING,
+ TAG_MAIN, TAG_MARQUEE, TAG_MENU, TAG_META, TAG_NAV, TAG_NOEMBED, TAG_NOFRAMES,
+ TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE,
+ TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SOURCE, TAG_STYLE, TAG_SUMMARY,
+ TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH,
+ TAG_THEAD, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_UL, TAG_WBR, TAG_XMP 
+}