diff options
author | bptato <nincsnevem662@gmail.com> | 2022-06-27 23:53:44 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2022-07-11 21:08:10 +0200 |
commit | 62cba694e47a7a1f4bedc7fd48ceac9c26aa3aa1 (patch) | |
tree | e20a9f39a293c256f707162c46e117d13f3d5621 /src/html | |
parent | 84882cb8a6f9bca58d178a1f2b8fb5cafa8b3a56 (diff) | |
download | chawan-62cba694e47a7a1f4bedc7fd48ceac9c26aa3aa1.tar.gz |
Implement HTML5 parsing
Completely replaced the previous HTML2 (?) parser, which was a bad re-implementation of w3m's parser in the first place. Now we have a (sort of) compliant HTML5 parser. Needs tests, badly.
Diffstat (limited to 'src/html')
-rw-r--r-- | src/html/dom.nim | 436 | ||||
-rw-r--r-- | src/html/htmlparser.nim | 3887 | ||||
-rw-r--r-- | src/html/htmltokenizer.nim | 1525 | ||||
-rw-r--r-- | src/html/tags.nim | 75 |
4 files changed, 3856 insertions, 2067 deletions
diff --git a/src/html/dom.nim b/src/html/dom.nim index aaa33163..6c63e1ad 100644 --- a/src/html/dom.nim +++ b/src/html/dom.nim @@ -1,5 +1,7 @@ import tables import options +import streams +import strformat import strutils import css/values @@ -17,25 +19,33 @@ type FORM_ENCODING_TYPE_MULTIPART = "multipart/form-data", FORM_ENCODING_TYPE_TEXT_PLAIN = "text/plain" + QuirksMode* = enum + NO_QUIRKS, QUIRKS, LIMITED_QUIRKS + + Namespace* = enum + HTML = "http://www.w3.org/1999/xhtml", + MATHML = "http://www.w3.org/1998/Math/MathML", + SVG = "http://www.w3.org/2000/svg", + XLINK = "http://www.w3.org/1999/xlink", + XML = "http://www.w3.org/XML/1998/namespace", + XMLNS = "http://www.w3.org/2000/xmlns/" + type - EventTarget* = ref EventTargetObj - EventTargetObj = object of RootObj + EventTarget* = ref object of RootObj - Node* = ref NodeObj - NodeObj = object of EventTargetObj + Node* = ref object of EventTarget nodeType*: NodeType childNodes*: seq[Node] - children*: seq[Element] isConnected*: bool nextSibling*: Node previousSibling*: Node parentNode*: Node parentElement*: Element - ownerDocument*: Document + rootNode: Node + document*: Document uid*: int # Unique id - Attr* = ref AttrObj - AttrObj = object of NodeObj + Attr* = ref object of Node namespaceURI*: string prefix*: string localName*: string @@ -43,38 +53,40 @@ type value*: string ownerElement*: Element - Document* = ref DocumentObj - DocumentObj = object of NodeObj + Document* = ref object of Node location*: Url type_elements*: array[TagType, seq[Element]] - id_elements*: Table[string, seq[Element]] class_elements*: Table[string, seq[Element]] all_elements*: seq[Element] - head*: HTMLElement - body*: HTMLElement - root*: Element + mode*: QuirksMode + + parser_cannot_change_the_mode_flag*: bool + is_iframe_srcdoc*: bool - CharacterData* = ref CharacterDataObj - CharacterDataObj = object of NodeObj + CharacterData* = ref object of Node data*: string length*: int - Text* = ref TextObj - TextObj = object of CharacterDataObj + Text* = ref object of CharacterData wholeText*: string - Comment* = ref CommentObj - CommentObj = object of CharacterDataObj + Comment* = ref object of CharacterData - Element* = ref ElementObj - ElementObj = object of NodeObj - namespaceURI*: string + DocumentFragment* = ref object of Node + host*: Element + + DocumentType* = ref object of Node + name*: string + publicId*: string + systemId*: string + + Element* = ref object of Node + namespace*: Namespace + namespacePrefix*: Option[string] #TODO namespaces prefix*: string localName*: string - tagName*: string tagType*: TagType - sheets*: seq[CSSStylesheet] id*: string classList*: seq[string] attributes*: Table[string, string] @@ -84,7 +96,7 @@ type cssapplied*: bool rendered*: bool - HTMLElement* = ref object of ElementObj + HTMLElement* = ref object of Element HTMLInputElement* = ref object of HTMLElement inputType*: InputType @@ -131,6 +143,8 @@ type ordinalvalue*: int HTMLStyleElement* = ref object of HTMLElement + sheet*: CSSStylesheet + sheet_invalid*: bool HTMLLinkElement* = ref object of HTMLElement href*: string @@ -145,6 +159,22 @@ type constructingentrylist*: bool inputs*: seq[HTMLInputElement] + HTMLTemplateElement* = ref object of HTMLElement + content*: DocumentFragment + + HTMLUnknownElement* = ref object of HTMLElement + + HTMLScriptElement* = ref object of HTMLElement + parserDocument*: Document + preparationTimeDocument*: Document + forceAsync*: bool + fromAnExternalFile*: bool + readyToBeParser*: bool + alreadyStarted*: bool + delayingTheLoadEvent*: bool + ctype*: bool + #TODO result + # For debugging func `$`*(node: Node): string = case node.nodeType @@ -177,7 +207,7 @@ iterator radiogroup*(input: HTMLInputElement): HTMLInputElement {.inline.} = for input in input.form.radiogroup: yield input else: - for input in input.ownerDocument.radiogroup: + for input in input.document.radiogroup: yield input iterator textNodes*(node: Node): Text {.inline.} = @@ -197,7 +227,68 @@ iterator branch*(node: Node): Node {.inline.} = var node = node while node != nil: yield node - node = node.parentElement + node = node.parentNode + +iterator children*(node: Node): Element {.inline.} = + for child in node.childNodes: + if child.nodeType == ELEMENT_NODE: + yield Element(child) + +func qualifiedName*(element: Element): string = + if element.namespacePrefix.issome: element.namespacePrefix.get & ':' & element.localName + else: element.localName + +func html*(document: Document): HTMLElement = + for element in document.children: + if element.tagType == TAG_HTML: + return HTMLElement(element) + return nil + +func head*(document: Document): HTMLElement = + if document.html != nil: + for element in document.html.children: + if element.tagType == TAG_HEAD: + return HTMLElement(element) + return nil + +func body*(document: Document): HTMLElement = + if document.html != nil: + for element in document.html.children: + if element.tagType == TAG_BODY: + return HTMLElement(element) + return nil + + +func countChildren(node: Node, nodeType: NodeType): int = + for child in node.childNodes: + if child.nodeType == nodeType: + inc result + +func hasChild(node: Node, nodeType: NodeType): bool = + for child in node.childNodes: + if child.nodeType == nodeType: + return false + +func hasNextSibling(node: Node, nodeType: NodeType): bool = + var node = node.nextSibling + while node != nil: + if node.nodeType == nodeType: return true + node = node.nextSibling + return false + +func hasPreviousSibling(node: Node, nodeType: NodeType): bool = + var node = node.previousSibling + while node != nil: + if node.nodeType == nodeType: return true + node = node.previousSibling + return false + +func inSameTree*(a, b: Node): bool = + a.rootNode == b.rootNode and (a.rootNode != nil or b.rootNode != nil) + +func children*(node: Node): seq[Element] = + for child in node.children: + result.add(child) func filterDescendants*(element: Element, predicate: (proc(child: Element): bool)): seq[Element] = var stack: seq[Element] @@ -231,7 +322,7 @@ func firstChild(node: Node): Node = return nil return node.childNodes[0] -func lastChild(node: Node): Node = +func lastChild*(node: Node): Node = if node.childNodes.len == 0: return nil return node.childNodes[^1] @@ -262,27 +353,6 @@ func nextElementSibling*(elem: Element): Element = e = e.nextSibling return nil -func isTextNode*(node: Node): bool = - return node.nodeType == TEXT_NODE - -func isElemNode*(node: Node): bool = - return node.nodeType == ELEMENT_NODE - -func isComment*(node: Node): bool = - return node.nodeType == COMMENT_NODE - -func isCData*(node: Node): bool = - return node.nodeType == CDATA_SECTION_NODE - -func isDocument*(node: Node): bool = - return node.nodeType == DOCUMENT_NODE - -func firstNode*(node: Node): bool = - return node.parentElement != nil and node.parentElement.childNodes[0] == node - -func lastNode*(node: Node): bool = - return node.parentElement != nil and node.parentElement.childNodes[^1] == node - func attr*(element: Element, s: string): string = return element.attributes.getOrDefault(s, "") @@ -309,31 +379,13 @@ func textContent*(node: Node): string = if child.nodeType != COMMENT_NODE: result &= child.textContent -func toInputType*(str: string): InputType = - case str - of "button": INPUT_BUTTON - of "checkbox": INPUT_CHECKBOX - of "color": INPUT_COLOR - of "date": INPUT_DATE - of "datetime_local": INPUT_DATETIME_LOCAL - of "email": INPUT_EMAIL - of "file": INPUT_FILE - of "hidden": INPUT_HIDDEN - of "image": INPUT_IMAGE - of "month": INPUT_MONTH - of "number": INPUT_NUMBER - of "password": INPUT_PASSWORD - of "radio": INPUT_RADIO - of "range": INPUT_RANGE - of "reset": INPUT_RESET - of "search": INPUT_SEARCH - of "submit": INPUT_SUBMIT - of "tel": INPUT_TEL - of "text": INPUT_TEXT - of "time": INPUT_TIME - of "url": INPUT_URL - of "week": INPUT_WEEK - else: INPUT_UNKNOWN +proc sheets*(element: Element): seq[CSSStylesheet] = + for child in element.children: + if child.tagType == TAG_STYLE: + let child = HTMLStyleElement(child) + if child.sheet_invalid: + child.sheet = parseStylesheet(newStringStream(child.textContent)) + result.add(child.sheet) func inputString*(input: HTMLInputElement): string = var text = case input.inputType @@ -431,7 +483,7 @@ func formmethod*(element: Element): FormMethod = func target*(element: Element): string = if element.attrb("target"): return element.attr("target") - for base in element.ownerDocument.elements(TAG_BASE): + for base in element.document.elements(TAG_BASE): if base.attrb("target"): return base.attr("target") return "" @@ -442,15 +494,27 @@ func findAncestor*(node: Node, tagTypes: set[TagType]): Element = return element return nil -func newText*(): Text = +func newText*(document: Document, data: string = ""): Text = new(result) result.nodeType = TEXT_NODE + result.document = document + result.data = data + result.rootNode = result -func newComment*(): Comment = +func newComment*(document: Document, data: string = ""): Comment = new(result) result.nodeType = COMMENT_NODE + result.document = document + result.data = data + result.rootNode = result + +func namespace(s: string): Option[Namespace] = + for n in Namespace: + if s == $n: + return some(n) -func newHtmlElement*(document: Document, tagType: TagType): HTMLElement = +# note: we do not implement custom elements +func newHTMLElement*(document: Document, tagType: TagType, namespace = Namespace.HTML, prefix = Option[string]): HTMLElement = case tagType of TAG_INPUT: result = new(HTMLInputElement) @@ -478,10 +542,19 @@ func newHtmlElement*(document: Document, tagType: TagType): HTMLElement = result = new(HTMLLIElement) of TAG_STYLE: result = new(HTMLStyleElement) + HTMLStyleElement(result).sheet_invalid = true of TAG_LINK: result = new(HTMLLinkElement) of TAG_FORM: result = new(HTMLFormElement) + of TAG_TEMPLATE: + result = new(HTMLTemplateElement) + HTMLTemplateElement(result).content = DocumentFragment(document: document, host: result) + of TAG_UNKNOWN: + result = new(HTMLUnknownElement) + of TAG_SCRIPT: + result = new(HTMLScriptElement) + HTMLScriptElement(result).forceAsync = true else: result = new(HTMLElement) @@ -489,26 +562,63 @@ func newHtmlElement*(document: Document, tagType: TagType): HTMLElement = result.tagType = tagType result.css = rootProperties() result.uid = document.all_elements.len + result.rootNode = result + result.document = document document.all_elements.add(result) +func newHTMLElement*(document: Document, localName: string, namespace = "", prefix = none[string](), tagType = tagType(localName)): Element = + result = document.newHTMLElement(tagType, namespace(namespace).get(HTML)) + result.namespacePrefix = prefix + func newDocument*(): Document = new(result) - result.root = result.newHtmlElement(TAG_HTML) - result.head = result.newHtmlElement(TAG_HEAD) - result.body = result.newHtmlElement(TAG_BODY) result.nodeType = DOCUMENT_NODE + result.rootNode = result + result.document = result + +func newDocumentType*(document: Document, name: string, publicId = "", systemId = ""): DocumentType = + new(result) + result.document = document + result.name = name + result.publicId = publicId + result.systemId = systemId + result.rootNode = result func newAttr*(parent: Element, key, value: string): Attr = new(result) + result.document = parent.document result.nodeType = ATTRIBUTE_NODE result.ownerElement = parent result.name = key result.value = value + result.rootNode = result +#TODO optimize? func getElementById*(document: Document, id: string): Element = - if id.len == 0 or id notin document.id_elements: + if id.len == 0: return nil - return document.id_elements[id][0] + var stack = document.children + while stack.len > 0: + let element = stack.pop() + if element.id == id: + return element + for i in countdown(element.childNodes.high, 0): + let child = element.childNodes[i] + if child.nodeType == ELEMENT_NODE: + stack.add(Element(child)) + return nil + +#TODO optimize? +func getElementsByTag*(document: Document, tag: TagType): seq[Element] = + var stack = document.children + while stack.len > 0: + let element = stack.pop() + if element.tagType == tag: + result.add(element) + for i in countdown(element.childNodes.high, 0): + let child = element.childNodes[i] + if child.nodeType == ELEMENT_NODE: + stack.add(Element(child)) func baseUrl*(document: Document): Url = var href = "" @@ -522,8 +632,139 @@ func baseUrl*(document: Document): Url = return document.location return url.get -func getElementsByTag*(document: Document, tag: TagType): seq[Element] = - return document.type_elements[tag] +func inHTMLNamespace*(element: Element): bool = element.namespace == Namespace.HTML +func inMathMLNamespace*(element: Element): bool = element.namespace == Namespace.MATHML +func inSVGNamespace*(element: Element): bool = element.namespace == Namespace.SVG +func inXLinkNamespace*(element: Element): bool = element.namespace == Namespace.XLINK +func inXMLNamespace*(element: Element): bool = element.namespace == Namespace.XML +func inXMLNSNamespace*(element: Element): bool = element.namespace == Namespace.XMLNS + +func isResettable*(element: Element): bool = + return element.tagType in {TAG_INPUT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA} + +func isHostIncludingInclusiveAncestor*(a, b: Node): bool = + for parent in b.branch: + if parent == a: + return true + if b.rootNode.nodeType == DOCUMENT_FRAGMENT_NODE and DocumentFragment(b.rootNode).host != nil: + for parent in b.rootNode.branch: + if parent == a: + return true + return false + +# WARNING the ordering of the arguments in the standard is whack so this doesn't match it +func preInsertionValidity*(parent, node, before: Node): bool = + if parent.nodeType notin {DOCUMENT_NODE, DOCUMENT_FRAGMENT_NODE, ELEMENT_NODE}: + # HierarchyRequestError + return false + if node.isHostIncludingInclusiveAncestor(parent): + # HierarchyRequestError + return false + if before != nil and before.parentNode != parent: + # NotFoundError + return false + if node.nodeType notin {DOCUMENT_FRAGMENT_NODE, DOCUMENT_TYPE_NODE, ELEMENT_NODE, CDATA_SECTION_NODE}: + # HierarchyRequestError + return false + if (node.nodeType == TEXT_NODE and parent.nodeType == DOCUMENT_NODE) or + (node.nodeType == DOCUMENT_TYPE_NODE and parent.nodeType != DOCUMENT_NODE): + # HierarchyRequestError + return false + if parent.nodeType == DOCUMENT_NODE: + case node.nodeType + of DOCUMENT_FRAGMENT_NODE: + let elems = node.countChildren(ELEMENT_NODE) + if elems > 1 or node.hasChild(TEXT_NODE): + # HierarchyRequestError + return false + elif elems == 1 and (parent.hasChild(ELEMENT_NODE) or before != nil and (before.nodeType == DOCUMENT_TYPE_NODE or before.hasNextSibling(DOCUMENT_TYPE_NODE))): + # HierarchyRequestError + return false + of ELEMENT_NODE: + if parent.hasChild(ELEMENT_NODE) or before != nil and (before.nodeType == DOCUMENT_TYPE_NODE or before.hasNextSibling(DOCUMENT_TYPE_NODE)): + # HierarchyRequestError + return false + of DOCUMENT_TYPE_NODE: + if parent.hasChild(DOCUMENT_TYPE_NODE) or before != nil and before.hasPreviousSibling(ELEMENT_NODE) or before == nil and parent.hasChild(ELEMENT_NODE): + # HierarchyRequestError + return false + else: discard + return true # no exception reached + +proc remove*(node: Node) = + let parent = node.parentNode + assert parent != nil + let index = parent.childNodes.find(node) + assert index != -1 + #TODO live ranges + #TODO NodeIterator + let oldPreviousSibling = node.previousSibling + let oldNextSibling = node.nextSibling + parent.childNodes.del(index) + if oldPreviousSibling != nil: + oldPreviousSibling.nextSibling = oldNextSibling + if oldNextSibling != nil: + oldNextSibling.previousSibling = oldPreviousSibling + + #TODO assigned, shadow root, shadow root again, custom nodes, registered observers + #TODO not surpress observers => queue tree mutation record + +proc adopt(document: Document, node: Node) = + let oldDocument = node.document + if node.parentNode != nil: + remove(node) + +proc applyChildInsert(parent, child: Node, index: int) = + if parent.rootNode != nil: + child.rootNode = parent.rootNode + else: + child.rootNode = parent + child.parentNode = parent + if parent.nodeType == ELEMENT_NODE: + child.parentElement = Element(parent) + if index - 1 >= 0: + child.previousSibling = parent.childNodes[index - 1] + child.previousSibling.nextSibling = child + if index + 1 < parent.childNodes.len: + child.nextSibling = parent.childNodes[index + 1] + child.nextSibling.previousSibling = child + +# WARNING ditto +proc insert*(parent, node, before: Node) = + let nodes = if node.nodeType == DOCUMENT_FRAGMENT_NODE: node.childNodes + else: @[node] + let count = nodes.len + if count == 0: + return + if node.nodeType == DOCUMENT_FRAGMENT_NODE: + for child in node.childNodes: + child.remove() + #TODO tree mutation record + if before != nil: + #TODO live ranges + discard + let previousSibling = if before == nil: parent.lastChild + else: before.previousSibling + for node in nodes: + parent.document.adopt(node) + if before == nil: + parent.childNodes.add(node) + parent.applyChildInsert(node, parent.childNodes.high) + else: + let index = parent.childNodes.find(before) + parent.childNodes.insert(node, index) + parent.applyChildInsert(node, index) + #TODO shadow root + +# WARNING ditto +proc preInsert*(parent, node, before: Node) = + if parent.preInsertionValidity(node, before): + let referenceChild = if before == node: node.nextSibling + else: before + parent.insert(node, referenceChild) + +proc append*(parent, node: Node) = + parent.preInsert(node, nil) proc applyOrdinal*(elem: HTMLLIElement) = let val = elem.attri("value") @@ -549,8 +790,10 @@ proc applyOrdinal*(elem: HTMLLIElement) = inc menu.ordinalcounter else: discard -proc reset*(form: HTMLFormElement) = - for input in form.inputs: +proc reset*(element: Element) = + case element.tagType + of TAG_INPUT: + let input = HTMLInputELement(element) case input.inputType of INPUT_SEARCH, INPUT_TEXT, INPUT_PASSWORD: input.value = input.attr("value") @@ -560,3 +803,20 @@ proc reset*(form: HTMLFormElement) = input.file = none(Url) else: discard input.rendered = false + else: discard + +proc reset*(form: HTMLFormElement) = + for input in form.inputs: + input.reset() + input.rendered = false + +proc appendAttribute*(element: Element, k, v: string) = + element.attributes[k] = v + +proc setForm*(element: Element, form: HTMLFormElement) = + case element.tagType + of TAG_INPUT: + HTMLInputElement(element).form = form + of TAG_BUTTON, TAG_FIELDSET, TAG_OBJECT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA, TAG_IMG: + discard #TODO + else: assert false diff --git a/src/html/htmlparser.nim b/src/html/htmlparser.nim index d5d5effe..3e962495 100644 --- a/src/html/htmlparser.nim +++ b/src/html/htmlparser.nim @@ -1,2009 +1,1962 @@ -import streams -import unicode -import strutils -import tables -import json import macros import options +import sequtils +import streams import strformat +import tables +import unicode import utils/twtstr -import utils/radixtree import html/dom -import html/entity import html/tags +import html/htmltokenizer import css/sheet type - HTMLParseState = object - in_comment: bool - in_script: bool - in_style: bool - in_noscript: bool - in_body: bool - skip_lf: bool - elementNode: Element - textNode: Text - commentNode: Comment + HTML5Parser = object + case fragment: bool + of true: ctx: Element + else: discard + openElements: seq[Element] + insertionMode: InsertionMode + oldInsertionMode: InsertionMode + templateModes: seq[InsertionMode] + head: Element + tokenizer: Tokenizer document: Document - formowners: seq[HTMLFormElement] + form: HTMLFormElement + fosterParenting: bool + scripting: bool + activeFormatting: seq[(Element, Token)] # nil => marker + framesetok: bool + ignoreLF: bool + pendingTableChars: string + pendingTableCharsWhitespace: bool + + AdjustedInsertionLocation = tuple[inside: Node, before: Node] + +# 13.2.4.1 + InsertionMode = enum + INITIAL, BEFORE_HTML, BEFORE_HEAD, IN_HEAD, IN_HEAD_NOSCRIPT, AFTER_HEAD, + IN_BODY, TEXT, IN_TABLE, IN_TABLE_TEXT, IN_CAPTION, IN_COLUMN_GROUP, + IN_TABLE_BODY, IN_ROW, IN_CELL, IN_SELECT, IN_SELECT_IN_TABLE, IN_TEMPLATE, + AFTER_BODY, IN_FRAMESET, AFTER_FRAMESET, AFTER_AFTER_BODY, + AFTER_AFTER_FRAMESET + +proc resetInsertionMode(parser: var HTML5Parser) = + template switch_insertion_mode_and_return(mode: InsertionMode) = + parser.insertionMode = mode + return + for i in countdown(parser.openElements.high, 0): + var node = parser.openElements[i] + let last = i == 0 + if parser.fragment: + node = parser.ctx + if node.tagType == TAG_SELECT: + if not last: + var ancestor = node + for j in countdown(parser.openElements.high, 1): + let ancestor = parser.openElements[j] + case ancestor.tagType + of TAG_TEMPLATE: break + of TAG_TABLE: switch_insertion_mode_and_return IN_SELECT_IN_TABLE + else: discard + switch_insertion_mode_and_return IN_SELECT + case node.tagType + of TAG_TD, TAG_TH: + if not last: + switch_insertion_mode_and_return IN_CELL + of TAG_TR: switch_insertion_mode_and_return IN_ROW + of TAG_TBODY, TAG_THEAD, TAG_TFOOT: switch_insertion_mode_and_return IN_CAPTION + of TAG_COLGROUP: switch_insertion_mode_and_return IN_COLUMN_GROUP + of TAG_TABLE: switch_insertion_mode_and_return IN_TABLE + of TAG_TEMPLATE: switch_insertion_mode_and_return parser.templateModes[^1] + of TAG_HEAD: + if not last: + switch_insertion_mode_and_return IN_HEAD + of TAG_BODY: switch_insertion_mode_and_return IN_BODY + of TAG_FRAMESET: switch_insertion_mode_and_return IN_FRAMESET + of TAG_HTML: + if parser.head != nil: + switch_insertion_mode_and_return BEFORE_HEAD + else: + switch_insertion_mode_and_return AFTER_HEAD + else: discard + if last: + switch_insertion_mode_and_return IN_BODY -# Tokenizer -type - Tokenizer = object - state: TokenizerState - rstate: TokenizerState - curr: Rune - tmp: string - code: int - tok: Token - laststart: Token - attrn: string - attrv: string - attr: bool - - istream: Stream - sbuf: string - sbuf_i: int - sbuf_ip: int - eof_i: int - - TokenType = enum - DOCTYPE, START_TAG, END_TAG, COMMENT, CHARACTER, EOF - - TokenizerState = enum - DATA, CHARACTER_REFERENCE, TAG_OPEN, RCDATA, RCDATA_LESS_THAN_SIGN, - RAWTEXT, RAWTEXT_LESS_THAN_SIGN, SCRIPT_DATA, SCRIPT_DATA_LESS_THAN_SIGN, - PLAINTEXT, MARKUP_DECLARATION_OPEN, END_TAG_OPEN, BOGUS_COMMENT, TAG_NAME, - BEFORE_ATTRIBUTE_NAME, RCDATA_END_TAG_OPEN, RCDATA_END_TAG_NAME, - RAWTEXT_END_TAG_OPEN, RAWTEXT_END_TAG_NAME, SELF_CLOSING_START_TAG, - SCRIPT_DATA_END_TAG_OPEN, SCRIPT_DATA_ESCAPE_START, - SCRIPT_DATA_END_TAG_NAME, SCRIPT_DATA_ESCAPE_START_DASH, - SCRIPT_DATA_ESCAPED_DASH_DASH, SCRIPT_DATA_ESCAPED, - SCRIPT_DATA_ESCAPED_DASH, SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, - SCRIPT_DATA_ESCAPED_END_TAG_OPEN, SCRIPT_DATA_DOUBLE_ESCAPE_START, - SCRIPT_DATA_ESCAPED_END_TAG_NAME, SCRIPT_DATA_DOUBLE_ESCAPED, - SCRIPT_DATA_DOUBLE_ESCAPED_DASH, SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, - SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, SCRIPT_DATA_DOUBLE_ESCAPE_END, - AFTER_ATTRIBUTE_NAME, ATTRIBUTE_NAME, BEFORE_ATTRIBUTE_VALUE, - ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED, - ATTRIBUTE_VALUE_UNQUOTED, AFTER_ATTRIBUTE_VALUE_QUOTED, COMMENT_START, - CDATA_SECTION, COMMENT_START_DASH, COMMENT, COMMENT_END, - COMMENT_LESS_THAN_SIGN, COMMENT_END_DASH, COMMENT_LESS_THAN_SIGN_BANG, - COMMENT_LESS_THAN_SIGN_BANG_DASH, COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH, - COMMENT_END_BANG, DOCTYPE, BEFORE_DOCTYPE_NAME, DOCTYPE_NAME, - AFTER_DOCTYPE_NAME, AFTER_DOCTYPE_PUBLIC_KEYWORD, - AFTER_DOCTYPE_SYSTEM_KEYWORD, BOGUS_DOCTYPE, - BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, - DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, AFTER_DOCTYPE_PUBLIC_IDENTIFIER, - BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, - DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, - DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, - AFTER_DOCTYPE_SYSTEM_IDENTIFIER, CDATA_SECTION_BRACKET, CDATA_SECTION_END, - NAMED_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE, - AMBIGUOUS_AMPERSAND_STATE, HEXADECIMAL_CHARACTER_REFERENCE_START, - DECIMAL_CHARACTER_REFERENCE_START, HEXADECIMAL_CHARACTER_REFERENCE, - DECIMAL_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE_END - - Token = ref object - case t: TokenType - of DOCTYPE: - name: Option[string] - pubid: Option[string] - sysid: Option[string] - quirks: bool - of START_TAG, END_TAG: - tagname: string - selfclosing: bool - attrs: Table[string, string] - of CHARACTER: - r: Rune - of COMMENT: - data: string - of EOF: discard - -func `$`*(tok: Token): string = - case tok.t - of DOCTYPE: fmt"{tok.t} {tok.name} {tok.pubid} {tok.sysid} {tok.quirks}" - of START_TAG, END_TAG: fmt"{tok.t} {tok.tagname} {tok.selfclosing} {tok.attrs}" - of CHARACTER: fmt"{tok.t} {tok.r}" - of COMMENT: fmt"{tok.t} {tok.data}" - of EOF: fmt"{tok.t}" - -const bufSize = 512 -const copyBufSize = 16 -proc newTokenizer(s: Stream): Tokenizer = - result.sbuf = newString(bufSize) - result.istream = s - if result.istream.atEnd: - result.eof_i = 0 +func currentNode(parser: HTML5Parser): Element = + if parser.openElements.len == 0: + assert false else: - let n = s.readDataStr(result.sbuf, 0..bufSize-1) - result.eof_i = n - -func atEof(t: Tokenizer): bool = - t.eof_i != -1 and t.sbuf_i >= t.eof_i - -proc consume(t: var Tokenizer): char {.inline.} = - if t.sbuf_i >= bufSize-copyBufSize: - var sbuf2 = newString(copyBufSize) - var i = 0 - while t.sbuf_i + i < bufSize: - sbuf2[i] = t.sbuf[t.sbuf_i + i] - inc i - let n = t.istream.readDataStr(t.sbuf, i..bufSize-1) - if n != bufSize - i: - t.eof_i = i + n - t.sbuf_i = 0 - - var j = 0 - while j < i: - t.sbuf[j] = sbuf2[j] - inc j - - assert t.eof_i == -1 or t.sbuf_i < t.eof_i # not consuming eof... - t.sbuf_ip = t.sbuf_i # save previous pointer for potential reconsume - - # Normalize newlines (\r\n -> \n, single \r -> \n) - if t.sbuf[t.sbuf_i] == '\r': - inc t.sbuf_i - if t.sbuf[t.sbuf_i] != '\n': - # \r - result = '\n' - t.curr = Rune('\n') - return - # else, \r\n so just return the \n - - result = t.sbuf[t.sbuf_i] - fastRuneAt(t.sbuf, t.sbuf_i, t.curr) - -proc reconsume(t: var Tokenizer) = - t.sbuf_i = t.sbuf_ip - -iterator tokenize(tokenizer: var Tokenizer): Token = - template emit(tok: Token) = - if tok.t == START_TAG: - tokenizer.laststart = tok - yield tok - template emit(tok: TokenType) = emit Token(t: tok) - template emit(rn: Rune) = emit Token(t: CHARACTER, r: rn) - template emit(ch: char) = emit Token(t: CHARACTER, r: Rune(ch)) - template emit_eof = - emit EOF - break - template emit_tok = - if tokenizer.attr: - tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv - emit tokenizer.tok - template emit_current = - if tokenizer.atEof: - emit_eof - else: - emit Token(t: CHARACTER, r: tokenizer.curr) - template emit_replacement = emit Rune(0xFFFD) - template switch_state(s: TokenizerState) = - tokenizer.state = s - template switch_state_return(s: TokenizerState) = - tokenizer.rstate = tokenizer.state - tokenizer.state = s - template reconsume_in(s: TokenizerState) = - tokenizer.reconsume() - switch_state s - template parse_error(error: untyped) = discard # does nothing for now... TODO? - template is_appropriate_end_tag_token(): bool = - tokenizer.laststart != nil and tokenizer.laststart.data == tokenizer.tok.data - template start_new_attribute = - if tokenizer.attr: - tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv - tokenizer.attrn = "" - tokenizer.attrv = "" - tokenizer.attr = true - template leave_attribute_name_state = - if tokenizer.attrn in tokenizer.tok.attrs: - tokenizer.attr = false - template append_to_current_attr_value(c: typed) = - if tokenizer.attr: - tokenizer.attrv &= c - template peek_str(s: string): bool = - # WARNING: will break on strings with copyBufSize + 4 bytes - assert s.len < copyBufSize - 4 and s.len > 0 - if tokenizer.sbuf_i + s.len > tokenizer.eof_i: - false - else: - let slice = tokenizer.sbuf[tokenizer.sbuf_i..tokenizer.sbuf_i+s.high] - s == slice - template peek_str_nocase(s: string): bool = - # WARNING: will break on strings with copyBufSize + 4 bytes - # WARNING: only works with UPPER CASE ascii - assert s.len < copyBufSize - 4 and s.len > 0 - if tokenizer.sbuf_i + s.len > tokenizer.eof_i: - false - else: - let slice = tokenizer.sbuf[tokenizer.sbuf_i..tokenizer.sbuf_i+s.high] - s == slice.toUpperAscii() - template peek_char(): char = tokenizer.sbuf[tokenizer.sbuf_i] - template has_adjusted_current_node(): bool = false #TODO implement this - template consume_and_discard(n: int) = #TODO optimize - var i = 0 - while i < n: - discard tokenizer.consume() - inc i - template consumed_as_an_attribute(): bool = - tokenizer.rstate in {ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED, ATTRIBUTE_VALUE_UNQUOTED} - template flush_code_points_consumed_as_a_character_reference() = - if consumed_as_an_attribute: - append_to_current_attr_value tokenizer.tmp - else: - for r in tokenizer.tmp.runes: - emit r - template new_token(t: Token) = - if tokenizer.attr: - tokenizer.attr = false - tokenizer.tok = t - - # Fake EOF as an actual character. Also replace anything_else with the else - # branch. - # Yes this is kind of ugly but it works and I'm too lazy to come up with - # anything better. - macro stateMachine(states: varargs[untyped]): untyped = - var maincase = newNimNode(nnkCaseStmt).add(quote do: tokenizer.state) - for state in states: - if state.kind == nnkOfBranch: - let mainstmtlist = findChild(state, it.kind == nnkStmtList) - if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "ignore_eof": - maincase.add(state) - continue - - var hasanythingelse = false - if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "has_anything_else": - hasanythingelse = true - - let childcase = findChild(mainstmtlist, it.kind == nnkCaseStmt) - var haseof = false - var eofstmts: NimNode - var elsestmts: NimNode - - for i in countdown(childcase.len-1, 0): - let childof = childcase[i] - if childof.kind == nnkOfBranch: - for j in countdown(childof.len-1, 0): - if childof[j].kind == nnkIdent and childof[j].strVal == "eof": - haseof = true - eofstmts = childof.findChild(it.kind == nnkStmtList) - if childof.findChild(it.kind == nnkIdent and it.strVal != "eof") != nil: - childof.del(j) - else: - childcase.del(i) - elif childof.kind == nnkElse: - elsestmts = childof.findChild(it.kind == nnkStmtList) - - if not haseof: - eofstmts = elsestmts - let fake_eof = quote do: - if tokenizer.atEof: - `eofstmts` - continue - mainstmtlist.insert(0, fake_eof) - if hasanythingelse: - let fake_anything_else = quote do: - template anything_else = - `elsestmts` - mainstmtlist.insert(0, fake_anything_else) - maincase.add(state) - result = newNimNode(nnkStmtList) - result.add(maincase) - - template ignore_eof = discard # does nothing - template has_anything_else = discard # does nothing - - const null = char(0) - const whitespace = {'\t', '\n', '\f', ' '} + return parser.openElements[^1] + +func adjustedCurrentNode(parser: HTML5Parser): Element = + if parser.fragment: parser.ctx + else: parser.currentNode + +template parse_error() = discard + +func lastElementOfTag(parser: HTML5Parser, tagType: TagType): tuple[element: Element, pos: int] = + for i in countdown(parser.openElements.high, 0): + if parser.openElements[i].tagType == tagType: + return (parser.openElements[i], i) + return (nil, -1) + +template last_child_of(n: Node): AdjustedInsertionLocation = + (n, nil) + +# 13.2.6.1 +func appropriatePlaceForInsert(parser: HTML5Parser, target: Element): AdjustedInsertionLocation = + assert parser.openElements[0].tagType == TAG_HTML + if parser.fosterParenting and target.tagType in {TAG_TABLE, TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TR}: + let lastTemplate = parser.lastElementOfTag(TAG_TEMPLATE) + let lastTable = parser.lastElementOfTag(TAG_TABLE) + if lastTemplate.element != nil and (lastTable.element == nil or lastTable.pos < lastTemplate.pos): + return last_child_of(HTMLTemplateElement(lastTemplate.element).content) + if lastTable.element == nil: + return last_child_of(parser.openElements[0]) + if lastTable.element.parentNode != nil: + return (lastTable.element.parentNode, lastTable.element) + let previousElement = parser.openElements[lastTable.pos - 1] + result = last_child_of(previousElement) + else: + result = last_child_of(target) + if result.inside.nodeType == ELEMENT_NODE and Element(result.inside).tagType == TAG_TEMPLATE: + result = (HTMLTemplateElement(result.inside).content, nil) + +func appropriatePlaceForInsert(parser: HTML5Parser): AdjustedInsertionLocation = + parser.appropriatePlaceForInsert(parser.currentNode) + +func hasElement(elements: seq[Element], tag: TagType): bool = + for element in elements: + if element.tagType == tag: + return true + return false + +func hasElementInSpecificScope(elements: seq[Element], target: Element, list: set[TagType]): bool = + for i in countdown(elements.high, 0): + if elements[i] == target: + return true + if elements[i].tagType in list: + return false + assert false + +func hasElementInSpecificScope(elements: seq[Element], target: TagType, list: set[TagType]): bool = + for i in countdown(elements.high, 0): + if elements[i].tagType == target: + return true + if elements[i].tagType in list: + return false + assert false + +func hasElementInSpecificScope(elements: seq[Element], target: set[TagType], list: set[TagType]): bool = + for i in countdown(elements.high, 0): + if elements[i].tagType in target: + return true + if elements[i].tagType in list: + return false + assert false + +const Scope = {TAG_APPLET, TAG_CAPTION, TAG_HTML, TAG_TABLE, TAG_TD, TAG_TH, + TAG_MARQUEE, TAG_OBJECT, TAG_TEMPLATE} #TODO SVG (NOTE MathML not implemented) +func hasElementInScope(elements: seq[Element], target: TagType): bool = + return elements.hasElementInSpecificScope(target, Scope) + +func hasElementInScope(elements: seq[Element], target: set[TagType]): bool = + return elements.hasElementInSpecificScope(target, Scope) + +func hasElementInScope(elements: seq[Element], target: Element): bool = + return elements.hasElementInSpecificScope(target, Scope) + +func hasElementInListItemScope(elements: seq[Element], target: TagType): bool = + return elements.hasElementInSpecificScope(target, Scope + {TAG_OL, TAG_UL}) + +func hasElementInButtonScope(elements: seq[Element], target: TagType): bool = + return elements.hasElementInSpecificScope(target, Scope + {TAG_BUTTON}) + +func hasElementInTableScope(elements: seq[Element], target: TagType): bool = + return elements.hasElementInSpecificScope(target, {TAG_HTML, TAG_TABLE, TAG_TEMPLATE}) + +func hasElementInTableScope(elements: seq[Element], target: set[TagType]): bool = + return elements.hasElementInSpecificScope(target, {TAG_HTML, TAG_TABLE, TAG_TEMPLATE}) + +func hasElementInSelectScope(elements: seq[Element], target: TagType): bool = + for i in countdown(elements.high, 0): + if elements[i].tagType == target: + return true + if elements[i].tagType notin {TAG_OPTION, TAG_OPTGROUP}: + return false + assert false + +func createElement(parser: HTML5Parser, token: Token, namespace: string, intendedParent: Node): Element = + #TODO custom elements + let document = intendedParent.document + let localName = token.tagname + let element = document.newHTMLElement(localName, namespace, tagType = token.tagtype) + for k, v in token.attrs: + element.appendAttribute(k, v) + if element.isResettable(): + element.reset() + + if element.tagType in FormAssociatedElements and parser.form != nil and + not parser.openElements.hasElement(TAG_TEMPLATE) and + (element.tagType notin ListedElements or not element.attrb("form")) and + element.inSameTree(parser.form): + element.setForm(parser.form) + return element + +proc insert(location: AdjustedInsertionLocation, node: Node) = + location.inside.insert(node, location.before) + +proc insertForeignElement(parser: var HTML5Parser, token: Token, namespace: string): Element = + let location = parser.appropriatePlaceForInsert() + let element = parser.createElement(token, namespace, location.inside) + if location.inside.preInsertionValidity(element, location.before): + #TODO custom elements + location.insert(element) + parser.openElements.add(element) + return element + +proc insertHTMLElement(parser: var HTML5Parser, token: Token): Element = + return parser.insertForeignElement(token, $Namespace.HTML) + +template insert_character_impl(parser: var HTML5Parser, data: typed) = + let location = parser.appropriatePlaceForInsert() + if location.inside.nodeType == DOCUMENT_NODE: + return + let insertNode = if location.before == nil: + location.inside.lastChild + else: + location.before.previousSibling + assert location.before == nil + if insertNode != nil and insertNode.nodeType == TEXT_NODE: + dom.Text(insertNode).data &= data + else: + let text = location.inside.document.newText($data) + location.insert(text) + + if location.inside.nodeType == ELEMENT_NODE: + let parent = Element(location.inside) + if parent.tagType == TAG_STYLE: + let parent = HTMLStyleElement(parent) + parent.sheet_invalid = true + +proc insertCharacter(parser: var HTML5Parser, data: string) = + insert_character_impl(parser, data) + +proc insertCharacter(parser: var HTML5Parser, data: char) = + insert_character_impl(parser, data) + +proc insertCharacter(parser: var HTML5Parser, data: Rune) = + insert_character_impl(parser, data) + +proc insertComment(parser: var HTML5Parser, token: Token, position: AdjustedInsertionLocation) = + position.insert(position.inside.document.newComment(token.data)) + +proc insertComment(parser: var HTML5Parser, token: Token) = + let position = parser.appropriatePlaceForInsert() + position.insert(position.inside.document.newComment(token.data)) + +const PublicIdentifierEquals = [ + "-//W3O//DTD W3 HTML Strict 3.0//EN//", + "-/W3C/DTD HTML 4.0 Transitional/EN", + "HTML" +] + +const PublicIdentifierStartsWith = [ + "+//Silmaril//dtd html Pro v0r11 19970101//", + "-//AS//DTD HTML 3.0 asWedit + extensions//", + "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//", + "-//IETF//DTD HTML 2.0 Level 1//", + "-//IETF//DTD HTML 2.0 Level 2//", + "-//IETF//DTD HTML 2.0 Strict Level 1//", + "-//IETF//DTD HTML 2.0 Strict Level 2//", + "-//IETF//DTD HTML 2.0 Strict//", + "-//IETF//DTD HTML 2.0//", + "-//IETF//DTD HTML 2.1E//", + "-//IETF//DTD HTML 3.0//", + "-//IETF//DTD HTML 3.2 Final//", + "-//IETF//DTD HTML 3.2//", + "-//IETF//DTD HTML 3//", + "-//IETF//DTD HTML Level 0//", + "-//IETF//DTD HTML Level 1//", + "-//IETF//DTD HTML Level 2//", + "-//IETF//DTD HTML Level 3//", + "-//IETF//DTD HTML Strict Level 0//", + "-//IETF//DTD HTML Strict Level 1//", + "-//IETF//DTD HTML Strict Level 2//", + "-//IETF//DTD HTML Strict Level 3//", + "-//IETF//DTD HTML Strict//", + "-//IETF//DTD HTML//", + "-//Metrius//DTD Metrius Presentational//", + "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//", + "-//Microsoft//DTD Internet Explorer 2.0 HTML//", + "-//Microsoft//DTD Internet Explorer 2.0 Tables//", + "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//", + "-//Microsoft//DTD Internet Explorer 3.0 HTML//", + "-//Microsoft//DTD Internet Explorer 3.0 Tables//", + "-//Netscape Comm. Corp.//DTD HTML//", + "-//Netscape Comm. Corp.//DTD Strict HTML//", + "-//O'Reilly and Associates//DTD HTML 2.0//", + "-//O'Reilly and Associates//DTD HTML Extended 1.0//", + "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//", + "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//", + "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//", + "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//", + "-//Spyglass//DTD HTML 2.0 Extended//", + "-//Sun Microsystems Corp.//DTD HotJava HTML//", + "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//", + "-//W3C//DTD HTML 3 1995-03-24//", + "-//W3C//DTD HTML 3.2 Draft//", + "-//W3C//DTD HTML 3.2 Final//", + "-//W3C//DTD HTML 3.2//", + "-//W3C//DTD HTML 3.2S Draft//", + "-//W3C//DTD HTML 4.0 Frameset//", + "-//W3C//DTD HTML 4.0 Transitional//", + "-//W3C//DTD HTML Experimental 19960712//", + "-//W3C//DTD HTML Experimental 970421//", + "-//W3C//DTD W3 HTML//", + "-//W3O//DTD W3 HTML 3.0//", + "-//WebTechs//DTD Mozilla HTML 2.0//", + "-//WebTechs//DTD Mozilla HTML//", +] + +const SystemIdentifierMissingAndPublicIdentifierStartsWith = [ + "-//W3C//DTD HTML 4.01 Frameset//", + "-//W3C//DTD HTML 4.01 Transitional//" +] + +const PublicIdentifierStartsWithLimited = [ + "-//W3C//DTD XHTML 1.0 Frameset//", + "-//W3C//DTD XHTML 1.0 Transitional//" +] + +const SystemIdentifierNotMissingAndPublicIdentifierStartsWith = [ + "-//W3C//DTD HTML 4.01 Frameset//", + "-//W3C//DTD HTML 4.01 Transitional//" +] + +func quirksConditions(token: Token): bool = + if token.quirks: return true + if token.name.isnone or token.name.get != "html": return true + if token.sysid.issome: + if token.sysid.get == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd": + return true + if token.pubid.issome: + if token.pubid.get in PublicIdentifierEquals: + return true + for id in PublicIdentifierStartsWith: + if token.pubid.get.startsWithNoCase(id): + return true + if token.sysid.isnone: + for id in SystemIdentifierMissingAndPublicIdentifierStartsWith: + if token.pubid.get.startsWithNoCase(id): + return true + return false + +func limitedQuirksConditions(token: Token): bool = + if token.pubid.isnone: return false + for id in PublicIdentifierStartsWithLimited: + if token.pubid.get.startsWithNoCase(id): + return true + if token.sysid.isnone: return false + for id in SystemIdentifierNotMissingAndPublicIdentifierStartsWith: + if token.pubid.get.startsWithNoCase(id): + return true + return false + +# 13.2.6.2 +proc genericRawtextElementParsingAlgorithm(parser: var HTML5Parser, token: Token) = + discard parser.insertHTMLElement(token) + parser.tokenizer.state = RAWTEXT + parser.oldInsertionMode = parser.insertionMode + parser.insertionMode = TEXT + +proc genericRCDATAElementParsingAlgorithm(parser: var HTML5Parser, token: Token) = + discard parser.insertHTMLElement(token) + parser.tokenizer.state = RCDATA + parser.oldInsertionMode = parser.insertionMode + parser.insertionMode = TEXT + +# 13.2.6.3 +proc generateImpliedEndTags(parser: var HTML5Parser) = + const tags = {TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, + TAG_RB, TAG_RP, TAG_RT, TAG_RTC} + while parser.currentNode.tagType in tags: + discard parser.openElements.pop() + +proc generateImpliedEndTags(parser: var HTML5Parser, exclude: TagType) = + let tags = {TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, + TAG_RB, TAG_RP, TAG_RT, TAG_RTC} - {exclude} + while parser.currentNode.tagType in tags: + discard parser.openElements.pop() + +proc generateImpliedEndTagsThoroughly(parser: var HTML5Parser) = + const tags = {TAG_CAPTION, TAG_COLGROUP, TAG_DD, TAG_DT, TAG_LI, + TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, TAG_RT, + TAG_RTC, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH, TAG_THEAD, + TAG_TR} + while parser.currentNode.tagType in tags: + discard parser.openElements.pop() + +# 13.2.4.3 +proc pushOntoActiveFormatting(parser: var HTML5Parser, element: Element, token: Token) = + var count = 0 + for i in countdown(parser.activeFormatting.high, 0): + let it = parser.activeFormatting[i] + if it[0] == nil: break + if it[0].tagType != element.tagType: continue + if it[0].tagType == TAG_UNKNOWN: + if it[0].localName != element.localName: continue #TODO local or qualified? + if it[0].namespace != element.namespace: continue + var fail = false + for k, v in it[0].attributes: + if k notin element.attributes: + fail = true + break + if v != element.attributes[k]: + fail = true + break + if fail: continue + for k, v in element.attributes: + if k notin it[0].attributes: + fail = true + break + if fail: continue + inc count + if count == 3: + parser.activeFormatting.del(i) + break + parser.activeFormatting.add((element, token)) +proc reconstructActiveFormatting(parser: var HTML5Parser) = + type State = enum + REWIND, ADVANCE, CREATE + if parser.activeFormatting.len == 0: + return + if parser.activeFormatting[^1][0] == nil or parser.openElements.hasElement(parser.activeFormatting[^1][0].tagType): + return + var i = parser.activeFormatting.high + template entry: Element = (parser.activeFormatting[i][0]) + var state = REWIND while true: {.computedGoto.} - let c = tokenizer.consume() - stateMachine: # => case tokenizer.state - of DATA: - case c - of '&': switch_state_return CHARACTER_REFERENCE - of '<': switch_state TAG_OPEN - of null: - parse_error unexpected_null_character - emit_current - of eof: emit_eof - else: emit_current - - of RCDATA: - case c - of '&': switch_state_return CHARACTER_REFERENCE - of '<': switch_state RCDATA_LESS_THAN_SIGN - of null: parse_error unexpected_null_character - of eof: emit_eof - else: emit_current - - of RAWTEXT: - case c - of '<': switch_state RAWTEXT_LESS_THAN_SIGN - of null: - parse_error unexpected_null_character - emit_replacement - of eof: emit_eof - else: emit_current - - of SCRIPT_DATA: - case c - of '<': switch_state SCRIPT_DATA_LESS_THAN_SIGN - of null: - parse_error unexpected_null_character - emit_replacement - of eof: emit_eof - else: emit_current - - of PLAINTEXT: - case c - of null: - parse_error unexpected_null_character - emit_replacement - of eof: emit_eof - else: emit_current - - of TAG_OPEN: - case c - of '!': switch_state MARKUP_DECLARATION_OPEN - of '/': switch_state END_TAG_OPEN - of AsciiAlpha: - new_token Token(t: START_TAG) - reconsume_in TAG_NAME - of '?': - parse_error unexpected_question_mark_instead_of_tag_name - new_token Token(t: COMMENT) - reconsume_in BOGUS_COMMENT - of eof: - parse_error eof_before_tag_name - emit '<' - emit_eof - else: - parse_error invalid_first_character_of_tag_name - emit '<' - reconsume_in DATA - - of END_TAG_OPEN: - case c - of AsciiAlpha: new_token Token(t: END_TAG) - of '>': - parse_error missing_end_tag_name - switch_state DATA - of eof: - parse_error eof_before_tag_name - emit '<' - emit '/' - emit_eof - else: - parse_error invalid_first_character_of_tag_name - new_token Token(t: COMMENT) - reconsume_in BOGUS_COMMENT - - of TAG_NAME: - case c - of whitespace: switch_state BEFORE_ATTRIBUTE_NAME - of '/': switch_state SELF_CLOSING_START_TAG - of '>': - switch_state DATA - emit_tok - of AsciiUpperAlpha: tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - of null: - parse_error unexpected_null_character - tokenizer.tok.tagname &= Rune(0xFFFD) - of eof: - parse_error eof_in_tag - emit_eof - else: tokenizer.tok.tagname &= tokenizer.curr - - of RCDATA_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state RCDATA_END_TAG_OPEN + case state + of REWIND: + if i == 0: + state = CREATE + continue + dec i + if entry != nil and not parser.openElements.hasElement(entry.tagType): + continue + state = ADVANCE + of ADVANCE: + inc i + state = CREATE + of CREATE: + parser.activeFormatting[i] = (parser.insertHTMLElement(parser.activeFormatting[i][1]), parser.activeFormatting[i][1]) + if i != parser.activeFormatting.high: + state = ADVANCE + continue + break + +proc clearActiveFormattingTillMarker(parser: var HTML5Parser) = + while parser.activeFormatting.len > 0 and parser.activeFormatting.pop()[0] != nil: discard + +template pop_current_node = discard parser.openElements.pop() + +func isHTMLIntegrationPoint(node: Element): bool = + return false #TODO SVG (NOTE MathML not implemented) + +# Following is an implementation of the state (?) machine defined in +# https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhtml +# It uses the ad-hoc pattern matching macro `match' to apply the following +# transformations: +# * First, pairs of patterns and actions are stored in tuples (and `discard' +# statements...) +# * These pairs are then assigned to token types, later mapped to legs of the +# first case statement. +# * Another case statement is constructed where needed, e.g. for switching on +# characters/tags/etc. +# * Finally, the whole thing is wrapped in a named block, to implement a +# pseudo-goto by breaking out only when the else statement needn't be +# executed. +# So for example the following code: +# +# match token: +# TokenType.COMMENT => (block: echo "comment") +# ("<p>", "<a>", "</div>") => (block: echo "p, a or closing div") +# ("<div>", "</p>") => (block: anything_else) +# (TokenType.START_TAG, TokenType.END_TAG) => (block: assert false, "invalid") +# _ => (block: echo "anything else") +# +# (effectively) generates this: +# +# block inside_not_else: +# case token.t +# of TokenType.COMMENT: +# echo "comment" +# break inside_not_else +# of TokenType.START_TAG: +# case token.tagtype +# of {TAG_P, TAG_A}: +# echo "p, a or closing div" +# break inside_not_else +# of TAG_DIV: discard +# else: +# assert false +# break inside_not_else +# of TokenType.END_TAG: +# case token.tagtype +# of TAG_DIV: +# echo "p, a or closing div" +# break inside_not_else +# of TAG_P: discard +# else: +# assert false +# break inside_not_else +# else: discard +# echo "anything else" +# +# This duplicates any code that applies for several token types, except for the +# else branch. +macro match(token: Token, body: typed): untyped = + type OfBranchStore = object + ofBranches: seq[(seq[NimNode], NimNode)] + defaultBranch: NimNode + + # Stores 'of' branches + var ofBranches: array[TokenType, OfBranchStore] + # Stores 'else', 'elif' branches + var defaultBranch: NimNode + + const tokenTypes = (func(): Table[string, TokenType] = + for tt in TokenType: + result[$tt] = tt)() + + for disc in body: + let tup = disc[0] # access actual tuple + let pattern = `tup`[0] + let lambda = `tup`[1] + var action = lambda.findChild(it.kind notin {nnkSym, nnkEmpty, nnkFormalParams}) + if pattern.kind != nnkDiscardStmt and not (action.len == 2 and action[1].kind == nnkDiscardStmt and action[1][0] == newStrLitNode("anything_else")): + action = quote do: + `action` + #eprint token #debug + break inside_not_else + + var patterns = @[pattern] + while patterns.len > 0: + let pattern = patterns.pop() + case pattern.kind + of nnkSym: # simple symbols; we assume these are the enums + ofBranches[tokenTypes[pattern.strVal]].defaultBranch = action + of nnkCharLit: + ofBranches[CHARACTER_ASCII].ofBranches.add((@[pattern], action)) + of nnkCurly: + case pattern[0].kind + of nnkCharLit: + ofBranches[CHARACTER_ASCII].ofBranches.add((@[pattern], action)) + else: error fmt"Unsupported curly of kind {pattern[0].kind}" + of nnkStrLit: + var tempTokenizer = newTokenizer(newStringStream(pattern.strVal)) + for token in tempTokenizer.tokenize: + let tt = int(token.tagtype) + case token.t + of START_TAG, END_TAG: + var found = false + for i in 0..ofBranches[token.t].ofBranches.high: + if ofBranches[token.t].ofBranches[i][1] == action: + found = true + ofBranches[token.t].ofBranches[i][0].add((quote do: TagType(`tt`))) + break + if not found: + ofBranches[token.t].ofBranches.add((@[(quote do: TagType(`tt`))], action)) + else: error fmt"{pattern.strVal}: Unsupported token {token} of kind {token.t}" + break + of nnkDiscardStmt: + defaultBranch = action + of nnkTupleConstr: + for child in pattern: + patterns.add(child) + else: error fmt"{pattern}: Unsupported pattern of kind {pattern.kind}" + + func tokenBranchOn(tok: TokenType): NimNode = + case tok + of START_TAG, END_TAG: + return quote do: token.tagtype + of CHARACTER: + return quote do: token.r + of CHARACTER_ASCII: + return quote do: token.c + else: error fmt"Unsupported branching of token {tok}" + + template add_to_case(branch: typed) = + if branch[0].len == 1: + tokenCase.add(newNimNode(nnkOfBranch).add(branch[0][0]).add(branch[1])) + else: + var curly = newNimNode(nnkCurly) + for node in branch[0]: + curly.add(node) + tokenCase.add(newNimNode(nnkOfBranch).add(curly).add(branch[1])) + + # Build case statements + var mainCase = newNimNode(nnkCaseStmt).add(quote do: `token`.t) + for tt in TokenType: + let ofBranch = newNimNode(nnkOfBranch).add(quote do: TokenType(`tt`)) + let tokenCase = newNimNode(nnkCaseStmt) + if ofBranches[tt].defaultBranch != nil: + if ofBranches[tt].ofBranches.len > 0: + tokenCase.add(tokenBranchOn(tt)) + for branch in ofBranches[tt].ofBranches: + add_to_case branch + tokenCase.add(newNimNode(nnkElse).add(ofBranches[tt].defaultBranch)) + ofBranch.add(tokenCase) + mainCase.add(ofBranch) else: - emit '<' - reconsume_in RCDATA - - of RCDATA_END_TAG_OPEN: - case c - of AsciiAlpha: - new_token Token(t: END_TAG) - reconsume_in RCDATA_END_TAG_NAME + ofBranch.add(ofBranches[tt].defaultBranch) + mainCase.add(ofBranch) + else: + if ofBranches[tt].ofBranches.len > 0: + tokenCase.add(tokenBranchOn(tt)) + for branch in ofBranches[tt].ofBranches: + add_to_case branch + ofBranch.add(tokenCase) + tokenCase.add(newNimNode(nnkElse).add(quote do: discard)) + mainCase.add(ofBranch) else: - emit '<' - emit '/' - reconsume_in RCDATA - - of RCDATA_END_TAG_NAME: - has_anything_else - case c - of whitespace: - if is_appropriate_end_tag_token: - switch_state BEFORE_ATTRIBUTE_NAME + discard + + mainCase.add(newNimNode(nnkElse).add(quote do: discard)) + + var stmts = newStmtList().add(mainCase) + for stmt in defaultBranch: + stmts.add(stmt) + result = newBlockStmt(ident("inside_not_else"), stmts) + +proc processInHTMLContent(parser: var HTML5Parser, token: Token, insertionMode = parser.insertionMode) = + template pop_all_nodes = + while parser.openElements.len > 1: pop_current_node + template anything_else = discard "anything_else" + macro `=>`(v: typed, body: untyped): untyped = + quote do: + discard (`v`, proc() = `body`) + template _ = discard + template reprocess(tok: Token) = + parser.processInHTMLContent(tok) + + case insertionMode + of INITIAL: + match token: + AsciiWhitespace => (block: discard) + TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document))) + TokenType.DOCTYPE => (block: + if token.name.isnone or token.name.get != "html" or token.pubid.issome or (token.sysid.issome and token.sysid.get != "about:legacy-compat"): + parse_error + let doctype = parser.document.newDocumentType(token.name.get(""), token.pubid.get(""), token.sysid.get("")) + parser.document.append(doctype) + if not parser.document.is_iframe_srcdoc and not parser.document.parser_cannot_change_the_mode_flag: + if quirksConditions(token): + parser.document.mode = QUIRKS + elif limitedQuirksConditions(token): + parser.document.mode = LIMITED_QUIRKS + parser.insertionMode = BEFORE_HTML + ) + _ => (block: + if not parser.document.is_iframe_srcdoc: + parse_error + if not parser.document.parser_cannot_change_the_mode_flag: + parser.document.mode = QUIRKS + parser.insertionMode = BEFORE_HTML + reprocess token + ) + + of BEFORE_HTML: + match token: + TokenType.DOCTYPE => (block: parse_error) + TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document))) + AsciiWhitespace => (block: discard) + "<html>" => (block: + let element = parser.createElement(token, $Namespace.HTML, parser.document) + parser.document.append(element) + parser.openElements.add(element) + parser.insertionMode = BEFORE_HEAD + ) + ("</head>", "</body>", "</html>", "</br>") => (block: anything_else) + TokenType.END_TAG => (block: parse_error) + _ => (block: + let element = parser.document.newHTMLElement(TAG_HTML) + parser.document.append(element) + parser.openElements.add(element) + parser.insertionMode = BEFORE_HEAD + reprocess token + ) + + of BEFORE_HEAD: + match token: + AsciiWhitespace => (block: discard) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "<head>" => (block: + parser.head = parser.insertHTMLElement(token) + parser.insertionMode = IN_HEAD + ) + ("</head>", "</body>", "</html>", "</br>") => (block: anything_else) + TokenType.END_TAG => (block: parse_error) + _ => (block: + parser.head = parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_HEAD)) + parser.insertionMode = IN_HEAD + reprocess token + ) + + of IN_HEAD: + match token: + AsciiWhitespace => (block: discard) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + ("<base>", "<basefont>", "<bgsound>", "<link>") => (block: + discard parser.insertHTMLElement(token) + pop_current_node + ) + "<meta>" => (block: + discard parser.insertHTMLElement(token) + pop_current_node + #TODO encodings + ) + "<title>" => (block: parser.genericRCDATAElementParsingAlgorithm(token)) + "<noscript>" => (block: + if not parser.scripting: + discard parser.insertHTMLElement(token) + parser.insertionMode = IN_HEAD_NOSCRIPT else: - anything_else - of '/': - if is_appropriate_end_tag_token: - switch_state SELF_CLOSING_START_TAG + parser.genericRawtextElementParsingAlgorithm(token) + ) + ("<noframes>", "<style>") => (block: parser.genericRawtextElementParsingAlgorithm(token)) + "<script>" => (block: + let location = parser.appropriatePlaceForInsert() + let element = HTMLScriptElement(parser.createElement(token, $Namespace.HTML, location.inside)) + element.parserDocument = parser.document + element.forceAsync = false + if parser.fragment: + element.alreadyStarted = true + #TODO document.write (?) + location.insert(element) + parser.openElements.add(element) + parser.tokenizer.state = SCRIPT_DATA + parser.insertionMode = TEXT + ) + "</head>" => (block: + pop_current_node + parser.insertionMode = AFTER_HEAD + ) + ("</body>", "</html>", "</br>") => (block: anything_else) + "<template>" => (block: + discard parser.insertHTMLElement(token) + parser.activeFormatting.add((nil, nil)) + parser.framesetok = false + parser.insertionMode = IN_TEMPLATE + parser.templateModes.add(IN_TEMPLATE) + ) + "</template>" => (block: + if not parser.openElements.hasElement(TAG_TEMPLATE): + parse_error else: - anything_else - of '>': - if is_appropriate_end_tag_token: - switch_state DATA - of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr - else: - new_token nil #TODO - emit '<' - emit '/' - for r in tokenizer.tmp.runes: - emit r - reconsume_in RCDATA - - of RAWTEXT_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state RAWTEXT_END_TAG_OPEN - else: - emit '<' - reconsume_in RAWTEXT - - of RAWTEXT_END_TAG_OPEN: - case c - of AsciiAlpha: - new_token Token(t: END_TAG) - reconsume_in RAWTEXT_END_TAG_NAME - else: - emit '<' - emit '/' - reconsume_in RAWTEXT - - of RAWTEXT_END_TAG_NAME: - has_anything_else - case c - of whitespace: - if is_appropriate_end_tag_token: - switch_state BEFORE_ATTRIBUTE_NAME + parser.generateImpliedEndTagsThoroughly() + if parser.currentNode.tagType != TAG_TEMPLATE: + parse_error + while parser.openElements.pop().tagType != TAG_TEMPLATE: discard + parser.clearActiveFormattingTillMarker() + discard parser.templateModes.pop() + parser.resetInsertionMode() + ) + ("<head>", TokenType.END_TAG) => (block: parse_error) + _ => (block: + pop_current_node + parser.insertionMode = AFTER_HEAD + reprocess token + ) + + of IN_HEAD_NOSCRIPT: + match token: + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "</noscript>" => (block: + pop_current_node + parser.insertionMode = IN_HEAD + ) + (AsciiWhitespace, + TokenType.COMMENT, + "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<style>") => (block: + parser.processInHTMLContent(token, IN_HEAD)) + "</br>" => (block: anything_else) + ("<head>", "<noscript>") => (block: parse_error) + TokenType.END_TAG => (block: parse_error) + _ => (block: + pop_current_node + parser.insertionMode = IN_HEAD + reprocess token + ) + + of AFTER_HEAD: + match token: + AsciiWhitespace => (block: parser.insertCharacter(token.c)) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "<body>" => (block: + discard parser.insertHTMLElement(token) + parser.framesetok = false + parser.insertionMode = IN_BODY + ) + "<frameset>" => (block: + discard parser.insertHTMLElement(token) + parser.insertionMode = IN_FRAMESET + ) + ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<script>", "<style>", "<template>", "<title>") => (block: + parse_error + parser.openElements.add(parser.head) + parser.processInHTMLContent(token, IN_HEAD) + for i in countdown(parser.openElements.high, 0): + if parser.openElements[i] == parser.head: + parser.openElements.del(i) + ) + "</template>" => (block: parser.processInHTMLContent(token, IN_HEAD)) + ("</body>", "</html>", "</br>") => (block: anything_else) + ("<head>", TokenType.END_TAG) => (block: parse_error) + _ => (block: + discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_BODY)) + parser.insertionMode = IN_BODY + reprocess token + ) + + of IN_BODY: + proc closeP(parser: var HTML5Parser) = + parser.generateImpliedEndTags(TAG_P) + if parser.currentNode.tagType != TAG_P: parse_error + while parser.openElements.pop().tagType != TAG_P: discard + + proc adoptionAgencyAlgorithm(parser: var HTML5Parser, token: Token): bool = + if parser.currentNode.tagType != TAG_UNKNOWN and parser.currentNode.tagtype == token.tagtype or parser.currentNode.localName == token.tagname: #TODO local or qualified name? + var fail = true + for it in parser.activeFormatting: + if it[0] == parser.currentNode: + fail = false + if fail: + pop_current_node + return false + var i = 0 + while true: + if i >= 8: return false + inc i + if parser.activeFormatting.len == 0: return true + var formatting: Element + var formattingIndex: int + for j in countdown(parser.activeFormatting.high, 0): + let element = parser.activeFormatting[j][0] + if element == nil: + return true + if element.tagType != TAG_UNKNOWN and element.tagtype == token.tagtype or element.qualifiedName == token.tagname: + formatting = element + formattingIndex = j + break + if j == 0: + return true + let stackIndex = parser.openElements.find(formatting) + if stackIndex < 0: + parse_error + parser.activeFormatting.del(formattingIndex) + return false + if not parser.openElements.hasElementInScope(formatting): + parse_error + return false + if formatting != parser.currentNode: parse_error + var furthestBlock: Element = nil + var furthestBlockIndex: int + for j in countdown(parser.openElements.high, 0): + if parser.openElements[j] == formatting: + break + if parser.openElements[j].tagType in SpecialElements: + furthestBlock = parser.openElements[j] + furthestBlockIndex = j + break + if furthestBlock == nil: + while parser.openElements.pop() != formatting: discard + parser.activeFormatting.del(formattingIndex) + return false + let commonAncestor = parser.openElements[stackIndex - 1] + var bookmark = formattingIndex + var node = furthestBlock + var aboveNode = parser.openElements[furthestBlockIndex - 1] + var lastNode = furthestBlock + var j = 0 + while true: + inc j + node = aboveNode + if node == formatting: break + var nodeFormattingIndex = -1 + for i in countdown(parser.activeFormatting.high, 0): + if parser.activeFormatting[i][0] == node: + nodeFormattingIndex = i + break + if j > 3 and nodeFormattingIndex >= 0: + parser.activeFormatting.del(nodeFormattingIndex) + if nodeFormattingIndex < bookmark: + dec bookmark # a previous node got deleted, so decrease bookmark by one + let nodeStackIndex = parser.openElements.find(node) + if nodeFormattingIndex < 0: + parser.openElements.del(nodeStackIndex) + if nodeStackIndex < furthestBlockIndex: + dec furthestBlockIndex + continue + let element = parser.createElement(parser.activeFormatting[nodeFormattingIndex][1], $Namespace.HTML, commonAncestor) + parser.activeFormatting[nodeFormattingIndex] = (element, parser.activeFormatting[nodeFormattingIndex][1]) + parser.openElements[nodeFormattingIndex] = element + aboveNode = parser.openElements[nodeFormattingIndex - 1] + node = element + if lastNode == furthestBlock: + bookmark = nodeFormattingIndex + node.append(lastNode) + lastNode = node + let location = parser.appropriatePlaceForInsert(commonAncestor) + location.inside.insert(lastNode, location.before) + let token = parser.activeFormatting[formattingIndex][1] + let element = parser.createElement(token, $Namespace.HTML, furthestBlock) + for child in furthestBlock.childNodes: + child.remove() + element.append(child) + furthestBlock.append(element) + parser.activeFormatting.insert((element, token), bookmark) + parser.activeFormatting.del(formattingIndex) + parser.openElements.insert(element, furthestBlockIndex) + parser.openElements.del(stackIndex) + + template any_other_start_tag() = + parser.reconstructActiveFormatting() + discard parser.insertHTMLElement(token) + + template any_other_end_tag() = + for i in countdown(parser.openElements.high, 0): + let node = parser.openElements[i] + if node.tagType != TAG_UNKNOWN and node.tagType == token.tagtype or node.localName == token.tagname: #TODO local or qualified name? + parser.generateImpliedEndTags(token.tagtype) + if node != parser.currentNode: parse_error + while parser.openElements.pop() != node: discard + break + elif node.tagType in SpecialElements: + parse_error + return + + match token: + '\0' => (block: parse_error) + AsciiWhitespace => (block: + parser.reconstructActiveFormatting() + parser.insertCharacter(token.c) + ) + TokenType.CHARACTER_ASCII => (block: + parser.reconstructActiveFormatting() + parser.insertCharacter(token.c) + parser.framesetOk = false + ) + TokenType.CHARACTER => (block: + parser.reconstructActiveFormatting() + parser.insertCharacter(token.r) + parser.framesetOk = false + ) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: + parse_error + if parser.openElements.hasElement(TAG_TEMPLATE): + discard else: - anything_else - of '/': - if is_appropriate_end_tag_token: - switch_state SELF_CLOSING_START_TAG + for k, v in token.attrs: + if k notin parser.openElements[0].attributes: + parser.openElements[0].attributes[k] = v + ) + ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<script>", "<style>", "<template>", "<title>", + "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD)) + "<body>" => (block: + parse_error + if parser.openElements.len == 1 or parser.openElements[1].tagType != TAG_BODY or parser.openElements.hasElement(TAG_TEMPLATE): + discard else: - anything_else - of '>': - if is_appropriate_end_tag_token: - switch_state DATA - emit_tok + parser.framesetOk = false + for k, v in token.attrs: + if k notin parser.openElements[1].attributes: + parser.openElements[1].attributes[k] = v + ) + "<frameset>" => (block: + parse_error + if parser.openElements.len == 1 or parser.openElements[1].tagType != TAG_BODY or not parser.framesetOk: + discard else: - anything_else - of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr - else: - new_token nil #TODO - emit '<' - emit '/' - for r in tokenizer.tmp.runes: - emit r - reconsume_in RAWTEXT - - of SCRIPT_DATA_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state SCRIPT_DATA_END_TAG_OPEN - of '!': - switch_state SCRIPT_DATA_ESCAPE_START - emit '<' - emit '!' - else: - emit '<' - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_END_TAG_OPEN: - case c - of AsciiAlpha: - new_token Token(t: END_TAG) - reconsume_in SCRIPT_DATA_END_TAG_NAME - else: - emit '<' - emit '/' - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_END_TAG_NAME: - has_anything_else - case c - of whitespace: - if is_appropriate_end_tag_token: - switch_state BEFORE_ATTRIBUTE_NAME + if parser.openElements[1].parentNode != nil: + parser.openElements[1].remove() + pop_all_nodes + ) + TokenType.EOF => (block: + if parser.templateModes.len > 0: + parser.processInHTMLContent(token, IN_TEMPLATE) else: - anything_else - of '/': - if is_appropriate_end_tag_token: - switch_state SELF_CLOSING_START_TAG + #NOTE parse error omitted + discard # stop + ) + "</body>" => (block: + if not parser.openElements.hasElementInScope(TAG_BODY): + parse_error else: - anything_else - of '>': - if is_appropriate_end_tag_token: - switch_state DATA - emit_tok + #NOTE parse error omitted + parser.insertionMode = AFTER_BODY + ) + "</html>" => (block: + if not parser.openElements.hasElementInScope(TAG_BODY): + parse_error else: - anything_else - of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr - else: - emit '<' - emit '/' - for r in tokenizer.tmp.runes: - emit r - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_ESCAPE_START: - case c - of '-': - switch_state SCRIPT_DATA_ESCAPE_START_DASH - emit '-' - else: - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_ESCAPE_START_DASH: - case c - of '-': - switch_state SCRIPT_DATA_ESCAPED_DASH_DASH - emit '-' - else: - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_ESCAPED: - case c - of '-': - switch_state SCRIPT_DATA_ESCAPED_DASH - emit '-' - of '<': - switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN - of null: - parse_error unexpected_null_character - emit_replacement - of eof: - parse_error eof_in_script_html_comment_like_text - emit_eof - else: - emit_current - - of SCRIPT_DATA_ESCAPED_DASH: - case c - of '-': - switch_state SCRIPT_DATA_ESCAPED_DASH_DASH - emit '-' - of '<': - switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN - of null: - parse_error unexpected_null_character - switch_state SCRIPT_DATA_ESCAPED - of eof: - parse_error eof_in_script_html_comment_like_text - emit_eof - else: - switch_state SCRIPT_DATA_ESCAPED - emit_current - - of SCRIPT_DATA_ESCAPED_DASH_DASH: - case c - of '-': - emit '-' - of '<': - switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN - of '>': - switch_state SCRIPT_DATA - emit '>' - of null: - parse_error unexpected_null_character - switch_state SCRIPT_DATA_ESCAPED - of eof: - parse_error eof_in_script_html_comment_like_text - emit_eof - else: - switch_state SCRIPT_DATA_ESCAPED - emit_current - - of SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state SCRIPT_DATA_ESCAPED_END_TAG_OPEN - of AsciiAlpha: - tokenizer.tmp = "" - emit '<' - reconsume_in SCRIPT_DATA_DOUBLE_ESCAPE_START - else: - emit '<' - reconsume_in SCRIPT_DATA_ESCAPED - - of SCRIPT_DATA_ESCAPED_END_TAG_OPEN: - case c - of AsciiAlpha: - new_token Token(t: START_TAG) - reconsume_in SCRIPT_DATA_ESCAPED_END_TAG_NAME - else: - emit '<' - emit '/' - reconsume_in SCRIPT_DATA_ESCAPED - - of SCRIPT_DATA_ESCAPED_END_TAG_NAME: - has_anything_else - case c - of whitespace: - if is_appropriate_end_tag_token: - switch_state BEFORE_ATTRIBUTE_NAME + #NOTE parse error omitted + parser.insertionMode = AFTER_BODY + reprocess token + ) + ("<address>", "<article>", "<aside>", "<blockquote>", "<center>", + "<details>", "<dialog>", "<dir>", "<div>", "<dl>", "<fieldset>", + "<figcaption>", "<figure>", "<footer>", "<header>", "<hgroup>", "<main>", + "<menu>", "<nav>", "<ol>", "<p>", "<section>", "<summary>", "<ul>") => (block: + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + discard parser.insertHTMLElement(token) + ) + ("<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>") => (block: + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + if parser.currentNode.tagType in HTagTypes: + parse_error + pop_current_node + discard parser.insertHTMLElement(token) + ) + ("<pre>", "<listing>") => (block: + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + discard parser.insertHTMLElement(token) + parser.ignoreLF = true + parser.framesetOk = false + ) + "<form>" => (block: + let hasTemplate = parser.openElements.hasElement(TAG_TEMPLATE) + if parser.form != nil and not hasTemplate: + parse_error else: - anything_else - of '/': - if is_appropriate_end_tag_token: - switch_state SELF_CLOSING_START_TAG + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + let element = parser.insertHTMLElement(token) + if not hasTemplate: + parser.form = HTMLFormElement(element) + ) + "<li>" => (block: + parser.framesetOk = false + for i in countdown(parser.openElements.high, 0): + let node = parser.openElements[i] + case node.tagType + of TAG_LI: + parser.generateImpliedEndTags(TAG_LI) + if parser.currentNode.tagType != TAG_LI: parse_error + while parser.openElements.pop().tagType != TAG_LI: discard + break + of SpecialElements - {TAG_ADDRESS, TAG_DIV, TAG_P, TAG_LI}: + break + else: discard + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + discard parser.insertHTMLElement(token) + ) + ("<dd>", "<dt>") => (block: + parser.framesetOk = false + for i in countdown(parser.openElements.high, 0): + let node = parser.openElements[i] + case node.tagType + of TAG_DD: + parser.generateImpliedEndTags(TAG_DD) + if parser.currentNode.tagType != TAG_DD: parse_error + while parser.openElements.pop().tagType != TAG_DD: discard + break + of TAG_DT: + parser.generateImpliedEndTags(TAG_DT) + if parser.currentNode.tagType != TAG_DT: parse_error + while parser.openElements.pop().tagType != TAG_DT: discard + break + of SpecialElements - {TAG_ADDRESS, TAG_DIV, TAG_P, TAG_DD, TAG_DT}: + break + else: discard + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + discard parser.insertHTMLElement(token) + ) + "<plaintext>" => (block: + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + discard parser.insertHTMLElement(token) + parser.tokenizer.state = PLAINTEXT + ) + "<button>" => (block: + if parser.openElements.hasElementInScope(TAG_BUTTON): + parse_error + parser.generateImpliedEndTags() + while parser.openElements.pop().tagType != TAG_BUTTON: discard + parser.reconstructActiveFormatting() + discard parser.insertHTMLElement(token) + parser.framesetOk = false + ) + ("</address>", "</article>", "</aside>", "</blockquote>", "</button>", + "</center>", "</details>", "</dialog>", "</dir>", "</div>", "</dl>", + "</fieldset>", "</figcaption>", "</figure>", "</footer>", "</header>", + "</hgroup>", "</listing>", "</main>", "</menu>", "</nav>", "</ol>", + "</pre>", "</section>", "</summary>", "</ul>") => (block: + if not parser.openElements.hasElementInScope(token.tagtype): + parse_error else: - anything_else - of '>': - if is_appropriate_end_tag_token: - switch_state DATA + parser.generateImpliedEndTags() + if parser.currentNode.tagType != token.tagtype: parse_error + while parser.openElements.pop().tagType != token.tagtype: discard + ) + "</form>" => (block: + if not parser.openElements.hasElement(TAG_TEMPLATE): + let node = parser.form + parser.form = nil + if node == nil or not parser.openElements.hasElementInScope(node.tagType): + parse_error + return + parser.generateImpliedEndTags() + if parser.currentNode != node: parse_error + parser.openElements.del(parser.openElements.find(node)) else: - anything_else - of AsciiAlpha: - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr - else: - emit '<' - emit '/' - for r in tokenizer.tmp.runes: - emit r - reconsume_in SCRIPT_DATA_ESCAPED - - of SCRIPT_DATA_DOUBLE_ESCAPE_START: - case c - of whitespace, '/', '>': - if tokenizer.tmp == "script": - switch_state SCRIPT_DATA_DOUBLE_ESCAPED + if not parser.openElements.hasElementInScope(TAG_FORM): + parse_error + return + parser.generateImpliedEndTags() + if parser.currentNode.tagType != TAG_FORM: parse_error + while parser.openElements.pop().tagType != TAG_FORM: discard + ) + "</p>" => (block: + if not parser.openElements.hasElementInButtonScope(TAG_P): + parse_error + discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_P)) + parser.closeP() + ) + "</li>" => (block: + if not parser.openElements.hasElementInListItemScope(TAG_LI): + parse_error else: - switch_state SCRIPT_DATA_ESCAPED - emit_current - of AsciiAlpha: # note: merged upper & lower - tokenizer.tmp &= c.tolower() - emit_current - else: reconsume_in SCRIPT_DATA_ESCAPED - - of SCRIPT_DATA_DOUBLE_ESCAPED: - case c - of '-': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_DASH - emit '-' - of '<': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN - emit '<' - of null: - parse_error unexpected_null_character - emit_replacement - of eof: - parse_error eof_in_script_html_comment_like_text - emit_eof - else: emit_current - - of SCRIPT_DATA_DOUBLE_ESCAPED_DASH: - case c - of '-': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH - emit '-' - of '<': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN - emit '<' - of null: - parse_error unexpected_null_character - switch_state SCRIPT_DATA_DOUBLE_ESCAPED - emit_replacement - of eof: - parse_error eof_in_script_html_comment_like_text - emit_eof - else: - switch_state SCRIPT_DATA_DOUBLE_ESCAPED - emit_current - - of SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: - case c - of '-': emit '-' - of '<': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN - emit '<' - of '>': - switch_state SCRIPT_DATA - emit '>' - of null: - parse_error unexpected_null_character - switch_state SCRIPT_DATA_DOUBLE_ESCAPED - emit_replacement - of eof: - parse_error eof_in_script_html_comment_like_text - emit_eof - else: switch_state SCRIPT_DATA_DOUBLE_ESCAPED - - of SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state SCRIPT_DATA_DOUBLE_ESCAPE_END - emit '/' - else: reconsume_in SCRIPT_DATA_DOUBLE_ESCAPED - - of SCRIPT_DATA_DOUBLE_ESCAPE_END: - case c - of whitespace, '/', '>': - if tokenizer.tmp == "script": - switch_state SCRIPT_DATA_ESCAPED + parser.generateImpliedEndTags(TAG_LI) + if parser.currentNode.tagType != TAG_LI: parse_error + while parser.openElements.pop().tagType != TAG_LI: discard + ) + ("</dd>", "</dt>") => (block: + if not parser.openElements.hasElementInScope(token.tagtype): + parse_error else: - switch_state SCRIPT_DATA_DOUBLE_ESCAPED - emit_current - of AsciiAlpha: # note: merged upper & lower - tokenizer.tmp &= c.tolower() - emit_current - else: - reconsume_in SCRIPT_DATA_DOUBLE_ESCAPED - - of BEFORE_ATTRIBUTE_NAME: - case c - of whitespace: discard - of '/', '>', eof: reconsume_in AFTER_ATTRIBUTE_NAME - of '=': - parse_error unexpected_equals_sign_before_attribute_name - start_new_attribute - switch_state ATTRIBUTE_NAME - else: - start_new_attribute - reconsume_in ATTRIBUTE_NAME - - of ATTRIBUTE_NAME: - has_anything_else - case c - of whitespace, '/', '>', eof: - leave_attribute_name_state - reconsume_in AFTER_ATTRIBUTE_NAME - of '=': - leave_attribute_name_state - switch_state BEFORE_ATTRIBUTE_VALUE - of AsciiUpperAlpha: - tokenizer.attrn &= c.tolower() - of null: - parse_error unexpected_null_character - tokenizer.attrn &= Rune(0xFFFD) - of '"', '\'', '<': - parse_error unexpected_character_in_attribute_name + parser.generateImpliedEndTags(token.tagtype) + if parser.currentNode.tagType != token.tagtype: parse_error + while parser.openElements.pop().tagType != token.tagtype: discard + ) + ("</h1>", "</h2>", "</h3>", "</h4>", "</h5>", "</h6>") => (block: + if not parser.openElements.hasElementInScope(HTagTypes): + parse_error + else: + parser.generateImpliedEndTags() + if parser.currentNode.tagType != token.tagtype: parse_error + while parser.openElements.pop().tagType notin HTagTypes: discard + ) + "</sarcasm>" => (block: + #*deep breath* anything_else - else: - tokenizer.attrn &= tokenizer.curr - - of AFTER_ATTRIBUTE_NAME: - case c - of whitespace: discard - of '/': switch_state SELF_CLOSING_START_TAG - of '=': switch_state BEFORE_ATTRIBUTE_VALUE - of '>': - switch_state DATA - emit '>' - of eof: - parse_error eof_in_tag - emit_eof - else: - start_new_attribute - reconsume_in ATTRIBUTE_NAME - - of BEFORE_ATTRIBUTE_VALUE: - case c - of whitespace: discard - of '"': switch_state ATTRIBUTE_VALUE_DOUBLE_QUOTED - of '\'': switch_state ATTRIBUTE_VALUE_SINGLE_QUOTED - of '>': - parse_error missing_attribute_value - switch_state DATA - emit '>' - else: reconsume_in ATTRIBUTE_VALUE_UNQUOTED - - of ATTRIBUTE_VALUE_DOUBLE_QUOTED: - case c - of '"': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED - of '&': switch_state_return CHARACTER_REFERENCE - of null: - parse_error unexpected_null_character - append_to_current_attr_value Rune(0xFFFD) - of eof: - parse_error eof_in_tag - emit_eof - else: append_to_current_attr_value tokenizer.curr - - of ATTRIBUTE_VALUE_SINGLE_QUOTED: - case c - of '\'': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED - of '&': switch_state_return CHARACTER_REFERENCE - of null: - parse_error unexpected_null_character - append_to_current_attr_value Rune(0xFFFD) - of eof: - parse_error eof_in_tag - emit_eof - else: append_to_current_attr_value tokenizer.curr - - of ATTRIBUTE_VALUE_UNQUOTED: - case c - of whitespace: switch_state BEFORE_ATTRIBUTE_NAME - of '&': switch_state_return CHARACTER_REFERENCE - of '>': switch_state DATA - of null: - parse_error unexpected_null_character - append_to_current_attr_value Rune(0xFFFD) - of '"', '\'', '<', '=', '`': - parse_error unexpected_character_in_unquoted_attribute_value - append_to_current_attr_value c - of eof: - parse_error eof_in_tag - emit_eof - else: append_to_current_attr_value tokenizer.curr - - of AFTER_ATTRIBUTE_VALUE_QUOTED: - case c - of whitespace: - switch_state BEFORE_ATTRIBUTE_NAME - of '/': - switch_state SELF_CLOSING_START_TAG - of '>': - switch_state DATA - emit_tok - of eof: - parse_error eof_in_tag - emit_eof - else: append_to_current_attr_value tokenizer.curr - - of SELF_CLOSING_START_TAG: - case c - of '>': - tokenizer.tok.selfclosing = true - switch_state DATA - emit '>' - of eof: - parse_error eof_in_tag - emit_eof - else: - parse_error unexpected_solidus_in_tag - reconsume_in BEFORE_ATTRIBUTE_NAME - - of BOGUS_COMMENT: - assert tokenizer.tok.t == COMMENT - case c - of '>': - switch_state DATA - emit_tok - of eof: - emit_tok - emit_eof - of null: parse_error unexpected_null_character - else: tokenizer.tok.data &= tokenizer.curr - - of MARKUP_DECLARATION_OPEN: # note: rewritten to fit case model as we consume a char anyway - has_anything_else - case c - of '-': - if peek_char == '-': - new_token Token(t: COMMENT) - tokenizer.state = COMMENT_START - consume_and_discard 1 - else: anything_else - of 'D', 'd': - if peek_str_nocase("OCTYPE"): - consume_and_discard "OCTYPE".len - switch_state DOCTYPE - else: anything_else - of '[': - if peek_str("CDATA["): - consume_and_discard "CDATA[".len - if has_adjusted_current_node: #TODO and it is not an element in the HTML namespace - switch_state CDATA_SECTION - else: - parse_error cdata_in_html_content - new_token Token(t: COMMENT, data: "[CDATA[") - switch_state BOGUS_COMMENT - else: anything_else - else: - parse_error incorrectly_opened_comment - new_token Token(t: COMMENT) - reconsume_in BOGUS_COMMENT - - of COMMENT_START: - case c - of '-': switch_state COMMENT_START_DASH - of '>': - parse_error abrupt_closing_of_empty_comment - switch_state DATA - emit_tok - else: reconsume_in COMMENT - - of COMMENT_START_DASH: - case c - of '-': switch_state COMMENT_END - of '>': - parse_error abrupt_closing_of_empty_comment - switch_state DATA - emit_tok - of eof: - parse_error eof_in_comment - emit_tok - emit_eof - else: - tokenizer.tok.data &= '-' - reconsume_in COMMENT - - of COMMENT: - case c - of '<': - tokenizer.tok.data &= c - switch_state COMMENT_LESS_THAN_SIGN - of '-': switch_state COMMENT_END_DASH - of null: - parse_error unexpected_null_character - tokenizer.tok.data &= Rune(0xFFFD) - of eof: - parse_error eof_in_comment - emit_tok - emit_eof - else: tokenizer.tok.data &= tokenizer.curr - - of COMMENT_LESS_THAN_SIGN: - case c - of '!': - tokenizer.tok.data &= c - switch_state COMMENT_LESS_THAN_SIGN_BANG - of '<': tokenizer.tok.data &= c - else: reconsume_in COMMENT - - of COMMENT_LESS_THAN_SIGN_BANG: - case c - of '-': switch_state COMMENT_LESS_THAN_SIGN_BANG_DASH - else: reconsume_in COMMENT - - of COMMENT_LESS_THAN_SIGN_BANG_DASH: - case c - of '-': switch_state COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH - else: reconsume_in COMMENT_END_DASH - - of COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH: - case c - of '>', eof: reconsume_in COMMENT_END - else: - parse_error nested_comment - reconsume_in COMMENT_END - - of COMMENT_END_DASH: - case c - of '-': switch_state COMMENT_END - of eof: - parse_error eof_in_comment - emit_tok - emit_eof - else: - tokenizer.tok.data &= '-' - reconsume_in COMMENT - - of COMMENT_END: - case c - of '>': switch_state DATA - of '!': switch_state COMMENT_END_BANG - of '-': tokenizer.tok.data &= '-' - of eof: - parse_error eof_in_comment - emit_tok - emit_eof - else: - tokenizer.tok.data &= "--" - reconsume_in COMMENT - - of COMMENT_END_BANG: - case c - of '-': - tokenizer.tok.data &= "--!" - switch_state COMMENT_END_DASH - of '>': - parse_error incorrectly_closed_comment - switch_state DATA - emit_tok - of eof: - parse_error eof_in_comment - emit_tok - emit_eof - else: - tokenizer.tok.data &= "--!" - reconsume_in COMMENT - - of DOCTYPE: - case c - of whitespace: switch_state BEFORE_DOCTYPE_NAME - of '>': reconsume_in BEFORE_DOCTYPE_NAME - of eof: - parse_error eof_in_doctype - new_token Token(t: DOCTYPE, quirks: true) - emit_tok - emit_eof - else: - parse_error missing_whitespace_before_doctype_name - reconsume_in BEFORE_DOCTYPE_NAME - - of BEFORE_DOCTYPE_NAME: - case c - of whitespace: discard - of AsciiUpperAlpha: - new_token Token(t: DOCTYPE, name: some($c.tolower())) - switch_state DOCTYPE_NAME - of null: - parse_error unexpected_null_character - new_token Token(t: DOCTYPE, name: some($Rune(0xFFFD))) - of '>': - parse_error missing_doctype_name - new_token Token(t: DOCTYPE, quirks: true) - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - new_token Token(t: DOCTYPE, quirks: true) - emit_tok - emit_eof - else: - new_token Token(t: DOCTYPE, name: some($tokenizer.curr)) - switch_state DOCTYPE_NAME - - of DOCTYPE_NAME: - case c - of whitespace: switch_state AFTER_DOCTYPE_NAME - of '>': - switch_state DATA - emit_tok - of AsciiUpperAlpha: - tokenizer.tok.name.get &= c.tolower() - of null: - parse_error unexpected_null_character - tokenizer.tok.name.get &= Rune(0xFFFD) - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.name.get &= tokenizer.curr - - of AFTER_DOCTYPE_NAME: # note: rewritten to fit case model as we consume a char anyway - has_anything_else - case c - of whitespace: discard - of '>': - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - of 'p', 'P': - if peek_str("UBLIC"): - consume_and_discard "UBLIC".len - switch_state AFTER_DOCTYPE_PUBLIC_KEYWORD + ) + "<a>" => (block: + var element: Element = nil + for i in countdown(parser.activeFormatting.high, 0): + let format = parser.activeFormatting[i] + if format[0] == nil: + break + if format[0].tagType == TAG_A: + element = format[0] + break + if element != nil: + parse_error + if parser.adoptionAgencyAlgorithm(token): + any_other_end_tag + return + for i in 0..parser.activeFormatting.high: + if parser.activeFormatting[i][0] == element: + parser.activeFormatting.del(i) + break + for i in 0..parser.openElements.high: + if parser.openElements[i] == element: + parser.openElements.del(i) + break + parser.reconstructActiveFormatting() + let element = parser.insertHTMLElement(token) + parser.pushOntoActiveFormatting(element, token) + ) + ("<b>", "<big>", "<code>", "<em>", "<font>", "<i>", "<s>", "<small>", + "<strike>", "<strong>", "<tt>", "<u>") => (block: + parser.reconstructActiveFormatting() + let element = parser.insertHTMLElement(token) + parser.pushOntoActiveFormatting(element, token) + ) + "<nobr>" => (block: + parser.reconstructActiveFormatting() + if parser.openElements.hasElementInScope(TAG_NOBR): + parse_error + if parser.adoptionAgencyAlgorithm(token): + any_other_end_tag + return + parser.reconstructActiveFormatting() + let element = parser.insertHTMLElement(token) + parser.pushOntoActiveFormatting(element, token) + ) + ("</a>", "</b>", "</big>", "</code>", "</em>", "</font>", "</i>", + "</nobr>", "</s>", "</small>", "</strike>", "</strong>", "</tt>", + "</u>") => (block: + if parser.adoptionAgencyAlgorithm(token): + any_other_end_tag + return + ) + ("<applet>", "<marquee>", "<object>") => (block: + parser.reconstructActiveFormatting() + discard parser.insertHTMLElement(token) + parser.activeFormatting.add((nil, nil)) + parser.framesetOk = false + ) + ("</applet>", "</marquee>", "</object>") => (block: + if not parser.openElements.hasElementInScope(token.tagtype): + parse_error else: - anything_else - of 's', 'S': - if peek_str("YSTEM"): - consume_and_discard "YSTEM".len - switch_state AFTER_DOCTYPE_SYSTEM_KEYWORD + parser.generateImpliedEndTags() + if parser.currentNode.tagType != token.tagtype: parse_error + while parser.openElements.pop().tagType != token.tagtype: discard + parser.clearActiveFormattingTillMarker() + ) + "<table>" => (block: + if parser.document.mode != QUIRKS: + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + discard parser.insertHTMLElement(token) + parser.framesetOk = false + parser.insertionMode = IN_TABLE + ) + "</br>" => (block: + parse_error + parser.processInHTMLContent(Token(t: START_TAG, tagtype: TAG_BR)) + ) + ("<area>", "<br>", "<embed>", "<img>", "<keygen>", "<wbr>") => (block: + parser.reconstructActiveFormatting() + discard parser.insertHTMLElement(token) + pop_current_node + parser.framesetOk = false + ) + "<input>" => (block: + parser.reconstructActiveFormatting() + discard parser.insertHTMLElement(token) + pop_current_node + if not token.attrs.getOrDefault("type").equalsIgnoreCase("hidden"): + parser.framesetOk = false + ) + ("<param>", "<source>", "<track>") => (block: + discard parser.insertHTMLElement(token) + pop_current_node + ) + "<hr>" => (block: + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + discard parser.insertHTMLElement(token) + pop_current_node + parser.framesetOk = false + ) + "<image>" => (block: + #TODO ew + let token = Token(t: START_TAG, tagtype: TAG_IMG, tagname: "img", selfclosing: token.selfclosing, attrs: token.attrs) + reprocess token + ) + "<textarea>" => (block: + discard parser.insertHTMLElement(token) + parser.ignoreLF = true + parser.tokenizer.state = RCDATA + parser.oldInsertionMode = parser.insertionMode + parser.framesetOk = false + parser.insertionMode = TEXT + ) + "<xmp>" => (block: + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + parser.reconstructActiveFormatting() + parser.framesetOk = false + parser.genericRawtextElementParsingAlgorithm(token) + ) + "<iframe>" => (block: + parser.framesetOk = false + parser.genericRawtextElementParsingAlgorithm(token) + ) + "<noembed>" => (block: + parser.genericRawtextElementParsingAlgorithm(token) + ) + "<noscript>" => (block: + if parser.scripting: + parser.genericRawtextElementParsingAlgorithm(token) else: - anything_else - else: - parse_error invalid_character_sequence_after_doctype_name - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of AFTER_DOCTYPE_PUBLIC_KEYWORD: - case c - of whitespace: switch_state BEFORE_DOCTYPE_PUBLIC_IDENTIFIER - of '"': - parse_error missing_whitespace_after_doctype_public_keyword - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED - of '>': - parse_error missing_doctype_public_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error missing_quote_before_doctype_public_identifier - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: - case c - of whitespace: discard - of '"': - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED - of '\'': - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED - of '>': - parse_error missing_doctype_public_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error missing_quote_before_doctype_public_identifier - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: - case c - of '"': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER - of null: - parse_error unexpected_null_character - tokenizer.tok.pubid.get &= Rune(0xFFFD) - of '>': - parse_error abrupt_doctype_public_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.pubid.get &= tokenizer.curr - - of DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: - case c - of '\'': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER - of null: - parse_error unexpected_null_character - tokenizer.tok.pubid.get &= Rune(0xFFFD) - of '>': - parse_error abrupt_doctype_public_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.pubid.get &= tokenizer.curr - - of AFTER_DOCTYPE_PUBLIC_IDENTIFIER: - case c - of whitespace: switch_state BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS - of '>': - switch_state DATA - emit_tok - of '"': - parse_error missing_whitespace_between_doctype_public_and_system_identifiers - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED - of '\'': - parse_error missing_whitespace_between_doctype_public_and_system_identifiers - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error missing_quote_before_doctype_system_identifier - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: - case c - of whitespace: discard - of '>': - switch_state DATA - emit_tok - of '"': - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED - of '\'': - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error missing_quote_before_doctype_system_identifier - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of AFTER_DOCTYPE_SYSTEM_KEYWORD: - case c - of whitespace: switch_state BEFORE_DOCTYPE_SYSTEM_IDENTIFIER - of '"': - parse_error missing_whitespace_after_doctype_system_keyword - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED - of '\'': - parse_error missing_whitespace_after_doctype_system_keyword - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED - of '>': - parse_error missing_doctype_system_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error missing_quote_before_doctype_system_identifier - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: - case c - of whitespace: discard - of '"': - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED - of '\'': - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED - of '>': - parse_error missing_doctype_system_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error missing_quote_before_doctype_system_identifier - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: - case c - of '"': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER - of null: - parse_error unexpected_null_character - tokenizer.tok.sysid.get &= Rune(0xFFFD) - of '>': - parse_error abrupt_doctype_system_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.sysid.get &= tokenizer.curr - - of DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: - case c - of '\'': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER - of null: - parse_error unexpected_null_character - tokenizer.tok.sysid.get &= Rune(0xFFFD) - of '>': - parse_error abrupt_doctype_system_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.sysid.get &= tokenizer.curr - - of AFTER_DOCTYPE_SYSTEM_IDENTIFIER: - case c - of whitespace: discard - of '>': - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error unexpected_character_after_doctype_system_identifier - reconsume_in BOGUS_DOCTYPE - - of BOGUS_DOCTYPE: - case c - of '>': - switch_state DATA - emit_tok - of null: parse_error unexpected_null_character - of eof: - emit_tok - emit_eof - else: discard - - of CDATA_SECTION: - case c - of ']': switch_state CDATA_SECTION_BRACKET - of eof: - parse_error eof_in_cdata - emit_eof - else: - emit_current - - of CDATA_SECTION_BRACKET: - case c - of ']': switch_state CDATA_SECTION_END - of '>': switch_state DATA - else: - emit ']' - reconsume_in CDATA_SECTION - - of CDATA_SECTION_END: - case c - of ']': emit ']' - of '>': switch_state DATA - else: - emit ']' - emit ']' - reconsume_in CDATA_SECTION - - of CHARACTER_REFERENCE: - tokenizer.tmp = "&" - case c - of AsciiAlpha: reconsume_in NAMED_CHARACTER_REFERENCE - of '#': - tokenizer.tmp &= '#' - switch_state NUMERIC_CHARACTER_REFERENCE - else: - flush_code_points_consumed_as_a_character_reference - reconsume_in tokenizer.rstate - - of NAMED_CHARACTER_REFERENCE: - ignore_eof # we check for eof ourselves - tokenizer.reconsume() #TODO optimize this away - var buf = "" - var node = entityMap - var value = none(string) # last value - var match = true - #TODO interfacing with RadixNode is suffering - # plus this doesn't look very efficient either - while not tokenizer.atEof: - let c = tokenizer.consume() - buf &= c - if not node.hasPrefix(buf): - break - let prevnode = node - node = node{buf} - if node != prevnode: - buf = "" - if node.value.issome: - value = node.value - tokenizer.tmp &= tokenizer.curr - if value.issome: - if consumed_as_an_attribute and tokenizer.tmp[^1] != ';' and peek_char in {';'} + AsciiAlpha: - flush_code_points_consumed_as_a_character_reference - switch_state tokenizer.rstate + any_other_start_tag + ) + "<select>" => (block: + parser.reconstructActiveFormatting() + discard parser.insertHTMLElement(token) + parser.framesetOk = false + if parser.insertionMode in {IN_TABLE, IN_CAPTION, IN_TABLE_BODY, IN_CELL}: + parser.insertionMode = IN_SELECT_IN_TABLE else: - if tokenizer.tmp[^1] != ';': - parse_error missing_semicolon_after_character_reference_parse_error - tokenizer.tmp = node.value.get - flush_code_points_consumed_as_a_character_reference - switch_state tokenizer.rstate - else: - flush_code_points_consumed_as_a_character_reference - switch_state AMBIGUOUS_AMPERSAND_STATE - - of AMBIGUOUS_AMPERSAND_STATE: - case c - of AsciiAlpha: - if consumed_as_an_attribute: - append_to_current_attr_value c + parser.insertionMode = IN_SELECT + ) + ("<optgroup>", "<option>") => (block: + if parser.currentNode.tagType == TAG_OPTION: + pop_current_node + parser.reconstructActiveFormatting() + discard parser.insertHTMLElement(token) + ) + ("<rb>", "<rtc>") => (block: + if parser.openElements.hasElementInScope(TAG_RUBY): + parser.generateImpliedEndTags() + if parser.currentNode.tagType != TAG_RUBY: parse_error + discard parser.insertHTMLElement(token) + ) + ("<rp>", "<rt>") => (block: + if parser.openElements.hasElementInScope(TAG_RUBY): + parser.generateImpliedEndTags(TAG_RTC) + if parser.currentNode.tagType notin {TAG_RUBY, TAG_RTC}: parse_error + discard parser.insertHTMLElement(token) + ) + #NOTE <math> (not implemented) + #TODO <svg> (SVG) + ("<caption>", "<col>", "<colgroup>", "<frame>", "<head>", "<tbody>", + "<td>", "<tfoot>", "<th>", "<thead>", "<tr>") => (block: parse_error) + TokenType.START_TAG => (block: any_other_start_tag) + TokenType.END_TAG => (block: any_other_end_tag) + + of TEXT: + match token: + TokenType.CHARACTER_ASCII => (block: + assert token.c != '\0' + parser.insertCharacter(token.c) + ) + TokenType.CHARACTER => (block: + parser.insertCharacter(token.r) + ) + TokenType.EOF => (block: + parse_error + if parser.currentNode.tagType == TAG_SCRIPT: + HTMLScriptElement(parser.currentNode).alreadyStarted = true + pop_current_node + parser.insertionMode = parser.oldInsertionMode + reprocess token + ) + "</script>" => (block: + #TODO microtask + let script = parser.currentNode + pop_current_node + parser.insertionMode = parser.oldInsertionMode + #TODO document.write() ? + #TODO prepare script element + #TODO uh implement scripting or something + ) + TokenType.END_TAG => (block: + pop_current_node + parser.insertionMode = parser.oldInsertionMode + ) + + of IN_TABLE: + template clear_the_stack_back_to_a_table_context() = + while parser.currentNode.tagType notin {TAG_TABLE, TAG_TEMPLATE, TAG_HTML}: + pop_current_node + + match token: + (TokenType.CHARACTER_ASCII, TokenType.CHARACTER) => (block: + if parser.currentNode.tagType in {TAG_TABLE, TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TR}: + parser.pendingTableChars = "" + parser.pendingTableCharsWhitespace = true + parser.oldInsertionMode = parser.insertionMode + parser.insertionMode = IN_TABLE_TEXT + reprocess token + else: # anything else + parse_error + parser.fosterParenting = true + parser.processInHTMLContent(token, IN_BODY) + parser.fosterParenting = false + ) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<caption>" => (block: + clear_the_stack_back_to_a_table_context + parser.activeFormatting.add((nil, nil)) + discard parser.insertHTMLElement(token) + parser.insertionMode = IN_CAPTION + ) + "<colgroup>" => (block: + clear_the_stack_back_to_a_table_context + discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_COLGROUP)) + parser.insertionMode = IN_COLUMN_GROUP + ) + ("<tbody>", "<tfoot>", "<thead>") => (block: + clear_the_stack_back_to_a_table_context + discard parser.insertHTMLElement(token) + parser.insertionMode = IN_TABLE_BODY + ) + ("<td>", "<th>", "<tr>") => (block: + clear_the_stack_back_to_a_table_context + discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_TBODY)) + parser.insertionMode = IN_TABLE_BODY + ) + "<table>" => (block: + parse_error + if not parser.openElements.hasElementInScope(TAG_TABLE): + discard else: - emit_current - of ';': - parse_error unknown_named_character_reference - reconsume_in tokenizer.rstate - else: reconsume_in tokenizer.rstate - - of NUMERIC_CHARACTER_REFERENCE: - tokenizer.code = 0 - case c - of 'x', 'X': - tokenizer.tmp &= c - switch_state HEXADECIMAL_CHARACTER_REFERENCE_START - else: reconsume_in DECIMAL_CHARACTER_REFERENCE_START - - of HEXADECIMAL_CHARACTER_REFERENCE_START: - case c - of AsciiHexDigit: reconsume_in HEXADECIMAL_CHARACTER_REFERENCE - else: - parse_error absence_of_digits_in_numeric_character_reference - flush_code_points_consumed_as_a_character_reference - reconsume_in tokenizer.rstate - - of DECIMAL_CHARACTER_REFERENCE_START: - case c - of AsciiDigit: reconsume_in DECIMAL_CHARACTER_REFERENCE - else: - parse_error absence_of_digits_in_numeric_character_reference - flush_code_points_consumed_as_a_character_reference - reconsume_in tokenizer.rstate - - of HEXADECIMAL_CHARACTER_REFERENCE: - case c - of AsciiHexDigit: # note: merged digit, upper hex, lower hex - tokenizer.code *= 0x10 - tokenizer.code += hexValue(c) - of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END - else: - parse_error missing_semicolon_after_character_reference - reconsume_in NUMERIC_CHARACTER_REFERENCE_END - - of DECIMAL_CHARACTER_REFERENCE: - case c - of AsciiDigit: - tokenizer.code *= 10 - tokenizer.code += decValue(c) - of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END + while parser.openElements.pop().tagType != TAG_TABLE: discard + parser.resetInsertionMode() + reprocess token + ) + "</table>" => (block: + if not parser.openElements.hasElementInScope(TAG_TABLE): + parse_error + else: + while parser.openElements.pop().tagType != TAG_TABLE: discard + parser.resetInsertionMode() + ) + ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</tbody>", + "</td>", "</tfoot>", "</th>", "</thead>", "</tr>") => (block: + parse_error + ) + ("<style>", "<script>", "<template>", "</template>") => (block: + parser.processInHTMLContent(token, IN_HEAD) + ) + "<input>" => (block: + if not token.attrs.getOrDefault("type").equalsIgnoreCase("hidden"): + # anything else + parse_error + parser.fosterParenting = true + parser.processInHTMLContent(token, IN_BODY) + parser.fosterParenting = false + else: + parse_error + discard parser.insertHTMLElement(token) + pop_current_node + ) + "<form>" => (block: + parse_error + if parser.form != nil or parser.openElements.hasElement(TAG_TEMPLATE): + discard + else: + parser.form = HTMLFormElement(parser.insertHTMLElement(token)) + pop_current_node + ) + TokenType.EOF => (block: + parser.processInHTMLContent(token, IN_BODY) + ) + _ => (block: + parse_error + parser.fosterParenting = true + parser.processInHTMLContent(token, IN_BODY) + parser.fosterParenting = false + ) + + of IN_TABLE_TEXT: + match token: + '\0' => (block: parse_error) + TokenType.CHARACTER_ASCII => (block: + if token.c notin AsciiWhitespace: + parser.pendingTableCharsWhitespace = false + parser.pendingTableChars &= token.c + ) + TokenType.CHARACTER => (block: + parser.pendingTableChars &= token.r + parser.pendingTableCharsWhitespace = false + ) + _ => (block: + if not parser.pendingTableCharsWhitespace: + # I *think* this is effectively the same thing the specification wants... + parse_error + parser.fosterParenting = true + parser.reconstructActiveFormatting() + parser.insertCharacter(token.c) + parser.framesetOk = false + parser.fosterParenting = false + else: + parser.insertCharacter(parser.pendingTableChars) + parser.insertionMode = parser.oldInsertionMode + reprocess token + ) + + of IN_CAPTION: + match token: + "</caption>" => (block: + if parser.openElements.hasElementInTableScope(TAG_CAPTION): + parse_error + else: + parser.generateImpliedEndTags() + if parser.currentNode.tagType != TAG_CAPTION: parse_error + while parser.openElements.pop().tagType != TAG_CAPTION: discard + parser.clearActiveFormattingTillMarker() + parser.insertionMode = IN_TABLE + ) + ("<caption>", "<col>", "<colgroup>", "<tbody>", "<td>", "<tfoot>", + "<th>", "<thead>", "<tr>", "</table>") => (block: + if not parser.openElements.hasElementInTableScope(TAG_CAPTION): + parse_error + else: + parser.generateImpliedEndTags() + if parser.currentNode.tagType != TAG_CAPTION: parse_error + parser.clearActiveFormattingTillMarker() + parser.insertionMode = IN_TABLE + reprocess token + ) + ("</body>", "</col>", "</colgroup>", "</html>", "</tbody>", "</td>", + "</tfoot>", "</th>", "</thead>", "</tr>") => (block: parse_error) + _ => (block: parser.processInHTMLContent(token, IN_BODY)) + + of IN_COLUMN_GROUP: + match token: + AsciiWhitespace => (block: parser.insertCharacter(token.c)) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "<col>" => (block: + discard parser.insertHTMLElement(token) + pop_current_node + ) + "</colgroup>" => (block: + if parser.currentNode.tagType != TAG_COLGROUP: + parse_error + else: + pop_current_node + parser.insertionMode = IN_TABLE + ) + "</col>" => (block: parse_error) + ("<template>", "</template>") => (block: + parser.processInHTMLContent(token, IN_HEAD) + ) + TokenType.EOF => (block: parser.processInHTMLContent(token, IN_BODY)) + _ => (block: + if parser.currentNode.tagType != TAG_COLGROUP: + parse_error + else: + pop_current_node + parser.insertionMode = IN_TABLE + reprocess token + ) + + of IN_TABLE_BODY: + template clear_the_stack_back_to_a_table_body_context() = + while parser.currentNode.tagType notin {TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TEMPLATE, TAG_HTML}: + pop_current_node + + match token: + "<tr>" => (block: + clear_the_stack_back_to_a_table_body_context + discard parser.insertHTMLElement(token) + parser.insertionMode = IN_ROW + ) + ("<th>", "<td>") => (block: + parse_error + clear_the_stack_back_to_a_table_body_context + discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_TR)) + parser.insertionMode = IN_ROW + reprocess token + ) + ("</tbody>", "</tfoot>", "</thead>") => (block: + if not parser.openElements.hasElementInTableScope(token.tagtype): + parse_error + else: + clear_the_stack_back_to_a_table_body_context + pop_current_node + parser.insertionMode = IN_TABLE + ) + ("<caption>", "<col>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>", + "</table>") => (block: + if not parser.openElements.hasElementInTableScope({TAG_TBODY, TAG_THEAD, TAG_TFOOT}): + parse_error + else: + clear_the_stack_back_to_a_table_body_context + pop_current_node + parser.insertionMode = IN_TABLE + reprocess token + ) + ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</td>", + "</th>", "</tr>") => (block: + parse_error + ) + _ => (block: parser.processInHTMLContent(token, IN_TABLE)) + + of IN_ROW: + template clear_the_stack_back_to_a_table_row_context() = + while parser.currentNode.tagType notin {TAG_TR, TAG_TEMPLATE, TAG_HTML}: + pop_current_node + + match token: + ("<th>", "<td>") => (block: + clear_the_stack_back_to_a_table_row_context + discard parser.insertHTMLElement(token) + parser.insertionMode = IN_CELL + parser.activeFormatting.add((nil, nil)) + ) + "</tr>" => (block: + if not parser.openElements.hasElementInTableScope(TAG_TR): + parse_error + else: + clear_the_stack_back_to_a_table_row_context + pop_current_node + parser.insertionMode = IN_TABLE_BODY + ) + ("<caption>", "<col>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>", + "<tr>", "</table>") => (block: + if not parser.openElements.hasElementInTableScope(TAG_TR): + parse_error + else: + clear_the_stack_back_to_a_table_row_context + pop_current_node + parser.insertionMode = IN_TABLE_BODY + reprocess token + ) + ("</tbody>", "</tfoot>", "</thead>") => (block: + if not parser.openElements.hasElementInTableScope(token.tagtype): + parse_error + elif not parser.openElements.hasElementInTableScope(TAG_TR): + discard + else: + clear_the_stack_back_to_a_table_row_context + pop_current_node + parser.insertionMode = IN_BODY + reprocess token + ) + ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</td>", + "</th>") => (block: parse_error) + _ => (block: parser.processInHTMLContent(token, IN_TABLE)) + + of IN_CELL: + template close_cell() = + parser.generateImpliedEndTags() + if parser.currentNode.tagType notin {TAG_TD, TAG_TH}: parse_error + while parser.openElements.pop().tagType notin {TAG_TD, TAG_TH}: discard + parser.clearActiveFormattingTillMarker() + parser.insertionMode = IN_ROW + + match token: + ("</td>", "</th>") => (block: + if not parser.openElements.hasElementInTableScope(token.tagtype): + parse_error + else: + parser.generateImpliedEndTags() + if parser.currentNode.tagType != token.tagtype: parse_error + while parser.openElements.pop().tagType != token.tagtype: discard + parser.clearActiveFormattingTillMarker() + parser.insertionMode = IN_ROW + ) + ("<caption>", "<col>", "<colgroup>", "<tbody>", "<td>", "<tfoot>", + "<thead>", "<tr>") => (block: + if not parser.openElements.hasElementInTableScope({TAG_TD, TAG_TH}): + parse_error + else: + close_cell + ) + ("</body>", "</caption>", "</col>", "</colgroup>", + "</html>") => (block: parse_error) + ("</table>", "</tbody>", "</tfoot>", "</thead>", "</tr>") => (block: + if not parser.openElements.hasElementInTableScope(token.tagtype): + parse_error + else: + close_cell + reprocess token + ) + _ => (block: parser.processInHTMLContent(token, IN_BODY)) + + of IN_SELECT: + match token: + '\0' => (block: parse_error) + TokenType.CHARACTER_ASCII => (block: parser.insertCharacter(token.c)) + TokenType.CHARACTER => (block: parser.insertCharacter(token.r)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "<option>" => (block: + if parser.currentNode.tagType == TAG_OPTION: + pop_current_node + discard parser.insertHTMLElement(token) + ) + "<optgroup>" => (block: + if parser.currentNode.tagType == TAG_OPTION: + pop_current_node + if parser.currentNode.tagType == TAG_OPTGROUP: + pop_current_node + discard parser.insertHTMLElement(token) + ) + "</optgroup>" => (block: + if parser.currentNode.tagType == TAG_OPTION: + if parser.openElements.len > 1 and parser.openElements[^2].tagType == TAG_OPTGROUP: + pop_current_node + if parser.currentNode.tagType == TAG_OPTGROUP: + pop_current_node + else: + parse_error + ) + "</option>" => (block: + if parser.currentNode.tagType == TAG_OPTION: + pop_current_node + else: + parse_error + ) + "</select>" => (block: + if not parser.openElements.hasElementInSelectScope(TAG_SELECT): + parse_error + else: + while parser.openElements.pop().tagType != TAG_SELECT: discard + parser.resetInsertionMode() + ) + ("<input>", "<keygen>", "<textarea>") => (block: + parse_error + if not parser.openElements.hasElementInSelectScope(TAG_SELECT): + discard + else: + while parser.openElements.pop().tagType != TAG_SELECT: discard + parser.resetInsertionMode() + reprocess token + ) + ("<script>", "<template>", "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD)) + TokenType.EOF => (block: parser.processInHTMLContent(token, IN_BODY)) + _ => (block: parse_error) + + of IN_SELECT_IN_TABLE: + match token: + ("<caption>", "<table>", "<tbody>", "<tfoot>", "<thead>", "<tr>", "<td>", + "<th>") => (block: + parse_error + while parser.openElements.pop().tagType != TAG_SELECT: discard + parser.resetInsertionMode() + reprocess token + ) + ("</caption>", "</table>", "</tbody>", "</tfoot>", "</thead>", "</tr>", + "</td>", "</th>") => (block: + parse_error + if not parser.openElements.hasElementInTableScope(token.tagtype): + discard + else: + while parser.openElements.pop().tagType != TAG_SELECT: discard + parser.resetInsertionMode() + reprocess token + ) + _ => (block: parser.processInHTMLContent(token, IN_SELECT)) + + of IN_TEMPLATE: + match token: + (TokenType.CHARACTER_ASCII, TokenType.CHARACTER, TokenType.DOCTYPE) => (block: + parser.processInHTMLContent(token, IN_BODY) + ) + ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", + "<script>", "<style>", "<template>", "<title>", "</template>") => (block: + parser.processInHTMLContent(token, IN_HEAD) + ) + ("<caption>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>") => (block: + discard parser.templateModes.pop() + parser.templateModes.add(IN_TABLE) + parser.insertionMode = IN_TABLE + reprocess token + ) + "<col>" => (block: + discard parser.templateModes.pop() + parser.templateModes.add(IN_COLUMN_GROUP) + parser.insertionMode = IN_COLUMN_GROUP + reprocess token + ) + "<tr>" => (block: + discard parser.templateModes.pop() + parser.templateModes.add(IN_TABLE_BODY) + parser.insertionMode = IN_TABLE_BODY + reprocess token + ) + ("<td>", "<th>") => (block: + discard parser.templateModes.pop() + parser.templateModes.add(IN_ROW) + parser.insertionMode = IN_ROW + reprocess token + ) + TokenType.START_TAG => (block: + discard parser.templateModes.pop() + parser.templateModes.add(IN_BODY) + parser.insertionMode = IN_BODY + reprocess token + ) + TokenType.END_TAG => (block: parse_error) + TokenType.EOF => (block: + if not parser.openElements.hasElement(TAG_TEMPLATE): + discard # stop + else: + parse_error + while parser.openElements.pop().tagType != TAG_TEMPLATE: discard + parser.clearActiveFormattingTillMarker() + discard parser.templateModes.pop() + parser.resetInsertionMode() + reprocess token + ) + + of AFTER_BODY: + match token: + AsciiWhitespace => (block: parser.processInHTMLContent(token, IN_BODY)) + TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.openElements[0]))) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "</html>" => (block: + if parser.fragment: + parse_error + else: + parser.insertionMode = AFTER_AFTER_BODY + ) + TokenType.EOF => (block: discard) # stop + _ => (block: + parse_error + parser.insertionMode = IN_BODY + reprocess token + ) + + of IN_FRAMESET: + match token: + AsciiWhitespace => (block: parser.insertCharacter(token.c)) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "<frameset>" => (block: + if parser.currentNode == parser.document.html: + parse_error + else: + pop_current_node + if not parser.fragment and parser.currentNode.tagType != TAG_FRAMESET: + parser.insertionMode = AFTER_FRAMESET + ) + "<frame>" => (block: + discard parser.insertHTMLElement(token) + pop_current_node + ) + "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) + TokenType.EOF => (block: + if parser.currentNode != parser.document.html: parse_error + # stop + ) + _ => (block: parse_error) + + of AFTER_FRAMESET: + match token: + AsciiWhitespace => (block: parser.insertCharacter(token.c)) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "</html>" => (block: parser.insertionMode = AFTER_AFTER_FRAMESET) + "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) + TokenType.EOF => (block: discard) # stop + _ => (block: parse_error) + + of AFTER_AFTER_BODY: + match token: + TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document))) + (TokenType.DOCTYPE, AsciiWhitespace, "<html>") => (block: parser.processInHTMLContent(token, IN_BODY)) + TokenType.EOF => (block: discard) # stop + _ => (block: + parse_error + parser.insertionMode = IN_BODY + reprocess token + ) + + of AFTER_AFTER_FRAMESET: + match token: + TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document))) + (TokenType.DOCTYPE, AsciiWhitespace, "<html>") => (block: parser.processInHTMLContent(token, IN_BODY)) + TokenType.EOF => (block: discard) # stop + "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) + _ => (block: parse_error) + +proc processInForeignContent(parser: var HTML5Parser, token: Token) = + macro `=>`(v: typed, body: untyped): untyped = + quote do: + discard (`v`, proc() = `body`) + template script_end_tag() = + pop_current_node + #TODO document.write (?) + #TODO SVG + template any_other_end_tag() = + if parser.currentNode.localName != token.tagname: parse_error + for i in countdown(parser.openElements.high, 1): + let node = parser.openElements[i] + if node.localName == token.tagname: + while parser.openElements.pop() != node: discard + break + if node.namespace == Namespace.HTML: break + parser.processInHTMLContent(token) + + + match token: + '\0' => (block: + parse_error + parser.insertCharacter(Rune(0xFFFD)) + ) + AsciiWhitespace => (block: parser.insertCharacter(token.c)) + TokenType.CHARACTER_ASCII => (block: parser.insertCharacter(token.c)) + TokenType.CHARACTER => (block: parser.insertCharacter(token.r)) + TokenType.DOCTYPE => (block: parse_error) + ("<b>", "<big>", "<blockquote>", "<body>", "<br>", "<center>", "<code>", + "<dd>", "<div>", "<dl>", "<dt>", "<em>", "<embed>", "<h1>", "<h2>", "<h3>", + "<h4>", "<h5>", "<h6>", "<head>", "<hr>", "<i>", "<img>", "<li>", + "<listing>", "<menu>", "<meta>", "<nobr>", "<ol>", "<p>", "<pre>", + "<ruby>", "<s>", "<small>", "<span>", "<strong>", "<strike>", "<sub>", + "<sup>", "<table>", "<tt>", "<u>", "<ul>", "<var>") => (block: + parse_error + #NOTE MathML not implemented + while not (parser.currentNode.isHTMLIntegrationPoint() or parser.currentNode.inHTMLNamespace()): + pop_current_node + parser.processInHTMLContent(token) + ) + TokenType.START_TAG => (block: + #NOTE MathML not implemented + #TODO SVG + #TODO adjust foreign attributes + let element = parser.insertForeignElement(token, $parser.adjustedCurrentNode.namespace) + if token.selfclosing and element.inSVGNamespace(): + script_end_tag else: - parse_error missing_semicolon_after_character_reference - reconsume_in NUMERIC_CHARACTER_REFERENCE_END - - of NUMERIC_CHARACTER_REFERENCE_END: - ignore_eof # we reconsume anyway - case tokenizer.code - of 0x00: - parse_error null_character_reference - tokenizer.code = 0xFFFD - elif tokenizer.code > 0x10FFFF: - parse_error character_reference_outside_unicode_range - tokenizer.code = 0xFFFD - elif Rune(tokenizer.code).isSurrogate(): - parse_error surrogate_character_reference - tokenizer.code = 0xFFFD - elif Rune(tokenizer.code).isNonCharacter(): - parse_error noncharacter_character_reference - # do nothing - elif tokenizer.code in 0..255 and char(tokenizer.code) in ((Controls - AsciiWhitespace) + {chr(0x0D)}): - const ControlMapTable = [ - (0x80, 0x20AC), (0x82, 0x201A), (0x83, 0x0192), (0x84, 0x201E), - (0x85, 0x2026), (0x86, 0x2020), (0x87, 0x2021), (0x88, 0x02C6), - (0x89, 0x2030), (0x8A, 0x0160), (0x8B, 0x2039), (0x8C, 0x0152), - (0x8E, 0x017D), (0x91, 0x2018), (0x92, 0x2019), (0x93, 0x201C), - (0x94, 0x201D), (0x95, 0x2022), (0x96, 0x2013), (0x97, 0x2014), - (0x98, 0x02DC), (0x99, 0x2122), (0x9A, 0x0161), (0x9B, 0x203A), - (0x9C, 0x0153), (0x9E, 0x017E), (0x9F, 0x0178), - ].toTable() - if ControlMapTable.hasKey(tokenizer.code): - tokenizer.code = ControlMapTable[tokenizer.code] - tokenizer.tmp = $Rune(tokenizer.code) - flush_code_points_consumed_as_a_character_reference #TODO optimize so we flush directly - reconsume_in tokenizer.rstate # we unnecessarily consumed once so reconsume - -func inputSize*(str: string): int = - if str.len == 0: - return 20 - for c in str: - if not c.isDigit: - return 20 - return str.parseInt() - -#w3m's getescapecmd and parse_tag, transpiled to nim and heavily modified. -#(C) Copyright 1994-2002 by Akinori Ito -#(C) Copyright 2002-2011 by Akinori Ito, Hironori Sakamoto, Fumitoshi Ukai -# -#Use, modification and redistribution of this software is hereby granted, -#provided that this entire copyright notice is included on any copies of -#this software and applications and derivations thereof. -# -#This software is provided on an "as is" basis, without warranty of any -#kind, either expressed or implied, as to any matter including, but not -#limited to warranty of fitness of purpose, or merchantability, or -#results obtained from use of this software. -proc getescapecmd(buf: string, at: var int): string = - var i = at - - if buf[i] == '#': #num - inc i - var num: int - if buf[i].tolower() == 'x': #hex - inc i - if not isdigit(buf[i]): - at = i - return "&" - - num = hexValue(buf[i]) - inc i - while i < buf.len and hexValue(buf[i]) != -1: - num *= 0x10 - num += hexValue(buf[i]) - inc i - else: #dec - if not isDigit(buf[i]): - at = i - return "&" - - num = decValue(buf[i]) - inc i - while i < buf.len and isDigit(buf[i]): - num *= 10 - num += decValue(buf[i]) - inc i - - if buf[i] == ';': - inc i - at = i - return $(Rune(num)) - elif not isAlphaAscii(buf[i]): - return "&" - - var n = entityMap - var s = "" - while true: - s &= buf[i] - if not n.hasPrefix(s): - break - let pn = n - n = n{s} - if n != pn: - s = "" - inc i - - if n.value.issome: - at = i - return n.value.get - - return "&" - -type - DOMParsedTag = object - tagid: TagType - attrs: Table[string, string] - open: bool - -proc parse_tag(buf: string, at: var int): DOMParsedTag = - var tag = DOMParsedTag() - tag.open = true - - #Parse tag name - var tagname = "" - inc at - if buf[at] == '/': - inc at - tag.open = false - at = skipBlanks(buf, at) - - while at < buf.len and not buf[at].isWhitespace() and not (tag.open and buf[at] == '/') and buf[at] != '>' and buf[at].isAscii(): - tagname &= buf[at].tolower() - inc at - - tag.tagid = tagType(tagname) - at = skipBlanks(buf, at) - - while at < buf.len and buf[at] != '>': - var value = "" - var attrname = "" - while at < buf.len and buf[at] != '=' and not buf[at].isWhitespace() and buf[at] != '>': - var r: Rune - fastRuneAt(buf, at, r) - if r.isAscii(): - attrname &= char(r).tolower() + pop_current_node + ) + "</script>" => (block: + if parser.currentNode.namespace == Namespace.SVG and parser.currentNode.localName == "script": #TODO SVG + script_end_tag else: - attrname &= r - - at = skipBlanks(buf, at) - if at < buf.len and buf[at] == '=': - inc at - at = skipBlanks(buf, at) - if at < buf.len and (buf[at] == '"' or buf[at] == '\''): - let startc = buf[at] - inc at - while at < buf.len and buf[at] != startc: - if buf[at] == '&': - inc at - value &= getescapecmd(buf, at) - else: - value &= buf[at] - inc at - if at < buf.len: - inc at - elif at < buf.len: - while at < buf.len and not buf[at].isWhitespace() and buf[at] != '>': - var r: Rune - fastRuneAt(buf, at, r) - value &= $r - - if attrname.len > 0: - tag.attrs[attrname] = value - - while at < buf.len and buf[at] != '>': - inc at - - if at < buf.len and buf[at] == '>': - inc at - return tag - -proc insertNode(parent, node: Node) = - parent.childNodes.add(node) - - if parent.childNodes.len > 1: - let prevSibling = parent.childNodes[^2] - prevSibling.nextSibling = node - node.previousSibling = prevSibling - - node.parentNode = parent - if parent.nodeType == ELEMENT_NODE: - node.parentElement = Element(parent) - - if parent.ownerDocument != nil: - node.ownerDocument = parent.ownerDocument - elif parent.nodeType == DOCUMENT_NODE: - node.ownerDocument = Document(parent) - - if node.nodeType == ELEMENT_NODE: - parent.children.add(Element(node)) - - let element = (Element(node)) - if element.ownerDocument != nil: - element.ownerDocument.type_elements[element.tagType].add(element) - if element.id != "": - if not (element.id in element.ownerDocument.id_elements): - element.ownerDocument.id_elements[element.id] = newSeq[Element]() - element.ownerDocument.id_elements[element.id].add(element) - - for c in element.classList: - if not (c in element.ownerDocument.class_elements): - element.ownerDocument.class_elements[c] = newSeq[Element]() - element.ownerDocument.class_elements[c].add(element) - -proc processDocumentBody(state: var HTMLParseState) = - if not state.in_body: - state.in_body = true - if state.elementNode.ownerDocument != nil: - state.elementNode = state.elementNode.ownerDocument.body - -#TODO this adds text nodes to head -proc processDocumentAddNode(state: var HTMLParseState, newNode: Node) = - if state.elementNode.tagType == TAG_HTML: - if state.in_body: - state.elementNode = state.elementNode.ownerDocument.body + any_other_end_tag + ) + TokenType.END_TAG => (block: any_other_end_tag) + +proc constructTree(parser: var HTML5Parser): Document = + for token in parser.tokenizer.tokenize: + if parser.ignoreLF: + parser.ignoreLF = false + if token.t == CHARACTER_ASCII and token.c == '\n': + continue + if parser.openElements.len == 0 or + parser.adjustedCurrentNode.inHTMLNamespace() or + parser.adjustedCurrentNode.isHTMLIntegrationPoint() and token.t in {START_TAG, CHARACTER, CHARACTER_ASCII} or + token.t == EOF: + #NOTE MathML not implemented + parser.processInHTMLContent(token) else: - state.elementNode = state.elementNode.ownerDocument.head + #TODO disabled path because I'm pretty sure it'd just break things + #parser.processInForeignContent(token) + pop_current_node - insertNode(state.elementNode, newNode) + #TODO document.write (?) + #TODO etc etc... -proc processDocumentEndNode(state: var HTMLParseState) = - if state.elementNode == nil or state.elementNode.nodeType == DOCUMENT_NODE: - return - state.elementNode = state.elementNode.parentElement - -proc processDocumentText(state: var HTMLParseState) = - if state.textNode == nil: - state.textNode = newText() - processDocumentAddNode(state, state.textNode) - -proc processDocumentStartElement(state: var HTMLParseState, element: Element, tag: DOMParsedTag) = - var add = true - - for k, v in tag.attrs: - element.attributes[k] = v - - element.id = element.attr("id") - if element.attributes.hasKey("class"): - for w in unicode.split(element.attributes["class"], Rune(' ')): - element.classList.add(w) - - case element.tagType - of TAG_SCRIPT: - state.in_script = true - of TAG_NOSCRIPT: - state.in_noscript = true - of TAG_STYLE: - state.in_style = true - of TAG_SELECT: - HTMLSelectElement(element).name = element.attr("name") - HTMLSelectElement(element).value = element.attr("value") - of TAG_INPUT: - let element = HTMLInputElement(element) - element.value = element.attr("value") - element.inputType = element.attr("type").inputType() - element.size = element.attr("size").inputSize() - element.checked = element.attrb("checked") - if state.formowners.len > 0: - element.form = state.formowners[^1] - element.form.inputs.add(element) - of TAG_A: - HTMLAnchorElement(element).href = element.attr("href") - of TAG_OPTION: - HTMLOptionElement(element).value = element.attr("href") - of TAG_OL: - HTMLOListElement(element).start = element.attri("start") - HTMLOListElement(element).ordinalcounter = HTMLOListElement(element).start.get(1) - of TAG_LI: - HTMLLIElement(element).value = element.attri("value") - of TAG_HTML: - add = false - of TAG_HEAD: - add = false - state.in_body = false - if state.elementNode.ownerDocument != nil: - state.elementNode = state.elementNode.ownerDocument.head - of TAG_BODY: - add = false - of TAG_PRE: - state.skip_lf = true - of TAG_H1: - HTMLHeadingElement(element).rank = 1 - of TAG_H2: - HTMLHeadingElement(element).rank = 2 - of TAG_H3: - HTMLHeadingElement(element).rank = 3 - of TAG_H4: - HTMLHeadingElement(element).rank = 4 - of TAG_H5: - HTMLHeadingElement(element).rank = 5 - of TAG_H6: - HTMLHeadingElement(element).rank = 6 - of TAG_LINK: - HTMLLinkElement(element).href = element.attr("href") - HTMLLinkElement(element).rel = element.attr("rel") - of TAG_FORM: - let element = HTMLFormElement(element) - element.name = element.attr("name") - element.smethod = element.attr("method") - element.enctype = element.attr("enctype") - element.target = element.attr("target") - element.novalidate = element.attrb("novalidate") - state.formowners.add(element) - else: discard - - if not state.in_body and not (element.tagType in HeadTagTypes): - processDocumentBody(state) - - if state.elementNode.nodeType == ELEMENT_NODE: - if element.tagType in SelfClosingTagTypes: - if state.elementNode.tagType == element.tagType: - processDocumentEndNode(state) - - if state.elementNode.tagType == TAG_P and element.tagType in PClosingTagTypes: - processDocumentEndNode(state) - - if add: - processDocumentAddNode(state, element) - state.elementNode = element - - case element.tagType - of VoidTagTypes: - processDocumentEndNode(state) - of TAG_LI: - HTMLLIElement(element).applyOrdinal() #needs to know parent - else: discard + return parser.document -proc processDocumentEndElement(state: var HTMLParseState, tag: DOMParsedTag) = - if tag.tagid != state.elementNode.tagType: - if state.elementNode.tagType in SelfClosingTagTypes: - processDocumentEndNode(state) - processDocumentEndNode(state) - else: - case tag.tagid - of VoidTagTypes: - return - of TAG_HEAD: - processDocumentBody(state) - return - of TAG_BODY: - return - of TAG_FORM: - if state.formowners.len > 0: - discard state.formowners.pop() - of TAG_STYLE: - let style = HTMLStyleElement(state.elementNode) - var str = "" - for child in style.textNodes: - str &= child.data - let sheet = newStringStream(str).parseStylesheet() - style.parentElement.sheets.add(sheet) - else: discard - processDocumentEndNode(state) - -proc processDocumentTag(state: var HTMLParseState, tag: DOMParsedTag) = - if state.in_script: - if not tag.open and tag.tagid == TAG_SCRIPT: - state.in_script = false - else: - return - - if state.in_style: - if not tag.open and tag.tagid == TAG_STYLE: - state.in_style = false - else: - return - - if not tag.open and state.in_noscript: - if tag.tagid == TAG_NOSCRIPT: - state.in_noscript = false - else: - return - - if tag.open: - processDocumentStartElement(state, state.document.newHtmlElement(tag.tagid), tag) - else: - processDocumentEndElement(state, tag) - -proc processDocumentPart(state: var HTMLParseState, buf: string) = - var at = 0 - var max = 0 - var was_script = false - - max = buf.len - - template process_char(c: char) = - if state.in_comment: - state.commentNode.data &= c - else: - if not c.isWhitespace() and state.elementNode.tagType == TAG_HTML: - state.textNode = nil - processDocumentBody(state) - processDocumentText(state) - if not (state.skip_lf and c == '\n'): - processDocumentText(state) - state.textNode.data &= c - state.skip_lf = false - - template process_text(s: string) = - if state.in_comment: - state.commentNode.data &= s - else: - if not (state.skip_lf and s[0] == '\n'): - processDocumentText(state) - state.textNode.data &= s - state.skip_lf = false - - template has(buf: string, s: string): bool = - (at + s.len < buf.len and buf.substr(at, at + 8) == "</script>") - - while at < max: - case buf[at] - of '&': - inc at - let p = getescapecmd(buf, at) - process_text(p) - of '<': - if state.in_comment: - state.commentNode.data &= buf[at] - inc at - else: - var p = at - inc p - if p < max and buf[p] == '!': - inc p - if p < max and buf[p] == '-': - inc p - if p < max and buf[p] == '-': - inc p - at = p - state.in_comment = true - let comment = newComment() - state.commentNode = comment - processDocumentAddNode(state, comment) - state.textNode = nil - else: - #TODO for doctype - while p < max and buf[p] != '>': - inc p - at = p + 1 - continue - - if not state.in_comment: - state.textNode = nil - p = at - if state.in_script: - if buf.has("</script>"): - var tag = parse_tag(buf, at) - processDocumentTag(state, tag) - else: - process_char(buf[at]) - inc at - else: - var tag = parse_tag(buf, at) - processDocumentTag(state, tag) - elif buf[at] == '-' and state.in_comment: - var p = at - inc p - if p < max and buf[p] == '-': - inc p - if p < max and buf[p] == '>': - inc p - at = p - state.commentNode = nil - state.in_comment = false - - if state.in_comment: - state.commentNode.data &= buf[at] - inc at - else: - process_char(buf[at]) - inc at - -proc parseHtml5(inputStream: Stream, savesource: bool, source: var string): Document = - #TODO implement HTML5 parsing - var tokenizer = inputStream.newTokenizer() - for tok in tokenizer.tokenize: - eprint tok - -proc parseHtml(inputStream: Stream, savesource: bool, source: var string): Document = - let document = newDocument() - insertNode(document, document.root) - insertNode(document.root, document.head) - insertNode(document.root, document.body) - - var state = HTMLParseState() - state.document = document - state.elementNode = document.root - - var till_when = false - - var buf = "" - var lineBuf: string - while not inputStream.atEnd(): - lineBuf = inputStream.readLine() & '\n' - if savesource: - source &= lineBuf - buf &= lineBuf - - var at = 0 - while at < lineBuf.len: - case lineBuf[at] - of '<': - till_when = true - of '>': - till_when = false - else: discard - inc at - - if till_when: - continue - - processDocumentPart(state, buf) - buf = "" - - inputStream.close() - return document - -proc parseHtml*(inputStream: Stream, source: var string): Document = - return parseHtml(inputStream, true, source) - -proc parseHtml*(inputStream: Stream): Document = - var placeholder = "" - return parseHtml(inputStream, false, placeholder) +proc parseHTML5*(inputStream: Stream): Document = + var parser: HTML5Parser + parser.document = newDocument() + parser.tokenizer = inputStream.newTokenizer() + return parser.constructTree() diff --git a/src/html/htmltokenizer.nim b/src/html/htmltokenizer.nim new file mode 100644 index 00000000..29680d19 --- /dev/null +++ b/src/html/htmltokenizer.nim @@ -0,0 +1,1525 @@ +import options +import streams +import strformat +import strutils +import macros +import tables +import unicode + +import html/entity +import html/tags +import utils/radixtree +import utils/twtstr + +# Tokenizer +type + Tokenizer* = object + state*: TokenizerState + rstate: TokenizerState + curr: Rune + tmp: string + code: int + tok: Token + laststart: Token + attrn: string + attrv: string + attr: bool + + istream: Stream + sbuf: string + sbuf_i: int + sbuf_ip: int + eof_i: int + + TokenType* = enum + DOCTYPE, START_TAG, END_TAG, COMMENT, CHARACTER, CHARACTER_ASCII, EOF + + TokenizerState* = enum + DATA, CHARACTER_REFERENCE, TAG_OPEN, RCDATA, RCDATA_LESS_THAN_SIGN, + RAWTEXT, RAWTEXT_LESS_THAN_SIGN, SCRIPT_DATA, SCRIPT_DATA_LESS_THAN_SIGN, + PLAINTEXT, MARKUP_DECLARATION_OPEN, END_TAG_OPEN, BOGUS_COMMENT, TAG_NAME, + BEFORE_ATTRIBUTE_NAME, RCDATA_END_TAG_OPEN, RCDATA_END_TAG_NAME, + RAWTEXT_END_TAG_OPEN, RAWTEXT_END_TAG_NAME, SELF_CLOSING_START_TAG, + SCRIPT_DATA_END_TAG_OPEN, SCRIPT_DATA_ESCAPE_START, + SCRIPT_DATA_END_TAG_NAME, SCRIPT_DATA_ESCAPE_START_DASH, + SCRIPT_DATA_ESCAPED_DASH_DASH, SCRIPT_DATA_ESCAPED, + SCRIPT_DATA_ESCAPED_DASH, SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, + SCRIPT_DATA_ESCAPED_END_TAG_OPEN, SCRIPT_DATA_DOUBLE_ESCAPE_START, + SCRIPT_DATA_ESCAPED_END_TAG_NAME, SCRIPT_DATA_DOUBLE_ESCAPED, + SCRIPT_DATA_DOUBLE_ESCAPED_DASH, SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, + SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, SCRIPT_DATA_DOUBLE_ESCAPE_END, + AFTER_ATTRIBUTE_NAME, ATTRIBUTE_NAME, BEFORE_ATTRIBUTE_VALUE, + ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED, + ATTRIBUTE_VALUE_UNQUOTED, AFTER_ATTRIBUTE_VALUE_QUOTED, COMMENT_START, + CDATA_SECTION, COMMENT_START_DASH, COMMENT, COMMENT_END, + COMMENT_LESS_THAN_SIGN, COMMENT_END_DASH, COMMENT_LESS_THAN_SIGN_BANG, + COMMENT_LESS_THAN_SIGN_BANG_DASH, COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH, + COMMENT_END_BANG, DOCTYPE, BEFORE_DOCTYPE_NAME, DOCTYPE_NAME, + AFTER_DOCTYPE_NAME, AFTER_DOCTYPE_PUBLIC_KEYWORD, + AFTER_DOCTYPE_SYSTEM_KEYWORD, BOGUS_DOCTYPE, + BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, + DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, AFTER_DOCTYPE_PUBLIC_IDENTIFIER, + BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, + DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, + DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, + AFTER_DOCTYPE_SYSTEM_IDENTIFIER, CDATA_SECTION_BRACKET, CDATA_SECTION_END, + NAMED_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE, + AMBIGUOUS_AMPERSAND_STATE, HEXADECIMAL_CHARACTER_REFERENCE_START, + DECIMAL_CHARACTER_REFERENCE_START, HEXADECIMAL_CHARACTER_REFERENCE, + DECIMAL_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE_END + + Token* = ref object + case t*: TokenType + of DOCTYPE: + name*: Option[string] + pubid*: Option[string] + sysid*: Option[string] + quirks*: bool + of START_TAG, END_TAG: + tagname*: string + tagtype*: TagType + selfclosing*: bool + attrs*: Table[string, string] + of CHARACTER: + r*: Rune + of CHARACTER_ASCII: + c*: char + of COMMENT: + data*: string + of EOF: discard + +func `$`*(tok: Token): string = + case tok.t + of DOCTYPE: fmt"{tok.t} {tok.name} {tok.pubid} {tok.sysid} {tok.quirks}" + of START_TAG, END_TAG: fmt"{tok.t} {tok.tagname} {tok.selfclosing} {tok.attrs}" + of CHARACTER: fmt"{tok.t} {tok.r}" + of CHARACTER_ASCII: fmt"{tok.t} {tok.c}" + of COMMENT: fmt"{tok.t} {tok.data}" + of EOF: fmt"{tok.t}" + +const bufSize = 512 +const copyBufSize = 16 +proc newTokenizer*(s: Stream): Tokenizer = + result.sbuf = newString(bufSize) + result.istream = s + result.eof_i = -1 + if result.istream.atEnd: + result.eof_i = 0 + else: + let n = s.readDataStr(result.sbuf, 0..bufSize-1) + if n != bufSize: + result.eof_i = n + +func atEof(t: Tokenizer): bool = + t.eof_i != -1 and t.sbuf_i >= t.eof_i + +proc consume(t: var Tokenizer): char {.inline.} = + if t.eof_i == -1 and t.sbuf_i >= bufSize-copyBufSize: + # Workaround to swap buffer without breaking fastRuneAt. + var sbuf2 = newString(copyBufSize) + var i = 0 + while t.sbuf_i + i < bufSize: + sbuf2[i] = t.sbuf[t.sbuf_i + i] + inc i + let n = t.istream.readDataStr(t.sbuf, i..bufSize-1) + if n != bufSize - i: + t.eof_i = i + n + t.sbuf_i = 0 + + var j = 0 + while j < i: + t.sbuf[j] = sbuf2[j] + inc j + + assert t.eof_i == -1 or t.sbuf_i < t.eof_i # not consuming eof... + t.sbuf_ip = t.sbuf_i # save previous pointer for potential reconsume + + # Normalize newlines (\r\n -> \n, single \r -> \n) + if t.sbuf[t.sbuf_i] == '\r': + inc t.sbuf_i + if t.sbuf[t.sbuf_i] != '\n': + # \r + result = '\n' + t.curr = Rune('\n') + return + # else, \r\n so just return the \n + + result = t.sbuf[t.sbuf_i] + fastRuneAt(t.sbuf, t.sbuf_i, t.curr) + +proc reconsume(t: var Tokenizer) = + t.sbuf_i = t.sbuf_ip + +iterator tokenize*(tokenizer: var Tokenizer): Token = + template emit(tok: Token) = + if tok.t == START_TAG: + tokenizer.laststart = tok + if tok.t in {START_TAG, END_TAG}: + tok.tagtype = tagType(tok.tagName) + yield tok + template emit(tok: TokenType) = emit Token(t: tok) + template emit(rn: Rune) = emit Token(t: CHARACTER, r: rn) + template emit(ch: char) = emit Token(t: CHARACTER_ASCII, c: ch) + template emit_eof = + emit EOF + break + template emit_tok = + if tokenizer.attr: + tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv + emit tokenizer.tok + template emit_current = + if is_eof: + emit_eof + elif c in Ascii: + emit c + else: + emit tokenizer.curr + template emit_replacement = emit Rune(0xFFFD) + template switch_state(s: TokenizerState) = + tokenizer.state = s + template switch_state_return(s: TokenizerState) = + tokenizer.rstate = tokenizer.state + tokenizer.state = s + template reconsume_in(s: TokenizerState) = + tokenizer.reconsume() + switch_state s + template parse_error(error: untyped) = discard # does nothing for now... TODO? + template is_appropriate_end_tag_token(): bool = + tokenizer.laststart != nil and tokenizer.laststart.tagname == tokenizer.tok.tagname + template start_new_attribute = + if tokenizer.attr: + tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv + tokenizer.attrn = "" + tokenizer.attrv = "" + tokenizer.attr = true + template leave_attribute_name_state = + if tokenizer.attrn in tokenizer.tok.attrs: + tokenizer.attr = false + template append_to_current_attr_value(c: typed) = + if tokenizer.attr: + tokenizer.attrv &= c + template peek_str(s: string): bool = + # WARNING: will break on strings with copyBufSize + 4 bytes + assert s.len < copyBufSize - 4 and s.len > 0 + if tokenizer.sbuf_i + s.len > tokenizer.eof_i: + false + else: + let slice = tokenizer.sbuf[tokenizer.sbuf_i..tokenizer.sbuf_i+s.high] + s == slice + template peek_str_nocase(s: string): bool = + # WARNING: will break on strings with copyBufSize + 4 bytes + # WARNING: only works with UPPER CASE ascii + assert s.len < copyBufSize - 4 and s.len > 0 + if tokenizer.sbuf_i + s.len > tokenizer.eof_i: + false + else: + let slice = tokenizer.sbuf[tokenizer.sbuf_i..tokenizer.sbuf_i+s.high] + s == slice.toUpperAscii() + template peek_char(): char = tokenizer.sbuf[tokenizer.sbuf_i] + template has_adjusted_current_node(): bool = false #TODO implement this + template consume_and_discard(n: int) = #TODO optimize + var i = 0 + while i < n: + discard tokenizer.consume() + inc i + template consumed_as_an_attribute(): bool = + tokenizer.rstate in {ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED, ATTRIBUTE_VALUE_UNQUOTED} + template emit_tmp() = + var i = 0 + while i < tokenizer.tmp.len: + if tokenizer.tmp[i].isAscii(): + emit tokenizer.tmp[i] + inc i + else: + var r: Rune + fastRuneAt(tokenizer.tmp, i, r) + emit r + template flush_code_points_consumed_as_a_character_reference() = + if consumed_as_an_attribute: + append_to_current_attr_value tokenizer.tmp + else: + emit_tmp + template new_token(t: Token) = + if tokenizer.attr: + tokenizer.attr = false + tokenizer.tok = t + + # Fake EOF as an actual character. Also replace anything_else with the else + # branch. + macro stateMachine(states: varargs[untyped]): untyped = + var maincase = newNimNode(nnkCaseStmt).add(quote do: tokenizer.state) + for state in states: + if state.kind == nnkOfBranch: + let mainstmtlist = findChild(state, it.kind == nnkStmtList) + if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "ignore_eof": + maincase.add(state) + continue + + var hasanythingelse = false + if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "has_anything_else": + hasanythingelse = true + + let childcase = findChild(mainstmtlist, it.kind == nnkCaseStmt) + var haseof = false + var eofstmts: NimNode + var elsestmts: NimNode + + for i in countdown(childcase.len-1, 0): + let childof = childcase[i] + if childof.kind == nnkOfBranch: + for j in countdown(childof.len-1, 0): + if childof[j].kind == nnkIdent and childof[j].strVal == "eof": + haseof = true + eofstmts = childof.findChild(it.kind == nnkStmtList) + if childof.findChild(it.kind == nnkIdent and it.strVal != "eof") != nil: + childof.del(j) + else: + childcase.del(i) + elif childof.kind == nnkElse: + elsestmts = childof.findChild(it.kind == nnkStmtList) + + if not haseof: + eofstmts = elsestmts + let fake_eof = quote do: + if is_eof: + `eofstmts` + continue + mainstmtlist.insert(0, fake_eof) + if hasanythingelse: + let fake_anything_else = quote do: + template anything_else = + `elsestmts` + mainstmtlist.insert(0, fake_anything_else) + maincase.add(state) + result = newNimNode(nnkStmtList) + result.add(maincase) + + template ignore_eof = discard # does nothing + template has_anything_else = discard # does nothing + + const null = char(0) + const whitespace = {'\t', '\n', '\f', ' '} + + while true: + {.computedGoto.} + #eprint tokenizer.state #debug + let is_eof = tokenizer.atEof # set eof here, otherwise we would exit at the last character + let c = if not is_eof: + tokenizer.consume() + else: + # avoid consuming eof... + null + stateMachine: # => case tokenizer.state + of DATA: + case c + of '&': switch_state_return CHARACTER_REFERENCE + of '<': switch_state TAG_OPEN + of null: + parse_error unexpected_null_character + emit_current + of eof: emit_eof + else: emit_current + + of RCDATA: + case c + of '&': switch_state_return CHARACTER_REFERENCE + of '<': switch_state RCDATA_LESS_THAN_SIGN + of null: parse_error unexpected_null_character + of eof: emit_eof + else: emit_current + + of RAWTEXT: + case c + of '<': switch_state RAWTEXT_LESS_THAN_SIGN + of null: + parse_error unexpected_null_character + emit_replacement + of eof: emit_eof + else: emit_current + + of SCRIPT_DATA: + case c + of '<': switch_state SCRIPT_DATA_LESS_THAN_SIGN + of null: + parse_error unexpected_null_character + emit_replacement + of eof: emit_eof + else: emit_current + + of PLAINTEXT: + case c + of null: + parse_error unexpected_null_character + emit_replacement + of eof: emit_eof + else: emit_current + + of TAG_OPEN: + case c + of '!': switch_state MARKUP_DECLARATION_OPEN + of '/': switch_state END_TAG_OPEN + of AsciiAlpha: + new_token Token(t: START_TAG) + reconsume_in TAG_NAME + of '?': + parse_error unexpected_question_mark_instead_of_tag_name + new_token Token(t: COMMENT) + reconsume_in BOGUS_COMMENT + of eof: + parse_error eof_before_tag_name + emit '<' + emit_eof + else: + parse_error invalid_first_character_of_tag_name + emit '<' + reconsume_in DATA + + of END_TAG_OPEN: + case c + of AsciiAlpha: + new_token Token(t: END_TAG) + reconsume_in TAG_NAME + of '>': + parse_error missing_end_tag_name + switch_state DATA + of eof: + parse_error eof_before_tag_name + emit '<' + emit '/' + emit_eof + else: + parse_error invalid_first_character_of_tag_name + new_token Token(t: COMMENT) + reconsume_in BOGUS_COMMENT + + of TAG_NAME: + case c + of whitespace: switch_state BEFORE_ATTRIBUTE_NAME + of '/': switch_state SELF_CLOSING_START_TAG + of '>': + switch_state DATA + emit_tok + of AsciiUpperAlpha: tokenizer.tok.tagname &= char(tokenizer.curr).tolower() + of null: + parse_error unexpected_null_character + tokenizer.tok.tagname &= Rune(0xFFFD) + of eof: + parse_error eof_in_tag + emit_eof + else: tokenizer.tok.tagname &= tokenizer.curr + + of RCDATA_LESS_THAN_SIGN: + case c + of '/': + tokenizer.tmp = "" + switch_state RCDATA_END_TAG_OPEN + else: + emit '<' + reconsume_in RCDATA + + of RCDATA_END_TAG_OPEN: + case c + of AsciiAlpha: + new_token Token(t: END_TAG) + reconsume_in RCDATA_END_TAG_NAME + else: + emit '<' + emit '/' + reconsume_in RCDATA + + of RCDATA_END_TAG_NAME: + has_anything_else + case c + of whitespace: + if is_appropriate_end_tag_token: + switch_state BEFORE_ATTRIBUTE_NAME + else: + anything_else + of '/': + if is_appropriate_end_tag_token: + switch_state SELF_CLOSING_START_TAG + else: + anything_else + of '>': + if is_appropriate_end_tag_token: + switch_state DATA + emit_tok + else: + anything_else + of AsciiAlpha: # note: merged upper & lower + tokenizer.tok.tagname &= char(tokenizer.curr).tolower() + tokenizer.tmp &= tokenizer.curr + else: + new_token nil #TODO + emit '<' + emit '/' + emit_tmp + reconsume_in RCDATA + + of RAWTEXT_LESS_THAN_SIGN: + case c + of '/': + tokenizer.tmp = "" + switch_state RAWTEXT_END_TAG_OPEN + else: + emit '<' + reconsume_in RAWTEXT + + of RAWTEXT_END_TAG_OPEN: + case c + of AsciiAlpha: + new_token Token(t: END_TAG) + reconsume_in RAWTEXT_END_TAG_NAME + else: + emit '<' + emit '/' + reconsume_in RAWTEXT + + of RAWTEXT_END_TAG_NAME: + has_anything_else + case c + of whitespace: + if is_appropriate_end_tag_token: + switch_state BEFORE_ATTRIBUTE_NAME + else: + anything_else + of '/': + if is_appropriate_end_tag_token: + switch_state SELF_CLOSING_START_TAG + else: + anything_else + of '>': + if is_appropriate_end_tag_token: + switch_state DATA + emit_tok + else: + anything_else + of AsciiAlpha: # note: merged upper & lower + tokenizer.tok.tagname &= char(tokenizer.curr).tolower() + tokenizer.tmp &= tokenizer.curr + else: + new_token nil #TODO + emit '<' + emit '/' + for r in tokenizer.tmp.runes: + emit r + reconsume_in RAWTEXT + + of SCRIPT_DATA_LESS_THAN_SIGN: + case c + of '/': + tokenizer.tmp = "" + switch_state SCRIPT_DATA_END_TAG_OPEN + of '!': + switch_state SCRIPT_DATA_ESCAPE_START + emit '<' + emit '!' + else: + emit '<' + reconsume_in SCRIPT_DATA + + of SCRIPT_DATA_END_TAG_OPEN: + case c + of AsciiAlpha: + new_token Token(t: END_TAG) + reconsume_in SCRIPT_DATA_END_TAG_NAME + else: + emit '<' + emit '/' + reconsume_in SCRIPT_DATA + + of SCRIPT_DATA_END_TAG_NAME: + has_anything_else + case c + of whitespace: + if is_appropriate_end_tag_token: + switch_state BEFORE_ATTRIBUTE_NAME + else: + anything_else + of '/': + if is_appropriate_end_tag_token: + switch_state SELF_CLOSING_START_TAG + else: + anything_else + of '>': + if is_appropriate_end_tag_token: + switch_state DATA + emit_tok + else: + anything_else + of AsciiAlpha: # note: merged upper & lower + tokenizer.tok.tagname &= char(tokenizer.curr).tolower() + tokenizer.tmp &= tokenizer.curr + else: + emit '<' + emit '/' + emit_tmp + reconsume_in SCRIPT_DATA + + of SCRIPT_DATA_ESCAPE_START: + case c + of '-': + switch_state SCRIPT_DATA_ESCAPE_START_DASH + emit '-' + else: + reconsume_in SCRIPT_DATA + + of SCRIPT_DATA_ESCAPE_START_DASH: + case c + of '-': + switch_state SCRIPT_DATA_ESCAPED_DASH_DASH + emit '-' + else: + reconsume_in SCRIPT_DATA + + of SCRIPT_DATA_ESCAPED: + case c + of '-': + switch_state SCRIPT_DATA_ESCAPED_DASH + emit '-' + of '<': + switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN + of null: + parse_error unexpected_null_character + emit_replacement + of eof: + parse_error eof_in_script_html_comment_like_text + emit_eof + else: + emit_current + + of SCRIPT_DATA_ESCAPED_DASH: + case c + of '-': + switch_state SCRIPT_DATA_ESCAPED_DASH_DASH + emit '-' + of '<': + switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN + of null: + parse_error unexpected_null_character + switch_state SCRIPT_DATA_ESCAPED + of eof: + parse_error eof_in_script_html_comment_like_text + emit_eof + else: + switch_state SCRIPT_DATA_ESCAPED + emit_current + + of SCRIPT_DATA_ESCAPED_DASH_DASH: + case c + of '-': + emit '-' + of '<': + switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN + of '>': + switch_state SCRIPT_DATA + emit '>' + of null: + parse_error unexpected_null_character + switch_state SCRIPT_DATA_ESCAPED + of eof: + parse_error eof_in_script_html_comment_like_text + emit_eof + else: + switch_state SCRIPT_DATA_ESCAPED + emit_current + + of SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: + case c + of '/': + tokenizer.tmp = "" + switch_state SCRIPT_DATA_ESCAPED_END_TAG_OPEN + of AsciiAlpha: + tokenizer.tmp = "" + emit '<' + reconsume_in SCRIPT_DATA_DOUBLE_ESCAPE_START + else: + emit '<' + reconsume_in SCRIPT_DATA_ESCAPED + + of SCRIPT_DATA_ESCAPED_END_TAG_OPEN: + case c + of AsciiAlpha: + new_token Token(t: START_TAG) + reconsume_in SCRIPT_DATA_ESCAPED_END_TAG_NAME + else: + emit '<' + emit '/' + reconsume_in SCRIPT_DATA_ESCAPED + + of SCRIPT_DATA_ESCAPED_END_TAG_NAME: + has_anything_else + case c + of whitespace: + if is_appropriate_end_tag_token: + switch_state BEFORE_ATTRIBUTE_NAME + else: + anything_else + of '/': + if is_appropriate_end_tag_token: + switch_state SELF_CLOSING_START_TAG + else: + anything_else + of '>': + if is_appropriate_end_tag_token: + switch_state DATA + else: + anything_else + of AsciiAlpha: + tokenizer.tok.tagname &= char(tokenizer.curr).tolower() + tokenizer.tmp &= tokenizer.curr + else: + emit '<' + emit '/' + emit_tmp + reconsume_in SCRIPT_DATA_ESCAPED + + of SCRIPT_DATA_DOUBLE_ESCAPE_START: + case c + of whitespace, '/', '>': + if tokenizer.tmp == "script": + switch_state SCRIPT_DATA_DOUBLE_ESCAPED + else: + switch_state SCRIPT_DATA_ESCAPED + emit_current + of AsciiAlpha: # note: merged upper & lower + tokenizer.tmp &= c.tolower() + emit_current + else: reconsume_in SCRIPT_DATA_ESCAPED + + of SCRIPT_DATA_DOUBLE_ESCAPED: + case c + of '-': + switch_state SCRIPT_DATA_DOUBLE_ESCAPED_DASH + emit '-' + of '<': + switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN + emit '<' + of null: + parse_error unexpected_null_character + emit_replacement + of eof: + parse_error eof_in_script_html_comment_like_text + emit_eof + else: emit_current + + of SCRIPT_DATA_DOUBLE_ESCAPED_DASH: + case c + of '-': + switch_state SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH + emit '-' + of '<': + switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN + emit '<' + of null: + parse_error unexpected_null_character + switch_state SCRIPT_DATA_DOUBLE_ESCAPED + emit_replacement + of eof: + parse_error eof_in_script_html_comment_like_text + emit_eof + else: + switch_state SCRIPT_DATA_DOUBLE_ESCAPED + emit_current + + of SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: + case c + of '-': emit '-' + of '<': + switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN + emit '<' + of '>': + switch_state SCRIPT_DATA + emit '>' + of null: + parse_error unexpected_null_character + switch_state SCRIPT_DATA_DOUBLE_ESCAPED + emit_replacement + of eof: + parse_error eof_in_script_html_comment_like_text + emit_eof + else: switch_state SCRIPT_DATA_DOUBLE_ESCAPED + + of SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: + case c + of '/': + tokenizer.tmp = "" + switch_state SCRIPT_DATA_DOUBLE_ESCAPE_END + emit '/' + else: reconsume_in SCRIPT_DATA_DOUBLE_ESCAPED + + of SCRIPT_DATA_DOUBLE_ESCAPE_END: + case c + of whitespace, '/', '>': + if tokenizer.tmp == "script": + switch_state SCRIPT_DATA_ESCAPED + else: + switch_state SCRIPT_DATA_DOUBLE_ESCAPED + emit_current + of AsciiAlpha: # note: merged upper & lower + tokenizer.tmp &= c.tolower() + emit_current + else: + reconsume_in SCRIPT_DATA_DOUBLE_ESCAPED + + of BEFORE_ATTRIBUTE_NAME: + case c + of whitespace: discard + of '/', '>', eof: reconsume_in AFTER_ATTRIBUTE_NAME + of '=': + parse_error unexpected_equals_sign_before_attribute_name + start_new_attribute + switch_state ATTRIBUTE_NAME + else: + start_new_attribute + reconsume_in ATTRIBUTE_NAME + + of ATTRIBUTE_NAME: + has_anything_else + case c + of whitespace, '/', '>', eof: + leave_attribute_name_state + reconsume_in AFTER_ATTRIBUTE_NAME + of '=': + leave_attribute_name_state + switch_state BEFORE_ATTRIBUTE_VALUE + of AsciiUpperAlpha: + tokenizer.attrn &= c.tolower() + of null: + parse_error unexpected_null_character + tokenizer.attrn &= Rune(0xFFFD) + of '"', '\'', '<': + parse_error unexpected_character_in_attribute_name + anything_else + else: + tokenizer.attrn &= tokenizer.curr + + of AFTER_ATTRIBUTE_NAME: + case c + of whitespace: discard + of '/': switch_state SELF_CLOSING_START_TAG + of '=': switch_state BEFORE_ATTRIBUTE_VALUE + of '>': + switch_state DATA + emit '>' + of eof: + parse_error eof_in_tag + emit_eof + else: + start_new_attribute + reconsume_in ATTRIBUTE_NAME + + of BEFORE_ATTRIBUTE_VALUE: + case c + of whitespace: discard + of '"': switch_state ATTRIBUTE_VALUE_DOUBLE_QUOTED + of '\'': switch_state ATTRIBUTE_VALUE_SINGLE_QUOTED + of '>': + parse_error missing_attribute_value + switch_state DATA + emit '>' + else: reconsume_in ATTRIBUTE_VALUE_UNQUOTED + + of ATTRIBUTE_VALUE_DOUBLE_QUOTED: + case c + of '"': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED + of '&': switch_state_return CHARACTER_REFERENCE + of null: + parse_error unexpected_null_character + append_to_current_attr_value Rune(0xFFFD) + of eof: + parse_error eof_in_tag + emit_eof + else: append_to_current_attr_value tokenizer.curr + + of ATTRIBUTE_VALUE_SINGLE_QUOTED: + case c + of '\'': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED + of '&': switch_state_return CHARACTER_REFERENCE + of null: + parse_error unexpected_null_character + append_to_current_attr_value Rune(0xFFFD) + of eof: + parse_error eof_in_tag + emit_eof + else: append_to_current_attr_value tokenizer.curr + + of ATTRIBUTE_VALUE_UNQUOTED: + case c + of whitespace: switch_state BEFORE_ATTRIBUTE_NAME + of '&': switch_state_return CHARACTER_REFERENCE + of '>': switch_state DATA + of null: + parse_error unexpected_null_character + append_to_current_attr_value Rune(0xFFFD) + of '"', '\'', '<', '=', '`': + parse_error unexpected_character_in_unquoted_attribute_value + append_to_current_attr_value c + of eof: + parse_error eof_in_tag + emit_eof + else: append_to_current_attr_value tokenizer.curr + + of AFTER_ATTRIBUTE_VALUE_QUOTED: + case c + of whitespace: + switch_state BEFORE_ATTRIBUTE_NAME + of '/': + switch_state SELF_CLOSING_START_TAG + of '>': + switch_state DATA + emit_tok + of eof: + parse_error eof_in_tag + emit_eof + else: append_to_current_attr_value tokenizer.curr + + of SELF_CLOSING_START_TAG: + case c + of '>': + tokenizer.tok.selfclosing = true + switch_state DATA + emit_tok + of eof: + parse_error eof_in_tag + emit_eof + else: + parse_error unexpected_solidus_in_tag + reconsume_in BEFORE_ATTRIBUTE_NAME + + of BOGUS_COMMENT: + assert tokenizer.tok.t == COMMENT + case c + of '>': + switch_state DATA + emit_tok + of eof: + emit_tok + emit_eof + of null: parse_error unexpected_null_character + else: tokenizer.tok.data &= tokenizer.curr + + of MARKUP_DECLARATION_OPEN: # note: rewritten to fit case model as we consume a char anyway + has_anything_else + case c + of '-': + if peek_char == '-': + new_token Token(t: COMMENT) + tokenizer.state = COMMENT_START + consume_and_discard 1 + else: anything_else + of 'D', 'd': + if peek_str_nocase("OCTYPE"): + consume_and_discard "OCTYPE".len + switch_state DOCTYPE + else: anything_else + of '[': + if peek_str("CDATA["): + consume_and_discard "CDATA[".len + if has_adjusted_current_node: #TODO and it is not an element in the HTML namespace + switch_state CDATA_SECTION + else: + parse_error cdata_in_html_content + new_token Token(t: COMMENT, data: "[CDATA[") + switch_state BOGUS_COMMENT + else: anything_else + else: + parse_error incorrectly_opened_comment + new_token Token(t: COMMENT) + reconsume_in BOGUS_COMMENT + + of COMMENT_START: + case c + of '-': switch_state COMMENT_START_DASH + of '>': + parse_error abrupt_closing_of_empty_comment + switch_state DATA + emit_tok + else: reconsume_in COMMENT + + of COMMENT_START_DASH: + case c + of '-': switch_state COMMENT_END + of '>': + parse_error abrupt_closing_of_empty_comment + switch_state DATA + emit_tok + of eof: + parse_error eof_in_comment + emit_tok + emit_eof + else: + tokenizer.tok.data &= '-' + reconsume_in COMMENT + + of COMMENT: + case c + of '<': + tokenizer.tok.data &= c + switch_state COMMENT_LESS_THAN_SIGN + of '-': switch_state COMMENT_END_DASH + of null: + parse_error unexpected_null_character + tokenizer.tok.data &= Rune(0xFFFD) + of eof: + parse_error eof_in_comment + emit_tok + emit_eof + else: tokenizer.tok.data &= tokenizer.curr + + of COMMENT_LESS_THAN_SIGN: + case c + of '!': + tokenizer.tok.data &= c + switch_state COMMENT_LESS_THAN_SIGN_BANG + of '<': tokenizer.tok.data &= c + else: reconsume_in COMMENT + + of COMMENT_LESS_THAN_SIGN_BANG: + case c + of '-': switch_state COMMENT_LESS_THAN_SIGN_BANG_DASH + else: reconsume_in COMMENT + + of COMMENT_LESS_THAN_SIGN_BANG_DASH: + case c + of '-': switch_state COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH + else: reconsume_in COMMENT_END_DASH + + of COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH: + case c + of '>', eof: reconsume_in COMMENT_END + else: + parse_error nested_comment + reconsume_in COMMENT_END + + of COMMENT_END_DASH: + case c + of '-': switch_state COMMENT_END + of eof: + parse_error eof_in_comment + emit_tok + emit_eof + else: + tokenizer.tok.data &= '-' + reconsume_in COMMENT + + of COMMENT_END: + case c + of '>': switch_state DATA + of '!': switch_state COMMENT_END_BANG + of '-': tokenizer.tok.data &= '-' + of eof: + parse_error eof_in_comment + emit_tok + emit_eof + else: + tokenizer.tok.data &= "--" + reconsume_in COMMENT + + of COMMENT_END_BANG: + case c + of '-': + tokenizer.tok.data &= "--!" + switch_state COMMENT_END_DASH + of '>': + parse_error incorrectly_closed_comment + switch_state DATA + emit_tok + of eof: + parse_error eof_in_comment + emit_tok + emit_eof + else: + tokenizer.tok.data &= "--!" + reconsume_in COMMENT + + of DOCTYPE: + case c + of whitespace: switch_state BEFORE_DOCTYPE_NAME + of '>': reconsume_in BEFORE_DOCTYPE_NAME + of eof: + parse_error eof_in_doctype + new_token Token(t: DOCTYPE, quirks: true) + emit_tok + emit_eof + else: + parse_error missing_whitespace_before_doctype_name + reconsume_in BEFORE_DOCTYPE_NAME + + of BEFORE_DOCTYPE_NAME: + case c + of whitespace: discard + of AsciiUpperAlpha: + new_token Token(t: DOCTYPE, name: some($c.tolower())) + switch_state DOCTYPE_NAME + of null: + parse_error unexpected_null_character + new_token Token(t: DOCTYPE, name: some($Rune(0xFFFD))) + of '>': + parse_error missing_doctype_name + new_token Token(t: DOCTYPE, quirks: true) + switch_state DATA + emit_tok + of eof: + parse_error eof_in_doctype + new_token Token(t: DOCTYPE, quirks: true) + emit_tok + emit_eof + else: + new_token Token(t: DOCTYPE, name: some($tokenizer.curr)) + switch_state DOCTYPE_NAME + + of DOCTYPE_NAME: + case c + of whitespace: switch_state AFTER_DOCTYPE_NAME + of '>': + switch_state DATA + emit_tok + of AsciiUpperAlpha: + tokenizer.tok.name.get &= c.tolower() + of null: + parse_error unexpected_null_character + tokenizer.tok.name.get &= Rune(0xFFFD) + of eof: + parse_error eof_in_doctype + tokenizer.tok.quirks = true + emit_tok + emit_eof + else: + tokenizer.tok.name.get &= tokenizer.curr + + of AFTER_DOCTYPE_NAME: # note: rewritten to fit case model as we consume a char anyway + has_anything_else + case c + of whitespace: discard + of '>': + switch_state DATA + emit_tok + of eof: + parse_error eof_in_doctype + tokenizer.tok.quirks = true + emit_tok + emit_eof + of 'p', 'P': + if peek_str("UBLIC"): + consume_and_discard "UBLIC".len + switch_state AFTER_DOCTYPE_PUBLIC_KEYWORD + else: + anything_else + of 's', 'S': + if peek_str("YSTEM"): + consume_and_discard "YSTEM".len + switch_state AFTER_DOCTYPE_SYSTEM_KEYWORD + else: + anything_else + else: + parse_error invalid_character_sequence_after_doctype_name + tokenizer.tok.quirks = true + reconsume_in BOGUS_DOCTYPE + + of AFTER_DOCTYPE_PUBLIC_KEYWORD: + case c + of whitespace: switch_state BEFORE_DOCTYPE_PUBLIC_IDENTIFIER + of '"': + parse_error missing_whitespace_after_doctype_public_keyword + tokenizer.tok.pubid = some("") + switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED + of '>': + parse_error missing_doctype_public_identifier + tokenizer.tok.quirks = true + switch_state DATA + emit_tok + of eof: + parse_error eof_in_doctype + tokenizer.tok.quirks = true + emit_tok + emit_eof + else: + parse_error missing_quote_before_doctype_public_identifier + tokenizer.tok.quirks = true + reconsume_in BOGUS_DOCTYPE + + of BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: + case c + of whitespace: discard + of '"': + tokenizer.tok.pubid = some("") + switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED + of '\'': + tokenizer.tok.pubid = some("") + switch_state DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED + of '>': + parse_error missing_doctype_public_identifier + tokenizer.tok.quirks = true + switch_state DATA + emit_tok + of eof: + parse_error eof_in_doctype + tokenizer.tok.quirks = true + emit_tok + emit_eof + else: + parse_error missing_quote_before_doctype_public_identifier + tokenizer.tok.quirks = true + reconsume_in BOGUS_DOCTYPE + + of DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: + case c + of '"': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER + of null: + parse_error unexpected_null_character + tokenizer.tok.pubid.get &= Rune(0xFFFD) + of '>': + parse_error abrupt_doctype_public_identifier + tokenizer.tok.quirks = true + switch_state DATA + emit_tok + of eof: + parse_error eof_in_doctype + tokenizer.tok.quirks = true + emit_tok + emit_eof + else: + tokenizer.tok.pubid.get &= tokenizer.curr + + of DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: + case c + of '\'': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER + of null: + parse_error unexpected_null_character + tokenizer.tok.pubid.get &= Rune(0xFFFD) + of '>': + parse_error abrupt_doctype_public_identifier + tokenizer.tok.quirks = true + switch_state DATA + emit_tok + of eof: + parse_error eof_in_doctype + tokenizer.tok.quirks = true + emit_tok + emit_eof + else: + tokenizer.tok.pubid.get &= tokenizer.curr + + of AFTER_DOCTYPE_PUBLIC_IDENTIFIER: + case c + of whitespace: switch_state BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS + of '>': + switch_state DATA + emit_tok + of '"': + parse_error missing_whitespace_between_doctype_public_and_system_identifiers + tokenizer.tok.sysid = some("") + switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED + of '\'': + parse_error missing_whitespace_between_doctype_public_and_system_identifiers + tokenizer.tok.sysid = some("") + switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED + of eof: + parse_error eof_in_doctype + tokenizer.tok.quirks = true + emit_tok + emit_eof + else: + parse_error missing_quote_before_doctype_system_identifier + tokenizer.tok.quirks = true + reconsume_in BOGUS_DOCTYPE + + of BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: + case c + of whitespace: discard + of '>': + switch_state DATA + emit_tok + of '"': + tokenizer.tok.sysid = some("") + switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED + of '\'': + tokenizer.tok.sysid = some("") + switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED + of eof: + parse_error eof_in_doctype + tokenizer.tok.quirks = true + emit_tok + emit_eof + else: + parse_error missing_quote_before_doctype_system_identifier + tokenizer.tok.quirks = true + reconsume_in BOGUS_DOCTYPE + + of AFTER_DOCTYPE_SYSTEM_KEYWORD: + case c + of whitespace: switch_state BEFORE_DOCTYPE_SYSTEM_IDENTIFIER + of '"': + parse_error missing_whitespace_after_doctype_system_keyword + tokenizer.tok.sysid = some("") + switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED + of '\'': + parse_error missing_whitespace_after_doctype_system_keyword + tokenizer.tok.sysid = some("") + switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED + of '>': + parse_error missing_doctype_system_identifier + tokenizer.tok.quirks = true + switch_state DATA + emit_tok + of eof: + parse_error eof_in_doctype + tokenizer.tok.quirks = true + emit_tok + emit_eof + else: + parse_error missing_quote_before_doctype_system_identifier + tokenizer.tok.quirks = true + reconsume_in BOGUS_DOCTYPE + + of BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: + case c + of whitespace: discard + of '"': + tokenizer.tok.pubid = some("") + switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED + of '\'': + tokenizer.tok.pubid = some("") + switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED + of '>': + parse_error missing_doctype_system_identifier + tokenizer.tok.quirks = true + switch_state DATA + emit_tok + of eof: + parse_error eof_in_doctype + tokenizer.tok.quirks = true + emit_tok + emit_eof + else: + parse_error missing_quote_before_doctype_system_identifier + tokenizer.tok.quirks = true + reconsume_in BOGUS_DOCTYPE + + of DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: + case c + of '"': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER + of null: + parse_error unexpected_null_character + tokenizer.tok.sysid.get &= Rune(0xFFFD) + of '>': + parse_error abrupt_doctype_system_identifier + tokenizer.tok.quirks = true + switch_state DATA + emit_tok + of eof: + parse_error eof_in_doctype + tokenizer.tok.quirks = true + emit_tok + emit_eof + else: + tokenizer.tok.sysid.get &= tokenizer.curr + + of DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: + case c + of '\'': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER + of null: + parse_error unexpected_null_character + tokenizer.tok.sysid.get &= Rune(0xFFFD) + of '>': + parse_error abrupt_doctype_system_identifier + tokenizer.tok.quirks = true + switch_state DATA + emit_tok + of eof: + parse_error eof_in_doctype + tokenizer.tok.quirks = true + emit_tok + emit_eof + else: + tokenizer.tok.sysid.get &= tokenizer.curr + + of AFTER_DOCTYPE_SYSTEM_IDENTIFIER: + case c + of whitespace: discard + of '>': + switch_state DATA + emit_tok + of eof: + parse_error eof_in_doctype + tokenizer.tok.quirks = true + emit_tok + emit_eof + else: + parse_error unexpected_character_after_doctype_system_identifier + reconsume_in BOGUS_DOCTYPE + + of BOGUS_DOCTYPE: + case c + of '>': + switch_state DATA + emit_tok + of null: parse_error unexpected_null_character + of eof: + emit_tok + emit_eof + else: discard + + of CDATA_SECTION: + case c + of ']': switch_state CDATA_SECTION_BRACKET + of eof: + parse_error eof_in_cdata + emit_eof + else: + emit_current + + of CDATA_SECTION_BRACKET: + case c + of ']': switch_state CDATA_SECTION_END + of '>': switch_state DATA + else: + emit ']' + reconsume_in CDATA_SECTION + + of CDATA_SECTION_END: + case c + of ']': emit ']' + of '>': switch_state DATA + else: + emit ']' + emit ']' + reconsume_in CDATA_SECTION + + of CHARACTER_REFERENCE: + tokenizer.tmp = "&" + case c + of AsciiAlpha: reconsume_in NAMED_CHARACTER_REFERENCE + of '#': + tokenizer.tmp &= '#' + switch_state NUMERIC_CHARACTER_REFERENCE + else: + flush_code_points_consumed_as_a_character_reference + reconsume_in tokenizer.rstate + + of NAMED_CHARACTER_REFERENCE: + ignore_eof # we check for eof ourselves + tokenizer.reconsume() + when nimVm: + eprint "Cannot evaluate character references at compile time" + else: + var buf = "" + var node = entityMap + var value = none(string) # last value + #TODO interfacing with RadixNode is suffering + # plus this doesn't look very efficient either + while not tokenizer.atEof: + let c = tokenizer.consume() + buf &= c + if not node.hasPrefix(buf): + tokenizer.reconsume() + break + let prevnode = node + node = node{buf} + if node != prevnode: + buf = "" + if node.value.issome: + value = node.value + tokenizer.tmp &= tokenizer.curr + if value.issome: + if consumed_as_an_attribute and tokenizer.tmp[^1] != ';' and peek_char in {'='} + AsciiAlpha: + flush_code_points_consumed_as_a_character_reference + switch_state tokenizer.rstate + else: + if tokenizer.tmp[^1] != ';': + parse_error missing_semicolon_after_character_reference_parse_error + tokenizer.tmp = value.get + flush_code_points_consumed_as_a_character_reference + switch_state tokenizer.rstate + else: + flush_code_points_consumed_as_a_character_reference + switch_state AMBIGUOUS_AMPERSAND_STATE + + of AMBIGUOUS_AMPERSAND_STATE: + case c + of AsciiAlpha: + if consumed_as_an_attribute: + append_to_current_attr_value c + else: + emit_current + of ';': + parse_error unknown_named_character_reference + reconsume_in tokenizer.rstate + else: reconsume_in tokenizer.rstate + + of NUMERIC_CHARACTER_REFERENCE: + tokenizer.code = 0 + case c + of 'x', 'X': + tokenizer.tmp &= c + switch_state HEXADECIMAL_CHARACTER_REFERENCE_START + else: reconsume_in DECIMAL_CHARACTER_REFERENCE_START + + of HEXADECIMAL_CHARACTER_REFERENCE_START: + case c + of AsciiHexDigit: reconsume_in HEXADECIMAL_CHARACTER_REFERENCE + else: + parse_error absence_of_digits_in_numeric_character_reference + flush_code_points_consumed_as_a_character_reference + reconsume_in tokenizer.rstate + + of DECIMAL_CHARACTER_REFERENCE_START: + case c + of AsciiDigit: reconsume_in DECIMAL_CHARACTER_REFERENCE + else: + parse_error absence_of_digits_in_numeric_character_reference + flush_code_points_consumed_as_a_character_reference + reconsume_in tokenizer.rstate + + of HEXADECIMAL_CHARACTER_REFERENCE: + case c + of AsciiHexDigit: # note: merged digit, upper hex, lower hex + tokenizer.code *= 0x10 + tokenizer.code += hexValue(c) + of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END + else: + parse_error missing_semicolon_after_character_reference + reconsume_in NUMERIC_CHARACTER_REFERENCE_END + + of DECIMAL_CHARACTER_REFERENCE: + case c + of AsciiDigit: + tokenizer.code *= 10 + tokenizer.code += decValue(c) + of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END + else: + parse_error missing_semicolon_after_character_reference + reconsume_in NUMERIC_CHARACTER_REFERENCE_END + + of NUMERIC_CHARACTER_REFERENCE_END: + ignore_eof # we reconsume anyway + case tokenizer.code + of 0x00: + parse_error null_character_reference + tokenizer.code = 0xFFFD + elif tokenizer.code > 0x10FFFF: + parse_error character_reference_outside_unicode_range + tokenizer.code = 0xFFFD + elif Rune(tokenizer.code).isSurrogate(): + parse_error surrogate_character_reference + tokenizer.code = 0xFFFD + elif Rune(tokenizer.code).isNonCharacter(): + parse_error noncharacter_character_reference + # do nothing + elif tokenizer.code in 0..255 and char(tokenizer.code) in ((Controls - AsciiWhitespace) + {chr(0x0D)}): + const ControlMapTable = [ + (0x80, 0x20AC), (0x82, 0x201A), (0x83, 0x0192), (0x84, 0x201E), + (0x85, 0x2026), (0x86, 0x2020), (0x87, 0x2021), (0x88, 0x02C6), + (0x89, 0x2030), (0x8A, 0x0160), (0x8B, 0x2039), (0x8C, 0x0152), + (0x8E, 0x017D), (0x91, 0x2018), (0x92, 0x2019), (0x93, 0x201C), + (0x94, 0x201D), (0x95, 0x2022), (0x96, 0x2013), (0x97, 0x2014), + (0x98, 0x02DC), (0x99, 0x2122), (0x9A, 0x0161), (0x9B, 0x203A), + (0x9C, 0x0153), (0x9E, 0x017E), (0x9F, 0x0178), + ].toTable() + if ControlMapTable.hasKey(tokenizer.code): + tokenizer.code = ControlMapTable[tokenizer.code] + tokenizer.tmp = $Rune(tokenizer.code) + flush_code_points_consumed_as_a_character_reference #TODO optimize so we flush directly + reconsume_in tokenizer.rstate # we unnecessarily consumed once so reconsume + diff --git a/src/html/tags.nim b/src/html/tags.nim index 9e356444..d3bd7f6b 100644 --- a/src/html/tags.nim +++ b/src/html/tags.nim @@ -25,24 +25,27 @@ type INPUT_URL, INPUT_WEEK TagType* = enum - TAG_UNKNOWN, TAG_HTML, TAG_BASE, TAG_HEAD, TAG_LINK, TAG_META, TAG_STYLE, + TAG_UNKNOWN, TAG_APPLET, TAG_BIG, TAG_HTML, TAG_BASE, TAG_BASEFONT, + TAG_BGSOUND, TAG_HEAD, TAG_LINK, TAG_LISTING, TAG_META, TAG_STYLE, TAG_TITLE, TAG_BODY, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_FOOTER, TAG_HEADER, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HGROUP, TAG_MAIN, TAG_NAV, TAG_SECTION, TAG_BLOCKQUOTE, TAG_DD, TAG_DIV, TAG_DL, TAG_DT, TAG_FIGCAPTION, TAG_FIGURE, TAG_HR, TAG_LI, TAG_OL, TAG_P, TAG_PRE, - TAG_UL, TAG_A, TAG_ABBR, TAG_B, TAG_BDI, TAG_BDO, TAG_BR, TAG_CITE, - TAG_CODE, TAG_DATA, TAG_DFN, TAG_EM, TAG_I, TAG_KBD, TAG_MARK, TAG_Q, - TAG_RB, TAG_RP, TAG_RT, TAG_RTC, TAG_RUBY, TAG_S, TAG_SAMP, TAG_SMALL, - TAG_SPAN, TAG_STRONG, TAG_SUB, TAG_SUP, TAG_TIME, TAG_U, TAG_VAR, TAG_WBR, - TAG_AREA, TAG_AUDIO, TAG_IMG, TAG_MAP, TAG_TRACK, TAG_VIDEO, - TAG_IFRAME, TAG_OBJECT, TAG_PARAM, TAG_PICTURE, TAG_PORTAL, TAG_SOURCE, - TAG_CANVAS, TAG_NOSCRIPT, TAG_SCRIPT, TAG_DEL, TAG_INS, TAG_CAPTION, - TAG_COL, TAG_COLGROUP, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH, - TAG_THEAD, TAG_TR, TAG_BUTTON, TAG_DATALIST, TAG_FIELDSET, TAG_FORM, - TAG_INPUT, TAG_LABEL, TAG_LEGEND, TAG_METER, TAG_OPTGROUP, TAG_OPTION, + TAG_UL, TAG_A, TAG_ABBR, TAG_B, TAG_BDI, TAG_BDO, TAG_BR, TAG_NOBR, + TAG_CITE, TAG_CODE, TAG_DATA, TAG_DFN, TAG_EM, TAG_EMBED, TAG_I, TAG_KBD, + TAG_MARK, TAG_MARQUEE, TAG_Q, TAG_RB, TAG_RP, TAG_RT, TAG_RTC, TAG_RUBY, + TAG_S, TAG_SAMP, TAG_SMALL, TAG_SPAN, TAG_STRONG, TAG_SUB, TAG_SUP, + TAG_TIME, TAG_U, TAG_VAR, TAG_WBR, TAG_AREA, TAG_AUDIO, TAG_IMG, TAG_IMAGE, + TAG_MAP, TAG_TRACK, TAG_VIDEO, TAG_IFRAME, TAG_OBJECT, TAG_PARAM, + TAG_PICTURE, TAG_PORTAL, TAG_SOURCE, TAG_CANVAS, TAG_NOSCRIPT, TAG_NOEMBED, + TAG_PLAINTEXT, TAG_XMP, TAG_SCRIPT, TAG_DEL, TAG_INS, TAG_CAPTION, TAG_COL, + TAG_COLGROUP, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH, TAG_THEAD, + TAG_TR, TAG_BUTTON, TAG_DATALIST, TAG_FIELDSET, TAG_FORM, TAG_INPUT, + TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_METER, TAG_OPTGROUP, TAG_OPTION, TAG_OUTPUT, TAG_PROGRESS, TAG_SELECT, TAG_TEXTAREA, TAG_DETAILS, TAG_DIALOG, TAG_MENU, TAG_SUMMARY, TAG_BLINK, TAG_CENTER, TAG_CONTENT, - TAG_DIR, TAG_FONT, TAG_FRAME, TAG_NOFRAMES, TAG_FRAMESET, TAG_STRIKE, TAG_TT + TAG_DIR, TAG_FONT, TAG_FRAME, TAG_NOFRAMES, TAG_FRAMESET, TAG_STRIKE, + TAG_TT, TAG_TEMPLATE, TAG_SARCASM func getTagTypeMap(): Table[string, TagType] = for i in TagType: @@ -88,6 +91,54 @@ const PClosingTagTypes* = { TAG_TABLE, TAG_UL } +const HTagTypes* = { + TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6 +} + const HeadTagTypes* = { TAG_BASE, TAG_LINK, TAG_META, TAG_TITLE, TAG_NOSCRIPT, TAG_SCRIPT, TAG_NOFRAMES, TAG_STYLE, TAG_HEAD } + +# 4.10.2 Categories +const FormAssociatedElements* = { + TAG_BUTTON, TAG_FIELDSET, TAG_INPUT, TAG_OBJECT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA, TAG_IMG +} + +const ListedElements* = { + TAG_BUTTON, TAG_FIELDSET, TAG_INPUT, TAG_OBJECT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA +} + +const SubmittableElements* = { + TAG_BUTTON, TAG_INPUT, TAG_SELECT, TAG_TEXTAREA +} + +const ResettableElements* = { + TAG_INPUT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA +} + +const AutocapitalizeInheritingElements* = { + TAG_BUTTON, TAG_FIELDSET, TAG_INPUT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA +} + +const LabelableElements* = { + # input only if type not hidden + TAG_BUTTON, TAG_INPUT, TAG_METER, TAG_OUTPUT, TAG_PROGRESS, TAG_SELECT, TAG_TEXTAREA +} + +#https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements +#NOTE MathML not implemented +#TODO SVG foreignObject, SVG desc, SVG title +const SpecialElements* = { + TAG_ADDRESS, TAG_APPLET, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_BASE, + TAG_BASEFONT, TAG_BGSOUND, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, + TAG_CAPTION, TAG_CENTER, TAG_COL, TAG_COLGROUP, TAG_DD, TAG_DETAILS, TAG_DIR, + TAG_DIV, TAG_DL, TAG_DT, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, + TAG_FOOTER, TAG_FORM, TAG_FRAME, TAG_FRAMESET, TAG_H1, TAG_H2, TAG_H3, TAG_H4, + TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HGROUP, TAG_HR, TAG_HTML, + TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_KEYGEN, TAG_LI, TAG_LINK, TAG_LISTING, + TAG_MAIN, TAG_MARQUEE, TAG_MENU, TAG_META, TAG_NAV, TAG_NOEMBED, TAG_NOFRAMES, + TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, + TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SOURCE, TAG_STYLE, TAG_SUMMARY, + TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, + TAG_THEAD, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_UL, TAG_WBR, TAG_XMP +} |