diff options
author | bptato <nincsnevem662@gmail.com> | 2022-06-27 23:53:44 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2022-07-11 21:08:10 +0200 |
commit | 62cba694e47a7a1f4bedc7fd48ceac9c26aa3aa1 (patch) | |
tree | e20a9f39a293c256f707162c46e117d13f3d5621 /src/html/htmlparser.nim | |
parent | 84882cb8a6f9bca58d178a1f2b8fb5cafa8b3a56 (diff) | |
download | chawan-62cba694e47a7a1f4bedc7fd48ceac9c26aa3aa1.tar.gz |
Implement HTML5 parsing
Completely replaced the previous HTML2 (?) parser, which was a bad re-implementation of w3m's parser in the first place. Now we have a (sort of) compliant HTML5 parser. Needs tests, badly.
Diffstat (limited to 'src/html/htmlparser.nim')
-rw-r--r-- | src/html/htmlparser.nim | 3887 |
1 files changed, 1920 insertions, 1967 deletions
diff --git a/src/html/htmlparser.nim b/src/html/htmlparser.nim index d5d5effe..3e962495 100644 --- a/src/html/htmlparser.nim +++ b/src/html/htmlparser.nim @@ -1,2009 +1,1962 @@ -import streams -import unicode -import strutils -import tables -import json import macros import options +import sequtils +import streams import strformat +import tables +import unicode import utils/twtstr -import utils/radixtree import html/dom -import html/entity import html/tags +import html/htmltokenizer import css/sheet type - HTMLParseState = object - in_comment: bool - in_script: bool - in_style: bool - in_noscript: bool - in_body: bool - skip_lf: bool - elementNode: Element - textNode: Text - commentNode: Comment + HTML5Parser = object + case fragment: bool + of true: ctx: Element + else: discard + openElements: seq[Element] + insertionMode: InsertionMode + oldInsertionMode: InsertionMode + templateModes: seq[InsertionMode] + head: Element + tokenizer: Tokenizer document: Document - formowners: seq[HTMLFormElement] + form: HTMLFormElement + fosterParenting: bool + scripting: bool + activeFormatting: seq[(Element, Token)] # nil => marker + framesetok: bool + ignoreLF: bool + pendingTableChars: string + pendingTableCharsWhitespace: bool + + AdjustedInsertionLocation = tuple[inside: Node, before: Node] + +# 13.2.4.1 + InsertionMode = enum + INITIAL, BEFORE_HTML, BEFORE_HEAD, IN_HEAD, IN_HEAD_NOSCRIPT, AFTER_HEAD, + IN_BODY, TEXT, IN_TABLE, IN_TABLE_TEXT, IN_CAPTION, IN_COLUMN_GROUP, + IN_TABLE_BODY, IN_ROW, IN_CELL, IN_SELECT, IN_SELECT_IN_TABLE, IN_TEMPLATE, + AFTER_BODY, IN_FRAMESET, AFTER_FRAMESET, AFTER_AFTER_BODY, + AFTER_AFTER_FRAMESET + +proc resetInsertionMode(parser: var HTML5Parser) = + template switch_insertion_mode_and_return(mode: InsertionMode) = + parser.insertionMode = mode + return + for i in countdown(parser.openElements.high, 0): + var node = parser.openElements[i] + let last = i == 0 + if parser.fragment: + node = parser.ctx + if node.tagType == TAG_SELECT: + if not last: + var ancestor = node + for j in countdown(parser.openElements.high, 1): + let ancestor = parser.openElements[j] + case ancestor.tagType + of TAG_TEMPLATE: break + of TAG_TABLE: switch_insertion_mode_and_return IN_SELECT_IN_TABLE + else: discard + switch_insertion_mode_and_return IN_SELECT + case node.tagType + of TAG_TD, TAG_TH: + if not last: + switch_insertion_mode_and_return IN_CELL + of TAG_TR: switch_insertion_mode_and_return IN_ROW + of TAG_TBODY, TAG_THEAD, TAG_TFOOT: switch_insertion_mode_and_return IN_CAPTION + of TAG_COLGROUP: switch_insertion_mode_and_return IN_COLUMN_GROUP + of TAG_TABLE: switch_insertion_mode_and_return IN_TABLE + of TAG_TEMPLATE: switch_insertion_mode_and_return parser.templateModes[^1] + of TAG_HEAD: + if not last: + switch_insertion_mode_and_return IN_HEAD + of TAG_BODY: switch_insertion_mode_and_return IN_BODY + of TAG_FRAMESET: switch_insertion_mode_and_return IN_FRAMESET + of TAG_HTML: + if parser.head != nil: + switch_insertion_mode_and_return BEFORE_HEAD + else: + switch_insertion_mode_and_return AFTER_HEAD + else: discard + if last: + switch_insertion_mode_and_return IN_BODY -# Tokenizer -type - Tokenizer = object - state: TokenizerState - rstate: TokenizerState - curr: Rune - tmp: string - code: int - tok: Token - laststart: Token - attrn: string - attrv: string - attr: bool - - istream: Stream - sbuf: string - sbuf_i: int - sbuf_ip: int - eof_i: int - - TokenType = enum - DOCTYPE, START_TAG, END_TAG, COMMENT, CHARACTER, EOF - - TokenizerState = enum - DATA, CHARACTER_REFERENCE, TAG_OPEN, RCDATA, RCDATA_LESS_THAN_SIGN, - RAWTEXT, RAWTEXT_LESS_THAN_SIGN, SCRIPT_DATA, SCRIPT_DATA_LESS_THAN_SIGN, - PLAINTEXT, MARKUP_DECLARATION_OPEN, END_TAG_OPEN, BOGUS_COMMENT, TAG_NAME, - BEFORE_ATTRIBUTE_NAME, RCDATA_END_TAG_OPEN, RCDATA_END_TAG_NAME, - RAWTEXT_END_TAG_OPEN, RAWTEXT_END_TAG_NAME, SELF_CLOSING_START_TAG, - SCRIPT_DATA_END_TAG_OPEN, SCRIPT_DATA_ESCAPE_START, - SCRIPT_DATA_END_TAG_NAME, SCRIPT_DATA_ESCAPE_START_DASH, - SCRIPT_DATA_ESCAPED_DASH_DASH, SCRIPT_DATA_ESCAPED, - SCRIPT_DATA_ESCAPED_DASH, SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, - SCRIPT_DATA_ESCAPED_END_TAG_OPEN, SCRIPT_DATA_DOUBLE_ESCAPE_START, - SCRIPT_DATA_ESCAPED_END_TAG_NAME, SCRIPT_DATA_DOUBLE_ESCAPED, - SCRIPT_DATA_DOUBLE_ESCAPED_DASH, SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, - SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, SCRIPT_DATA_DOUBLE_ESCAPE_END, - AFTER_ATTRIBUTE_NAME, ATTRIBUTE_NAME, BEFORE_ATTRIBUTE_VALUE, - ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED, - ATTRIBUTE_VALUE_UNQUOTED, AFTER_ATTRIBUTE_VALUE_QUOTED, COMMENT_START, - CDATA_SECTION, COMMENT_START_DASH, COMMENT, COMMENT_END, - COMMENT_LESS_THAN_SIGN, COMMENT_END_DASH, COMMENT_LESS_THAN_SIGN_BANG, - COMMENT_LESS_THAN_SIGN_BANG_DASH, COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH, - COMMENT_END_BANG, DOCTYPE, BEFORE_DOCTYPE_NAME, DOCTYPE_NAME, - AFTER_DOCTYPE_NAME, AFTER_DOCTYPE_PUBLIC_KEYWORD, - AFTER_DOCTYPE_SYSTEM_KEYWORD, BOGUS_DOCTYPE, - BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, - DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, AFTER_DOCTYPE_PUBLIC_IDENTIFIER, - BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, - DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, - DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, - AFTER_DOCTYPE_SYSTEM_IDENTIFIER, CDATA_SECTION_BRACKET, CDATA_SECTION_END, - NAMED_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE, - AMBIGUOUS_AMPERSAND_STATE, HEXADECIMAL_CHARACTER_REFERENCE_START, - DECIMAL_CHARACTER_REFERENCE_START, HEXADECIMAL_CHARACTER_REFERENCE, - DECIMAL_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE_END - - Token = ref object - case t: TokenType - of DOCTYPE: - name: Option[string] - pubid: Option[string] - sysid: Option[string] - quirks: bool - of START_TAG, END_TAG: - tagname: string - selfclosing: bool - attrs: Table[string, string] - of CHARACTER: - r: Rune - of COMMENT: - data: string - of EOF: discard - -func `$`*(tok: Token): string = - case tok.t - of DOCTYPE: fmt"{tok.t} {tok.name} {tok.pubid} {tok.sysid} {tok.quirks}" - of START_TAG, END_TAG: fmt"{tok.t} {tok.tagname} {tok.selfclosing} {tok.attrs}" - of CHARACTER: fmt"{tok.t} {tok.r}" - of COMMENT: fmt"{tok.t} {tok.data}" - of EOF: fmt"{tok.t}" - -const bufSize = 512 -const copyBufSize = 16 -proc newTokenizer(s: Stream): Tokenizer = - result.sbuf = newString(bufSize) - result.istream = s - if result.istream.atEnd: - result.eof_i = 0 +func currentNode(parser: HTML5Parser): Element = + if parser.openElements.len == 0: + assert false else: - let n = s.readDataStr(result.sbuf, 0..bufSize-1) - result.eof_i = n - -func atEof(t: Tokenizer): bool = - t.eof_i != -1 and t.sbuf_i >= t.eof_i - -proc consume(t: var Tokenizer): char {.inline.} = - if t.sbuf_i >= bufSize-copyBufSize: - var sbuf2 = newString(copyBufSize) - var i = 0 - while t.sbuf_i + i < bufSize: - sbuf2[i] = t.sbuf[t.sbuf_i + i] - inc i - let n = t.istream.readDataStr(t.sbuf, i..bufSize-1) - if n != bufSize - i: - t.eof_i = i + n - t.sbuf_i = 0 - - var j = 0 - while j < i: - t.sbuf[j] = sbuf2[j] - inc j - - assert t.eof_i == -1 or t.sbuf_i < t.eof_i # not consuming eof... - t.sbuf_ip = t.sbuf_i # save previous pointer for potential reconsume - - # Normalize newlines (\r\n -> \n, single \r -> \n) - if t.sbuf[t.sbuf_i] == '\r': - inc t.sbuf_i - if t.sbuf[t.sbuf_i] != '\n': - # \r - result = '\n' - t.curr = Rune('\n') - return - # else, \r\n so just return the \n - - result = t.sbuf[t.sbuf_i] - fastRuneAt(t.sbuf, t.sbuf_i, t.curr) - -proc reconsume(t: var Tokenizer) = - t.sbuf_i = t.sbuf_ip - -iterator tokenize(tokenizer: var Tokenizer): Token = - template emit(tok: Token) = - if tok.t == START_TAG: - tokenizer.laststart = tok - yield tok - template emit(tok: TokenType) = emit Token(t: tok) - template emit(rn: Rune) = emit Token(t: CHARACTER, r: rn) - template emit(ch: char) = emit Token(t: CHARACTER, r: Rune(ch)) - template emit_eof = - emit EOF - break - template emit_tok = - if tokenizer.attr: - tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv - emit tokenizer.tok - template emit_current = - if tokenizer.atEof: - emit_eof - else: - emit Token(t: CHARACTER, r: tokenizer.curr) - template emit_replacement = emit Rune(0xFFFD) - template switch_state(s: TokenizerState) = - tokenizer.state = s - template switch_state_return(s: TokenizerState) = - tokenizer.rstate = tokenizer.state - tokenizer.state = s - template reconsume_in(s: TokenizerState) = - tokenizer.reconsume() - switch_state s - template parse_error(error: untyped) = discard # does nothing for now... TODO? - template is_appropriate_end_tag_token(): bool = - tokenizer.laststart != nil and tokenizer.laststart.data == tokenizer.tok.data - template start_new_attribute = - if tokenizer.attr: - tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv - tokenizer.attrn = "" - tokenizer.attrv = "" - tokenizer.attr = true - template leave_attribute_name_state = - if tokenizer.attrn in tokenizer.tok.attrs: - tokenizer.attr = false - template append_to_current_attr_value(c: typed) = - if tokenizer.attr: - tokenizer.attrv &= c - template peek_str(s: string): bool = - # WARNING: will break on strings with copyBufSize + 4 bytes - assert s.len < copyBufSize - 4 and s.len > 0 - if tokenizer.sbuf_i + s.len > tokenizer.eof_i: - false - else: - let slice = tokenizer.sbuf[tokenizer.sbuf_i..tokenizer.sbuf_i+s.high] - s == slice - template peek_str_nocase(s: string): bool = - # WARNING: will break on strings with copyBufSize + 4 bytes - # WARNING: only works with UPPER CASE ascii - assert s.len < copyBufSize - 4 and s.len > 0 - if tokenizer.sbuf_i + s.len > tokenizer.eof_i: - false - else: - let slice = tokenizer.sbuf[tokenizer.sbuf_i..tokenizer.sbuf_i+s.high] - s == slice.toUpperAscii() - template peek_char(): char = tokenizer.sbuf[tokenizer.sbuf_i] - template has_adjusted_current_node(): bool = false #TODO implement this - template consume_and_discard(n: int) = #TODO optimize - var i = 0 - while i < n: - discard tokenizer.consume() - inc i - template consumed_as_an_attribute(): bool = - tokenizer.rstate in {ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED, ATTRIBUTE_VALUE_UNQUOTED} - template flush_code_points_consumed_as_a_character_reference() = - if consumed_as_an_attribute: - append_to_current_attr_value tokenizer.tmp - else: - for r in tokenizer.tmp.runes: - emit r - template new_token(t: Token) = - if tokenizer.attr: - tokenizer.attr = false - tokenizer.tok = t - - # Fake EOF as an actual character. Also replace anything_else with the else - # branch. - # Yes this is kind of ugly but it works and I'm too lazy to come up with - # anything better. - macro stateMachine(states: varargs[untyped]): untyped = - var maincase = newNimNode(nnkCaseStmt).add(quote do: tokenizer.state) - for state in states: - if state.kind == nnkOfBranch: - let mainstmtlist = findChild(state, it.kind == nnkStmtList) - if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "ignore_eof": - maincase.add(state) - continue - - var hasanythingelse = false - if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "has_anything_else": - hasanythingelse = true - - let childcase = findChild(mainstmtlist, it.kind == nnkCaseStmt) - var haseof = false - var eofstmts: NimNode - var elsestmts: NimNode - - for i in countdown(childcase.len-1, 0): - let childof = childcase[i] - if childof.kind == nnkOfBranch: - for j in countdown(childof.len-1, 0): - if childof[j].kind == nnkIdent and childof[j].strVal == "eof": - haseof = true - eofstmts = childof.findChild(it.kind == nnkStmtList) - if childof.findChild(it.kind == nnkIdent and it.strVal != "eof") != nil: - childof.del(j) - else: - childcase.del(i) - elif childof.kind == nnkElse: - elsestmts = childof.findChild(it.kind == nnkStmtList) - - if not haseof: - eofstmts = elsestmts - let fake_eof = quote do: - if tokenizer.atEof: - `eofstmts` - continue - mainstmtlist.insert(0, fake_eof) - if hasanythingelse: - let fake_anything_else = quote do: - template anything_else = - `elsestmts` - mainstmtlist.insert(0, fake_anything_else) - maincase.add(state) - result = newNimNode(nnkStmtList) - result.add(maincase) - - template ignore_eof = discard # does nothing - template has_anything_else = discard # does nothing - - const null = char(0) - const whitespace = {'\t', '\n', '\f', ' '} + return parser.openElements[^1] + +func adjustedCurrentNode(parser: HTML5Parser): Element = + if parser.fragment: parser.ctx + else: parser.currentNode + +template parse_error() = discard + +func lastElementOfTag(parser: HTML5Parser, tagType: TagType): tuple[element: Element, pos: int] = + for i in countdown(parser.openElements.high, 0): + if parser.openElements[i].tagType == tagType: + return (parser.openElements[i], i) + return (nil, -1) + +template last_child_of(n: Node): AdjustedInsertionLocation = + (n, nil) + +# 13.2.6.1 +func appropriatePlaceForInsert(parser: HTML5Parser, target: Element): AdjustedInsertionLocation = + assert parser.openElements[0].tagType == TAG_HTML + if parser.fosterParenting and target.tagType in {TAG_TABLE, TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TR}: + let lastTemplate = parser.lastElementOfTag(TAG_TEMPLATE) + let lastTable = parser.lastElementOfTag(TAG_TABLE) + if lastTemplate.element != nil and (lastTable.element == nil or lastTable.pos < lastTemplate.pos): + return last_child_of(HTMLTemplateElement(lastTemplate.element).content) + if lastTable.element == nil: + return last_child_of(parser.openElements[0]) + if lastTable.element.parentNode != nil: + return (lastTable.element.parentNode, lastTable.element) + let previousElement = parser.openElements[lastTable.pos - 1] + result = last_child_of(previousElement) + else: + result = last_child_of(target) + if result.inside.nodeType == ELEMENT_NODE and Element(result.inside).tagType == TAG_TEMPLATE: + result = (HTMLTemplateElement(result.inside).content, nil) + +func appropriatePlaceForInsert(parser: HTML5Parser): AdjustedInsertionLocation = + parser.appropriatePlaceForInsert(parser.currentNode) + +func hasElement(elements: seq[Element], tag: TagType): bool = + for element in elements: + if element.tagType == tag: + return true + return false + +func hasElementInSpecificScope(elements: seq[Element], target: Element, list: set[TagType]): bool = + for i in countdown(elements.high, 0): + if elements[i] == target: + return true + if elements[i].tagType in list: + return false + assert false + +func hasElementInSpecificScope(elements: seq[Element], target: TagType, list: set[TagType]): bool = + for i in countdown(elements.high, 0): + if elements[i].tagType == target: + return true + if elements[i].tagType in list: + return false + assert false + +func hasElementInSpecificScope(elements: seq[Element], target: set[TagType], list: set[TagType]): bool = + for i in countdown(elements.high, 0): + if elements[i].tagType in target: + return true + if elements[i].tagType in list: + return false + assert false + +const Scope = {TAG_APPLET, TAG_CAPTION, TAG_HTML, TAG_TABLE, TAG_TD, TAG_TH, + TAG_MARQUEE, TAG_OBJECT, TAG_TEMPLATE} #TODO SVG (NOTE MathML not implemented) +func hasElementInScope(elements: seq[Element], target: TagType): bool = + return elements.hasElementInSpecificScope(target, Scope) + +func hasElementInScope(elements: seq[Element], target: set[TagType]): bool = + return elements.hasElementInSpecificScope(target, Scope) + +func hasElementInScope(elements: seq[Element], target: Element): bool = + return elements.hasElementInSpecificScope(target, Scope) + +func hasElementInListItemScope(elements: seq[Element], target: TagType): bool = + return elements.hasElementInSpecificScope(target, Scope + {TAG_OL, TAG_UL}) + +func hasElementInButtonScope(elements: seq[Element], target: TagType): bool = + return elements.hasElementInSpecificScope(target, Scope + {TAG_BUTTON}) + +func hasElementInTableScope(elements: seq[Element], target: TagType): bool = + return elements.hasElementInSpecificScope(target, {TAG_HTML, TAG_TABLE, TAG_TEMPLATE}) + +func hasElementInTableScope(elements: seq[Element], target: set[TagType]): bool = + return elements.hasElementInSpecificScope(target, {TAG_HTML, TAG_TABLE, TAG_TEMPLATE}) + +func hasElementInSelectScope(elements: seq[Element], target: TagType): bool = + for i in countdown(elements.high, 0): + if elements[i].tagType == target: + return true + if elements[i].tagType notin {TAG_OPTION, TAG_OPTGROUP}: + return false + assert false + +func createElement(parser: HTML5Parser, token: Token, namespace: string, intendedParent: Node): Element = + #TODO custom elements + let document = intendedParent.document + let localName = token.tagname + let element = document.newHTMLElement(localName, namespace, tagType = token.tagtype) + for k, v in token.attrs: + element.appendAttribute(k, v) + if element.isResettable(): + element.reset() + + if element.tagType in FormAssociatedElements and parser.form != nil and + not parser.openElements.hasElement(TAG_TEMPLATE) and + (element.tagType notin ListedElements or not element.attrb("form")) and + element.inSameTree(parser.form): + element.setForm(parser.form) + return element + +proc insert(location: AdjustedInsertionLocation, node: Node) = + location.inside.insert(node, location.before) + +proc insertForeignElement(parser: var HTML5Parser, token: Token, namespace: string): Element = + let location = parser.appropriatePlaceForInsert() + let element = parser.createElement(token, namespace, location.inside) + if location.inside.preInsertionValidity(element, location.before): + #TODO custom elements + location.insert(element) + parser.openElements.add(element) + return element + +proc insertHTMLElement(parser: var HTML5Parser, token: Token): Element = + return parser.insertForeignElement(token, $Namespace.HTML) + +template insert_character_impl(parser: var HTML5Parser, data: typed) = + let location = parser.appropriatePlaceForInsert() + if location.inside.nodeType == DOCUMENT_NODE: + return + let insertNode = if location.before == nil: + location.inside.lastChild + else: + location.before.previousSibling + assert location.before == nil + if insertNode != nil and insertNode.nodeType == TEXT_NODE: + dom.Text(insertNode).data &= data + else: + let text = location.inside.document.newText($data) + location.insert(text) + + if location.inside.nodeType == ELEMENT_NODE: + let parent = Element(location.inside) + if parent.tagType == TAG_STYLE: + let parent = HTMLStyleElement(parent) + parent.sheet_invalid = true + +proc insertCharacter(parser: var HTML5Parser, data: string) = + insert_character_impl(parser, data) + +proc insertCharacter(parser: var HTML5Parser, data: char) = + insert_character_impl(parser, data) + +proc insertCharacter(parser: var HTML5Parser, data: Rune) = + insert_character_impl(parser, data) + +proc insertComment(parser: var HTML5Parser, token: Token, position: AdjustedInsertionLocation) = + position.insert(position.inside.document.newComment(token.data)) + +proc insertComment(parser: var HTML5Parser, token: Token) = + let position = parser.appropriatePlaceForInsert() + position.insert(position.inside.document.newComment(token.data)) + +const PublicIdentifierEquals = [ + "-//W3O//DTD W3 HTML Strict 3.0//EN//", + "-/W3C/DTD HTML 4.0 Transitional/EN", + "HTML" +] + +const PublicIdentifierStartsWith = [ + "+//Silmaril//dtd html Pro v0r11 19970101//", + "-//AS//DTD HTML 3.0 asWedit + extensions//", + "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//", + "-//IETF//DTD HTML 2.0 Level 1//", + "-//IETF//DTD HTML 2.0 Level 2//", + "-//IETF//DTD HTML 2.0 Strict Level 1//", + "-//IETF//DTD HTML 2.0 Strict Level 2//", + "-//IETF//DTD HTML 2.0 Strict//", + "-//IETF//DTD HTML 2.0//", + "-//IETF//DTD HTML 2.1E//", + "-//IETF//DTD HTML 3.0//", + "-//IETF//DTD HTML 3.2 Final//", + "-//IETF//DTD HTML 3.2//", + "-//IETF//DTD HTML 3//", + "-//IETF//DTD HTML Level 0//", + "-//IETF//DTD HTML Level 1//", + "-//IETF//DTD HTML Level 2//", + "-//IETF//DTD HTML Level 3//", + "-//IETF//DTD HTML Strict Level 0//", + "-//IETF//DTD HTML Strict Level 1//", + "-//IETF//DTD HTML Strict Level 2//", + "-//IETF//DTD HTML Strict Level 3//", + "-//IETF//DTD HTML Strict//", + "-//IETF//DTD HTML//", + "-//Metrius//DTD Metrius Presentational//", + "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//", + "-//Microsoft//DTD Internet Explorer 2.0 HTML//", + "-//Microsoft//DTD Internet Explorer 2.0 Tables//", + "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//", + "-//Microsoft//DTD Internet Explorer 3.0 HTML//", + "-//Microsoft//DTD Internet Explorer 3.0 Tables//", + "-//Netscape Comm. Corp.//DTD HTML//", + "-//Netscape Comm. Corp.//DTD Strict HTML//", + "-//O'Reilly and Associates//DTD HTML 2.0//", + "-//O'Reilly and Associates//DTD HTML Extended 1.0//", + "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//", + "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//", + "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//", + "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//", + "-//Spyglass//DTD HTML 2.0 Extended//", + "-//Sun Microsystems Corp.//DTD HotJava HTML//", + "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//", + "-//W3C//DTD HTML 3 1995-03-24//", + "-//W3C//DTD HTML 3.2 Draft//", + "-//W3C//DTD HTML 3.2 Final//", + "-//W3C//DTD HTML 3.2//", + "-//W3C//DTD HTML 3.2S Draft//", + "-//W3C//DTD HTML 4.0 Frameset//", + "-//W3C//DTD HTML 4.0 Transitional//", + "-//W3C//DTD HTML Experimental 19960712//", + "-//W3C//DTD HTML Experimental 970421//", + "-//W3C//DTD W3 HTML//", + "-//W3O//DTD W3 HTML 3.0//", + "-//WebTechs//DTD Mozilla HTML 2.0//", + "-//WebTechs//DTD Mozilla HTML//", +] + +const SystemIdentifierMissingAndPublicIdentifierStartsWith = [ + "-//W3C//DTD HTML 4.01 Frameset//", + "-//W3C//DTD HTML 4.01 Transitional//" +] + +const PublicIdentifierStartsWithLimited = [ + "-//W3C//DTD XHTML 1.0 Frameset//", + "-//W3C//DTD XHTML 1.0 Transitional//" +] + +const SystemIdentifierNotMissingAndPublicIdentifierStartsWith = [ + "-//W3C//DTD HTML 4.01 Frameset//", + "-//W3C//DTD HTML 4.01 Transitional//" +] + +func quirksConditions(token: Token): bool = + if token.quirks: return true + if token.name.isnone or token.name.get != "html": return true + if token.sysid.issome: + if token.sysid.get == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd": + return true + if token.pubid.issome: + if token.pubid.get in PublicIdentifierEquals: + return true + for id in PublicIdentifierStartsWith: + if token.pubid.get.startsWithNoCase(id): + return true + if token.sysid.isnone: + for id in SystemIdentifierMissingAndPublicIdentifierStartsWith: + if token.pubid.get.startsWithNoCase(id): + return true + return false + +func limitedQuirksConditions(token: Token): bool = + if token.pubid.isnone: return false + for id in PublicIdentifierStartsWithLimited: + if token.pubid.get.startsWithNoCase(id): + return true + if token.sysid.isnone: return false + for id in SystemIdentifierNotMissingAndPublicIdentifierStartsWith: + if token.pubid.get.startsWithNoCase(id): + return true + return false + +# 13.2.6.2 +proc genericRawtextElementParsingAlgorithm(parser: var HTML5Parser, token: Token) = + discard parser.insertHTMLElement(token) + parser.tokenizer.state = RAWTEXT + parser.oldInsertionMode = parser.insertionMode + parser.insertionMode = TEXT + +proc genericRCDATAElementParsingAlgorithm(parser: var HTML5Parser, token: Token) = + discard parser.insertHTMLElement(token) + parser.tokenizer.state = RCDATA + parser.oldInsertionMode = parser.insertionMode + parser.insertionMode = TEXT + +# 13.2.6.3 +proc generateImpliedEndTags(parser: var HTML5Parser) = + const tags = {TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, + TAG_RB, TAG_RP, TAG_RT, TAG_RTC} + while parser.currentNode.tagType in tags: + discard parser.openElements.pop() + +proc generateImpliedEndTags(parser: var HTML5Parser, exclude: TagType) = + let tags = {TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, + TAG_RB, TAG_RP, TAG_RT, TAG_RTC} - {exclude} + while parser.currentNode.tagType in tags: + discard parser.openElements.pop() + +proc generateImpliedEndTagsThoroughly(parser: var HTML5Parser) = + const tags = {TAG_CAPTION, TAG_COLGROUP, TAG_DD, TAG_DT, TAG_LI, + TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, TAG_RT, + TAG_RTC, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH, TAG_THEAD, + TAG_TR} + while parser.currentNode.tagType in tags: + discard parser.openElements.pop() + +# 13.2.4.3 +proc pushOntoActiveFormatting(parser: var HTML5Parser, element: Element, token: Token) = + var count = 0 + for i in countdown(parser.activeFormatting.high, 0): + let it = parser.activeFormatting[i] + if it[0] == nil: break + if it[0].tagType != element.tagType: continue + if it[0].tagType == TAG_UNKNOWN: + if it[0].localName != element.localName: continue #TODO local or qualified? + if it[0].namespace != element.namespace: continue + var fail = false + for k, v in it[0].attributes: + if k notin element.attributes: + fail = true + break + if v != element.attributes[k]: + fail = true + break + if fail: continue + for k, v in element.attributes: + if k notin it[0].attributes: + fail = true + break + if fail: continue + inc count + if count == 3: + parser.activeFormatting.del(i) + break + parser.activeFormatting.add((element, token)) +proc reconstructActiveFormatting(parser: var HTML5Parser) = + type State = enum + REWIND, ADVANCE, CREATE + if parser.activeFormatting.len == 0: + return + if parser.activeFormatting[^1][0] == nil or parser.openElements.hasElement(parser.activeFormatting[^1][0].tagType): + return + var i = parser.activeFormatting.high + template entry: Element = (parser.activeFormatting[i][0]) + var state = REWIND while true: {.computedGoto.} - let c = tokenizer.consume() - stateMachine: # => case tokenizer.state - of DATA: - case c - of '&': switch_state_return CHARACTER_REFERENCE - of '<': switch_state TAG_OPEN - of null: - parse_error unexpected_null_character - emit_current - of eof: emit_eof - else: emit_current - - of RCDATA: - case c - of '&': switch_state_return CHARACTER_REFERENCE - of '<': switch_state RCDATA_LESS_THAN_SIGN - of null: parse_error unexpected_null_character - of eof: emit_eof - else: emit_current - - of RAWTEXT: - case c - of '<': switch_state RAWTEXT_LESS_THAN_SIGN - of null: - parse_error unexpected_null_character - emit_replacement - of eof: emit_eof - else: emit_current - - of SCRIPT_DATA: - case c - of '<': switch_state SCRIPT_DATA_LESS_THAN_SIGN - of null: - parse_error unexpected_null_character - emit_replacement - of eof: emit_eof - else: emit_current - - of PLAINTEXT: - case c - of null: - parse_error unexpected_null_character - emit_replacement - of eof: emit_eof - else: emit_current - - of TAG_OPEN: - case c - of '!': switch_state MARKUP_DECLARATION_OPEN - of '/': switch_state END_TAG_OPEN - of AsciiAlpha: - new_token Token(t: START_TAG) - reconsume_in TAG_NAME - of '?': - parse_error unexpected_question_mark_instead_of_tag_name - new_token Token(t: COMMENT) - reconsume_in BOGUS_COMMENT - of eof: - parse_error eof_before_tag_name - emit '<' - emit_eof - else: - parse_error invalid_first_character_of_tag_name - emit '<' - reconsume_in DATA - - of END_TAG_OPEN: - case c - of AsciiAlpha: new_token Token(t: END_TAG) - of '>': - parse_error missing_end_tag_name - switch_state DATA - of eof: - parse_error eof_before_tag_name - emit '<' - emit '/' - emit_eof - else: - parse_error invalid_first_character_of_tag_name - new_token Token(t: COMMENT) - reconsume_in BOGUS_COMMENT - - of TAG_NAME: - case c - of whitespace: switch_state BEFORE_ATTRIBUTE_NAME - of '/': switch_state SELF_CLOSING_START_TAG - of '>': - switch_state DATA - emit_tok - of AsciiUpperAlpha: tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - of null: - parse_error unexpected_null_character - tokenizer.tok.tagname &= Rune(0xFFFD) - of eof: - parse_error eof_in_tag - emit_eof - else: tokenizer.tok.tagname &= tokenizer.curr - - of RCDATA_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state RCDATA_END_TAG_OPEN + case state + of REWIND: + if i == 0: + state = CREATE + continue + dec i + if entry != nil and not parser.openElements.hasElement(entry.tagType): + continue + state = ADVANCE + of ADVANCE: + inc i + state = CREATE + of CREATE: + parser.activeFormatting[i] = (parser.insertHTMLElement(parser.activeFormatting[i][1]), parser.activeFormatting[i][1]) + if i != parser.activeFormatting.high: + state = ADVANCE + continue + break + +proc clearActiveFormattingTillMarker(parser: var HTML5Parser) = + while parser.activeFormatting.len > 0 and parser.activeFormatting.pop()[0] != nil: discard + +template pop_current_node = discard parser.openElements.pop() + +func isHTMLIntegrationPoint(node: Element): bool = + return false #TODO SVG (NOTE MathML not implemented) + +# Following is an implementation of the state (?) machine defined in +# https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhtml +# It uses the ad-hoc pattern matching macro `match' to apply the following +# transformations: +# * First, pairs of patterns and actions are stored in tuples (and `discard' +# statements...) +# * These pairs are then assigned to token types, later mapped to legs of the +# first case statement. +# * Another case statement is constructed where needed, e.g. for switching on +# characters/tags/etc. +# * Finally, the whole thing is wrapped in a named block, to implement a +# pseudo-goto by breaking out only when the else statement needn't be +# executed. +# So for example the following code: +# +# match token: +# TokenType.COMMENT => (block: echo "comment") +# ("<p>", "<a>", "</div>") => (block: echo "p, a or closing div") +# ("<div>", "</p>") => (block: anything_else) +# (TokenType.START_TAG, TokenType.END_TAG) => (block: assert false, "invalid") +# _ => (block: echo "anything else") +# +# (effectively) generates this: +# +# block inside_not_else: +# case token.t +# of TokenType.COMMENT: +# echo "comment" +# break inside_not_else +# of TokenType.START_TAG: +# case token.tagtype +# of {TAG_P, TAG_A}: +# echo "p, a or closing div" +# break inside_not_else +# of TAG_DIV: discard +# else: +# assert false +# break inside_not_else +# of TokenType.END_TAG: +# case token.tagtype +# of TAG_DIV: +# echo "p, a or closing div" +# break inside_not_else +# of TAG_P: discard +# else: +# assert false +# break inside_not_else +# else: discard +# echo "anything else" +# +# This duplicates any code that applies for several token types, except for the +# else branch. +macro match(token: Token, body: typed): untyped = + type OfBranchStore = object + ofBranches: seq[(seq[NimNode], NimNode)] + defaultBranch: NimNode + + # Stores 'of' branches + var ofBranches: array[TokenType, OfBranchStore] + # Stores 'else', 'elif' branches + var defaultBranch: NimNode + + const tokenTypes = (func(): Table[string, TokenType] = + for tt in TokenType: + result[$tt] = tt)() + + for disc in body: + let tup = disc[0] # access actual tuple + let pattern = `tup`[0] + let lambda = `tup`[1] + var action = lambda.findChild(it.kind notin {nnkSym, nnkEmpty, nnkFormalParams}) + if pattern.kind != nnkDiscardStmt and not (action.len == 2 and action[1].kind == nnkDiscardStmt and action[1][0] == newStrLitNode("anything_else")): + action = quote do: + `action` + #eprint token #debug + break inside_not_else + + var patterns = @[pattern] + while patterns.len > 0: + let pattern = patterns.pop() + case pattern.kind + of nnkSym: # simple symbols; we assume these are the enums + ofBranches[tokenTypes[pattern.strVal]].defaultBranch = action + of nnkCharLit: + ofBranches[CHARACTER_ASCII].ofBranches.add((@[pattern], action)) + of nnkCurly: + case pattern[0].kind + of nnkCharLit: + ofBranches[CHARACTER_ASCII].ofBranches.add((@[pattern], action)) + else: error fmt"Unsupported curly of kind {pattern[0].kind}" + of nnkStrLit: + var tempTokenizer = newTokenizer(newStringStream(pattern.strVal)) + for token in tempTokenizer.tokenize: + let tt = int(token.tagtype) + case token.t + of START_TAG, END_TAG: + var found = false + for i in 0..ofBranches[token.t].ofBranches.high: + if ofBranches[token.t].ofBranches[i][1] == action: + found = true + ofBranches[token.t].ofBranches[i][0].add((quote do: TagType(`tt`))) + break + if not found: + ofBranches[token.t].ofBranches.add((@[(quote do: TagType(`tt`))], action)) + else: error fmt"{pattern.strVal}: Unsupported token {token} of kind {token.t}" + break + of nnkDiscardStmt: + defaultBranch = action + of nnkTupleConstr: + for child in pattern: + patterns.add(child) + else: error fmt"{pattern}: Unsupported pattern of kind {pattern.kind}" + + func tokenBranchOn(tok: TokenType): NimNode = + case tok + of START_TAG, END_TAG: + return quote do: token.tagtype + of CHARACTER: + return quote do: token.r + of CHARACTER_ASCII: + return quote do: token.c + else: error fmt"Unsupported branching of token {tok}" + + template add_to_case(branch: typed) = + if branch[0].len == 1: + tokenCase.add(newNimNode(nnkOfBranch).add(branch[0][0]).add(branch[1])) + else: + var curly = newNimNode(nnkCurly) + for node in branch[0]: + curly.add(node) + tokenCase.add(newNimNode(nnkOfBranch).add(curly).add(branch[1])) + + # Build case statements + var mainCase = newNimNode(nnkCaseStmt).add(quote do: `token`.t) + for tt in TokenType: + let ofBranch = newNimNode(nnkOfBranch).add(quote do: TokenType(`tt`)) + let tokenCase = newNimNode(nnkCaseStmt) + if ofBranches[tt].defaultBranch != nil: + if ofBranches[tt].ofBranches.len > 0: + tokenCase.add(tokenBranchOn(tt)) + for branch in ofBranches[tt].ofBranches: + add_to_case branch + tokenCase.add(newNimNode(nnkElse).add(ofBranches[tt].defaultBranch)) + ofBranch.add(tokenCase) + mainCase.add(ofBranch) else: - emit '<' - reconsume_in RCDATA - - of RCDATA_END_TAG_OPEN: - case c - of AsciiAlpha: - new_token Token(t: END_TAG) - reconsume_in RCDATA_END_TAG_NAME + ofBranch.add(ofBranches[tt].defaultBranch) + mainCase.add(ofBranch) + else: + if ofBranches[tt].ofBranches.len > 0: + tokenCase.add(tokenBranchOn(tt)) + for branch in ofBranches[tt].ofBranches: + add_to_case branch + ofBranch.add(tokenCase) + tokenCase.add(newNimNode(nnkElse).add(quote do: discard)) + mainCase.add(ofBranch) else: - emit '<' - emit '/' - reconsume_in RCDATA - - of RCDATA_END_TAG_NAME: - has_anything_else - case c - of whitespace: - if is_appropriate_end_tag_token: - switch_state BEFORE_ATTRIBUTE_NAME + discard + + mainCase.add(newNimNode(nnkElse).add(quote do: discard)) + + var stmts = newStmtList().add(mainCase) + for stmt in defaultBranch: + stmts.add(stmt) + result = newBlockStmt(ident("inside_not_else"), stmts) + +proc processInHTMLContent(parser: var HTML5Parser, token: Token, insertionMode = parser.insertionMode) = + template pop_all_nodes = + while parser.openElements.len > 1: pop_current_node + template anything_else = discard "anything_else" + macro `=>`(v: typed, body: untyped): untyped = + quote do: + discard (`v`, proc() = `body`) + template _ = discard + template reprocess(tok: Token) = + parser.processInHTMLContent(tok) + + case insertionMode + of INITIAL: + match token: + AsciiWhitespace => (block: discard) + TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document))) + TokenType.DOCTYPE => (block: + if token.name.isnone or token.name.get != "html" or token.pubid.issome or (token.sysid.issome and token.sysid.get != "about:legacy-compat"): + parse_error + let doctype = parser.document.newDocumentType(token.name.get(""), token.pubid.get(""), token.sysid.get("")) + parser.document.append(doctype) + if not parser.document.is_iframe_srcdoc and not parser.document.parser_cannot_change_the_mode_flag: + if quirksConditions(token): + parser.document.mode = QUIRKS + elif limitedQuirksConditions(token): + parser.document.mode = LIMITED_QUIRKS + parser.insertionMode = BEFORE_HTML + ) + _ => (block: + if not parser.document.is_iframe_srcdoc: + parse_error + if not parser.document.parser_cannot_change_the_mode_flag: + parser.document.mode = QUIRKS + parser.insertionMode = BEFORE_HTML + reprocess token + ) + + of BEFORE_HTML: + match token: + TokenType.DOCTYPE => (block: parse_error) + TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document))) + AsciiWhitespace => (block: discard) + "<html>" => (block: + let element = parser.createElement(token, $Namespace.HTML, parser.document) + parser.document.append(element) + parser.openElements.add(element) + parser.insertionMode = BEFORE_HEAD + ) + ("</head>", "</body>", "</html>", "</br>") => (block: anything_else) + TokenType.END_TAG => (block: parse_error) + _ => (block: + let element = parser.document.newHTMLElement(TAG_HTML) + parser.document.append(element) + parser.openElements.add(element) + parser.insertionMode = BEFORE_HEAD + reprocess token + ) + + of BEFORE_HEAD: + match token: + AsciiWhitespace => (block: discard) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "<head>" => (block: + parser.head = parser.insertHTMLElement(token) + parser.insertionMode = IN_HEAD + ) + ("</head>", "</body>", "</html>", "</br>") => (block: anything_else) + TokenType.END_TAG => (block: parse_error) + _ => (block: + parser.head = parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_HEAD)) + parser.insertionMode = IN_HEAD + reprocess token + ) + + of IN_HEAD: + match token: + AsciiWhitespace => (block: discard) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + ("<base>", "<basefont>", "<bgsound>", "<link>") => (block: + discard parser.insertHTMLElement(token) + pop_current_node + ) + "<meta>" => (block: + discard parser.insertHTMLElement(token) + pop_current_node + #TODO encodings + ) + "<title>" => (block: parser.genericRCDATAElementParsingAlgorithm(token)) + "<noscript>" => (block: + if not parser.scripting: + discard parser.insertHTMLElement(token) + parser.insertionMode = IN_HEAD_NOSCRIPT else: - anything_else - of '/': - if is_appropriate_end_tag_token: - switch_state SELF_CLOSING_START_TAG + parser.genericRawtextElementParsingAlgorithm(token) + ) + ("<noframes>", "<style>") => (block: parser.genericRawtextElementParsingAlgorithm(token)) + "<script>" => (block: + let location = parser.appropriatePlaceForInsert() + let element = HTMLScriptElement(parser.createElement(token, $Namespace.HTML, location.inside)) + element.parserDocument = parser.document + element.forceAsync = false + if parser.fragment: + element.alreadyStarted = true + #TODO document.write (?) + location.insert(element) + parser.openElements.add(element) + parser.tokenizer.state = SCRIPT_DATA + parser.insertionMode = TEXT + ) + "</head>" => (block: + pop_current_node + parser.insertionMode = AFTER_HEAD + ) + ("</body>", "</html>", "</br>") => (block: anything_else) + "<template>" => (block: + discard parser.insertHTMLElement(token) + parser.activeFormatting.add((nil, nil)) + parser.framesetok = false + parser.insertionMode = IN_TEMPLATE + parser.templateModes.add(IN_TEMPLATE) + ) + "</template>" => (block: + if not parser.openElements.hasElement(TAG_TEMPLATE): + parse_error else: - anything_else - of '>': - if is_appropriate_end_tag_token: - switch_state DATA - of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr - else: - new_token nil #TODO - emit '<' - emit '/' - for r in tokenizer.tmp.runes: - emit r - reconsume_in RCDATA - - of RAWTEXT_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state RAWTEXT_END_TAG_OPEN - else: - emit '<' - reconsume_in RAWTEXT - - of RAWTEXT_END_TAG_OPEN: - case c - of AsciiAlpha: - new_token Token(t: END_TAG) - reconsume_in RAWTEXT_END_TAG_NAME - else: - emit '<' - emit '/' - reconsume_in RAWTEXT - - of RAWTEXT_END_TAG_NAME: - has_anything_else - case c - of whitespace: - if is_appropriate_end_tag_token: - switch_state BEFORE_ATTRIBUTE_NAME + parser.generateImpliedEndTagsThoroughly() + if parser.currentNode.tagType != TAG_TEMPLATE: + parse_error + while parser.openElements.pop().tagType != TAG_TEMPLATE: discard + parser.clearActiveFormattingTillMarker() + discard parser.templateModes.pop() + parser.resetInsertionMode() + ) + ("<head>", TokenType.END_TAG) => (block: parse_error) + _ => (block: + pop_current_node + parser.insertionMode = AFTER_HEAD + reprocess token + ) + + of IN_HEAD_NOSCRIPT: + match token: + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "</noscript>" => (block: + pop_current_node + parser.insertionMode = IN_HEAD + ) + (AsciiWhitespace, + TokenType.COMMENT, + "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<style>") => (block: + parser.processInHTMLContent(token, IN_HEAD)) + "</br>" => (block: anything_else) + ("<head>", "<noscript>") => (block: parse_error) + TokenType.END_TAG => (block: parse_error) + _ => (block: + pop_current_node + parser.insertionMode = IN_HEAD + reprocess token + ) + + of AFTER_HEAD: + match token: + AsciiWhitespace => (block: parser.insertCharacter(token.c)) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "<body>" => (block: + discard parser.insertHTMLElement(token) + parser.framesetok = false + parser.insertionMode = IN_BODY + ) + "<frameset>" => (block: + discard parser.insertHTMLElement(token) + parser.insertionMode = IN_FRAMESET + ) + ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<script>", "<style>", "<template>", "<title>") => (block: + parse_error + parser.openElements.add(parser.head) + parser.processInHTMLContent(token, IN_HEAD) + for i in countdown(parser.openElements.high, 0): + if parser.openElements[i] == parser.head: + parser.openElements.del(i) + ) + "</template>" => (block: parser.processInHTMLContent(token, IN_HEAD)) + ("</body>", "</html>", "</br>") => (block: anything_else) + ("<head>", TokenType.END_TAG) => (block: parse_error) + _ => (block: + discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_BODY)) + parser.insertionMode = IN_BODY + reprocess token + ) + + of IN_BODY: + proc closeP(parser: var HTML5Parser) = + parser.generateImpliedEndTags(TAG_P) + if parser.currentNode.tagType != TAG_P: parse_error + while parser.openElements.pop().tagType != TAG_P: discard + + proc adoptionAgencyAlgorithm(parser: var HTML5Parser, token: Token): bool = + if parser.currentNode.tagType != TAG_UNKNOWN and parser.currentNode.tagtype == token.tagtype or parser.currentNode.localName == token.tagname: #TODO local or qualified name? + var fail = true + for it in parser.activeFormatting: + if it[0] == parser.currentNode: + fail = false + if fail: + pop_current_node + return false + var i = 0 + while true: + if i >= 8: return false + inc i + if parser.activeFormatting.len == 0: return true + var formatting: Element + var formattingIndex: int + for j in countdown(parser.activeFormatting.high, 0): + let element = parser.activeFormatting[j][0] + if element == nil: + return true + if element.tagType != TAG_UNKNOWN and element.tagtype == token.tagtype or element.qualifiedName == token.tagname: + formatting = element + formattingIndex = j + break + if j == 0: + return true + let stackIndex = parser.openElements.find(formatting) + if stackIndex < 0: + parse_error + parser.activeFormatting.del(formattingIndex) + return false + if not parser.openElements.hasElementInScope(formatting): + parse_error + return false + if formatting != parser.currentNode: parse_error + var furthestBlock: Element = nil + var furthestBlockIndex: int + for j in countdown(parser.openElements.high, 0): + if parser.openElements[j] == formatting: + break + if parser.openElements[j].tagType in SpecialElements: + furthestBlock = parser.openElements[j] + furthestBlockIndex = j + break + if furthestBlock == nil: + while parser.openElements.pop() != formatting: discard + parser.activeFormatting.del(formattingIndex) + return false + let commonAncestor = parser.openElements[stackIndex - 1] + var bookmark = formattingIndex + var node = furthestBlock + var aboveNode = parser.openElements[furthestBlockIndex - 1] + var lastNode = furthestBlock + var j = 0 + while true: + inc j + node = aboveNode + if node == formatting: break + var nodeFormattingIndex = -1 + for i in countdown(parser.activeFormatting.high, 0): + if parser.activeFormatting[i][0] == node: + nodeFormattingIndex = i + break + if j > 3 and nodeFormattingIndex >= 0: + parser.activeFormatting.del(nodeFormattingIndex) + if nodeFormattingIndex < bookmark: + dec bookmark # a previous node got deleted, so decrease bookmark by one + let nodeStackIndex = parser.openElements.find(node) + if nodeFormattingIndex < 0: + parser.openElements.del(nodeStackIndex) + if nodeStackIndex < furthestBlockIndex: + dec furthestBlockIndex + continue + let element = parser.createElement(parser.activeFormatting[nodeFormattingIndex][1], $Namespace.HTML, commonAncestor) + parser.activeFormatting[nodeFormattingIndex] = (element, parser.activeFormatting[nodeFormattingIndex][1]) + parser.openElements[nodeFormattingIndex] = element + aboveNode = parser.openElements[nodeFormattingIndex - 1] + node = element + if lastNode == furthestBlock: + bookmark = nodeFormattingIndex + node.append(lastNode) + lastNode = node + let location = parser.appropriatePlaceForInsert(commonAncestor) + location.inside.insert(lastNode, location.before) + let token = parser.activeFormatting[formattingIndex][1] + let element = parser.createElement(token, $Namespace.HTML, furthestBlock) + for child in furthestBlock.childNodes: + child.remove() + element.append(child) + furthestBlock.append(element) + parser.activeFormatting.insert((element, token), bookmark) + parser.activeFormatting.del(formattingIndex) + parser.openElements.insert(element, furthestBlockIndex) + parser.openElements.del(stackIndex) + + template any_other_start_tag() = + parser.reconstructActiveFormatting() + discard parser.insertHTMLElement(token) + + template any_other_end_tag() = + for i in countdown(parser.openElements.high, 0): + let node = parser.openElements[i] + if node.tagType != TAG_UNKNOWN and node.tagType == token.tagtype or node.localName == token.tagname: #TODO local or qualified name? + parser.generateImpliedEndTags(token.tagtype) + if node != parser.currentNode: parse_error + while parser.openElements.pop() != node: discard + break + elif node.tagType in SpecialElements: + parse_error + return + + match token: + '\0' => (block: parse_error) + AsciiWhitespace => (block: + parser.reconstructActiveFormatting() + parser.insertCharacter(token.c) + ) + TokenType.CHARACTER_ASCII => (block: + parser.reconstructActiveFormatting() + parser.insertCharacter(token.c) + parser.framesetOk = false + ) + TokenType.CHARACTER => (block: + parser.reconstructActiveFormatting() + parser.insertCharacter(token.r) + parser.framesetOk = false + ) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: + parse_error + if parser.openElements.hasElement(TAG_TEMPLATE): + discard else: - anything_else - of '/': - if is_appropriate_end_tag_token: - switch_state SELF_CLOSING_START_TAG + for k, v in token.attrs: + if k notin parser.openElements[0].attributes: + parser.openElements[0].attributes[k] = v + ) + ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<script>", "<style>", "<template>", "<title>", + "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD)) + "<body>" => (block: + parse_error + if parser.openElements.len == 1 or parser.openElements[1].tagType != TAG_BODY or parser.openElements.hasElement(TAG_TEMPLATE): + discard else: - anything_else - of '>': - if is_appropriate_end_tag_token: - switch_state DATA - emit_tok + parser.framesetOk = false + for k, v in token.attrs: + if k notin parser.openElements[1].attributes: + parser.openElements[1].attributes[k] = v + ) + "<frameset>" => (block: + parse_error + if parser.openElements.len == 1 or parser.openElements[1].tagType != TAG_BODY or not parser.framesetOk: + discard else: - anything_else - of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr - else: - new_token nil #TODO - emit '<' - emit '/' - for r in tokenizer.tmp.runes: - emit r - reconsume_in RAWTEXT - - of SCRIPT_DATA_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state SCRIPT_DATA_END_TAG_OPEN - of '!': - switch_state SCRIPT_DATA_ESCAPE_START - emit '<' - emit '!' - else: - emit '<' - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_END_TAG_OPEN: - case c - of AsciiAlpha: - new_token Token(t: END_TAG) - reconsume_in SCRIPT_DATA_END_TAG_NAME - else: - emit '<' - emit '/' - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_END_TAG_NAME: - has_anything_else - case c - of whitespace: - if is_appropriate_end_tag_token: - switch_state BEFORE_ATTRIBUTE_NAME + if parser.openElements[1].parentNode != nil: + parser.openElements[1].remove() + pop_all_nodes + ) + TokenType.EOF => (block: + if parser.templateModes.len > 0: + parser.processInHTMLContent(token, IN_TEMPLATE) else: - anything_else - of '/': - if is_appropriate_end_tag_token: - switch_state SELF_CLOSING_START_TAG + #NOTE parse error omitted + discard # stop + ) + "</body>" => (block: + if not parser.openElements.hasElementInScope(TAG_BODY): + parse_error else: - anything_else - of '>': - if is_appropriate_end_tag_token: - switch_state DATA - emit_tok + #NOTE parse error omitted + parser.insertionMode = AFTER_BODY + ) + "</html>" => (block: + if not parser.openElements.hasElementInScope(TAG_BODY): + parse_error else: - anything_else - of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr - else: - emit '<' - emit '/' - for r in tokenizer.tmp.runes: - emit r - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_ESCAPE_START: - case c - of '-': - switch_state SCRIPT_DATA_ESCAPE_START_DASH - emit '-' - else: - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_ESCAPE_START_DASH: - case c - of '-': - switch_state SCRIPT_DATA_ESCAPED_DASH_DASH - emit '-' - else: - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_ESCAPED: - case c - of '-': - switch_state SCRIPT_DATA_ESCAPED_DASH - emit '-' - of '<': - switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN - of null: - parse_error unexpected_null_character - emit_replacement - of eof: - parse_error eof_in_script_html_comment_like_text - emit_eof - else: - emit_current - - of SCRIPT_DATA_ESCAPED_DASH: - case c - of '-': - switch_state SCRIPT_DATA_ESCAPED_DASH_DASH - emit '-' - of '<': - switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN - of null: - parse_error unexpected_null_character - switch_state SCRIPT_DATA_ESCAPED - of eof: - parse_error eof_in_script_html_comment_like_text - emit_eof - else: - switch_state SCRIPT_DATA_ESCAPED - emit_current - - of SCRIPT_DATA_ESCAPED_DASH_DASH: - case c - of '-': - emit '-' - of '<': - switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN - of '>': - switch_state SCRIPT_DATA - emit '>' - of null: - parse_error unexpected_null_character - switch_state SCRIPT_DATA_ESCAPED - of eof: - parse_error eof_in_script_html_comment_like_text - emit_eof - else: - switch_state SCRIPT_DATA_ESCAPED - emit_current - - of SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state SCRIPT_DATA_ESCAPED_END_TAG_OPEN - of AsciiAlpha: - tokenizer.tmp = "" - emit '<' - reconsume_in SCRIPT_DATA_DOUBLE_ESCAPE_START - else: - emit '<' - reconsume_in SCRIPT_DATA_ESCAPED - - of SCRIPT_DATA_ESCAPED_END_TAG_OPEN: - case c - of AsciiAlpha: - new_token Token(t: START_TAG) - reconsume_in SCRIPT_DATA_ESCAPED_END_TAG_NAME - else: - emit '<' - emit '/' - reconsume_in SCRIPT_DATA_ESCAPED - - of SCRIPT_DATA_ESCAPED_END_TAG_NAME: - has_anything_else - case c - of whitespace: - if is_appropriate_end_tag_token: - switch_state BEFORE_ATTRIBUTE_NAME + #NOTE parse error omitted + parser.insertionMode = AFTER_BODY + reprocess token + ) + ("<address>", "<article>", "<aside>", "<blockquote>", "<center>", + "<details>", "<dialog>", "<dir>", "<div>", "<dl>", "<fieldset>", + "<figcaption>", "<figure>", "<footer>", "<header>", "<hgroup>", "<main>", + "<menu>", "<nav>", "<ol>", "<p>", "<section>", "<summary>", "<ul>") => (block: + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + discard parser.insertHTMLElement(token) + ) + ("<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>") => (block: + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + if parser.currentNode.tagType in HTagTypes: + parse_error + pop_current_node + discard parser.insertHTMLElement(token) + ) + ("<pre>", "<listing>") => (block: + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + discard parser.insertHTMLElement(token) + parser.ignoreLF = true + parser.framesetOk = false + ) + "<form>" => (block: + let hasTemplate = parser.openElements.hasElement(TAG_TEMPLATE) + if parser.form != nil and not hasTemplate: + parse_error else: - anything_else - of '/': - if is_appropriate_end_tag_token: - switch_state SELF_CLOSING_START_TAG + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + let element = parser.insertHTMLElement(token) + if not hasTemplate: + parser.form = HTMLFormElement(element) + ) + "<li>" => (block: + parser.framesetOk = false + for i in countdown(parser.openElements.high, 0): + let node = parser.openElements[i] + case node.tagType + of TAG_LI: + parser.generateImpliedEndTags(TAG_LI) + if parser.currentNode.tagType != TAG_LI: parse_error + while parser.openElements.pop().tagType != TAG_LI: discard + break + of SpecialElements - {TAG_ADDRESS, TAG_DIV, TAG_P, TAG_LI}: + break + else: discard + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + discard parser.insertHTMLElement(token) + ) + ("<dd>", "<dt>") => (block: + parser.framesetOk = false + for i in countdown(parser.openElements.high, 0): + let node = parser.openElements[i] + case node.tagType + of TAG_DD: + parser.generateImpliedEndTags(TAG_DD) + if parser.currentNode.tagType != TAG_DD: parse_error + while parser.openElements.pop().tagType != TAG_DD: discard + break + of TAG_DT: + parser.generateImpliedEndTags(TAG_DT) + if parser.currentNode.tagType != TAG_DT: parse_error + while parser.openElements.pop().tagType != TAG_DT: discard + break + of SpecialElements - {TAG_ADDRESS, TAG_DIV, TAG_P, TAG_DD, TAG_DT}: + break + else: discard + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + discard parser.insertHTMLElement(token) + ) + "<plaintext>" => (block: + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + discard parser.insertHTMLElement(token) + parser.tokenizer.state = PLAINTEXT + ) + "<button>" => (block: + if parser.openElements.hasElementInScope(TAG_BUTTON): + parse_error + parser.generateImpliedEndTags() + while parser.openElements.pop().tagType != TAG_BUTTON: discard + parser.reconstructActiveFormatting() + discard parser.insertHTMLElement(token) + parser.framesetOk = false + ) + ("</address>", "</article>", "</aside>", "</blockquote>", "</button>", + "</center>", "</details>", "</dialog>", "</dir>", "</div>", "</dl>", + "</fieldset>", "</figcaption>", "</figure>", "</footer>", "</header>", + "</hgroup>", "</listing>", "</main>", "</menu>", "</nav>", "</ol>", + "</pre>", "</section>", "</summary>", "</ul>") => (block: + if not parser.openElements.hasElementInScope(token.tagtype): + parse_error else: - anything_else - of '>': - if is_appropriate_end_tag_token: - switch_state DATA + parser.generateImpliedEndTags() + if parser.currentNode.tagType != token.tagtype: parse_error + while parser.openElements.pop().tagType != token.tagtype: discard + ) + "</form>" => (block: + if not parser.openElements.hasElement(TAG_TEMPLATE): + let node = parser.form + parser.form = nil + if node == nil or not parser.openElements.hasElementInScope(node.tagType): + parse_error + return + parser.generateImpliedEndTags() + if parser.currentNode != node: parse_error + parser.openElements.del(parser.openElements.find(node)) else: - anything_else - of AsciiAlpha: - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr - else: - emit '<' - emit '/' - for r in tokenizer.tmp.runes: - emit r - reconsume_in SCRIPT_DATA_ESCAPED - - of SCRIPT_DATA_DOUBLE_ESCAPE_START: - case c - of whitespace, '/', '>': - if tokenizer.tmp == "script": - switch_state SCRIPT_DATA_DOUBLE_ESCAPED + if not parser.openElements.hasElementInScope(TAG_FORM): + parse_error + return + parser.generateImpliedEndTags() + if parser.currentNode.tagType != TAG_FORM: parse_error + while parser.openElements.pop().tagType != TAG_FORM: discard + ) + "</p>" => (block: + if not parser.openElements.hasElementInButtonScope(TAG_P): + parse_error + discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_P)) + parser.closeP() + ) + "</li>" => (block: + if not parser.openElements.hasElementInListItemScope(TAG_LI): + parse_error else: - switch_state SCRIPT_DATA_ESCAPED - emit_current - of AsciiAlpha: # note: merged upper & lower - tokenizer.tmp &= c.tolower() - emit_current - else: reconsume_in SCRIPT_DATA_ESCAPED - - of SCRIPT_DATA_DOUBLE_ESCAPED: - case c - of '-': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_DASH - emit '-' - of '<': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN - emit '<' - of null: - parse_error unexpected_null_character - emit_replacement - of eof: - parse_error eof_in_script_html_comment_like_text - emit_eof - else: emit_current - - of SCRIPT_DATA_DOUBLE_ESCAPED_DASH: - case c - of '-': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH - emit '-' - of '<': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN - emit '<' - of null: - parse_error unexpected_null_character - switch_state SCRIPT_DATA_DOUBLE_ESCAPED - emit_replacement - of eof: - parse_error eof_in_script_html_comment_like_text - emit_eof - else: - switch_state SCRIPT_DATA_DOUBLE_ESCAPED - emit_current - - of SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: - case c - of '-': emit '-' - of '<': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN - emit '<' - of '>': - switch_state SCRIPT_DATA - emit '>' - of null: - parse_error unexpected_null_character - switch_state SCRIPT_DATA_DOUBLE_ESCAPED - emit_replacement - of eof: - parse_error eof_in_script_html_comment_like_text - emit_eof - else: switch_state SCRIPT_DATA_DOUBLE_ESCAPED - - of SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state SCRIPT_DATA_DOUBLE_ESCAPE_END - emit '/' - else: reconsume_in SCRIPT_DATA_DOUBLE_ESCAPED - - of SCRIPT_DATA_DOUBLE_ESCAPE_END: - case c - of whitespace, '/', '>': - if tokenizer.tmp == "script": - switch_state SCRIPT_DATA_ESCAPED + parser.generateImpliedEndTags(TAG_LI) + if parser.currentNode.tagType != TAG_LI: parse_error + while parser.openElements.pop().tagType != TAG_LI: discard + ) + ("</dd>", "</dt>") => (block: + if not parser.openElements.hasElementInScope(token.tagtype): + parse_error else: - switch_state SCRIPT_DATA_DOUBLE_ESCAPED - emit_current - of AsciiAlpha: # note: merged upper & lower - tokenizer.tmp &= c.tolower() - emit_current - else: - reconsume_in SCRIPT_DATA_DOUBLE_ESCAPED - - of BEFORE_ATTRIBUTE_NAME: - case c - of whitespace: discard - of '/', '>', eof: reconsume_in AFTER_ATTRIBUTE_NAME - of '=': - parse_error unexpected_equals_sign_before_attribute_name - start_new_attribute - switch_state ATTRIBUTE_NAME - else: - start_new_attribute - reconsume_in ATTRIBUTE_NAME - - of ATTRIBUTE_NAME: - has_anything_else - case c - of whitespace, '/', '>', eof: - leave_attribute_name_state - reconsume_in AFTER_ATTRIBUTE_NAME - of '=': - leave_attribute_name_state - switch_state BEFORE_ATTRIBUTE_VALUE - of AsciiUpperAlpha: - tokenizer.attrn &= c.tolower() - of null: - parse_error unexpected_null_character - tokenizer.attrn &= Rune(0xFFFD) - of '"', '\'', '<': - parse_error unexpected_character_in_attribute_name + parser.generateImpliedEndTags(token.tagtype) + if parser.currentNode.tagType != token.tagtype: parse_error + while parser.openElements.pop().tagType != token.tagtype: discard + ) + ("</h1>", "</h2>", "</h3>", "</h4>", "</h5>", "</h6>") => (block: + if not parser.openElements.hasElementInScope(HTagTypes): + parse_error + else: + parser.generateImpliedEndTags() + if parser.currentNode.tagType != token.tagtype: parse_error + while parser.openElements.pop().tagType notin HTagTypes: discard + ) + "</sarcasm>" => (block: + #*deep breath* anything_else - else: - tokenizer.attrn &= tokenizer.curr - - of AFTER_ATTRIBUTE_NAME: - case c - of whitespace: discard - of '/': switch_state SELF_CLOSING_START_TAG - of '=': switch_state BEFORE_ATTRIBUTE_VALUE - of '>': - switch_state DATA - emit '>' - of eof: - parse_error eof_in_tag - emit_eof - else: - start_new_attribute - reconsume_in ATTRIBUTE_NAME - - of BEFORE_ATTRIBUTE_VALUE: - case c - of whitespace: discard - of '"': switch_state ATTRIBUTE_VALUE_DOUBLE_QUOTED - of '\'': switch_state ATTRIBUTE_VALUE_SINGLE_QUOTED - of '>': - parse_error missing_attribute_value - switch_state DATA - emit '>' - else: reconsume_in ATTRIBUTE_VALUE_UNQUOTED - - of ATTRIBUTE_VALUE_DOUBLE_QUOTED: - case c - of '"': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED - of '&': switch_state_return CHARACTER_REFERENCE - of null: - parse_error unexpected_null_character - append_to_current_attr_value Rune(0xFFFD) - of eof: - parse_error eof_in_tag - emit_eof - else: append_to_current_attr_value tokenizer.curr - - of ATTRIBUTE_VALUE_SINGLE_QUOTED: - case c - of '\'': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED - of '&': switch_state_return CHARACTER_REFERENCE - of null: - parse_error unexpected_null_character - append_to_current_attr_value Rune(0xFFFD) - of eof: - parse_error eof_in_tag - emit_eof - else: append_to_current_attr_value tokenizer.curr - - of ATTRIBUTE_VALUE_UNQUOTED: - case c - of whitespace: switch_state BEFORE_ATTRIBUTE_NAME - of '&': switch_state_return CHARACTER_REFERENCE - of '>': switch_state DATA - of null: - parse_error unexpected_null_character - append_to_current_attr_value Rune(0xFFFD) - of '"', '\'', '<', '=', '`': - parse_error unexpected_character_in_unquoted_attribute_value - append_to_current_attr_value c - of eof: - parse_error eof_in_tag - emit_eof - else: append_to_current_attr_value tokenizer.curr - - of AFTER_ATTRIBUTE_VALUE_QUOTED: - case c - of whitespace: - switch_state BEFORE_ATTRIBUTE_NAME - of '/': - switch_state SELF_CLOSING_START_TAG - of '>': - switch_state DATA - emit_tok - of eof: - parse_error eof_in_tag - emit_eof - else: append_to_current_attr_value tokenizer.curr - - of SELF_CLOSING_START_TAG: - case c - of '>': - tokenizer.tok.selfclosing = true - switch_state DATA - emit '>' - of eof: - parse_error eof_in_tag - emit_eof - else: - parse_error unexpected_solidus_in_tag - reconsume_in BEFORE_ATTRIBUTE_NAME - - of BOGUS_COMMENT: - assert tokenizer.tok.t == COMMENT - case c - of '>': - switch_state DATA - emit_tok - of eof: - emit_tok - emit_eof - of null: parse_error unexpected_null_character - else: tokenizer.tok.data &= tokenizer.curr - - of MARKUP_DECLARATION_OPEN: # note: rewritten to fit case model as we consume a char anyway - has_anything_else - case c - of '-': - if peek_char == '-': - new_token Token(t: COMMENT) - tokenizer.state = COMMENT_START - consume_and_discard 1 - else: anything_else - of 'D', 'd': - if peek_str_nocase("OCTYPE"): - consume_and_discard "OCTYPE".len - switch_state DOCTYPE - else: anything_else - of '[': - if peek_str("CDATA["): - consume_and_discard "CDATA[".len - if has_adjusted_current_node: #TODO and it is not an element in the HTML namespace - switch_state CDATA_SECTION - else: - parse_error cdata_in_html_content - new_token Token(t: COMMENT, data: "[CDATA[") - switch_state BOGUS_COMMENT - else: anything_else - else: - parse_error incorrectly_opened_comment - new_token Token(t: COMMENT) - reconsume_in BOGUS_COMMENT - - of COMMENT_START: - case c - of '-': switch_state COMMENT_START_DASH - of '>': - parse_error abrupt_closing_of_empty_comment - switch_state DATA - emit_tok - else: reconsume_in COMMENT - - of COMMENT_START_DASH: - case c - of '-': switch_state COMMENT_END - of '>': - parse_error abrupt_closing_of_empty_comment - switch_state DATA - emit_tok - of eof: - parse_error eof_in_comment - emit_tok - emit_eof - else: - tokenizer.tok.data &= '-' - reconsume_in COMMENT - - of COMMENT: - case c - of '<': - tokenizer.tok.data &= c - switch_state COMMENT_LESS_THAN_SIGN - of '-': switch_state COMMENT_END_DASH - of null: - parse_error unexpected_null_character - tokenizer.tok.data &= Rune(0xFFFD) - of eof: - parse_error eof_in_comment - emit_tok - emit_eof - else: tokenizer.tok.data &= tokenizer.curr - - of COMMENT_LESS_THAN_SIGN: - case c - of '!': - tokenizer.tok.data &= c - switch_state COMMENT_LESS_THAN_SIGN_BANG - of '<': tokenizer.tok.data &= c - else: reconsume_in COMMENT - - of COMMENT_LESS_THAN_SIGN_BANG: - case c - of '-': switch_state COMMENT_LESS_THAN_SIGN_BANG_DASH - else: reconsume_in COMMENT - - of COMMENT_LESS_THAN_SIGN_BANG_DASH: - case c - of '-': switch_state COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH - else: reconsume_in COMMENT_END_DASH - - of COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH: - case c - of '>', eof: reconsume_in COMMENT_END - else: - parse_error nested_comment - reconsume_in COMMENT_END - - of COMMENT_END_DASH: - case c - of '-': switch_state COMMENT_END - of eof: - parse_error eof_in_comment - emit_tok - emit_eof - else: - tokenizer.tok.data &= '-' - reconsume_in COMMENT - - of COMMENT_END: - case c - of '>': switch_state DATA - of '!': switch_state COMMENT_END_BANG - of '-': tokenizer.tok.data &= '-' - of eof: - parse_error eof_in_comment - emit_tok - emit_eof - else: - tokenizer.tok.data &= "--" - reconsume_in COMMENT - - of COMMENT_END_BANG: - case c - of '-': - tokenizer.tok.data &= "--!" - switch_state COMMENT_END_DASH - of '>': - parse_error incorrectly_closed_comment - switch_state DATA - emit_tok - of eof: - parse_error eof_in_comment - emit_tok - emit_eof - else: - tokenizer.tok.data &= "--!" - reconsume_in COMMENT - - of DOCTYPE: - case c - of whitespace: switch_state BEFORE_DOCTYPE_NAME - of '>': reconsume_in BEFORE_DOCTYPE_NAME - of eof: - parse_error eof_in_doctype - new_token Token(t: DOCTYPE, quirks: true) - emit_tok - emit_eof - else: - parse_error missing_whitespace_before_doctype_name - reconsume_in BEFORE_DOCTYPE_NAME - - of BEFORE_DOCTYPE_NAME: - case c - of whitespace: discard - of AsciiUpperAlpha: - new_token Token(t: DOCTYPE, name: some($c.tolower())) - switch_state DOCTYPE_NAME - of null: - parse_error unexpected_null_character - new_token Token(t: DOCTYPE, name: some($Rune(0xFFFD))) - of '>': - parse_error missing_doctype_name - new_token Token(t: DOCTYPE, quirks: true) - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - new_token Token(t: DOCTYPE, quirks: true) - emit_tok - emit_eof - else: - new_token Token(t: DOCTYPE, name: some($tokenizer.curr)) - switch_state DOCTYPE_NAME - - of DOCTYPE_NAME: - case c - of whitespace: switch_state AFTER_DOCTYPE_NAME - of '>': - switch_state DATA - emit_tok - of AsciiUpperAlpha: - tokenizer.tok.name.get &= c.tolower() - of null: - parse_error unexpected_null_character - tokenizer.tok.name.get &= Rune(0xFFFD) - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.name.get &= tokenizer.curr - - of AFTER_DOCTYPE_NAME: # note: rewritten to fit case model as we consume a char anyway - has_anything_else - case c - of whitespace: discard - of '>': - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - of 'p', 'P': - if peek_str("UBLIC"): - consume_and_discard "UBLIC".len - switch_state AFTER_DOCTYPE_PUBLIC_KEYWORD + ) + "<a>" => (block: + var element: Element = nil + for i in countdown(parser.activeFormatting.high, 0): + let format = parser.activeFormatting[i] + if format[0] == nil: + break + if format[0].tagType == TAG_A: + element = format[0] + break + if element != nil: + parse_error + if parser.adoptionAgencyAlgorithm(token): + any_other_end_tag + return + for i in 0..parser.activeFormatting.high: + if parser.activeFormatting[i][0] == element: + parser.activeFormatting.del(i) + break + for i in 0..parser.openElements.high: + if parser.openElements[i] == element: + parser.openElements.del(i) + break + parser.reconstructActiveFormatting() + let element = parser.insertHTMLElement(token) + parser.pushOntoActiveFormatting(element, token) + ) + ("<b>", "<big>", "<code>", "<em>", "<font>", "<i>", "<s>", "<small>", + "<strike>", "<strong>", "<tt>", "<u>") => (block: + parser.reconstructActiveFormatting() + let element = parser.insertHTMLElement(token) + parser.pushOntoActiveFormatting(element, token) + ) + "<nobr>" => (block: + parser.reconstructActiveFormatting() + if parser.openElements.hasElementInScope(TAG_NOBR): + parse_error + if parser.adoptionAgencyAlgorithm(token): + any_other_end_tag + return + parser.reconstructActiveFormatting() + let element = parser.insertHTMLElement(token) + parser.pushOntoActiveFormatting(element, token) + ) + ("</a>", "</b>", "</big>", "</code>", "</em>", "</font>", "</i>", + "</nobr>", "</s>", "</small>", "</strike>", "</strong>", "</tt>", + "</u>") => (block: + if parser.adoptionAgencyAlgorithm(token): + any_other_end_tag + return + ) + ("<applet>", "<marquee>", "<object>") => (block: + parser.reconstructActiveFormatting() + discard parser.insertHTMLElement(token) + parser.activeFormatting.add((nil, nil)) + parser.framesetOk = false + ) + ("</applet>", "</marquee>", "</object>") => (block: + if not parser.openElements.hasElementInScope(token.tagtype): + parse_error else: - anything_else - of 's', 'S': - if peek_str("YSTEM"): - consume_and_discard "YSTEM".len - switch_state AFTER_DOCTYPE_SYSTEM_KEYWORD + parser.generateImpliedEndTags() + if parser.currentNode.tagType != token.tagtype: parse_error + while parser.openElements.pop().tagType != token.tagtype: discard + parser.clearActiveFormattingTillMarker() + ) + "<table>" => (block: + if parser.document.mode != QUIRKS: + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + discard parser.insertHTMLElement(token) + parser.framesetOk = false + parser.insertionMode = IN_TABLE + ) + "</br>" => (block: + parse_error + parser.processInHTMLContent(Token(t: START_TAG, tagtype: TAG_BR)) + ) + ("<area>", "<br>", "<embed>", "<img>", "<keygen>", "<wbr>") => (block: + parser.reconstructActiveFormatting() + discard parser.insertHTMLElement(token) + pop_current_node + parser.framesetOk = false + ) + "<input>" => (block: + parser.reconstructActiveFormatting() + discard parser.insertHTMLElement(token) + pop_current_node + if not token.attrs.getOrDefault("type").equalsIgnoreCase("hidden"): + parser.framesetOk = false + ) + ("<param>", "<source>", "<track>") => (block: + discard parser.insertHTMLElement(token) + pop_current_node + ) + "<hr>" => (block: + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + discard parser.insertHTMLElement(token) + pop_current_node + parser.framesetOk = false + ) + "<image>" => (block: + #TODO ew + let token = Token(t: START_TAG, tagtype: TAG_IMG, tagname: "img", selfclosing: token.selfclosing, attrs: token.attrs) + reprocess token + ) + "<textarea>" => (block: + discard parser.insertHTMLElement(token) + parser.ignoreLF = true + parser.tokenizer.state = RCDATA + parser.oldInsertionMode = parser.insertionMode + parser.framesetOk = false + parser.insertionMode = TEXT + ) + "<xmp>" => (block: + if parser.openElements.hasElementInButtonScope(TAG_P): + parser.closeP() + parser.reconstructActiveFormatting() + parser.framesetOk = false + parser.genericRawtextElementParsingAlgorithm(token) + ) + "<iframe>" => (block: + parser.framesetOk = false + parser.genericRawtextElementParsingAlgorithm(token) + ) + "<noembed>" => (block: + parser.genericRawtextElementParsingAlgorithm(token) + ) + "<noscript>" => (block: + if parser.scripting: + parser.genericRawtextElementParsingAlgorithm(token) else: - anything_else - else: - parse_error invalid_character_sequence_after_doctype_name - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of AFTER_DOCTYPE_PUBLIC_KEYWORD: - case c - of whitespace: switch_state BEFORE_DOCTYPE_PUBLIC_IDENTIFIER - of '"': - parse_error missing_whitespace_after_doctype_public_keyword - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED - of '>': - parse_error missing_doctype_public_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error missing_quote_before_doctype_public_identifier - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: - case c - of whitespace: discard - of '"': - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED - of '\'': - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED - of '>': - parse_error missing_doctype_public_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error missing_quote_before_doctype_public_identifier - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: - case c - of '"': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER - of null: - parse_error unexpected_null_character - tokenizer.tok.pubid.get &= Rune(0xFFFD) - of '>': - parse_error abrupt_doctype_public_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.pubid.get &= tokenizer.curr - - of DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: - case c - of '\'': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER - of null: - parse_error unexpected_null_character - tokenizer.tok.pubid.get &= Rune(0xFFFD) - of '>': - parse_error abrupt_doctype_public_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.pubid.get &= tokenizer.curr - - of AFTER_DOCTYPE_PUBLIC_IDENTIFIER: - case c - of whitespace: switch_state BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS - of '>': - switch_state DATA - emit_tok - of '"': - parse_error missing_whitespace_between_doctype_public_and_system_identifiers - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED - of '\'': - parse_error missing_whitespace_between_doctype_public_and_system_identifiers - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error missing_quote_before_doctype_system_identifier - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: - case c - of whitespace: discard - of '>': - switch_state DATA - emit_tok - of '"': - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED - of '\'': - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error missing_quote_before_doctype_system_identifier - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of AFTER_DOCTYPE_SYSTEM_KEYWORD: - case c - of whitespace: switch_state BEFORE_DOCTYPE_SYSTEM_IDENTIFIER - of '"': - parse_error missing_whitespace_after_doctype_system_keyword - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED - of '\'': - parse_error missing_whitespace_after_doctype_system_keyword - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED - of '>': - parse_error missing_doctype_system_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error missing_quote_before_doctype_system_identifier - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: - case c - of whitespace: discard - of '"': - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED - of '\'': - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED - of '>': - parse_error missing_doctype_system_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error missing_quote_before_doctype_system_identifier - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: - case c - of '"': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER - of null: - parse_error unexpected_null_character - tokenizer.tok.sysid.get &= Rune(0xFFFD) - of '>': - parse_error abrupt_doctype_system_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.sysid.get &= tokenizer.curr - - of DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: - case c - of '\'': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER - of null: - parse_error unexpected_null_character - tokenizer.tok.sysid.get &= Rune(0xFFFD) - of '>': - parse_error abrupt_doctype_system_identifier - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.sysid.get &= tokenizer.curr - - of AFTER_DOCTYPE_SYSTEM_IDENTIFIER: - case c - of whitespace: discard - of '>': - switch_state DATA - emit_tok - of eof: - parse_error eof_in_doctype - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error unexpected_character_after_doctype_system_identifier - reconsume_in BOGUS_DOCTYPE - - of BOGUS_DOCTYPE: - case c - of '>': - switch_state DATA - emit_tok - of null: parse_error unexpected_null_character - of eof: - emit_tok - emit_eof - else: discard - - of CDATA_SECTION: - case c - of ']': switch_state CDATA_SECTION_BRACKET - of eof: - parse_error eof_in_cdata - emit_eof - else: - emit_current - - of CDATA_SECTION_BRACKET: - case c - of ']': switch_state CDATA_SECTION_END - of '>': switch_state DATA - else: - emit ']' - reconsume_in CDATA_SECTION - - of CDATA_SECTION_END: - case c - of ']': emit ']' - of '>': switch_state DATA - else: - emit ']' - emit ']' - reconsume_in CDATA_SECTION - - of CHARACTER_REFERENCE: - tokenizer.tmp = "&" - case c - of AsciiAlpha: reconsume_in NAMED_CHARACTER_REFERENCE - of '#': - tokenizer.tmp &= '#' - switch_state NUMERIC_CHARACTER_REFERENCE - else: - flush_code_points_consumed_as_a_character_reference - reconsume_in tokenizer.rstate - - of NAMED_CHARACTER_REFERENCE: - ignore_eof # we check for eof ourselves - tokenizer.reconsume() #TODO optimize this away - var buf = "" - var node = entityMap - var value = none(string) # last value - var match = true - #TODO interfacing with RadixNode is suffering - # plus this doesn't look very efficient either - while not tokenizer.atEof: - let c = tokenizer.consume() - buf &= c - if not node.hasPrefix(buf): - break - let prevnode = node - node = node{buf} - if node != prevnode: - buf = "" - if node.value.issome: - value = node.value - tokenizer.tmp &= tokenizer.curr - if value.issome: - if consumed_as_an_attribute and tokenizer.tmp[^1] != ';' and peek_char in {';'} + AsciiAlpha: - flush_code_points_consumed_as_a_character_reference - switch_state tokenizer.rstate + any_other_start_tag + ) + "<select>" => (block: + parser.reconstructActiveFormatting() + discard parser.insertHTMLElement(token) + parser.framesetOk = false + if parser.insertionMode in {IN_TABLE, IN_CAPTION, IN_TABLE_BODY, IN_CELL}: + parser.insertionMode = IN_SELECT_IN_TABLE else: - if tokenizer.tmp[^1] != ';': - parse_error missing_semicolon_after_character_reference_parse_error - tokenizer.tmp = node.value.get - flush_code_points_consumed_as_a_character_reference - switch_state tokenizer.rstate - else: - flush_code_points_consumed_as_a_character_reference - switch_state AMBIGUOUS_AMPERSAND_STATE - - of AMBIGUOUS_AMPERSAND_STATE: - case c - of AsciiAlpha: - if consumed_as_an_attribute: - append_to_current_attr_value c + parser.insertionMode = IN_SELECT + ) + ("<optgroup>", "<option>") => (block: + if parser.currentNode.tagType == TAG_OPTION: + pop_current_node + parser.reconstructActiveFormatting() + discard parser.insertHTMLElement(token) + ) + ("<rb>", "<rtc>") => (block: + if parser.openElements.hasElementInScope(TAG_RUBY): + parser.generateImpliedEndTags() + if parser.currentNode.tagType != TAG_RUBY: parse_error + discard parser.insertHTMLElement(token) + ) + ("<rp>", "<rt>") => (block: + if parser.openElements.hasElementInScope(TAG_RUBY): + parser.generateImpliedEndTags(TAG_RTC) + if parser.currentNode.tagType notin {TAG_RUBY, TAG_RTC}: parse_error + discard parser.insertHTMLElement(token) + ) + #NOTE <math> (not implemented) + #TODO <svg> (SVG) + ("<caption>", "<col>", "<colgroup>", "<frame>", "<head>", "<tbody>", + "<td>", "<tfoot>", "<th>", "<thead>", "<tr>") => (block: parse_error) + TokenType.START_TAG => (block: any_other_start_tag) + TokenType.END_TAG => (block: any_other_end_tag) + + of TEXT: + match token: + TokenType.CHARACTER_ASCII => (block: + assert token.c != '\0' + parser.insertCharacter(token.c) + ) + TokenType.CHARACTER => (block: + parser.insertCharacter(token.r) + ) + TokenType.EOF => (block: + parse_error + if parser.currentNode.tagType == TAG_SCRIPT: + HTMLScriptElement(parser.currentNode).alreadyStarted = true + pop_current_node + parser.insertionMode = parser.oldInsertionMode + reprocess token + ) + "</script>" => (block: + #TODO microtask + let script = parser.currentNode + pop_current_node + parser.insertionMode = parser.oldInsertionMode + #TODO document.write() ? + #TODO prepare script element + #TODO uh implement scripting or something + ) + TokenType.END_TAG => (block: + pop_current_node + parser.insertionMode = parser.oldInsertionMode + ) + + of IN_TABLE: + template clear_the_stack_back_to_a_table_context() = + while parser.currentNode.tagType notin {TAG_TABLE, TAG_TEMPLATE, TAG_HTML}: + pop_current_node + + match token: + (TokenType.CHARACTER_ASCII, TokenType.CHARACTER) => (block: + if parser.currentNode.tagType in {TAG_TABLE, TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TR}: + parser.pendingTableChars = "" + parser.pendingTableCharsWhitespace = true + parser.oldInsertionMode = parser.insertionMode + parser.insertionMode = IN_TABLE_TEXT + reprocess token + else: # anything else + parse_error + parser.fosterParenting = true + parser.processInHTMLContent(token, IN_BODY) + parser.fosterParenting = false + ) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<caption>" => (block: + clear_the_stack_back_to_a_table_context + parser.activeFormatting.add((nil, nil)) + discard parser.insertHTMLElement(token) + parser.insertionMode = IN_CAPTION + ) + "<colgroup>" => (block: + clear_the_stack_back_to_a_table_context + discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_COLGROUP)) + parser.insertionMode = IN_COLUMN_GROUP + ) + ("<tbody>", "<tfoot>", "<thead>") => (block: + clear_the_stack_back_to_a_table_context + discard parser.insertHTMLElement(token) + parser.insertionMode = IN_TABLE_BODY + ) + ("<td>", "<th>", "<tr>") => (block: + clear_the_stack_back_to_a_table_context + discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_TBODY)) + parser.insertionMode = IN_TABLE_BODY + ) + "<table>" => (block: + parse_error + if not parser.openElements.hasElementInScope(TAG_TABLE): + discard else: - emit_current - of ';': - parse_error unknown_named_character_reference - reconsume_in tokenizer.rstate - else: reconsume_in tokenizer.rstate - - of NUMERIC_CHARACTER_REFERENCE: - tokenizer.code = 0 - case c - of 'x', 'X': - tokenizer.tmp &= c - switch_state HEXADECIMAL_CHARACTER_REFERENCE_START - else: reconsume_in DECIMAL_CHARACTER_REFERENCE_START - - of HEXADECIMAL_CHARACTER_REFERENCE_START: - case c - of AsciiHexDigit: reconsume_in HEXADECIMAL_CHARACTER_REFERENCE - else: - parse_error absence_of_digits_in_numeric_character_reference - flush_code_points_consumed_as_a_character_reference - reconsume_in tokenizer.rstate - - of DECIMAL_CHARACTER_REFERENCE_START: - case c - of AsciiDigit: reconsume_in DECIMAL_CHARACTER_REFERENCE - else: - parse_error absence_of_digits_in_numeric_character_reference - flush_code_points_consumed_as_a_character_reference - reconsume_in tokenizer.rstate - - of HEXADECIMAL_CHARACTER_REFERENCE: - case c - of AsciiHexDigit: # note: merged digit, upper hex, lower hex - tokenizer.code *= 0x10 - tokenizer.code += hexValue(c) - of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END - else: - parse_error missing_semicolon_after_character_reference - reconsume_in NUMERIC_CHARACTER_REFERENCE_END - - of DECIMAL_CHARACTER_REFERENCE: - case c - of AsciiDigit: - tokenizer.code *= 10 - tokenizer.code += decValue(c) - of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END + while parser.openElements.pop().tagType != TAG_TABLE: discard + parser.resetInsertionMode() + reprocess token + ) + "</table>" => (block: + if not parser.openElements.hasElementInScope(TAG_TABLE): + parse_error + else: + while parser.openElements.pop().tagType != TAG_TABLE: discard + parser.resetInsertionMode() + ) + ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</tbody>", + "</td>", "</tfoot>", "</th>", "</thead>", "</tr>") => (block: + parse_error + ) + ("<style>", "<script>", "<template>", "</template>") => (block: + parser.processInHTMLContent(token, IN_HEAD) + ) + "<input>" => (block: + if not token.attrs.getOrDefault("type").equalsIgnoreCase("hidden"): + # anything else + parse_error + parser.fosterParenting = true + parser.processInHTMLContent(token, IN_BODY) + parser.fosterParenting = false + else: + parse_error + discard parser.insertHTMLElement(token) + pop_current_node + ) + "<form>" => (block: + parse_error + if parser.form != nil or parser.openElements.hasElement(TAG_TEMPLATE): + discard + else: + parser.form = HTMLFormElement(parser.insertHTMLElement(token)) + pop_current_node + ) + TokenType.EOF => (block: + parser.processInHTMLContent(token, IN_BODY) + ) + _ => (block: + parse_error + parser.fosterParenting = true + parser.processInHTMLContent(token, IN_BODY) + parser.fosterParenting = false + ) + + of IN_TABLE_TEXT: + match token: + '\0' => (block: parse_error) + TokenType.CHARACTER_ASCII => (block: + if token.c notin AsciiWhitespace: + parser.pendingTableCharsWhitespace = false + parser.pendingTableChars &= token.c + ) + TokenType.CHARACTER => (block: + parser.pendingTableChars &= token.r + parser.pendingTableCharsWhitespace = false + ) + _ => (block: + if not parser.pendingTableCharsWhitespace: + # I *think* this is effectively the same thing the specification wants... + parse_error + parser.fosterParenting = true + parser.reconstructActiveFormatting() + parser.insertCharacter(token.c) + parser.framesetOk = false + parser.fosterParenting = false + else: + parser.insertCharacter(parser.pendingTableChars) + parser.insertionMode = parser.oldInsertionMode + reprocess token + ) + + of IN_CAPTION: + match token: + "</caption>" => (block: + if parser.openElements.hasElementInTableScope(TAG_CAPTION): + parse_error + else: + parser.generateImpliedEndTags() + if parser.currentNode.tagType != TAG_CAPTION: parse_error + while parser.openElements.pop().tagType != TAG_CAPTION: discard + parser.clearActiveFormattingTillMarker() + parser.insertionMode = IN_TABLE + ) + ("<caption>", "<col>", "<colgroup>", "<tbody>", "<td>", "<tfoot>", + "<th>", "<thead>", "<tr>", "</table>") => (block: + if not parser.openElements.hasElementInTableScope(TAG_CAPTION): + parse_error + else: + parser.generateImpliedEndTags() + if parser.currentNode.tagType != TAG_CAPTION: parse_error + parser.clearActiveFormattingTillMarker() + parser.insertionMode = IN_TABLE + reprocess token + ) + ("</body>", "</col>", "</colgroup>", "</html>", "</tbody>", "</td>", + "</tfoot>", "</th>", "</thead>", "</tr>") => (block: parse_error) + _ => (block: parser.processInHTMLContent(token, IN_BODY)) + + of IN_COLUMN_GROUP: + match token: + AsciiWhitespace => (block: parser.insertCharacter(token.c)) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "<col>" => (block: + discard parser.insertHTMLElement(token) + pop_current_node + ) + "</colgroup>" => (block: + if parser.currentNode.tagType != TAG_COLGROUP: + parse_error + else: + pop_current_node + parser.insertionMode = IN_TABLE + ) + "</col>" => (block: parse_error) + ("<template>", "</template>") => (block: + parser.processInHTMLContent(token, IN_HEAD) + ) + TokenType.EOF => (block: parser.processInHTMLContent(token, IN_BODY)) + _ => (block: + if parser.currentNode.tagType != TAG_COLGROUP: + parse_error + else: + pop_current_node + parser.insertionMode = IN_TABLE + reprocess token + ) + + of IN_TABLE_BODY: + template clear_the_stack_back_to_a_table_body_context() = + while parser.currentNode.tagType notin {TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TEMPLATE, TAG_HTML}: + pop_current_node + + match token: + "<tr>" => (block: + clear_the_stack_back_to_a_table_body_context + discard parser.insertHTMLElement(token) + parser.insertionMode = IN_ROW + ) + ("<th>", "<td>") => (block: + parse_error + clear_the_stack_back_to_a_table_body_context + discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_TR)) + parser.insertionMode = IN_ROW + reprocess token + ) + ("</tbody>", "</tfoot>", "</thead>") => (block: + if not parser.openElements.hasElementInTableScope(token.tagtype): + parse_error + else: + clear_the_stack_back_to_a_table_body_context + pop_current_node + parser.insertionMode = IN_TABLE + ) + ("<caption>", "<col>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>", + "</table>") => (block: + if not parser.openElements.hasElementInTableScope({TAG_TBODY, TAG_THEAD, TAG_TFOOT}): + parse_error + else: + clear_the_stack_back_to_a_table_body_context + pop_current_node + parser.insertionMode = IN_TABLE + reprocess token + ) + ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</td>", + "</th>", "</tr>") => (block: + parse_error + ) + _ => (block: parser.processInHTMLContent(token, IN_TABLE)) + + of IN_ROW: + template clear_the_stack_back_to_a_table_row_context() = + while parser.currentNode.tagType notin {TAG_TR, TAG_TEMPLATE, TAG_HTML}: + pop_current_node + + match token: + ("<th>", "<td>") => (block: + clear_the_stack_back_to_a_table_row_context + discard parser.insertHTMLElement(token) + parser.insertionMode = IN_CELL + parser.activeFormatting.add((nil, nil)) + ) + "</tr>" => (block: + if not parser.openElements.hasElementInTableScope(TAG_TR): + parse_error + else: + clear_the_stack_back_to_a_table_row_context + pop_current_node + parser.insertionMode = IN_TABLE_BODY + ) + ("<caption>", "<col>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>", + "<tr>", "</table>") => (block: + if not parser.openElements.hasElementInTableScope(TAG_TR): + parse_error + else: + clear_the_stack_back_to_a_table_row_context + pop_current_node + parser.insertionMode = IN_TABLE_BODY + reprocess token + ) + ("</tbody>", "</tfoot>", "</thead>") => (block: + if not parser.openElements.hasElementInTableScope(token.tagtype): + parse_error + elif not parser.openElements.hasElementInTableScope(TAG_TR): + discard + else: + clear_the_stack_back_to_a_table_row_context + pop_current_node + parser.insertionMode = IN_BODY + reprocess token + ) + ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</td>", + "</th>") => (block: parse_error) + _ => (block: parser.processInHTMLContent(token, IN_TABLE)) + + of IN_CELL: + template close_cell() = + parser.generateImpliedEndTags() + if parser.currentNode.tagType notin {TAG_TD, TAG_TH}: parse_error + while parser.openElements.pop().tagType notin {TAG_TD, TAG_TH}: discard + parser.clearActiveFormattingTillMarker() + parser.insertionMode = IN_ROW + + match token: + ("</td>", "</th>") => (block: + if not parser.openElements.hasElementInTableScope(token.tagtype): + parse_error + else: + parser.generateImpliedEndTags() + if parser.currentNode.tagType != token.tagtype: parse_error + while parser.openElements.pop().tagType != token.tagtype: discard + parser.clearActiveFormattingTillMarker() + parser.insertionMode = IN_ROW + ) + ("<caption>", "<col>", "<colgroup>", "<tbody>", "<td>", "<tfoot>", + "<thead>", "<tr>") => (block: + if not parser.openElements.hasElementInTableScope({TAG_TD, TAG_TH}): + parse_error + else: + close_cell + ) + ("</body>", "</caption>", "</col>", "</colgroup>", + "</html>") => (block: parse_error) + ("</table>", "</tbody>", "</tfoot>", "</thead>", "</tr>") => (block: + if not parser.openElements.hasElementInTableScope(token.tagtype): + parse_error + else: + close_cell + reprocess token + ) + _ => (block: parser.processInHTMLContent(token, IN_BODY)) + + of IN_SELECT: + match token: + '\0' => (block: parse_error) + TokenType.CHARACTER_ASCII => (block: parser.insertCharacter(token.c)) + TokenType.CHARACTER => (block: parser.insertCharacter(token.r)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "<option>" => (block: + if parser.currentNode.tagType == TAG_OPTION: + pop_current_node + discard parser.insertHTMLElement(token) + ) + "<optgroup>" => (block: + if parser.currentNode.tagType == TAG_OPTION: + pop_current_node + if parser.currentNode.tagType == TAG_OPTGROUP: + pop_current_node + discard parser.insertHTMLElement(token) + ) + "</optgroup>" => (block: + if parser.currentNode.tagType == TAG_OPTION: + if parser.openElements.len > 1 and parser.openElements[^2].tagType == TAG_OPTGROUP: + pop_current_node + if parser.currentNode.tagType == TAG_OPTGROUP: + pop_current_node + else: + parse_error + ) + "</option>" => (block: + if parser.currentNode.tagType == TAG_OPTION: + pop_current_node + else: + parse_error + ) + "</select>" => (block: + if not parser.openElements.hasElementInSelectScope(TAG_SELECT): + parse_error + else: + while parser.openElements.pop().tagType != TAG_SELECT: discard + parser.resetInsertionMode() + ) + ("<input>", "<keygen>", "<textarea>") => (block: + parse_error + if not parser.openElements.hasElementInSelectScope(TAG_SELECT): + discard + else: + while parser.openElements.pop().tagType != TAG_SELECT: discard + parser.resetInsertionMode() + reprocess token + ) + ("<script>", "<template>", "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD)) + TokenType.EOF => (block: parser.processInHTMLContent(token, IN_BODY)) + _ => (block: parse_error) + + of IN_SELECT_IN_TABLE: + match token: + ("<caption>", "<table>", "<tbody>", "<tfoot>", "<thead>", "<tr>", "<td>", + "<th>") => (block: + parse_error + while parser.openElements.pop().tagType != TAG_SELECT: discard + parser.resetInsertionMode() + reprocess token + ) + ("</caption>", "</table>", "</tbody>", "</tfoot>", "</thead>", "</tr>", + "</td>", "</th>") => (block: + parse_error + if not parser.openElements.hasElementInTableScope(token.tagtype): + discard + else: + while parser.openElements.pop().tagType != TAG_SELECT: discard + parser.resetInsertionMode() + reprocess token + ) + _ => (block: parser.processInHTMLContent(token, IN_SELECT)) + + of IN_TEMPLATE: + match token: + (TokenType.CHARACTER_ASCII, TokenType.CHARACTER, TokenType.DOCTYPE) => (block: + parser.processInHTMLContent(token, IN_BODY) + ) + ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", + "<script>", "<style>", "<template>", "<title>", "</template>") => (block: + parser.processInHTMLContent(token, IN_HEAD) + ) + ("<caption>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>") => (block: + discard parser.templateModes.pop() + parser.templateModes.add(IN_TABLE) + parser.insertionMode = IN_TABLE + reprocess token + ) + "<col>" => (block: + discard parser.templateModes.pop() + parser.templateModes.add(IN_COLUMN_GROUP) + parser.insertionMode = IN_COLUMN_GROUP + reprocess token + ) + "<tr>" => (block: + discard parser.templateModes.pop() + parser.templateModes.add(IN_TABLE_BODY) + parser.insertionMode = IN_TABLE_BODY + reprocess token + ) + ("<td>", "<th>") => (block: + discard parser.templateModes.pop() + parser.templateModes.add(IN_ROW) + parser.insertionMode = IN_ROW + reprocess token + ) + TokenType.START_TAG => (block: + discard parser.templateModes.pop() + parser.templateModes.add(IN_BODY) + parser.insertionMode = IN_BODY + reprocess token + ) + TokenType.END_TAG => (block: parse_error) + TokenType.EOF => (block: + if not parser.openElements.hasElement(TAG_TEMPLATE): + discard # stop + else: + parse_error + while parser.openElements.pop().tagType != TAG_TEMPLATE: discard + parser.clearActiveFormattingTillMarker() + discard parser.templateModes.pop() + parser.resetInsertionMode() + reprocess token + ) + + of AFTER_BODY: + match token: + AsciiWhitespace => (block: parser.processInHTMLContent(token, IN_BODY)) + TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.openElements[0]))) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "</html>" => (block: + if parser.fragment: + parse_error + else: + parser.insertionMode = AFTER_AFTER_BODY + ) + TokenType.EOF => (block: discard) # stop + _ => (block: + parse_error + parser.insertionMode = IN_BODY + reprocess token + ) + + of IN_FRAMESET: + match token: + AsciiWhitespace => (block: parser.insertCharacter(token.c)) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "<frameset>" => (block: + if parser.currentNode == parser.document.html: + parse_error + else: + pop_current_node + if not parser.fragment and parser.currentNode.tagType != TAG_FRAMESET: + parser.insertionMode = AFTER_FRAMESET + ) + "<frame>" => (block: + discard parser.insertHTMLElement(token) + pop_current_node + ) + "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) + TokenType.EOF => (block: + if parser.currentNode != parser.document.html: parse_error + # stop + ) + _ => (block: parse_error) + + of AFTER_FRAMESET: + match token: + AsciiWhitespace => (block: parser.insertCharacter(token.c)) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error) + "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) + "</html>" => (block: parser.insertionMode = AFTER_AFTER_FRAMESET) + "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) + TokenType.EOF => (block: discard) # stop + _ => (block: parse_error) + + of AFTER_AFTER_BODY: + match token: + TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document))) + (TokenType.DOCTYPE, AsciiWhitespace, "<html>") => (block: parser.processInHTMLContent(token, IN_BODY)) + TokenType.EOF => (block: discard) # stop + _ => (block: + parse_error + parser.insertionMode = IN_BODY + reprocess token + ) + + of AFTER_AFTER_FRAMESET: + match token: + TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document))) + (TokenType.DOCTYPE, AsciiWhitespace, "<html>") => (block: parser.processInHTMLContent(token, IN_BODY)) + TokenType.EOF => (block: discard) # stop + "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) + _ => (block: parse_error) + +proc processInForeignContent(parser: var HTML5Parser, token: Token) = + macro `=>`(v: typed, body: untyped): untyped = + quote do: + discard (`v`, proc() = `body`) + template script_end_tag() = + pop_current_node + #TODO document.write (?) + #TODO SVG + template any_other_end_tag() = + if parser.currentNode.localName != token.tagname: parse_error + for i in countdown(parser.openElements.high, 1): + let node = parser.openElements[i] + if node.localName == token.tagname: + while parser.openElements.pop() != node: discard + break + if node.namespace == Namespace.HTML: break + parser.processInHTMLContent(token) + + + match token: + '\0' => (block: + parse_error + parser.insertCharacter(Rune(0xFFFD)) + ) + AsciiWhitespace => (block: parser.insertCharacter(token.c)) + TokenType.CHARACTER_ASCII => (block: parser.insertCharacter(token.c)) + TokenType.CHARACTER => (block: parser.insertCharacter(token.r)) + TokenType.DOCTYPE => (block: parse_error) + ("<b>", "<big>", "<blockquote>", "<body>", "<br>", "<center>", "<code>", + "<dd>", "<div>", "<dl>", "<dt>", "<em>", "<embed>", "<h1>", "<h2>", "<h3>", + "<h4>", "<h5>", "<h6>", "<head>", "<hr>", "<i>", "<img>", "<li>", + "<listing>", "<menu>", "<meta>", "<nobr>", "<ol>", "<p>", "<pre>", + "<ruby>", "<s>", "<small>", "<span>", "<strong>", "<strike>", "<sub>", + "<sup>", "<table>", "<tt>", "<u>", "<ul>", "<var>") => (block: + parse_error + #NOTE MathML not implemented + while not (parser.currentNode.isHTMLIntegrationPoint() or parser.currentNode.inHTMLNamespace()): + pop_current_node + parser.processInHTMLContent(token) + ) + TokenType.START_TAG => (block: + #NOTE MathML not implemented + #TODO SVG + #TODO adjust foreign attributes + let element = parser.insertForeignElement(token, $parser.adjustedCurrentNode.namespace) + if token.selfclosing and element.inSVGNamespace(): + script_end_tag else: - parse_error missing_semicolon_after_character_reference - reconsume_in NUMERIC_CHARACTER_REFERENCE_END - - of NUMERIC_CHARACTER_REFERENCE_END: - ignore_eof # we reconsume anyway - case tokenizer.code - of 0x00: - parse_error null_character_reference - tokenizer.code = 0xFFFD - elif tokenizer.code > 0x10FFFF: - parse_error character_reference_outside_unicode_range - tokenizer.code = 0xFFFD - elif Rune(tokenizer.code).isSurrogate(): - parse_error surrogate_character_reference - tokenizer.code = 0xFFFD - elif Rune(tokenizer.code).isNonCharacter(): - parse_error noncharacter_character_reference - # do nothing - elif tokenizer.code in 0..255 and char(tokenizer.code) in ((Controls - AsciiWhitespace) + {chr(0x0D)}): - const ControlMapTable = [ - (0x80, 0x20AC), (0x82, 0x201A), (0x83, 0x0192), (0x84, 0x201E), - (0x85, 0x2026), (0x86, 0x2020), (0x87, 0x2021), (0x88, 0x02C6), - (0x89, 0x2030), (0x8A, 0x0160), (0x8B, 0x2039), (0x8C, 0x0152), - (0x8E, 0x017D), (0x91, 0x2018), (0x92, 0x2019), (0x93, 0x201C), - (0x94, 0x201D), (0x95, 0x2022), (0x96, 0x2013), (0x97, 0x2014), - (0x98, 0x02DC), (0x99, 0x2122), (0x9A, 0x0161), (0x9B, 0x203A), - (0x9C, 0x0153), (0x9E, 0x017E), (0x9F, 0x0178), - ].toTable() - if ControlMapTable.hasKey(tokenizer.code): - tokenizer.code = ControlMapTable[tokenizer.code] - tokenizer.tmp = $Rune(tokenizer.code) - flush_code_points_consumed_as_a_character_reference #TODO optimize so we flush directly - reconsume_in tokenizer.rstate # we unnecessarily consumed once so reconsume - -func inputSize*(str: string): int = - if str.len == 0: - return 20 - for c in str: - if not c.isDigit: - return 20 - return str.parseInt() - -#w3m's getescapecmd and parse_tag, transpiled to nim and heavily modified. -#(C) Copyright 1994-2002 by Akinori Ito -#(C) Copyright 2002-2011 by Akinori Ito, Hironori Sakamoto, Fumitoshi Ukai -# -#Use, modification and redistribution of this software is hereby granted, -#provided that this entire copyright notice is included on any copies of -#this software and applications and derivations thereof. -# -#This software is provided on an "as is" basis, without warranty of any -#kind, either expressed or implied, as to any matter including, but not -#limited to warranty of fitness of purpose, or merchantability, or -#results obtained from use of this software. -proc getescapecmd(buf: string, at: var int): string = - var i = at - - if buf[i] == '#': #num - inc i - var num: int - if buf[i].tolower() == 'x': #hex - inc i - if not isdigit(buf[i]): - at = i - return "&" - - num = hexValue(buf[i]) - inc i - while i < buf.len and hexValue(buf[i]) != -1: - num *= 0x10 - num += hexValue(buf[i]) - inc i - else: #dec - if not isDigit(buf[i]): - at = i - return "&" - - num = decValue(buf[i]) - inc i - while i < buf.len and isDigit(buf[i]): - num *= 10 - num += decValue(buf[i]) - inc i - - if buf[i] == ';': - inc i - at = i - return $(Rune(num)) - elif not isAlphaAscii(buf[i]): - return "&" - - var n = entityMap - var s = "" - while true: - s &= buf[i] - if not n.hasPrefix(s): - break - let pn = n - n = n{s} - if n != pn: - s = "" - inc i - - if n.value.issome: - at = i - return n.value.get - - return "&" - -type - DOMParsedTag = object - tagid: TagType - attrs: Table[string, string] - open: bool - -proc parse_tag(buf: string, at: var int): DOMParsedTag = - var tag = DOMParsedTag() - tag.open = true - - #Parse tag name - var tagname = "" - inc at - if buf[at] == '/': - inc at - tag.open = false - at = skipBlanks(buf, at) - - while at < buf.len and not buf[at].isWhitespace() and not (tag.open and buf[at] == '/') and buf[at] != '>' and buf[at].isAscii(): - tagname &= buf[at].tolower() - inc at - - tag.tagid = tagType(tagname) - at = skipBlanks(buf, at) - - while at < buf.len and buf[at] != '>': - var value = "" - var attrname = "" - while at < buf.len and buf[at] != '=' and not buf[at].isWhitespace() and buf[at] != '>': - var r: Rune - fastRuneAt(buf, at, r) - if r.isAscii(): - attrname &= char(r).tolower() + pop_current_node + ) + "</script>" => (block: + if parser.currentNode.namespace == Namespace.SVG and parser.currentNode.localName == "script": #TODO SVG + script_end_tag else: - attrname &= r - - at = skipBlanks(buf, at) - if at < buf.len and buf[at] == '=': - inc at - at = skipBlanks(buf, at) - if at < buf.len and (buf[at] == '"' or buf[at] == '\''): - let startc = buf[at] - inc at - while at < buf.len and buf[at] != startc: - if buf[at] == '&': - inc at - value &= getescapecmd(buf, at) - else: - value &= buf[at] - inc at - if at < buf.len: - inc at - elif at < buf.len: - while at < buf.len and not buf[at].isWhitespace() and buf[at] != '>': - var r: Rune - fastRuneAt(buf, at, r) - value &= $r - - if attrname.len > 0: - tag.attrs[attrname] = value - - while at < buf.len and buf[at] != '>': - inc at - - if at < buf.len and buf[at] == '>': - inc at - return tag - -proc insertNode(parent, node: Node) = - parent.childNodes.add(node) - - if parent.childNodes.len > 1: - let prevSibling = parent.childNodes[^2] - prevSibling.nextSibling = node - node.previousSibling = prevSibling - - node.parentNode = parent - if parent.nodeType == ELEMENT_NODE: - node.parentElement = Element(parent) - - if parent.ownerDocument != nil: - node.ownerDocument = parent.ownerDocument - elif parent.nodeType == DOCUMENT_NODE: - node.ownerDocument = Document(parent) - - if node.nodeType == ELEMENT_NODE: - parent.children.add(Element(node)) - - let element = (Element(node)) - if element.ownerDocument != nil: - element.ownerDocument.type_elements[element.tagType].add(element) - if element.id != "": - if not (element.id in element.ownerDocument.id_elements): - element.ownerDocument.id_elements[element.id] = newSeq[Element]() - element.ownerDocument.id_elements[element.id].add(element) - - for c in element.classList: - if not (c in element.ownerDocument.class_elements): - element.ownerDocument.class_elements[c] = newSeq[Element]() - element.ownerDocument.class_elements[c].add(element) - -proc processDocumentBody(state: var HTMLParseState) = - if not state.in_body: - state.in_body = true - if state.elementNode.ownerDocument != nil: - state.elementNode = state.elementNode.ownerDocument.body - -#TODO this adds text nodes to head -proc processDocumentAddNode(state: var HTMLParseState, newNode: Node) = - if state.elementNode.tagType == TAG_HTML: - if state.in_body: - state.elementNode = state.elementNode.ownerDocument.body + any_other_end_tag + ) + TokenType.END_TAG => (block: any_other_end_tag) + +proc constructTree(parser: var HTML5Parser): Document = + for token in parser.tokenizer.tokenize: + if parser.ignoreLF: + parser.ignoreLF = false + if token.t == CHARACTER_ASCII and token.c == '\n': + continue + if parser.openElements.len == 0 or + parser.adjustedCurrentNode.inHTMLNamespace() or + parser.adjustedCurrentNode.isHTMLIntegrationPoint() and token.t in {START_TAG, CHARACTER, CHARACTER_ASCII} or + token.t == EOF: + #NOTE MathML not implemented + parser.processInHTMLContent(token) else: - state.elementNode = state.elementNode.ownerDocument.head + #TODO disabled path because I'm pretty sure it'd just break things + #parser.processInForeignContent(token) + pop_current_node - insertNode(state.elementNode, newNode) + #TODO document.write (?) + #TODO etc etc... -proc processDocumentEndNode(state: var HTMLParseState) = - if state.elementNode == nil or state.elementNode.nodeType == DOCUMENT_NODE: - return - state.elementNode = state.elementNode.parentElement - -proc processDocumentText(state: var HTMLParseState) = - if state.textNode == nil: - state.textNode = newText() - processDocumentAddNode(state, state.textNode) - -proc processDocumentStartElement(state: var HTMLParseState, element: Element, tag: DOMParsedTag) = - var add = true - - for k, v in tag.attrs: - element.attributes[k] = v - - element.id = element.attr("id") - if element.attributes.hasKey("class"): - for w in unicode.split(element.attributes["class"], Rune(' ')): - element.classList.add(w) - - case element.tagType - of TAG_SCRIPT: - state.in_script = true - of TAG_NOSCRIPT: - state.in_noscript = true - of TAG_STYLE: - state.in_style = true - of TAG_SELECT: - HTMLSelectElement(element).name = element.attr("name") - HTMLSelectElement(element).value = element.attr("value") - of TAG_INPUT: - let element = HTMLInputElement(element) - element.value = element.attr("value") - element.inputType = element.attr("type").inputType() - element.size = element.attr("size").inputSize() - element.checked = element.attrb("checked") - if state.formowners.len > 0: - element.form = state.formowners[^1] - element.form.inputs.add(element) - of TAG_A: - HTMLAnchorElement(element).href = element.attr("href") - of TAG_OPTION: - HTMLOptionElement(element).value = element.attr("href") - of TAG_OL: - HTMLOListElement(element).start = element.attri("start") - HTMLOListElement(element).ordinalcounter = HTMLOListElement(element).start.get(1) - of TAG_LI: - HTMLLIElement(element).value = element.attri("value") - of TAG_HTML: - add = false - of TAG_HEAD: - add = false - state.in_body = false - if state.elementNode.ownerDocument != nil: - state.elementNode = state.elementNode.ownerDocument.head - of TAG_BODY: - add = false - of TAG_PRE: - state.skip_lf = true - of TAG_H1: - HTMLHeadingElement(element).rank = 1 - of TAG_H2: - HTMLHeadingElement(element).rank = 2 - of TAG_H3: - HTMLHeadingElement(element).rank = 3 - of TAG_H4: - HTMLHeadingElement(element).rank = 4 - of TAG_H5: - HTMLHeadingElement(element).rank = 5 - of TAG_H6: - HTMLHeadingElement(element).rank = 6 - of TAG_LINK: - HTMLLinkElement(element).href = element.attr("href") - HTMLLinkElement(element).rel = element.attr("rel") - of TAG_FORM: - let element = HTMLFormElement(element) - element.name = element.attr("name") - element.smethod = element.attr("method") - element.enctype = element.attr("enctype") - element.target = element.attr("target") - element.novalidate = element.attrb("novalidate") - state.formowners.add(element) - else: discard - - if not state.in_body and not (element.tagType in HeadTagTypes): - processDocumentBody(state) - - if state.elementNode.nodeType == ELEMENT_NODE: - if element.tagType in SelfClosingTagTypes: - if state.elementNode.tagType == element.tagType: - processDocumentEndNode(state) - - if state.elementNode.tagType == TAG_P and element.tagType in PClosingTagTypes: - processDocumentEndNode(state) - - if add: - processDocumentAddNode(state, element) - state.elementNode = element - - case element.tagType - of VoidTagTypes: - processDocumentEndNode(state) - of TAG_LI: - HTMLLIElement(element).applyOrdinal() #needs to know parent - else: discard + return parser.document -proc processDocumentEndElement(state: var HTMLParseState, tag: DOMParsedTag) = - if tag.tagid != state.elementNode.tagType: - if state.elementNode.tagType in SelfClosingTagTypes: - processDocumentEndNode(state) - processDocumentEndNode(state) - else: - case tag.tagid - of VoidTagTypes: - return - of TAG_HEAD: - processDocumentBody(state) - return - of TAG_BODY: - return - of TAG_FORM: - if state.formowners.len > 0: - discard state.formowners.pop() - of TAG_STYLE: - let style = HTMLStyleElement(state.elementNode) - var str = "" - for child in style.textNodes: - str &= child.data - let sheet = newStringStream(str).parseStylesheet() - style.parentElement.sheets.add(sheet) - else: discard - processDocumentEndNode(state) - -proc processDocumentTag(state: var HTMLParseState, tag: DOMParsedTag) = - if state.in_script: - if not tag.open and tag.tagid == TAG_SCRIPT: - state.in_script = false - else: - return - - if state.in_style: - if not tag.open and tag.tagid == TAG_STYLE: - state.in_style = false - else: - return - - if not tag.open and state.in_noscript: - if tag.tagid == TAG_NOSCRIPT: - state.in_noscript = false - else: - return - - if tag.open: - processDocumentStartElement(state, state.document.newHtmlElement(tag.tagid), tag) - else: - processDocumentEndElement(state, tag) - -proc processDocumentPart(state: var HTMLParseState, buf: string) = - var at = 0 - var max = 0 - var was_script = false - - max = buf.len - - template process_char(c: char) = - if state.in_comment: - state.commentNode.data &= c - else: - if not c.isWhitespace() and state.elementNode.tagType == TAG_HTML: - state.textNode = nil - processDocumentBody(state) - processDocumentText(state) - if not (state.skip_lf and c == '\n'): - processDocumentText(state) - state.textNode.data &= c - state.skip_lf = false - - template process_text(s: string) = - if state.in_comment: - state.commentNode.data &= s - else: - if not (state.skip_lf and s[0] == '\n'): - processDocumentText(state) - state.textNode.data &= s - state.skip_lf = false - - template has(buf: string, s: string): bool = - (at + s.len < buf.len and buf.substr(at, at + 8) == "</script>") - - while at < max: - case buf[at] - of '&': - inc at - let p = getescapecmd(buf, at) - process_text(p) - of '<': - if state.in_comment: - state.commentNode.data &= buf[at] - inc at - else: - var p = at - inc p - if p < max and buf[p] == '!': - inc p - if p < max and buf[p] == '-': - inc p - if p < max and buf[p] == '-': - inc p - at = p - state.in_comment = true - let comment = newComment() - state.commentNode = comment - processDocumentAddNode(state, comment) - state.textNode = nil - else: - #TODO for doctype - while p < max and buf[p] != '>': - inc p - at = p + 1 - continue - - if not state.in_comment: - state.textNode = nil - p = at - if state.in_script: - if buf.has("</script>"): - var tag = parse_tag(buf, at) - processDocumentTag(state, tag) - else: - process_char(buf[at]) - inc at - else: - var tag = parse_tag(buf, at) - processDocumentTag(state, tag) - elif buf[at] == '-' and state.in_comment: - var p = at - inc p - if p < max and buf[p] == '-': - inc p - if p < max and buf[p] == '>': - inc p - at = p - state.commentNode = nil - state.in_comment = false - - if state.in_comment: - state.commentNode.data &= buf[at] - inc at - else: - process_char(buf[at]) - inc at - -proc parseHtml5(inputStream: Stream, savesource: bool, source: var string): Document = - #TODO implement HTML5 parsing - var tokenizer = inputStream.newTokenizer() - for tok in tokenizer.tokenize: - eprint tok - -proc parseHtml(inputStream: Stream, savesource: bool, source: var string): Document = - let document = newDocument() - insertNode(document, document.root) - insertNode(document.root, document.head) - insertNode(document.root, document.body) - - var state = HTMLParseState() - state.document = document - state.elementNode = document.root - - var till_when = false - - var buf = "" - var lineBuf: string - while not inputStream.atEnd(): - lineBuf = inputStream.readLine() & '\n' - if savesource: - source &= lineBuf - buf &= lineBuf - - var at = 0 - while at < lineBuf.len: - case lineBuf[at] - of '<': - till_when = true - of '>': - till_when = false - else: discard - inc at - - if till_when: - continue - - processDocumentPart(state, buf) - buf = "" - - inputStream.close() - return document - -proc parseHtml*(inputStream: Stream, source: var string): Document = - return parseHtml(inputStream, true, source) - -proc parseHtml*(inputStream: Stream): Document = - var placeholder = "" - return parseHtml(inputStream, false, placeholder) +proc parseHTML5*(inputStream: Stream): Document = + var parser: HTML5Parser + parser.document = newDocument() + parser.tokenizer = inputStream.newTokenizer() + return parser.constructTree() |