import macros import options import sequtils import streams import strutils import tables import unicode import data/charset import encoding/decoderstream import html/htmltokenizer import html/parseerror import html/tags import utils/twtstr # Generics break without exporting macros. Maybe a compiler bug? export macros # Heavily inspired by html5ever's TreeSink design. type DOMBuilder*[Handle] = ref object of RootObj document*: Handle ## Must never be nil. finish*: DOMBuilderFinish[Handle] ## May be nil. parseError*: DOMBuilderParseError[Handle] ## May be nil. setQuirksMode*: DOMBuilderSetQuirksMode[Handle] ## May be nil. setCharacterSet*: DOMBuilderSetCharacterSet[Handle] ## May be nil. elementPopped*: DOMBuilderElementPopped[Handle] ## May be nil. getTemplateContent*: DOMBuilderGetTemplateContent[Handle] ## May be nil. (If nil, templates are treated as regular elements.) getParentNode*: DOMBuilderGetParentNode[Handle] ## Must never be nil. getLocalName*: DOMBuilderGetLocalName[Handle] ## Must never be nil. getTagType*: DOMBuilderGetTagType[Handle] ## May be nil. (If nil, the parser falls back to getLocalName.) getNamespace*: DOMBuilderGetNamespace[Handle] ## May be nil. (If nil, the parser always uses the HTML namespace.) createElement*: DOMBuilderCreateElement[Handle] ## Must never be nil. createComment*: DOMBuilderCreateComment[Handle] ## Must never be nil. createDocumentType*: DOMBuilderCreateDocumentType[Handle] ## Must never be nil. insertBefore*: DOMBuilderInsertBefore[Handle] ## Must never be nil. insertText*: DOMBuilderInsertText[Handle] ## Must never be nil. remove*: DOMBuilderRemove[Handle] ## Must never be nil. addAttrsIfMissing*: DOMBuilderAddAttrsIfMissing[Handle] ## May be nil. (If nil, some attributes may not be added to the HTML or ## BODY element if more than one of their respecting opening tags exist.) setScriptAlreadyStarted*: DOMBuilderSetScriptAlreadyStarted[Handle] ## May be nil. associateWithForm*: DOMBuilderAssociateWithForm[Handle] ## May be nil. isSVGIntegrationPoint*: DOMBuilderIsSVGIntegrationPoint[Handle] ## May be nil. (If nil, the parser considers no Handle an SVG integration ## point.) HTML5ParserOpts*[Handle] = object isIframeSrcdoc*: bool ## Is the document an iframe srcdoc? scripting*: bool ## Is scripting enabled for this document? canReinterpret*: bool ## Can we try to parse the document again with a different character set? ## ## Note: this only works if inputStream is seekable, i.e. ## inputStream.setPosition(0) must work correctly. ## ## Note 2: when this canReinterpret is false, confidence is set to ## certain, no BOM sniffing is performed and meta charset tags are ## disregarded. Expect this to change in the future. charsets*: seq[Charset] ## Fallback charsets. If empty, UTF-8 is used. In most cases, an empty ## sequence or a single-element sequence consisting of a character set ## chosen based on the user's locale will suffice. ## ## The parser goes through fallback charsets in the following order: ## * A charset stack is initialized to `charsets`, reversed. This ## means that the first charset specified in `charsets` is on top of ## the stack. (e.g. say `charsets = @[CHARSET_UTF_16_LE, CHARSET_UTF_8]`, ## then utf-16-le is tried before utf-8.) ## * BOM sniffing is attempted. If successful, confidence is set to ## certain and the resulting charset is used (i.e. other character ## sets will not be tried for decoding this document.) ## * If the charset stack is empty, UTF-8 is pushed on top. ## * Attempt to parse the document with the first charset on top of ## the stack. ## * If BOM sniffing was unsuccessful, and a tag ## is encountered, parsing is restarted with the specified charset. ## No further attempts are made to detect the encoding, and decoder ## errors are signaled by U+FFFD replacement characters. ## * Otherwise, each charset on the charset stack is tried until either no ## decoding errors are encountered, or only one charset is left. For ## the last charset, decoder errors are signaled by U+FFFD replacement ## characters. ctx*: Option[Handle] ## Context element for fragment parsing. When set to some Handle, ## the fragment case is used while parsing. DOMBuilderFinish*[Handle] = proc(builder: DOMBuilder[Handle]) {.nimcall.} ## Parsing has finished. DOMBuilderParseError*[Handle] = proc(builder: DOMBuilder[Handle], message: ParseError) {.nimcall.} ## Parse error. `message` is an error code either specified by the ## standard (in this case, message < LAST_SPECIFIED_ERROR) or named ## arbitrarily. (At the time of writing, only tokenizer errors have ## specified error codes.) DOMBuilderSetQuirksMode*[Handle] = proc(builder: DOMBuilder[Handle], quirksMode: QuirksMode) {.nimcall.} ## Set quirks mode to either QUIRKS or LIMITED_QUIRKS. NO_QUIRKS ## is the default and is therefore never used here. DOMBuilderSetCharacterSet*[Handle] = proc(builder: DOMBuilder[Handle], charset: Charset) {.nimcall.} ## Set the recognized charset, if it differs from the initial input. DOMBuilderElementPopped*[Handle] = proc(builder: DOMBuilder[Handle], element: Handle) {.nimcall.} ## Called when an element is popped from the stack of open elements ## (i.e. when it has been closed.) DOMBuilderGetTemplateContent*[Handle] = proc(builder: DOMBuilder[Handle], handle: Handle): Handle {.nimcall.} ## Retrieve a handle to the template element's contents. ## Note: this function must never return nil. DOMBuilderGetParentNode*[Handle] = proc(builder: DOMBuilder[Handle], handle: Handle): Handle {.nimcall.} ## Retrieve a handle to the parent node. DOMBuilderGetTagType*[Handle] = proc(builder: DOMBuilder[Handle], handle: Handle): TagType {.nimcall.} ## Retrieve the tag type of element. DOMBuilderGetLocalName*[Handle] = proc(builder: DOMBuilder[Handle], handle: Handle): string {.nimcall.} ## Retrieve the local name of element. (This is tagName(getTagType), ## unless the tag is unknown. DOMBuilderGetNamespace*[Handle] = proc(builder: DOMBuilder[Handle], handle: Handle): Namespace {.nimcall.} ## Retrieve the namespace of element. DOMBuilderCreateElement*[Handle] = proc(builder: DOMBuilder[Handle], localName: string, namespace: Namespace, tagType: TagType, attrs: Table[string, string]): Handle {.nimcall.} ## Create a new element node. ## ## localName is the tag name of the token. ## ## namespace is the namespace passed to the function. (For HTML elements, ## it's HTML.) ## tagType is set based on localName. (This saves the consumer from ## having to interpret localName again.) ## ## attrs is a table of the token's attributes. DOMBuilderCreateComment*[Handle] = proc(builder: DOMBuilder[Handle], text: string): Handle {.nimcall.} ## Create a new comment node. DOMBuilderInsertText*[Handle] = proc(builder: DOMBuilder[Handle], parent: Handle, text: string, before: Handle) {.nimcall.} ## Insert a text node at the specified location with contents ## `text`. If the specified location has a previous sibling that is ## a text node, no new text node should be created, but instead `text` ## should be appended to the previous sibling's character data. DOMBuilderCreateDocumentType*[Handle] = proc(builder: DOMBuilder[Handle], name, publicId, systemId: string): Handle {.nimcall.} ## Create a new document type node. DOMBuilderInsertBefore*[Handle] = proc(builder: DOMBuilder[Handle], parent, child, before: Handle) {.nimcall.} ## Insert node `child` before the node called `before`. ## ## If `before` is nil, `child` is expected to be appended to `parent`'s ## node list. ## ## If `child` is a text, and its previous sibling after insertion is a ## text as well, then they should be merged. `before` is never a ## text node (and thus never has to be merged). ## ## Note: parent may either be an Element or a Document node. DOMBuilderRemove*[Handle] = proc(builder: DOMBuilder[Handle], child: Handle) {.nimcall.} ## Remove `child` from its parent node, and do nothing if `child` ## has no parent node. DOMBuilderReparent*[Handle] = proc(builder: DOMBuilder[Handle], child, newParent: Handle) {.nimcall.} ## Remove `child` from its parent node, and append it to `newParent`. ## In terms of DOM operations, this should be equivalent to calling ## `child.remove()`, followed by `newParent.append(child)`. DOMBuilderAddAttrsIfMissing*[Handle] = proc(builder: DOMBuilder[Handle], element: Handle, attrs: Table[string, string]) {.nimcall.} ## Add the attributes in `attrs` to the element node `element`. ## At the time of writing, called for HTML and BODY only. (This may ## change in the future.) ## An example implementation: ## ```nim ## for k, v in attrs: ## if k notin element.attrs: ## element.attrs[k] = v ## ``` DOMBuilderSetScriptAlreadyStarted*[Handle] = proc(builder: DOMBuilder[Handle], script: Handle) {.nimcall.} ## Set the "already started" flag for the script element. ## ## Note: this flag is not togglable, so this callback should just set it ## to true. DOMBuilderAssociateWithForm*[Handle] = proc(builder: DOMBuilder[Handle], element, form, intendedParent: Handle) {.nimcall.} ## Called after createElement. Attempts to set form for form-associated ## elements. ## ## Note: the DOM builder is responsible for checking whether the ## intended parent and the form element are in the same tree. DOMBuilderIsSVGIntegrationPoint*[Handle] = proc(builder: DOMBuilder[Handle], element: Handle): bool {.nimcall.} ## Check if element is an SVG integration point. type CharsetConfidence = enum CONFIDENCE_TENTATIVE, CONFIDENCE_CERTAIN, CONFIDENCE_IRRELEVANT HTML5Parser[Handle] = object quirksMode: QuirksMode dombuilder: DOMBuilder[Handle] opts: HTML5ParserOpts[Handle] ctx: Option[Handle] needsreinterpret: bool charset: Charset confidence: CharsetConfidence openElements: seq[Handle] insertionMode: InsertionMode oldInsertionMode: InsertionMode templateModes: seq[InsertionMode] head: Option[Handle] tokenizer: Tokenizer form: Option[Handle] fosterParenting: bool # Handle is an element. nil => marker activeFormatting: seq[(Handle, Token)] framesetok: bool ignoreLF: bool pendingTableChars: string pendingTableCharsWhitespace: bool AdjustedInsertionLocation[Handle] = tuple[inside, before: Handle] # 13.2.4.1 InsertionMode = enum INITIAL, BEFORE_HTML, BEFORE_HEAD, IN_HEAD, IN_HEAD_NOSCRIPT, AFTER_HEAD, IN_BODY, TEXT, IN_TABLE, IN_TABLE_TEXT, IN_CAPTION, IN_COLUMN_GROUP, IN_TABLE_BODY, IN_ROW, IN_CELL, IN_SELECT, IN_SELECT_IN_TABLE, IN_TEMPLATE, AFTER_BODY, IN_FRAMESET, AFTER_FRAMESET, AFTER_AFTER_BODY, AFTER_AFTER_FRAMESET # DOMBuilder interface functions proc finish[Handle](parser: HTML5Parser[Handle]) = if parser.dombuilder.finish != nil: parser.dombuilder.finish(parser.dombuilder) proc parseError(parser: HTML5Parser, e: ParseError) = if parser.dombuilder.parseError != nil: parser.dombuilder.parseError(parser.dombuilder, e) proc setQuirksMode[Handle](parser: var HTML5Parser[Handle], mode: QuirksMode) = parser.quirksMode = mode if parser.dombuilder.setQuirksMode != nil: parser.dombuilder.setQuirksMode(parser.dombuilder, mode) func document[Handle](parser: HTML5Parser[Handle]): Handle {.inline.} = return parser.dombuilder.document func getTemplateContent[Handle](parser: HTML5Parser[Handle], handle: Handle): Handle = let dombuilder = parser.dombuilder return dombuilder.getTemplateContent(dombuilder, handle) func getParentNode[Handle](parser: HTML5Parser[Handle], handle: Handle): Handle = let dombuilder = parser.dombuilder return dombuilder.getParentNode(dombuilder, handle) func getLocalName[Handle](parser: HTML5Parser[Handle], handle: Handle): string = return parser.dombuilder.getLocalName(parser.dombuilder, handle) func getTagType[Handle](parser: HTML5Parser[Handle], handle: Handle): TagType = if parser.dombuilder.getTagType != nil: return parser.dombuilder.getTagType(parser.dombuilder, handle) return tagType(parser.getLocalName(handle)) func getNamespace[Handle](parser: HTML5Parser[Handle], handle: Handle): Namespace = if parser.dombuilder.getNamespace != nil: return parser.dombuilder.getNamespace(parser.dombuilder, handle) return Namespace.HTML func createElement[Handle](parser: HTML5Parser[Handle], localName: string, namespace: Namespace, tagType: TagType, attrs: Table[string, string]): Handle = return parser.dombuilder.createElement(parser.dombuilder, localName, namespace, tagType, attrs) func createElement[Handle](parser: HTML5Parser[Handle], tagType: TagType, namespace: Namespace): Handle = return parser.createElement(tagName(tagType), namespace, tagType, Table[string, string]()) func createComment[Handle](parser: HTML5Parser[Handle], text: string): Handle = let dombuilder = parser.dombuilder return dombuilder.createComment(dombuilder, text) proc createDocumentType[Handle](parser: HTML5Parser[Handle], name, publicId, systemId: string): Handle = let dombuilder = parser.dombuilder return dombuilder.createDocumentType(dombuilder, name, publicId, systemId) proc insertBefore[Handle](parser: HTML5Parser[Handle], parent, node, before: Handle) = let dombuilder = parser.dombuilder dombuilder.insertBefore(dombuilder, parent, node, before) proc insertText[Handle](parser: HTML5Parser[Handle], parent: Handle, text: string, before: Handle) = let dombuilder = parser.dombuilder dombuilder.insertText(dombuilder, parent, text, before) proc remove[Handle](parser: HTML5Parser[Handle], child: Handle) = let dombuilder = parser.dombuilder dombuilder.remove(dombuilder, child) proc addAttrsIfMissing[Handle](parser: HTML5Parser, element: Handle, attrs: Table[string, string]) = let dombuilder = parser.dombuilder if dombuilder.addAttrsIfMissing != nil: dombuilder.addAttrsIfMissing(dombuilder, element, attrs) proc setScriptAlreadyStarted[Handle](parser: HTML5Parser, script: Handle) = let dombuilder = parser.dombuilder if dombuilder.setScriptAlreadyStarted != nil: dombuilder.setScriptAlreadyStarted(dombuilder, script) proc associateWithForm[Handle](parser: HTML5Parser, element, form, intendedParent: Handle) = let dombuilder = parser.dombuilder if dombuilder.associateWithForm != nil: dombuilder.associateWithForm(dombuilder, element, form, intendedParent) func isSVGIntegrationPoint[Handle](parser: HTML5Parser, element: Handle): bool = let dombuilder = parser.dombuilder if dombuilder.isSVGIntegrationPoint != nil: return dombuilder.isSVGIntegrationPoint(dombuilder, element) return false # Parser func hasParseError(parser: HTML5Parser): bool = return parser.dombuilder.parseError != nil func tagNameEquals[Handle](parser: HTML5Parser, handle: Handle, token: Token): bool = let tagType = parser.getTagType(handle) if tagType != TAG_UNKNOWN: return tagType == token.tagtype let localName = parser.getLocalName(handle) return localName == token.tagname func tagNameEquals[Handle](parser: HTML5Parser, a, b: Handle): bool = let tagType = parser.getTagType(a) if tagType != TAG_UNKNOWN: return tagType == parser.getTagType(b) return parser.getLocalName(a) == parser.getLocalName(b) func fragment(parser: HTML5Parser): bool = return parser.ctx.isSome # https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately proc resetInsertionMode(parser: var HTML5Parser) = template switch_insertion_mode_and_return(mode: InsertionMode) = parser.insertionMode = mode return for i in countdown(parser.openElements.high, 0): var node = parser.openElements[i] let last = i == 0 if parser.fragment: node = parser.ctx.get let tagType = parser.getTagType(node) if tagType == TAG_SELECT: if not last: for j in countdown(parser.openElements.high, 1): let ancestor = parser.openElements[j] case parser.getTagType(ancestor) of TAG_TEMPLATE: break of TAG_TABLE: switch_insertion_mode_and_return IN_SELECT_IN_TABLE else: discard switch_insertion_mode_and_return IN_SELECT case tagType of TAG_TD, TAG_TH: if not last: switch_insertion_mode_and_return IN_CELL of TAG_TR: switch_insertion_mode_and_return IN_ROW of TAG_TBODY, TAG_THEAD, TAG_TFOOT: switch_insertion_mode_and_return IN_CAPTION of TAG_COLGROUP: switch_insertion_mode_and_return IN_COLUMN_GROUP of TAG_TABLE: switch_insertion_mode_and_return IN_TABLE of TAG_TEMPLATE: switch_insertion_mode_and_return parser.templateModes[^1] of TAG_HEAD: if not last: switch_insertion_mode_and_return IN_HEAD of TAG_BODY: switch_insertion_mode_and_return IN_BODY of TAG_FRAMESET: switch_insertion_mode_and_return IN_FRAMESET of TAG_HTML: if parser.head.isNone: switch_insertion_mode_and_return BEFORE_HEAD else: switch_insertion_mode_and_return AFTER_HEAD else: discard if last: switch_insertion_mode_and_return IN_BODY func currentNode[Handle](parser: HTML5Parser[Handle]): Handle = return parser.openElements[^1] func adjustedCurrentNode[Handle](parser: HTML5Parser[Handle]): Handle = if parser.fragment: parser.ctx.get else: parser.currentNode func lastElementOfTag[Handle](parser: HTML5Parser[Handle], tagType: TagType): tuple[element: Handle, pos: int] = for i in countdown(parser.openElements.high, 0): if parser.getTagType(parser.openElements[i]) == tagType: return (parser.openElements[i], i) return (nil, -1) template last_child_of[Handle](n: Handle): AdjustedInsertionLocation[Handle] = (n, nil) # https://html.spec.whatwg.org/multipage/#appropriate-place-for-inserting-a-node func appropriatePlaceForInsert[Handle](parser: HTML5Parser[Handle], target: Handle): AdjustedInsertionLocation[Handle] = assert parser.getTagType(parser.openElements[0]) == TAG_HTML let targetTagType = parser.getTagType(target) const FosterTagTypes = {TAG_TABLE, TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TR} if parser.fosterParenting and targetTagType in FosterTagTypes: let lastTemplate = parser.lastElementOfTag(TAG_TEMPLATE) let lastTable = parser.lastElementOfTag(TAG_TABLE) if lastTemplate.element != nil and parser.dombuilder.getTemplateContent != nil and (lastTable.element == nil or lastTable.pos < lastTemplate.pos): let content = parser.getTemplateContent(lastTemplate.element) return last_child_of(content) if lastTable.element == nil: return last_child_of(parser.openElements[0]) let parentNode = parser.getParentNode(lastTable.element) if parentNode != nil: return (parentNode, lastTable.element) let previousElement = parser.openElements[lastTable.pos - 1] result = last_child_of(previousElement) else: result = last_child_of(target) if parser.getTagType(result.inside) == TAG_TEMPLATE and parser.dombuilder.getTemplateContent != nil: result = (parser.getTemplateContent(result.inside), nil) func appropriatePlaceForInsert[Handle](parser: HTML5Parser[Handle]): AdjustedInsertionLocation[Handle] = parser.appropriatePlaceForInsert(parser.currentNode) func hasElement[Handle](parser: HTML5Parser[Handle], tag: TagType): bool = for element in parser.openElements: if parser.getTagType(element) == tag: return true return false func hasElement[Handle](parser: HTML5Parser[Handle], tags: set[TagType]): bool = for element in parser.openElements: if parser.getTagType(element) in tags: return true return false func hasElementInSpecificScope[Handle](parser: HTML5Parser[Handle], target: Handle, list: set[TagType]): bool = for i in countdown(parser.openElements.high, 0): if parser.openElements[i] == target: return true if parser.getTagType(parser.openElements[i]) in list: return false assert false func hasElementInSpecificScope[Handle](parser: HTML5Parser[Handle], target: TagType, list: set[TagType]): bool = for i in countdown(parser.openElements.high, 0): let tagType = parser.getTagType(parser.openElements[i]) if tagType == target: return true if tagType in list: return false assert false func hasElementInSpecificScope[Handle](parser: HTML5Parser[Handle], target: set[TagType], list: set[TagType]): bool = for i in countdown(parser.openElements.high, 0): let tagType = parser.getTagType(parser.openElements[i]) if tagType in target: return true if tagType in list: return false assert false const Scope = { TAG_APPLET, TAG_CAPTION, TAG_HTML, TAG_TABLE, TAG_TD, TAG_TH, TAG_MARQUEE, TAG_OBJECT, TAG_TEMPLATE #TODO SVG # Note: MathML is not implemented } func hasElementInScope[Handle](parser: HTML5Parser[Handle], target: TagType): bool = return parser.hasElementInSpecificScope(target, Scope) func hasElementInScope[Handle](parser: HTML5Parser[Handle], target: set[TagType]): bool = return parser.hasElementInSpecificScope(target, Scope) func hasElementInScope[Handle](parser: HTML5Parser[Handle], target: Handle): bool = return parser.hasElementInSpecificScope(target, Scope) func hasElementInListItemScope[Handle](parser: HTML5Parser[Handle], target: TagType): bool = const ListItemScope = Scope + {TAG_OL, TAG_UL} return parser.hasElementInSpecificScope(target, ListItemScope) func hasElementInButtonScope[Handle](parser: HTML5Parser[Handle], target: TagType): bool = const ButtonScope = Scope + {TAG_BUTTON} return parser.hasElementInSpecificScope(target, ButtonScope) const TableScope = {TAG_HTML, TAG_TABLE, TAG_TEMPLATE} func hasElementInTableScope[Handle](parser: HTML5Parser[Handle], target: TagType): bool = return parser.hasElementInSpecificScope(target, TableScope) func hasElementInTableScope[Handle](parser: HTML5Parser[Handle], target: set[TagType]): bool = return parser.hasElementInSpecificScope(target, TableScope) func hasElementInSelectScope[Handle](parser: HTML5Parser[Handle], target: TagType): bool = for i in countdown(parser.openElements.high, 0): let tagType = parser.getTagType(parser.openElements[i]) if tagType == target: return true if tagType notin {TAG_OPTION, TAG_OPTGROUP}: return false assert false func createElement[Handle](parser: HTML5Parser[Handle], token: Token, namespace: Namespace, intendedParent: Handle): Handle = #TODO custom elements let localName = token.tagname let element = parser.createElement(localName, namespace, token.tagtype, token.attrs) if token.tagtype in FormAssociatedElements and parser.form.isSome and not parser.hasElement(TAG_TEMPLATE) and (token.tagtype notin ListedElements or "form" notin token.attrs): parser.associateWithForm(element, parser.form.get, intendedParent) return element proc pushElement[Handle](parser: var HTML5Parser[Handle], node: Handle) = parser.openElements.add(node) let node = parser.adjustedCurrentNode() parser.tokenizer.hasnonhtml = parser.getNamespace(node) != Namespace.HTML proc popElement[Handle](parser: var HTML5Parser[Handle]): Handle = result = parser.openElements.pop() if parser.dombuilder.elementPopped != nil: parser.dombuilder.elementPopped(parser.dombuilder, result) if parser.openElements.len == 0: parser.tokenizer.hasnonhtml = false else: let node = parser.adjustedCurrentNode() parser.tokenizer.hasnonhtml = parser.getNamespace(node) != Namespace.HTML template pop_current_node = discard parser.popElement() proc insert[Handle](parser: HTML5Parser[Handle], location: AdjustedInsertionLocation[Handle], node: Handle) = parser.insertBefore(location.inside, node, location.before) proc append[Handle](parser: HTML5Parser[Handle], parent, node: Handle) = parser.insertBefore(parent, node, nil) proc insertForeignElement[Handle](parser: var HTML5Parser[Handle], token: Token, namespace: Namespace): Handle = let location = parser.appropriatePlaceForInsert() let element = parser.createElement(token, namespace, location.inside) #TODO custom elements parser.insert(location, element) parser.pushElement(element) return element proc insertHTMLElement[Handle](parser: var HTML5Parser[Handle], token: Token): Handle = return parser.insertForeignElement(token, Namespace.HTML) proc adjustSVGAttributes(token: Token) = const adjusted = { "attributename": "attributeName", "attributetype": "attributeType", "basefrequency": "baseFrequency", "baseprofile": "baseProfile", "calcmode": "calcMode", "clippathunits": "clipPathUnits", "diffuseconstant": "diffuseConstant", "edgemode": "edgeMode", "filterunits": "filterUnits", "glyphref": "glyphRef", "gradienttransform": "gradientTransform", "gradientunits": "gradientUnits", "kernelmatrix": "kernelMatrix", "kernelunitlength": "kernelUnitLength", "keypoints": "keyPoints", "keysplines": "keySplines", "keytimes": "keyTimes", "lengthadjust": "lengthAdjust", "limitingconeangle": "limitingConeAngle", "markerheight": "markerHeight", "markerunits": "markerUnits", "markerwidth": "markerWidth", "maskcontentunits": "maskContentUnits", "maskunits": "maskUnits", "numoctaves": "numOctaves", "pathlength": "pathLength", "patterncontentunits": "patternContentUnits", "patterntransform": "patternTransform", "patternunits": "patternUnits", "pointsatx": "pointsAtX", "pointsaty": "pointsAtY", "pointsatz": "pointsAtZ", "preservealpha": "preserveAlpha", "preserveaspectratio": "preserveAspectRatio", "primitiveunits": "primitiveUnits", "refx": "refX", "refy": "refY", "repeatcount": "repeatCount", "repeatdur": "repeatDur", "requiredextensions": "requiredExtensions", "requiredfeatures": "requiredFeatures", "specularconstant": "specularConstant", "specularexponent": "specularExponent", "spreadmethod": "spreadMethod", "startoffset": "startOffset", "stddeviation": "stdDeviation", "stitchtiles": "stitchTiles", "surfacescale": "surfaceScale", "systemlanguage": "systemLanguage", "tablevalues": "tableValues", "targetx": "targetX", "targety": "targetY", "textlength": "textLength", "viewbox": "viewBox", "viewtarget": "viewTarget", "xchannelselector": "xChannelSelector", "ychannelselector": "yChannelSelector", "zoomandpan": "zoomAndPan", }.toTable() var todo: seq[string] for k in token.attrs.keys: if k in adjusted: todo.add(k) for s in todo: token.attrs[adjusted[s]] = token.attrs[s] template insert_character_impl(parser: var HTML5Parser, data: typed) = let location = parser.appropriatePlaceForInsert() if location.inside.nodeType == DOCUMENT_NODE: return insertText(parser, location.inside, $data, location.before) proc insertCharacter(parser: var HTML5Parser, data: string) = insert_character_impl(parser, data) proc insertCharacter(parser: var HTML5Parser, data: char) = insert_character_impl(parser, data) proc insertCharacter(parser: var HTML5Parser, data: Rune) = insert_character_impl(parser, data) proc insertComment[Handle](parser: var HTML5Parser[Handle], token: Token, position: AdjustedInsertionLocation[Handle]) = let comment = parser.createComment(token.data) parser.insert(position, comment) proc insertComment(parser: var HTML5Parser, token: Token) = let position = parser.appropriatePlaceForInsert() parser.insertComment(token, position) const PublicIdentifierEquals = [ "-//W3O//DTD W3 HTML Strict 3.0//EN//", "-/W3C/DTD HTML 4.0 Transitional/EN", "HTML" ] const PublicIdentifierStartsWith = [ "+//Silmaril//dtd html Pro v0r11 19970101//", "-//AS//DTD HTML 3.0 asWedit + extensions//", "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//", "-//IETF//DTD HTML 2.0 Level 1//", "-//IETF//DTD HTML 2.0 Level 2//", "-//IETF//DTD HTML 2.0 Strict Level 1//", "-//IETF//DTD HTML 2.0 Strict Level 2//", "-//IETF//DTD HTML 2.0 Strict//", "-//IETF//DTD HTML 2.0//", "-//IETF//DTD HTML 2.1E//", "-//IETF//DTD HTML 3.0//", "-//IETF//DTD HTML 3.2 Final//", "-//IETF//DTD HTML 3.2//", "-//IETF//DTD HTML 3//", "-//IETF//DTD HTML Level 0//", "-//IETF//DTD HTML Level 1//", "-//IETF//DTD HTML Level 2//", "-//IETF//DTD HTML Level 3//", "-//IETF//DTD HTML Strict Level 0//", "-//IETF//DTD HTML Strict Level 1//", "-//IETF//DTD HTML Strict Level 2//", "-//IETF//DTD HTML Strict Level 3//", "-//IETF//DTD HTML Strict//", "-//IETF//DTD HTML//", "-//Metrius//DTD Metrius Presentational//", "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//", "-//Microsoft//DTD Internet Explorer 2.0 HTML//", "-//Microsoft//DTD Internet Explorer 2.0 Tables//", "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//", "-//Microsoft//DTD Internet Explorer 3.0 HTML//", "-//Microsoft//DTD Internet Explorer 3.0 Tables//", "-//Netscape Comm. Corp.//DTD HTML//", "-//Netscape Comm. Corp.//DTD Strict HTML//", "-//O'Reilly and Associates//DTD HTML 2.0//", "-//O'Reilly and Associates//DTD HTML Extended 1.0//", "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//", "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//", "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//", "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//", "-//Spyglass//DTD HTML 2.0 Extended//", "-//Sun Microsystems Corp.//DTD HotJava HTML//", "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//", "-//W3C//DTD HTML 3 1995-03-24//", "-//W3C//DTD HTML 3.2 Draft//", "-//W3C//DTD HTML 3.2 Final//", "-//W3C//DTD HTML 3.2//", "-//W3C//DTD HTML 3.2S Draft//", "-//W3C//DTD HTML 4.0 Frameset//", "-//W3C//DTD HTML 4.0 Transitional//", "-//W3C//DTD HTML Experimental 19960712//", "-//W3C//DTD HTML Experimental 970421//", "-//W3C//DTD W3 HTML//", "-//W3O//DTD W3 HTML 3.0//", "-//WebTechs//DTD Mozilla HTML 2.0//", "-//WebTechs//DTD Mozilla HTML//", ] const SystemIdentifierMissingAndPublicIdentifierStartsWith = [ "-//W3C//DTD HTML 4.01 Frameset//", "-//W3C//DTD HTML 4.01 Transitional//" ] const PublicIdentifierStartsWithLimited = [ "-//W3C//DTD XHTML 1.0 Frameset//", "-//W3C//DTD XHTML 1.0 Transitional//" ] const SystemIdentifierNotMissingAndPublicIdentifierStartsWith = [ "-//W3C//DTD HTML 4.01 Frameset//", "-//W3C//DTD HTML 4.01 Transitional//" ] func quirksConditions(token: Token): bool = if token.quirks: return true if token.name.isnone or token.name.get != "html": return true if token.sysid.issome: if token.sysid.get == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd": return true if token.pubid.issome: if token.pubid.get in PublicIdentifierEquals: return true for id in PublicIdentifierStartsWith: if token.pubid.get.startsWithNoCase(id): return true if token.sysid.isnone: for id in SystemIdentifierMissingAndPublicIdentifierStartsWith: if token.pubid.get.startsWithNoCase(id): return true return false func limitedQuirksConditions(token: Token): bool = if token.pubid.isnone: return false for id in PublicIdentifierStartsWithLimited: if token.pubid.get.startsWithNoCase(id): return true if token.sysid.isnone: return false for id in SystemIdentifierNotMissingAndPublicIdentifierStartsWith: if token.pubid.get.startsWithNoCase(id): return true return false # 13.2.6.2 proc genericRawtextElementParsingAlgorithm(parser: var HTML5Parser, token: Token) = discard parser.insertHTMLElement(token) parser.tokenizer.state = RAWTEXT parser.oldInsertionMode = parser.insertionMode parser.insertionMode = TEXT proc genericRCDATAElementParsingAlgorithm(parser: var HTML5Parser, token: Token) = discard parser.insertHTMLElement(token) parser.tokenizer.state = RCDATA parser.oldInsertionMode = parser.insertionMode parser.insertionMode = TEXT # Pop all elements, including the specified tag. proc popElementsIncl(parser: var HTML5Parser, tag: TagType) = while parser.getTagType(parser.popElement()) != tag: discard proc popElementsIncl(parser: var HTML5Parser, tags: set[TagType]) = while parser.getTagType(parser.popElement()) notin tags: discard # https://html.spec.whatwg.org/multipage/parsing.html#closing-elements-that-have-implied-end-tags proc generateImpliedEndTags(parser: var HTML5Parser) = const tags = {TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, TAG_RT, TAG_RTC} while parser.getTagType(parser.currentNode) in tags: discard parser.popElement() proc generateImpliedEndTags(parser: var HTML5Parser, exclude: TagType) = let tags = { TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, TAG_RT, TAG_RTC } - {exclude} while parser.getTagType(parser.currentNode) in tags: discard parser.popElement() proc generateImpliedEndTagsThoroughly(parser: var HTML5Parser) = const tags = {TAG_CAPTION, TAG_COLGROUP, TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, TAG_RT, TAG_RTC, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TR} while parser.getTagType(parser.currentNode) in tags: discard parser.popElement() # https://html.spec.whatwg.org/multipage/parsing.html#push-onto-the-list-of-active-formatting-elements proc pushOntoActiveFormatting[Handle](parser: var HTML5Parser[Handle], element: Handle, token: Token) = var count = 0 for i in countdown(parser.activeFormatting.high, 0): let it = parser.activeFormatting[i] if it[0] == nil: break if not parser.tagNameEquals(it[0], element): continue if parser.getNamespace(it[0]) != parser.getNamespace(element): continue var fail = false for k, v in it[1].attrs: if k notin token.attrs: fail = true break if v != token.attrs[k]: fail = true break if fail: continue for k, v in token.attrs: if k notin it[1].attrs: fail = true break if fail: continue inc count if count == 3: parser.activeFormatting.delete(i) break parser.activeFormatting.add((element, token)) proc reconstructActiveFormatting[Handle](parser: var HTML5Parser[Handle]) = type State = enum REWIND, ADVANCE, CREATE if parser.activeFormatting.len == 0: return if parser.activeFormatting[^1][0] == nil: return let tagType = parser.getTagType(parser.activeFormatting[^1][0]) if parser.hasElement(tagType): return var i = parser.activeFormatting.high template entry: Handle = (parser.activeFormatting[i][0]) var state = REWIND while true: {.computedGoto.} case state of REWIND: if i == 0: state = CREATE continue dec i if entry != nil: let tagType = parser.getTagType(entry) if not parser.hasElement(tagType): continue state = ADVANCE of ADVANCE: inc i state = CREATE of CREATE: parser.activeFormatting[i] = (parser.insertHTMLElement(parser.activeFormatting[i][1]), parser.activeFormatting[i][1]) if i != parser.activeFormatting.high: state = ADVANCE continue break proc clearActiveFormattingTillMarker(parser: var HTML5Parser) = while parser.activeFormatting.len > 0 and parser.activeFormatting.pop()[0] != nil: discard func isHTMLIntegrationPoint[Handle](parser: HTML5Parser[Handle], element: Handle): bool = return parser.isSVGIntegrationPoint(element) # (NOTE MathML not implemented) func extractEncFromMeta(s: string): Charset = var i = 0 while true: # Loop: var j = 0 while i < s.len: template check(c: static char) = if s[i] in {c, c.toUpperAscii()}: inc j else: j = 0 case j of 0: check 'c' of 1: check 'h' of 2: check 'a' of 3: check 'r' of 4: check 's' of 5: check 'e' of 6: check 't' of 7: inc j break else: discard inc i if j < 7: return CHARSET_UNKNOWN while i < s.len and s[i] in AsciiWhitespace: inc i if i >= s.len or s[i] != '=': continue while i < s.len and s[i] in AsciiWhitespace: inc i break inc i if i >= s.len: return CHARSET_UNKNOWN if s[i] in {'"', '\''}: let s2 = s.substr(i + 1).until(s[i]) if s2.len == 0 or s2[^1] != s[i]: return CHARSET_UNKNOWN return getCharset(s2) return getCharset(s.substr(i).until({';', ' '})) proc changeEncoding(parser: var HTML5Parser, cs: Charset) = if parser.charset in {CHARSET_UTF_16_LE, CHARSET_UTF_16_BE}: parser.confidence = CONFIDENCE_CERTAIN return parser.confidence = CONFIDENCE_CERTAIN if cs == parser.charset: return if cs == CHARSET_X_USER_DEFINED: parser.charset = CHARSET_WINDOWS_1252 else: parser.charset = cs parser.needsreinterpret = true proc parseErrorByTokenType(parser: var HTML5Parser, tokenType: TokenType) = case tokenType of START_TAG: parser.parseError UNEXPECTED_START_TAG of END_TAG: parser.parseError UNEXPECTED_END_TAG of EOF: parser.parseError UNEXPECTED_EOF else: doAssert false proc adoptionAgencyAlgorithm[Handle](parser: var HTML5Parser[Handle], token: Token): bool = template parse_error(e: ParseError) = parser.parseError(e) if parser.tagNameEquals(parser.currentNode, token): var fail = true for it in parser.activeFormatting: if it[0] == parser.currentNode: fail = false if fail: pop_current_node return false var i = 0 while true: if i >= 8: return false inc i if parser.activeFormatting.len == 0: return true var formatting: Handle var formattingIndex: int for j in countdown(parser.activeFormatting.high, 0): let element = parser.activeFormatting[j][0] if element == nil: return true if parser.tagNameEquals(parser.currentNode, token): formatting = element formattingIndex = j break if j == 0: return true let stackIndex = parser.openElements.find(formatting) if stackIndex < 0: parse_error ELEMENT_NOT_IN_OPEN_ELEMENTS parser.activeFormatting.delete(formattingIndex) return false if not parser.hasElementInScope(formatting): parse_error ELEMENT_NOT_IN_SCOPE return false if formatting != parser.currentNode: parse_error ELEMENT_NOT_CURRENT_NODE var furthestBlock: Handle = nil var furthestBlockIndex: int for j in countdown(parser.openElements.high, 0): if parser.openElements[j] == formatting: break if parser.getTagType(parser.openElements[j]) in SpecialElements: furthestBlock = parser.openElements[j] furthestBlockIndex = j break if furthestBlock == nil: while parser.popElement() != formatting: discard parser.activeFormatting.delete(formattingIndex) return false let commonAncestor = parser.openElements[stackIndex - 1] var bookmark = formattingIndex var node = furthestBlock var aboveNode = parser.openElements[furthestBlockIndex - 1] var lastNode = furthestBlock var j = 0 while true: inc j node = aboveNode let nodeStackIndex = parser.openElements.find(node) if node == formatting: break var nodeFormattingIndex = -1 for i in countdown(parser.activeFormatting.high, 0): if parser.activeFormatting[i][0] == node: nodeFormattingIndex = i break if j > 3 and nodeFormattingIndex >= 0: parser.activeFormatting.delete(nodeFormattingIndex) if nodeFormattingIndex < bookmark: dec bookmark # a previous node got deleted, so decrease bookmark by one if nodeFormattingIndex < 0: aboveNode = parser.openElements[nodeStackIndex - 1] parser.openElements.delete(nodeStackIndex) if nodeStackIndex < furthestBlockIndex: dec furthestBlockIndex furthestBlock = parser.openElements[furthestBlockIndex] continue let element = parser.createElement(parser.activeFormatting[nodeFormattingIndex][1], Namespace.HTML, commonAncestor) parser.activeFormatting[nodeFormattingIndex] = (element, parser.activeFormatting[nodeFormattingIndex][1]) parser.openElements[nodeStackIndex] = element aboveNode = parser.openElements[nodeStackIndex - 1] node = element if lastNode == furthestBlock: bookmark = nodeFormattingIndex + 1 parser.append(node, lastNode) lastNode = node let location = parser.appropriatePlaceForInsert(commonAncestor) location.inside.insert(lastNode, location.before) let token = parser.activeFormatting[formattingIndex][1] let element = parser.createElement(token, Namespace.HTML, furthestBlock) var tomove: seq[Handle] j = furthestBlock.childList.high while j >= 0: let child = furthestBlock.childList[j] tomove.add(child) parser.remove(child) dec j for child in tomove: parser.append(element, child) parser.append(furthestBlock, element) parser.activeFormatting.insert((element, token), bookmark) parser.activeFormatting.delete(formattingIndex) parser.openElements.insert(element, furthestBlockIndex) parser.openElements.delete(stackIndex) proc closeP(parser: var HTML5Parser) = parser.generateImpliedEndTags(TAG_P) if parser.getTagType(parser.currentNode) != TAG_P: parser.parseError(MISMATCHED_TAGS) while parser.getTagType(parser.popElement()) != TAG_P: discard # Following is an implementation of the state (?) machine defined in # https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhtml # It uses the ad-hoc pattern matching macro `match' to apply the following # transformations: # * First, pairs of patterns and actions are stored in tuples (and `discard' # statements...) # * These pairs are then assigned to token types, later mapped to legs of the # first case statement. # * Another case statement is constructed where needed, e.g. for switching on # characters/tags/etc. # * Finally, the whole thing is wrapped in a named block, to implement a # pseudo-goto by breaking out only when the else statement needn't be # executed. # # For example, the following code: # # match token: # TokenType.COMMENT => (block: echo "comment") # ("

", "", "") => (block: echo "p, a or closing div") # ("

", "

") => (block: anything_else) # (TokenType.START_TAG, TokenType.END_TAG) => (block: assert false, "invalid") # _ => (block: echo "anything else") # # (effectively) generates this: # # block inside_not_else: # case token.t # of TokenType.COMMENT: # echo "comment" # break inside_not_else # of TokenType.START_TAG: # case token.tagtype # of {TAG_P, TAG_A}: # echo "p, a or closing div" # break inside_not_else # of TAG_DIV: discard # else: # assert false # break inside_not_else # of TokenType.END_TAG: # case token.tagtype # of TAG_DIV: # echo "p, a or closing div" # break inside_not_else # of TAG_P: discard # else: # assert false # break inside_not_else # else: discard # echo "anything else" # # This duplicates any code that applies for several token types, except for the # else branch. macro match(token: Token, body: typed): untyped = type OfBranchStore = object ofBranches: seq[(seq[NimNode], NimNode)] defaultBranch: NimNode painted: bool # Stores 'of' branches var ofBranches: array[TokenType, OfBranchStore] # Stores 'else', 'elif' branches var defaultBranch: NimNode const tokenTypes = (func(): Table[string, TokenType] = for tt in TokenType: result[$tt] = tt)() for disc in body: let tup = disc[0] # access actual tuple let pattern = `tup`[0] let lambda = `tup`[1] var action = lambda.findChild(it.kind notin {nnkSym, nnkEmpty, nnkFormalParams}) if pattern.kind != nnkDiscardStmt and not (action.len == 2 and action[1].kind == nnkDiscardStmt and action[1][0] == newStrLitNode("anything_else")): action = quote do: `action` #eprint token #debug break inside_not_else var patterns = @[pattern] while patterns.len > 0: let pattern = patterns.pop() case pattern.kind of nnkSym: # simple symbols; we assume these are the enums ofBranches[tokenTypes[pattern.strVal]].defaultBranch = action ofBranches[tokenTypes[pattern.strVal]].painted = true of nnkCharLit: ofBranches[CHARACTER_ASCII].ofBranches.add((@[pattern], action)) ofBranches[CHARACTER_ASCII].painted = true of nnkCurly: case pattern[0].kind of nnkCharLit: ofBranches[CHARACTER_ASCII].ofBranches.add((@[pattern], action)) ofBranches[CHARACTER_ASCII].painted = true else: error "Unsupported curly of kind " & $pattern[0].kind of nnkStrLit: var tempTokenizer = newTokenizer(pattern.strVal) for token in tempTokenizer.tokenize: let tt = int(token.tagtype) case token.t of START_TAG, END_TAG: var found = false for i in 0..ofBranches[token.t].ofBranches.high: if ofBranches[token.t].ofBranches[i][1] == action: found = true ofBranches[token.t].ofBranches[i][0].add((quote do: TagType(`tt`))) ofBranches[token.t].painted = true break if not found: ofBranches[token.t].ofBranches.add((@[(quote do: TagType(`tt`))], action)) ofBranches[token.t].painted = true else: error pattern.strVal & ": Unsupported token " & $token & " of kind " & $token.t break of nnkDiscardStmt: defaultBranch = action of nnkTupleConstr: for child in pattern: patterns.add(child) else: error pattern.strVal & ": Unsupported pattern of kind " & $pattern.kind func tokenBranchOn(tok: TokenType): NimNode = case tok of START_TAG, END_TAG: return quote do: token.tagtype of CHARACTER: return quote do: token.r of CHARACTER_ASCII: return quote do: token.c else: error "Unsupported branching of token " & $tok template add_to_case(branch: typed) = if branch[0].len == 1: tokenCase.add(newNimNode(nnkOfBranch).add(branch[0][0]).add(branch[1])) else: var curly = newNimNode(nnkCurly) for node in branch[0]: curly.add(node) tokenCase.add(newNimNode(nnkOfBranch).add(curly).add(branch[1])) # Build case statements var mainCase = newNimNode(nnkCaseStmt).add(quote do: `token`.t) for tt in TokenType: let ofBranch = newNimNode(nnkOfBranch).add(quote do: TokenType(`tt`)) let tokenCase = newNimNode(nnkCaseStmt) if ofBranches[tt].defaultBranch != nil: if ofBranches[tt].ofBranches.len > 0: tokenCase.add(tokenBranchOn(tt)) for branch in ofBranches[tt].ofBranches: add_to_case branch tokenCase.add(newNimNode(nnkElse).add(ofBranches[tt].defaultBranch)) ofBranch.add(tokenCase) mainCase.add(ofBranch) else: ofBranch.add(ofBranches[tt].defaultBranch) mainCase.add(ofBranch) else: if ofBranches[tt].ofBranches.len > 0: tokenCase.add(tokenBranchOn(tt)) for branch in ofBranches[tt].ofBranches: add_to_case branch ofBranch.add(tokenCase) tokenCase.add(newNimNode(nnkElse).add(quote do: discard)) mainCase.add(ofBranch) else: discard for t in TokenType: if not ofBranches[t].painted: mainCase.add(newNimNode(nnkElse).add(quote do: discard)) break var stmts = newStmtList().add(mainCase) for stmt in defaultBranch: stmts.add(stmt) result = newBlockStmt(ident("inside_not_else"), stmts) proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], token: Token, insertionMode: InsertionMode) = template pop_all_nodes = while parser.openElements.len > 1: pop_current_node template anything_else = discard "anything_else" macro `=>`(v: typed, body: untyped): untyped = quote do: discard (`v`, proc() = `body`) template _ = discard template reprocess(tok: Token) = parser.processInHTMLContent(tok, parser.insertionMode) template parse_error(e: ParseError) = parser.parseError(e) template parse_error_if_mismatch(tagtype: TagType) = if parser.hasParseError(): if parser.getTagType(parser.currentNode) != TAG_DD: parse_error MISMATCHED_TAGS template parse_error_if_mismatch(tagtypes: set[TagType]) = if parser.hasParseError(): if parser.getTagType(parser.currentNode) notin tagtypes: parse_error MISMATCHED_TAGS case insertionMode of INITIAL: match token: AsciiWhitespace => (block: discard) TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document)) ) TokenType.DOCTYPE => (block: if token.name.isNone or token.name.get != "html" or token.pubid.isSome or (token.sysid.isSome and token.sysid.get != "about:legacy-compat"): parse_error INVALID_DOCTYPE let doctype = parser.createDocumentType(token.name.get(""), token.pubid.get(""), token.sysid.get("")) parser.append(parser.document, doctype) if not parser.opts.isIframeSrcdoc: if quirksConditions(token): parser.setQuirksMode(QUIRKS) elif limitedQuirksConditions(token): parser.setQuirksMode(LIMITED_QUIRKS) parser.insertionMode = BEFORE_HTML ) _ => (block: if not parser.opts.isIframeSrcdoc: parse_error UNEXPECTED_INITIAL_TOKEN parser.setQuirksMode(QUIRKS) parser.insertionMode = BEFORE_HTML reprocess token ) of BEFORE_HTML: match token: TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document)) ) AsciiWhitespace => (block: discard) "" => (block: let element = parser.createElement(token, Namespace.HTML, parser.document) parser.append(parser.document, element) parser.pushElement(element) parser.insertionMode = BEFORE_HEAD ) ("", "", "", "
") => (block: anything_else) TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) _ => (block: let element = parser.createElement(TAG_HTML, Namespace.HTML) parser.append(parser.document, element) parser.pushElement(element) parser.insertionMode = BEFORE_HEAD reprocess token ) of BEFORE_HEAD: match token: AsciiWhitespace => (block: discard) TokenType.COMMENT => (block: parser.insertComment(token)) TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "" => (block: parser.processInHTMLContent(token, IN_BODY)) "" => (block: parser.head = some(parser.insertHTMLElement(token)) parser.insertionMode = IN_HEAD ) ("", "", "", "
") => (block: anything_else) TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) _ => (block: let head = Token(t: START_TAG, tagtype: TAG_HEAD) parser.head = some(parser.insertHTMLElement(head)) parser.insertionMode = IN_HEAD reprocess token ) of IN_HEAD: match token: AsciiWhitespace => (block: discard) TokenType.COMMENT => (block: parser.insertComment(token)) TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "" => (block: parser.processInHTMLContent(token, IN_BODY)) ("", "", "", "") => (block: discard parser.insertHTMLElement(token) pop_current_node ) "" => (block: discard parser.insertHTMLElement(token) pop_current_node if parser.confidence == CONFIDENCE_TENTATIVE: let cs = getCharset(token.attrs.getOrDefault("charset", "")) if cs != CHARSET_UNKNOWN: parser.changeEncoding(cs) elif "http-equiv" in token.attrs: if token.attrs["http-equiv"].equalsIgnoreCase("Content-Type") and "content" in token.attrs: let cs = extractEncFromMeta(token.attrs["content"]) if cs != CHARSET_UNKNOWN: parser.changeEncoding(cs) ) "" => (block: parser.genericRCDATAElementParsingAlgorithm(token)) "<noscript>" => (block: if not parser.opts.scripting: discard parser.insertHTMLElement(token) parser.insertionMode = IN_HEAD_NOSCRIPT else: parser.genericRawtextElementParsingAlgorithm(token) ) ("<noframes>", "<style>") => (block: parser.genericRawtextElementParsingAlgorithm(token)) "<script>" => (block: let location = parser.appropriatePlaceForInsert() let element = parser.createElement(token, Namespace.HTML, location.inside) #TODO document.write (?) parser.insert(location, element) parser.pushElement(element) parser.tokenizer.state = SCRIPT_DATA parser.oldInsertionMode = parser.insertionMode parser.insertionMode = TEXT ) "</head>" => (block: pop_current_node parser.insertionMode = AFTER_HEAD ) ("</body>", "</html>", "</br>") => (block: anything_else) "<template>" => (block: discard parser.insertHTMLElement(token) parser.activeFormatting.add((nil, nil)) parser.framesetok = false parser.insertionMode = IN_TEMPLATE parser.templateModes.add(IN_TEMPLATE) ) "</template>" => (block: if not parser.hasElement(TAG_TEMPLATE): parse_error ELEMENT_NOT_IN_OPEN_ELEMENTS else: parser.generateImpliedEndTagsThoroughly() if parser.getTagType(parser.currentNode) != TAG_TEMPLATE: parse_error MISMATCHED_TAGS parser.popElementsIncl(TAG_TEMPLATE) parser.clearActiveFormattingTillMarker() discard parser.templateModes.pop() parser.resetInsertionMode() ) ("<head>", TokenType.END_TAG) => (block: parse_error UNEXPECTED_END_TAG) _ => (block: pop_current_node parser.insertionMode = AFTER_HEAD reprocess token ) of IN_HEAD_NOSCRIPT: match token: TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "</noscript>" => (block: pop_current_node parser.insertionMode = IN_HEAD ) (AsciiWhitespace, TokenType.COMMENT, "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<style>") => (block: parser.processInHTMLContent(token, IN_HEAD)) "</br>" => (block: anything_else) ("<head>", "<noscript>") => (block: parse_error UNEXPECTED_START_TAG) TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) _ => (block: pop_current_node parser.insertionMode = IN_HEAD reprocess token ) of AFTER_HEAD: match token: AsciiWhitespace => (block: parser.insertCharacter(token.c)) TokenType.COMMENT => (block: parser.insertComment(token)) TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "<body>" => (block: discard parser.insertHTMLElement(token) parser.framesetok = false parser.insertionMode = IN_BODY ) "<frameset>" => (block: discard parser.insertHTMLElement(token) parser.insertionMode = IN_FRAMESET ) ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<script>", "<style>", "<template>", "<title>") => (block: parse_error UNEXPECTED_START_TAG parser.pushElement(parser.head.get) parser.processInHTMLContent(token, IN_HEAD) for i in countdown(parser.openElements.high, 0): if parser.openElements[i] == parser.head.get: parser.openElements.delete(i) ) "</template>" => (block: parser.processInHTMLContent(token, IN_HEAD)) ("</body>", "</html>", "</br>") => (block: anything_else) ("<head>") => (block: parse_error UNEXPECTED_START_TAG) (TokenType.END_TAG) => (block: parse_error UNEXPECTED_END_TAG) _ => (block: discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_BODY)) parser.insertionMode = IN_BODY reprocess token ) of IN_BODY: template any_other_start_tag() = parser.reconstructActiveFormatting() discard parser.insertHTMLElement(token) template any_other_end_tag() = for i in countdown(parser.openElements.high, 0): let node = parser.openElements[i] if parser.tagNameEquals(node, token): parser.generateImpliedEndTags(token.tagtype) if node != parser.currentNode: parse_error ELEMENT_NOT_CURRENT_NODE while parser.popElement() != node: discard break elif parser.getTagType(node) in SpecialElements: parse_error UNEXPECTED_SPECIAL_ELEMENT return template parse_error_if_body_has_disallowed_open_elements = if parser.hasParseError(): const Disallowed = AllTagTypes - { TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, TAG_RT, TAG_RTC, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TR, TAG_BODY, TAG_HTML } if parser.hasElement(Disallowed): parse_error MISMATCHED_TAGS match token: '\0' => (block: parse_error UNEXPECTED_NULL) AsciiWhitespace => (block: parser.reconstructActiveFormatting() parser.insertCharacter(token.c) ) TokenType.CHARACTER_ASCII => (block: parser.reconstructActiveFormatting() parser.insertCharacter(token.c) parser.framesetOk = false ) TokenType.CHARACTER => (block: parser.reconstructActiveFormatting() parser.insertCharacter(token.r) parser.framesetOk = false ) TokenType.COMMENT => (block: parser.insertComment(token)) TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parse_error UNEXPECTED_START_TAG if parser.hasElement(TAG_TEMPLATE): discard else: parser.addAttrsIfMissing(parser.openElements[0], token.attrs) ) ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<script>", "<style>", "<template>", "<title>", "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD)) "<body>" => (block: parse_error UNEXPECTED_START_TAG if parser.openElements.len == 1 or parser.getTagType(parser.openElements[1]) != TAG_BODY or parser.hasElement(TAG_TEMPLATE): discard else: parser.framesetOk = false parser.addAttrsIfMissing(parser.openElements[1], token.attrs) ) "<frameset>" => (block: parse_error UNEXPECTED_START_TAG if parser.openElements.len == 1 or parser.getTagType(parser.openElements[1]) != TAG_BODY or not parser.framesetOk: discard else: parser.remove(parser.openElements[1]) pop_all_nodes ) TokenType.EOF => (block: if parser.templateModes.len > 0: parser.processInHTMLContent(token, IN_TEMPLATE) else: parse_error_if_body_has_disallowed_open_elements # stop ) "</body>" => (block: if not parser.hasElementInScope(TAG_BODY): parse_error UNEXPECTED_END_TAG else: parse_error_if_body_has_disallowed_open_elements parser.insertionMode = AFTER_BODY ) "</html>" => (block: if not parser.hasElementInScope(TAG_BODY): parse_error UNEXPECTED_END_TAG else: parse_error_if_body_has_disallowed_open_elements parser.insertionMode = AFTER_BODY reprocess token ) ("<address>", "<article>", "<aside>", "<blockquote>", "<center>", "<details>", "<dialog>", "<dir>", "<div>", "<dl>", "<fieldset>", "<figcaption>", "<figure>", "<footer>", "<header>", "<hgroup>", "<main>", "<menu>", "<nav>", "<ol>", "<p>", "<section>", "<summary>", "<ul>") => (block: if parser.hasElementInButtonScope(TAG_P): parser.closeP() discard parser.insertHTMLElement(token) ) ("<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>") => (block: if parser.hasElementInButtonScope(TAG_P): parser.closeP() if parser.getTagType(parser.currentNode) in HTagTypes: parse_error NESTED_TAGS pop_current_node discard parser.insertHTMLElement(token) ) ("<pre>", "<listing>") => (block: if parser.hasElementInButtonScope(TAG_P): parser.closeP() discard parser.insertHTMLElement(token) parser.ignoreLF = true parser.framesetOk = false ) "<form>" => (block: let hasTemplate = parser.hasElement(TAG_TEMPLATE) if parser.form.isSome and not hasTemplate: parse_error NESTED_TAGS else: if parser.hasElementInButtonScope(TAG_P): parser.closeP() let element = parser.insertHTMLElement(token) if not hasTemplate: parser.form = some(element) ) "<li>" => (block: parser.framesetOk = false for i in countdown(parser.openElements.high, 0): let node = parser.openElements[i] let tagType = parser.getTagType(node) case tagType of TAG_LI: parser.generateImpliedEndTags(TAG_LI) parse_error_if_mismatch TAG_LI parser.popElementsIncl(TAG_LI) break of SpecialElements - {TAG_ADDRESS, TAG_DIV, TAG_P, TAG_LI}: break else: discard if parser.hasElementInButtonScope(TAG_P): parser.closeP() discard parser.insertHTMLElement(token) ) ("<dd>", "<dt>") => (block: parser.framesetOk = false for i in countdown(parser.openElements.high, 0): let node = parser.openElements[i] let tagType = parser.getTagType(node) case tagType of TAG_DD: parser.generateImpliedEndTags(TAG_DD) parse_error_if_mismatch TAG_DD parser.popElementsIncl(TAG_DD) break of TAG_DT: parser.generateImpliedEndTags(TAG_DT) parse_error_if_mismatch TAG_DT parser.popElementsIncl(TAG_DT) break of SpecialElements - {TAG_ADDRESS, TAG_DIV, TAG_P, TAG_DD, TAG_DT}: break else: discard if parser.hasElementInButtonScope(TAG_P): parser.closeP() discard parser.insertHTMLElement(token) ) "<plaintext>" => (block: if parser.hasElementInButtonScope(TAG_P): parser.closeP() discard parser.insertHTMLElement(token) parser.tokenizer.state = PLAINTEXT ) "<button>" => (block: if parser.hasElementInScope(TAG_BUTTON): parse_error NESTED_TAGS parser.generateImpliedEndTags() parser.popElementsIncl(TAG_BUTTON) parser.reconstructActiveFormatting() discard parser.insertHTMLElement(token) parser.framesetOk = false ) ("</address>", "</article>", "</aside>", "</blockquote>", "</button>", "</center>", "</details>", "</dialog>", "</dir>", "</div>", "</dl>", "</fieldset>", "</figcaption>", "</figure>", "</footer>", "</header>", "</hgroup>", "</listing>", "</main>", "</menu>", "</nav>", "</ol>", "</pre>", "</section>", "</summary>", "</ul>") => (block: if not parser.hasElementInScope(token.tagtype): parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags() parse_error_if_mismatch token.tagtype parser.popElementsIncl(token.tagtype) ) "</form>" => (block: if not parser.hasElement(TAG_TEMPLATE): let form = parser.form parser.form = none(Handle) if form.isNone or not parser.hasElementInScope(parser.getTagType(form.get)): parse_error ELEMENT_NOT_IN_SCOPE return let node = form.get parser.generateImpliedEndTags() if parser.currentNode != node: parse_error ELEMENT_NOT_CURRENT_NODE parser.openElements.delete(parser.openElements.find(node)) else: if not parser.hasElementInScope(TAG_FORM): parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags() parse_error_if_mismatch TAG_FORM parser.popElementsIncl(TAG_FORM) ) "</p>" => (block: if not parser.hasElementInButtonScope(TAG_P): parse_error ELEMENT_NOT_IN_SCOPE discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_P)) parser.closeP() ) "</li>" => (block: if not parser.hasElementInListItemScope(TAG_LI): parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags(TAG_LI) parse_error_if_mismatch TAG_LI parser.popElementsIncl(TAG_LI) ) ("</dd>", "</dt>") => (block: if not parser.hasElementInScope(token.tagtype): parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags(token.tagtype) parse_error_if_mismatch token.tagtype parser.popElementsIncl(token.tagtype) ) ("</h1>", "</h2>", "</h3>", "</h4>", "</h5>", "</h6>") => (block: if not parser.hasElementInScope(HTagTypes): parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags() parse_error_if_mismatch token.tagtype parser.popElementsIncl(HTagTypes) ) "</sarcasm>" => (block: #*deep breath* anything_else ) "<a>" => (block: var anchor: Option[Handle] for i in countdown(parser.activeFormatting.high, 0): let format = parser.activeFormatting[i] if format[0] == nil: break if parser.getTagType(format[0]) == TAG_A: anchor = some(format[0]) break if anchor.isSome: parse_error NESTED_TAGS if parser.adoptionAgencyAlgorithm(token): any_other_end_tag return for i in 0..parser.activeFormatting.high: if parser.activeFormatting[i][0] == anchor.get: parser.activeFormatting.delete(i) break for i in 0..parser.openElements.high: if parser.openElements[i] == anchor.get: parser.openElements.delete(i) break parser.reconstructActiveFormatting() let element = parser.insertHTMLElement(token) parser.pushOntoActiveFormatting(element, token) ) ("<b>", "<big>", "<code>", "<em>", "<font>", "<i>", "<s>", "<small>", "<strike>", "<strong>", "<tt>", "<u>") => (block: parser.reconstructActiveFormatting() let element = parser.insertHTMLElement(token) parser.pushOntoActiveFormatting(element, token) ) "<nobr>" => (block: parser.reconstructActiveFormatting() if parser.hasElementInScope(TAG_NOBR): parse_error NESTED_TAGS if parser.adoptionAgencyAlgorithm(token): any_other_end_tag return parser.reconstructActiveFormatting() let element = parser.insertHTMLElement(token) parser.pushOntoActiveFormatting(element, token) ) ("</a>", "</b>", "</big>", "</code>", "</em>", "</font>", "</i>", "</nobr>", "</s>", "</small>", "</strike>", "</strong>", "</tt>", "</u>") => (block: if parser.adoptionAgencyAlgorithm(token): any_other_end_tag return ) ("<applet>", "<marquee>", "<object>") => (block: parser.reconstructActiveFormatting() discard parser.insertHTMLElement(token) parser.activeFormatting.add((nil, nil)) parser.framesetOk = false ) ("</applet>", "</marquee>", "</object>") => (block: if not parser.hasElementInScope(token.tagtype): parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags() parse_error_if_mismatch token.tagtype while parser.getTagType(parser.popElement()) != token.tagtype: discard parser.clearActiveFormattingTillMarker() ) "<table>" => (block: if parser.quirksMode != QUIRKS: if parser.hasElementInButtonScope(TAG_P): parser.closeP() discard parser.insertHTMLElement(token) parser.framesetOk = false parser.insertionMode = IN_TABLE ) "</br>" => (block: parse_error UNEXPECTED_END_TAG reprocess Token(t: START_TAG, tagtype: TAG_BR) ) ("<area>", "<br>", "<embed>", "<img>", "<keygen>", "<wbr>") => (block: parser.reconstructActiveFormatting() discard parser.insertHTMLElement(token) pop_current_node parser.framesetOk = false ) "<input>" => (block: parser.reconstructActiveFormatting() discard parser.insertHTMLElement(token) pop_current_node if not token.attrs.getOrDefault("type").equalsIgnoreCase("hidden"): parser.framesetOk = false ) ("<param>", "<source>", "<track>") => (block: discard parser.insertHTMLElement(token) pop_current_node ) "<hr>" => (block: if parser.hasElementInButtonScope(TAG_P): parser.closeP() discard parser.insertHTMLElement(token) pop_current_node parser.framesetOk = false ) "<image>" => (block: #TODO ew let token = Token(t: START_TAG, tagtype: TAG_IMG, tagname: "img", selfclosing: token.selfclosing, attrs: token.attrs) reprocess token ) "<textarea>" => (block: discard parser.insertHTMLElement(token) parser.ignoreLF = true parser.tokenizer.state = RCDATA parser.oldInsertionMode = parser.insertionMode parser.framesetOk = false parser.insertionMode = TEXT ) "<xmp>" => (block: if parser.hasElementInButtonScope(TAG_P): parser.closeP() parser.reconstructActiveFormatting() parser.framesetOk = false parser.genericRawtextElementParsingAlgorithm(token) ) "<iframe>" => (block: parser.framesetOk = false parser.genericRawtextElementParsingAlgorithm(token) ) "<noembed>" => (block: parser.genericRawtextElementParsingAlgorithm(token) ) "<noscript>" => (block: if parser.opts.scripting: parser.genericRawtextElementParsingAlgorithm(token) else: any_other_start_tag ) "<select>" => (block: parser.reconstructActiveFormatting() discard parser.insertHTMLElement(token) parser.framesetOk = false if parser.insertionMode in {IN_TABLE, IN_CAPTION, IN_TABLE_BODY, IN_CELL}: parser.insertionMode = IN_SELECT_IN_TABLE else: parser.insertionMode = IN_SELECT ) ("<optgroup>", "<option>") => (block: if parser.getTagType(parser.currentNode) == TAG_OPTION: pop_current_node parser.reconstructActiveFormatting() discard parser.insertHTMLElement(token) ) ("<rb>", "<rtc>") => (block: if parser.hasElementInScope(TAG_RUBY): parser.generateImpliedEndTags() parse_error_if_mismatch TAG_RUBY discard parser.insertHTMLElement(token) ) ("<rp>", "<rt>") => (block: if parser.hasElementInScope(TAG_RUBY): parser.generateImpliedEndTags(TAG_RTC) parse_error_if_mismatch {TAG_RUBY, TAG_RTC} discard parser.insertHTMLElement(token) ) #NOTE <math> (not implemented) #TODO <svg> (SVG) ("<caption>", "<col>", "<colgroup>", "<frame>", "<head>", "<tbody>", "<td>", "<tfoot>", "<th>", "<thead>", "<tr>") => (block: parse_error UNEXPECTED_START_TAG ) TokenType.START_TAG => (block: any_other_start_tag) TokenType.END_TAG => (block: any_other_end_tag) of TEXT: match token: TokenType.CHARACTER_ASCII => (block: assert token.c != '\0' parser.insertCharacter(token.c) ) TokenType.CHARACTER => (block: parser.insertCharacter(token.r) ) TokenType.EOF => (block: parse_error UNEXPECTED_EOF if parser.getTagType(parser.currentNode) == TAG_SCRIPT: parser.setScriptAlreadyStarted(parser.currentNode) pop_current_node parser.insertionMode = parser.oldInsertionMode reprocess token ) "</script>" => (block: #TODO microtask (?) pop_current_node parser.insertionMode = parser.oldInsertionMode ) TokenType.END_TAG => (block: pop_current_node parser.insertionMode = parser.oldInsertionMode ) of IN_TABLE: template clear_the_stack_back_to_a_table_context() = while parser.getTagType(parser.currentNode) notin {TAG_TABLE, TAG_TEMPLATE, TAG_HTML}: pop_current_node match token: (TokenType.CHARACTER_ASCII, TokenType.CHARACTER) => (block: const CanHaveText = { TAG_TABLE, TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TR } if parser.getTagType(parser.currentNode) in CanHaveText: parser.pendingTableChars = "" parser.pendingTableCharsWhitespace = true parser.oldInsertionMode = parser.insertionMode parser.insertionMode = IN_TABLE_TEXT reprocess token else: # anything else parse_error INVALID_TEXT_PARENT parser.fosterParenting = true parser.processInHTMLContent(token, IN_BODY) parser.fosterParenting = false ) TokenType.COMMENT => (block: parser.insertComment(token)) TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<caption>" => (block: clear_the_stack_back_to_a_table_context parser.activeFormatting.add((nil, nil)) discard parser.insertHTMLElement(token) parser.insertionMode = IN_CAPTION ) "<colgroup>" => (block: clear_the_stack_back_to_a_table_context discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_COLGROUP)) parser.insertionMode = IN_COLUMN_GROUP ) ("<tbody>", "<tfoot>", "<thead>") => (block: clear_the_stack_back_to_a_table_context discard parser.insertHTMLElement(token) parser.insertionMode = IN_TABLE_BODY ) ("<td>", "<th>", "<tr>") => (block: clear_the_stack_back_to_a_table_context discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_TBODY)) parser.insertionMode = IN_TABLE_BODY reprocess token ) "<table>" => (block: parse_error NESTED_TAGS if not parser.hasElementInScope(TAG_TABLE): discard else: while parser.getTagType(parser.popElement()) != TAG_TABLE: discard parser.resetInsertionMode() reprocess token ) "</table>" => (block: if not parser.hasElementInScope(TAG_TABLE): parse_error ELEMENT_NOT_IN_SCOPE else: while parser.getTagType(parser.popElement()) != TAG_TABLE: discard parser.resetInsertionMode() ) ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</tbody>", "</td>", "</tfoot>", "</th>", "</thead>", "</tr>") => (block: parse_error UNEXPECTED_END_TAG ) ("<style>", "<script>", "<template>", "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD) ) "<input>" => (block: parse_error UNEXPECTED_START_TAG if not token.attrs.getOrDefault("type").equalsIgnoreCase("hidden"): # anything else parser.fosterParenting = true parser.processInHTMLContent(token, IN_BODY) parser.fosterParenting = false else: discard parser.insertHTMLElement(token) pop_current_node ) "<form>" => (block: parse_error UNEXPECTED_START_TAG if parser.form.isSome or parser.hasElement(TAG_TEMPLATE): discard else: parser.form = some(parser.insertHTMLElement(token)) pop_current_node ) TokenType.EOF => (block: parser.processInHTMLContent(token, IN_BODY) ) _ => (block: parse_error UNEXPECTED_START_TAG parser.fosterParenting = true parser.processInHTMLContent(token, IN_BODY) parser.fosterParenting = false ) of IN_TABLE_TEXT: match token: '\0' => (block: parse_error UNEXPECTED_NULL) TokenType.CHARACTER_ASCII => (block: if token.c notin AsciiWhitespace: parser.pendingTableCharsWhitespace = false parser.pendingTableChars &= token.c ) TokenType.CHARACTER => (block: parser.pendingTableChars &= $token.r parser.pendingTableCharsWhitespace = false ) _ => (block: if not parser.pendingTableCharsWhitespace: # I *think* this is effectively the same thing the specification # wants... parse_error NON_SPACE_TABLE_TEXT parser.fosterParenting = true parser.reconstructActiveFormatting() parser.insertCharacter(parser.pendingTableChars) parser.framesetOk = false parser.fosterParenting = false else: parser.insertCharacter(parser.pendingTableChars) parser.insertionMode = parser.oldInsertionMode reprocess token ) of IN_CAPTION: match token: "</caption>" => (block: if not parser.hasElementInTableScope(TAG_CAPTION): parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags() parse_error_if_mismatch TAG_CAPTION parser.popElementsIncl(TAG_CAPTION) parser.clearActiveFormattingTillMarker() parser.insertionMode = IN_TABLE ) ("<caption>", "<col>", "<colgroup>", "<tbody>", "<td>", "<tfoot>", "<th>", "<thead>", "<tr>", "</table>") => (block: if not parser.hasElementInTableScope(TAG_CAPTION): parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags() parse_error_if_mismatch TAG_CAPTION parser.clearActiveFormattingTillMarker() parser.insertionMode = IN_TABLE reprocess token ) ("</body>", "</col>", "</colgroup>", "</html>", "</tbody>", "</td>", "</tfoot>", "</th>", "</thead>", "</tr>") => (block: parse_error UNEXPECTED_END_TAG ) _ => (block: parser.processInHTMLContent(token, IN_BODY)) of IN_COLUMN_GROUP: match token: AsciiWhitespace => (block: parser.insertCharacter(token.c)) TokenType.COMMENT => (block: parser.insertComment(token)) TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "<col>" => (block: discard parser.insertHTMLElement(token) pop_current_node ) "</colgroup>" => (block: if parser.getTagType(parser.currentNode) != TAG_COLGROUP: parse_error MISMATCHED_TAGS else: pop_current_node parser.insertionMode = IN_TABLE ) "</col>" => (block: parse_error UNEXPECTED_END_TAG) ("<template>", "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD) ) TokenType.EOF => (block: parser.processInHTMLContent(token, IN_BODY)) _ => (block: if parser.getTagType(parser.currentNode) != TAG_COLGROUP: parse_error MISMATCHED_TAGS else: pop_current_node parser.insertionMode = IN_TABLE reprocess token ) of IN_TABLE_BODY: template clear_the_stack_back_to_a_table_body_context() = while parser.getTagType(parser.currentNode) notin {TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TEMPLATE, TAG_HTML}: pop_current_node match token: "<tr>" => (block: clear_the_stack_back_to_a_table_body_context discard parser.insertHTMLElement(token) parser.insertionMode = IN_ROW ) ("<th>", "<td>") => (block: parse_error UNEXPECTED_START_TAG clear_the_stack_back_to_a_table_body_context discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_TR)) parser.insertionMode = IN_ROW reprocess token ) ("</tbody>", "</tfoot>", "</thead>") => (block: if not parser.hasElementInTableScope(token.tagtype): parse_error ELEMENT_NOT_IN_SCOPE else: clear_the_stack_back_to_a_table_body_context pop_current_node parser.insertionMode = IN_TABLE ) ("<caption>", "<col>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>", "</table>") => (block: if not parser.hasElementInTableScope({TAG_TBODY, TAG_THEAD, TAG_TFOOT}): parse_error ELEMENT_NOT_IN_SCOPE else: clear_the_stack_back_to_a_table_body_context pop_current_node parser.insertionMode = IN_TABLE reprocess token ) ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</td>", "</th>", "</tr>") => (block: parse_error ELEMENT_NOT_IN_SCOPE ) _ => (block: parser.processInHTMLContent(token, IN_TABLE)) of IN_ROW: template clear_the_stack_back_to_a_table_row_context() = while parser.getTagType(parser.currentNode) notin {TAG_TR, TAG_TEMPLATE, TAG_HTML}: pop_current_node match token: ("<th>", "<td>") => (block: clear_the_stack_back_to_a_table_row_context discard parser.insertHTMLElement(token) parser.insertionMode = IN_CELL parser.activeFormatting.add((nil, nil)) ) "</tr>" => (block: if not parser.hasElementInTableScope(TAG_TR): parse_error ELEMENT_NOT_IN_SCOPE else: clear_the_stack_back_to_a_table_row_context pop_current_node parser.insertionMode = IN_TABLE_BODY ) ("<caption>", "<col>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>", "<tr>", "</table>") => (block: if not parser.hasElementInTableScope(TAG_TR): parse_error ELEMENT_NOT_IN_SCOPE else: clear_the_stack_back_to_a_table_row_context pop_current_node parser.insertionMode = IN_TABLE_BODY reprocess token ) ("</tbody>", "</tfoot>", "</thead>") => (block: if not parser.hasElementInTableScope(token.tagtype): parse_error ELEMENT_NOT_IN_SCOPE elif not parser.hasElementInTableScope(TAG_TR): discard else: clear_the_stack_back_to_a_table_row_context pop_current_node parser.insertionMode = IN_BODY reprocess token ) ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</td>", "</th>") => (block: parse_error UNEXPECTED_END_TAG) _ => (block: parser.processInHTMLContent(token, IN_TABLE)) of IN_CELL: template close_cell() = parser.generateImpliedEndTags() parse_error_if_mismatch {TAG_TD, TAG_TH} parser.popElementsIncl({TAG_TD, TAG_TH}) parser.clearActiveFormattingTillMarker() parser.insertionMode = IN_ROW match token: ("</td>", "</th>") => (block: if not parser.hasElementInTableScope(token.tagtype): parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags() parse_error_if_mismatch token.tagtype parser.popElementsIncl(token.tagtype) parser.clearActiveFormattingTillMarker() parser.insertionMode = IN_ROW ) ("<caption>", "<col>", "<colgroup>", "<tbody>", "<td>", "<tfoot>", "<thead>", "<tr>") => (block: if not parser.hasElementInTableScope({TAG_TD, TAG_TH}): parse_error ELEMENT_NOT_IN_SCOPE else: close_cell reprocess token ) ("</body>", "</caption>", "</col>", "</colgroup>", "</html>") => (block: parse_error UNEXPECTED_END_TAG ) ("</table>", "</tbody>", "</tfoot>", "</thead>", "</tr>") => (block: if not parser.hasElementInTableScope(token.tagtype): parse_error ELEMENT_NOT_IN_SCOPE else: close_cell reprocess token ) _ => (block: parser.processInHTMLContent(token, IN_BODY)) of IN_SELECT: match token: '\0' => (block: parse_error UNEXPECTED_NULL) TokenType.CHARACTER_ASCII => (block: parser.insertCharacter(token.c)) TokenType.CHARACTER => (block: parser.insertCharacter(token.r)) TokenType.COMMENT => (block: parser.insertComment(token)) TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "<option>" => (block: if parser.getTagType(parser.currentNode) == TAG_OPTION: pop_current_node discard parser.insertHTMLElement(token) ) "<optgroup>" => (block: if parser.getTagType(parser.currentNode) == TAG_OPTION: pop_current_node if parser.getTagType(parser.currentNode) == TAG_OPTGROUP: pop_current_node discard parser.insertHTMLElement(token) ) "</optgroup>" => (block: if parser.getTagType(parser.currentNode) == TAG_OPTION: if parser.openElements.len > 1 and parser.getTagType(parser.openElements[^2]) == TAG_OPTGROUP: pop_current_node if parser.getTagType(parser.currentNode) == TAG_OPTGROUP: pop_current_node else: parse_error MISMATCHED_TAGS ) "</option>" => (block: if parser.getTagType(parser.currentNode) == TAG_OPTION: pop_current_node else: parse_error MISMATCHED_TAGS ) "</select>" => (block: if not parser.hasElementInSelectScope(TAG_SELECT): parse_error ELEMENT_NOT_IN_SCOPE else: while parser.getTagType(parser.popElement()) != TAG_SELECT: discard parser.resetInsertionMode() ) "<select>" => (block: parse_error NESTED_TAGS if parser.hasElementInSelectScope(TAG_SELECT): while parser.getTagType(parser.popElement()) != TAG_SELECT: discard parser.resetInsertionMode() ) ("<input>", "<keygen>", "<textarea>") => (block: parse_error UNEXPECTED_START_TAG if not parser.hasElementInSelectScope(TAG_SELECT): discard else: while parser.getTagType(parser.popElement()) != TAG_SELECT: discard parser.resetInsertionMode() reprocess token ) ("<script>", "<template>", "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD)) TokenType.EOF => (block: parser.processInHTMLContent(token, IN_BODY)) TokenType.START_TAG => (block: parse_error UNEXPECTED_START_TAG) TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) of IN_SELECT_IN_TABLE: match token: ("<caption>", "<table>", "<tbody>", "<tfoot>", "<thead>", "<tr>", "<td>", "<th>") => (block: parse_error UNEXPECTED_START_TAG while parser.getTagType(parser.popElement()) != TAG_SELECT: discard parser.resetInsertionMode() reprocess token ) ("</caption>", "</table>", "</tbody>", "</tfoot>", "</thead>", "</tr>", "</td>", "</th>") => (block: parse_error UNEXPECTED_END_TAG if not parser.hasElementInTableScope(token.tagtype): discard else: parser.popElementsIncl(TAG_SELECT) parser.resetInsertionMode() reprocess token ) _ => (block: parser.processInHTMLContent(token, IN_SELECT)) of IN_TEMPLATE: match token: (TokenType.CHARACTER_ASCII, TokenType.CHARACTER, TokenType.DOCTYPE) => (block: parser.processInHTMLContent(token, IN_BODY) ) ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<script>", "<style>", "<template>", "<title>", "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD) ) ("<caption>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>") => (block: discard parser.templateModes.pop() parser.templateModes.add(IN_TABLE) parser.insertionMode = IN_TABLE reprocess token ) "<col>" => (block: discard parser.templateModes.pop() parser.templateModes.add(IN_COLUMN_GROUP) parser.insertionMode = IN_COLUMN_GROUP reprocess token ) "<tr>" => (block: discard parser.templateModes.pop() parser.templateModes.add(IN_TABLE_BODY) parser.insertionMode = IN_TABLE_BODY reprocess token ) ("<td>", "<th>") => (block: discard parser.templateModes.pop() parser.templateModes.add(IN_ROW) parser.insertionMode = IN_ROW reprocess token ) TokenType.START_TAG => (block: discard parser.templateModes.pop() parser.templateModes.add(IN_BODY) parser.insertionMode = IN_BODY reprocess token ) TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) TokenType.EOF => (block: if not parser.hasElement(TAG_TEMPLATE): discard # stop else: parse_error UNEXPECTED_EOF parser.popElementsIncl(TAG_TEMPLATE) parser.clearActiveFormattingTillMarker() discard parser.templateModes.pop() parser.resetInsertionMode() reprocess token ) of AFTER_BODY: match token: AsciiWhitespace => (block: parser.processInHTMLContent(token, IN_BODY)) TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.openElements[0]))) TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "</html>" => (block: if parser.fragment: parse_error UNEXPECTED_END_TAG else: parser.insertionMode = AFTER_AFTER_BODY ) TokenType.EOF => (block: discard) # stop _ => (block: parse_error UNEXPECTED_AFTER_BODY_TOKEN parser.insertionMode = IN_BODY reprocess token ) of IN_FRAMESET: match token: AsciiWhitespace => (block: parser.insertCharacter(token.c)) TokenType.COMMENT => (block: parser.insertComment(token)) TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "<frameset>" => (block: if parser.getTagType(parser.currentNode) == TAG_HTML: parse_error UNEXPECTED_START_TAG else: pop_current_node if not parser.fragment and parser.getTagType(parser.currentNode) != TAG_FRAMESET: parser.insertionMode = AFTER_FRAMESET ) "<frame>" => (block: discard parser.insertHTMLElement(token) pop_current_node ) "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) TokenType.EOF => (block: if parser.getTagType(parser.currentNode) != TAG_HTML: parse_error UNEXPECTED_EOF # stop ) _ => (block: parser.parseErrorByTokenType(token.t)) of AFTER_FRAMESET: match token: AsciiWhitespace => (block: parser.insertCharacter(token.c)) TokenType.COMMENT => (block: parser.insertComment(token)) TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "</html>" => (block: parser.insertionMode = AFTER_AFTER_FRAMESET) "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) TokenType.EOF => (block: discard) # stop _ => (block: parser.parseErrorByTokenType(token.t)) of AFTER_AFTER_BODY: match token: TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document)) ) (TokenType.DOCTYPE, AsciiWhitespace, "<html>") => (block: parser.processInHTMLContent(token, IN_BODY) ) TokenType.EOF => (block: discard) # stop _ => (block: parser.parseErrorByTokenType(token.t) parser.insertionMode = IN_BODY reprocess token ) of AFTER_AFTER_FRAMESET: match token: TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document)) ) (TokenType.DOCTYPE, AsciiWhitespace, "<html>") => (block: parser.processInHTMLContent(token, IN_BODY) ) TokenType.EOF => (block: discard) # stop "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) _ => (block: parser.parseErrorByTokenType(token.t)) const CaseTable = { "altglyph": "altGlyph", "altglyphdef": "altGlyphDef", "altglyphitem": "altGlyphItem", "animatecolor": "animateColor", "animatemotion": "animateMotion", "animatetransform": "animateTransform", "clippath": "clipPath", "feblend": "feBlend", "fecolormatrix": "feColorMatrix", "fecomponenttransfer": "feComponentTransfer", "fecomposite": "feComposite", "feconvolvematrix": "feConvolveMatrix", "fediffuselighting": "feDiffuseLighting", "fedisplacementmap": "feDisplacementMap", "fedistantlight": "feDistantLight", "fedropshadow": "feDropShadow", "feflood": "feFlood", "fefunca": "feFuncA", "fefuncb": "feFuncB", "fefuncg": "feFuncG", "fefuncr": "feFuncR", "fegaussianblur": "feGaussianBlur", "feimage": "feImage", "femerge": "feMerge", "femergenode": "feMergeNode", "femorphology": "feMorphology", "feoffset": "feOffset", "fepointlight": "fePointLight", "fespecularlighting": "feSpecularLighting", "fespotlight": "feSpotLight", "fetile": "feTile", "feturbulence": "feTurbulence", "foreignobject": "foreignObject", "glyphref": "glyphRef", "lineargradient": "linearGradient", "radialgradient": "radialGradient", "textpath": "textPath", }.toTable() proc processInForeignContent(parser: var HTML5Parser, token: Token) = macro `=>`(v: typed, body: untyped): untyped = quote do: discard (`v`, proc() = `body`) template script_end_tag() = pop_current_node #TODO document.write (?) #TODO SVG template parse_error(e: ParseError) = parser.parseError(e) template any_other_end_tag() = if parser.getLocalName(parser.currentNode) != token.tagname: parse_error UNEXPECTED_END_TAG for i in countdown(parser.openElements.high, 1): let node = parser.openElements[i] if parser.getLocalName(parser.currentNode) == token.tagname: while parser.popElement() != node: discard break if parser.getNamespace(node) == Namespace.HTML: break parser.processInHTMLContent(token, parser.insertionMode) match token: '\0' => (block: parse_error UNEXPECTED_NULL parser.insertCharacter(Rune(0xFFFD)) ) AsciiWhitespace => (block: parser.insertCharacter(token.c)) TokenType.CHARACTER_ASCII => (block: parser.insertCharacter(token.c)) TokenType.CHARACTER => (block: parser.insertCharacter(token.r)) TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) ("<b>", "<big>", "<blockquote>", "<body>", "<br>", "<center>", "<code>", "<dd>", "<div>", "<dl>", "<dt>", "<em>", "<embed>", "<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>", "<head>", "<hr>", "<i>", "<img>", "<li>", "<listing>", "<menu>", "<meta>", "<nobr>", "<ol>", "<p>", "<pre>", "<ruby>", "<s>", "<small>", "<span>", "<strong>", "<strike>", "<sub>", "<sup>", "<table>", "<tt>", "<u>", "<ul>", "<var>") => (block: parse_error UNEXPECTED_START_TAG #NOTE MathML not implemented while not parser.isHTMLIntegrationPoint(parser.currentNode) and parser.getNamespace(parser.currentNode) != Namespace.HTML: pop_current_node parser.processInHTMLContent(token, parser.insertionMode) ) TokenType.START_TAG => (block: #NOTE MathML not implemented let namespace = parser.getNamespace(parser.adjustedCurrentNode) if namespace == Namespace.SVG: if token.tagname in CaseTable: token.tagname = CaseTable[token.tagname] adjustSVGAttributes(token) #TODO adjust foreign attributes discard parser.insertForeignElement(token, namespace) if token.selfclosing and namespace == Namespace.SVG: script_end_tag else: pop_current_node ) "</script>" => (block: let namespace = parser.getNamespace(parser.currentNode) let localName = parser.getLocalName(parser.currentNode) if namespace == Namespace.SVG and localName == "script": #TODO SVG script_end_tag else: any_other_end_tag ) TokenType.END_TAG => (block: any_other_end_tag) proc constructTree[Handle](parser: var HTML5Parser[Handle]) = for token in parser.tokenizer.tokenize: if parser.ignoreLF: parser.ignoreLF = false if token.t == CHARACTER_ASCII and token.c == '\n': continue let isTokenHTML = token.t in {START_TAG, CHARACTER, CHARACTER_ASCII} if parser.openElements.len == 0 or parser.getNamespace(parser.adjustedCurrentNode) == Namespace.HTML or parser.isHTMLIntegrationPoint(parser.adjustedCurrentNode) and isTokenHTML or token.t == EOF: #NOTE MathML not implemented parser.processInHTMLContent(token, parser.insertionMode) else: parser.processInForeignContent(token) if parser.needsreinterpret: break proc finishParsing(parser: var HTML5Parser) = while parser.openElements.len > 0: pop_current_node if parser.dombuilder.finish != nil: parser.dombuilder.finish(parser.dombuilder) proc bomSniff(inputStream: Stream): Charset = # bom sniff const u8bom = char(0xEF) & char(0xBB) & char(0xBF) const bebom = char(0xFE) & char(0xFF) const lebom = char(0xFF) & char(0xFE) var bom = inputStream.readStr(2) if bom == bebom: return CHARSET_UTF_16_BE elif bom == lebom: return CHARSET_UTF_16_LE else: bom &= inputStream.readChar() if bom == u8bom: return CHARSET_UTF_8 else: inputStream.setPosition(0) # Any of these pointers being nil would later result in a crash. proc checkCallbacks(dombuilder: DOMBuilder) = doAssert dombuilder.getParentNode != nil doAssert dombuilder.getLocalName != nil doAssert dombuilder.createElement != nil doAssert dombuilder.createComment != nil doAssert dombuilder.createDocumentType != nil doAssert dombuilder.insertBefore != nil doAssert dombuilder.insertText != nil doAssert dombuilder.remove != nil proc parseHTML*[Handle](inputStream: Stream, dombuilder: DOMBuilder[Handle], opts: HTML5ParserOpts[Handle]) = ## Parse an HTML document, using the DOMBuilder object `dombuilder`, and ## parser options `opts`. dombuilder.checkCallbacks() var charsetStack: seq[Charset] for i in countdown(opts.charsets.high, 0): charsetStack.add(opts.charsets[i]) var canReinterpret = opts.canReinterpret var confidence: CharsetConfidence if canReinterpret: let scs = inputStream.bomSniff() if scs != CHARSET_UNKNOWN: charsetStack.add(scs) confidence = CONFIDENCE_CERTAIN canReinterpret = false if charsetStack.len == 0: charsetStack.add(DefaultCharset) # UTF-8 while true: let charset = charsetStack.pop() var parser = HTML5Parser[Handle]( dombuilder: dombuilder, confidence: confidence, charset: charset, opts: opts ) confidence = CONFIDENCE_TENTATIVE # used in the next iteration if not canReinterpret: parser.confidence = CONFIDENCE_CERTAIN let em = if charsetStack.len == 0 or not canReinterpret: DECODER_ERROR_MODE_REPLACEMENT else: DECODER_ERROR_MODE_FATAL let decoder = newDecoderStream(inputStream, parser.charset, errormode = em) proc x(e: ParseError) = parser.parseError(e) let onParseError = if parser.hasParseError(): x else: nil parser.tokenizer = newTokenizer(decoder, onParseError) parser.constructTree() if parser.needsreinterpret and canReinterpret: inputStream.setPosition(0) charsetStack.add(parser.charset) canReinterpret = false continue if decoder.failed and canReinterpret: inputStream.setPosition(0) continue parser.finishParsing() break