import std/macros import std/options import std/strutils import std/tables import dombuilder import htmltokenizer import tags import tokstate # Generics break without exporting macros. Maybe a compiler bug? export macros # Export these so that htmlparseriface works seamlessly. export dombuilder export options export tags # Export tokstate too; it is needed for fragment parsing. export tokstate # Heavily inspired by html5ever's TreeSink design. type HTML5ParserOpts*[Handle, Atom] = object isIframeSrcdoc*: bool ## Is the document an iframe srcdoc? scripting*: bool ## Is scripting enabled for this document? ctx*: Option[OpenElementInit[Handle, Atom]] ## Context element for fragment parsing. When set to some Handle, ## the fragment case is used while parsing. ## ## `token` must be a valid starting token for this element. initialTokenizerState*: TokenizerState ## The initial tokenizer state; by default, this is DATA. openElementsInit*: seq[OpenElementInit[Handle, Atom]] ## Initial state of the stack of open elements. By default, the stack ## starts out empty. ## Note: if this is initialized to a non-empty sequence, the parser will ## start by resetting the insertion mode appropriately. formInit*: Option[Handle] ## Initial state of the parser's form pointer. pushInTemplate*: bool ## When set to true, the "in template" insertion mode is pushed to the ## stack of template insertion modes on parser start. OpenElement*[Handle, Atom] = tuple element: Handle token: Token[Atom] ## the element's start tag token; must not be nil. OpenElementInit*[Handle, Atom] = tuple element: Handle startTagName: Atom ## the element's start tag token's name. type QualifiedName[Atom] = tuple prefix: NamespacePrefix namespace: Namespace localName: Atom HTML5Parser*[Handle, Atom] = object quirksMode: QuirksMode dombuilder: DOMBuilder[Handle, Atom] opts: HTML5ParserOpts[Handle, Atom] ctx: Option[OpenElement[Handle, Atom]] openElements: seq[OpenElement[Handle, Atom]] insertionMode: InsertionMode oldInsertionMode: InsertionMode templateModes: seq[InsertionMode] head: Option[OpenElement[Handle, Atom]] tokenizer: Tokenizer[Handle, Atom] form: Option[Handle] fosterParenting: bool # Handle is an element. nil => marker activeFormatting: seq[(Option[Handle], Token[Atom])] framesetOk: bool ignoreLF: bool pendingTableChars: string pendingTableCharsWhitespace: bool caseTable: Table[Atom, Atom] adjustedTable: Table[Atom, Atom] foreignTable: Table[Atom, QualifiedName[Atom]] AdjustedInsertionLocation[Handle] = tuple[ inside: Handle, before: Option[Handle] ] # 13.2.4.1 InsertionMode = enum INITIAL, BEFORE_HTML, BEFORE_HEAD, IN_HEAD, IN_HEAD_NOSCRIPT, AFTER_HEAD, IN_BODY, TEXT, IN_TABLE, IN_TABLE_TEXT, IN_CAPTION, IN_COLUMN_GROUP, IN_TABLE_BODY, IN_ROW, IN_CELL, IN_SELECT, IN_SELECT_IN_TABLE, IN_TEMPLATE, AFTER_BODY, IN_FRAMESET, AFTER_FRAMESET, AFTER_AFTER_BODY, AFTER_AFTER_FRAMESET type ParseResult* = enum ## Result of parsing the passed chunk. ## PRES_CONTINUE is returned when it is OK to continue parsing. ## ## PRES_STOP is returned when the parser has been stopped from ## setEncodingImpl. ## ## PRES_SCRIPT is returned when a script end tag is encountered. For ## implementations that do not support scripting, this can be treated ## equivalently to PRES_CONTINUE. ## ## Implementations that *do* support scripting and implement `document.write` ## can instead use PRES_SCRIPT to process string injected into the input ## stream by `document.write` before continuing with parsing from the ## network stream. In this case, script elements should be stored in e.g. the ## DOM builder from `elementPoppedImpl`, and processed accordingly after ## PRES_SCRIPT has been returned. PRES_CONTINUE PRES_STOP PRES_SCRIPT # DOMBuilder interface functions proc strToAtom[Handle, Atom](parser: HTML5Parser[Handle, Atom], s: string): Atom = mixin strToAtomImpl return parser.dombuilder.strToAtomImpl(s) proc tagTypeToAtom[Handle, Atom](parser: HTML5Parser[Handle, Atom], tagType: TagType): Atom = mixin tagTypeToAtomImpl return parser.dombuilder.tagTypeToAtomImpl(tagType) # Declared as extern, because Nim 1.6.10 does not recognize it in macros # otherwise. proc atomToTagType*[Handle, Atom](parser: HTML5Parser[Handle, Atom], atom: Atom): TagType = mixin atomToTagTypeImpl return parser.dombuilder.atomToTagTypeImpl(atom) proc setQuirksMode[Handle, Atom](parser: var HTML5Parser[Handle, Atom], mode: QuirksMode) = mixin setQuirksModeImpl parser.quirksMode = mode when compiles(parser.dombuilder.setQuirksModeImpl(mode)): parser.dombuilder.setQuirksModeImpl(mode) proc setEncoding(parser: var HTML5Parser, cs: string): SetEncodingResult = mixin setEncodingImpl when compiles(parser.dombuilder.setEncodingImpl(cs)): return parser.dombuilder.setEncodingImpl(cs) else: return SET_ENCODING_CONTINUE proc getDocument[Handle, Atom](parser: HTML5Parser[Handle, Atom]): Handle = mixin getDocumentImpl return parser.dombuilder.getDocumentImpl() proc getParentNode[Handle, Atom](parser: HTML5Parser[Handle, Atom], handle: Handle): Option[Handle] = mixin getParentNodeImpl return parser.dombuilder.getParentNodeImpl(handle) proc getLocalName[Handle, Atom](parser: HTML5Parser[Handle, Atom], handle: Handle): Atom = mixin getLocalNameImpl return parser.dombuilder.getLocalNameImpl(handle) proc getNamespace[Handle, Atom](parser: HTML5Parser[Handle, Atom], handle: Handle): Namespace = mixin getNamespaceImpl return parser.dombuilder.getNamespaceImpl(handle) proc getTemplateContent[Handle, Atom](parser: HTML5Parser[Handle, Atom], handle: Handle): Handle = mixin getTemplateContentImpl return parser.dombuilder.getTemplateContentImpl(handle) proc getTagType[Handle, Atom](parser: HTML5Parser[Handle, Atom], handle: Handle): TagType = if parser.getNamespace(handle) != Namespace.HTML: return TAG_UNKNOWN return parser.atomToTagType(parser.getLocalName(handle)) proc createHTMLElement[Handle, Atom](parser: HTML5Parser[Handle, Atom]): Handle = mixin createHTMLElementImpl return parser.dombuilder.createHTMLElementImpl() proc createComment[Handle, Atom](parser: HTML5Parser[Handle, Atom], text: string): Handle = mixin createCommentImpl return parser.dombuilder.createCommentImpl(text) proc createDocumentType[Handle, Atom](parser: HTML5Parser[Handle, Atom], name, publicId, systemId: string): Handle = mixin createDocumentTypeImpl return parser.dombuilder.createDocumentTypeImpl(name, publicId, systemId) proc insertBefore[Handle, Atom](parser: HTML5Parser[Handle, Atom], parent, child: Handle, before: Option[Handle]) = mixin insertBeforeImpl parser.dombuilder.insertBeforeImpl(parent, child, before) proc insertText[Handle, Atom](parser: HTML5Parser[Handle, Atom], parent: Handle, text: string, before: Option[Handle]) = mixin insertTextImpl parser.dombuilder.insertTextImpl(parent, text, before) proc remove[Handle, Atom](parser: HTML5Parser[Handle, Atom], child: Handle) = mixin removeImpl parser.dombuilder.removeImpl(child) proc moveChildren[Handle, Atom](parser: HTML5Parser[Handle, Atom], handleFrom, handleTo: Handle) = mixin moveChildrenImpl parser.dombuilder.moveChildrenImpl(handleFrom, handleTo) proc addAttrsIfMissing[Handle, Atom](parser: HTML5Parser[Handle, Atom], element: Handle, attrs: Table[Atom, string]) = mixin addAttrsIfMissingImpl parser.dombuilder.addAttrsIfMissingImpl(element, attrs) proc setScriptAlreadyStarted[Handle, Atom](parser: HTML5Parser[Handle, Atom], script: Handle) = mixin setScriptAlreadyStartedImpl when compiles(parser.dombuilder.setScriptAlreadyStartedImpl(script)): parser.dombuilder.setScriptAlreadyStartedImpl(script) proc associateWithForm[Handle, Atom](parser: HTML5Parser[Handle, Atom], element, form, intendedParent: Handle) = mixin associateWithFormImpl when compiles(parser.dombuilder.associateWithFormImpl(element, form, intendedParent)): parser.dombuilder.associateWithFormImpl(element, form, intendedParent) # Parser func fragment(parser: HTML5Parser): bool = return parser.ctx.isSome # https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately proc resetInsertionMode(parser: var HTML5Parser) = template switch_insertion_mode_and_return(mode: InsertionMode) = parser.insertionMode = mode return for i in countdown(parser.openElements.high, 0): var node = parser.openElements[i] let last = i == 0 if parser.fragment: node = parser.ctx.get let tagType = parser.getTagType(node.element) case tagType of TAG_SELECT: if not last: for j in countdown(parser.openElements.high, 1): let ancestor = parser.openElements[j].element case parser.getTagType(ancestor) of TAG_TEMPLATE: break of TAG_TABLE: switch_insertion_mode_and_return IN_SELECT_IN_TABLE else: discard switch_insertion_mode_and_return IN_SELECT of TAG_TD, TAG_TH: if not last: switch_insertion_mode_and_return IN_CELL of TAG_TR: switch_insertion_mode_and_return IN_ROW of TAG_TBODY, TAG_THEAD, TAG_TFOOT: switch_insertion_mode_and_return IN_TABLE_BODY of TAG_CAPTION: switch_insertion_mode_and_return IN_CAPTION of TAG_COLGROUP: switch_insertion_mode_and_return IN_COLUMN_GROUP of TAG_TABLE: switch_insertion_mode_and_return IN_TABLE of TAG_TEMPLATE: switch_insertion_mode_and_return parser.templateModes[^1] of TAG_HEAD: if not last: switch_insertion_mode_and_return IN_HEAD of TAG_BODY: switch_insertion_mode_and_return IN_BODY of TAG_FRAMESET: switch_insertion_mode_and_return IN_FRAMESET of TAG_HTML: if parser.head.isNone: switch_insertion_mode_and_return BEFORE_HEAD else: switch_insertion_mode_and_return AFTER_HEAD else: discard if last: switch_insertion_mode_and_return IN_BODY func currentNodeToken[Handle, Atom](parser: HTML5Parser[Handle, Atom]): OpenElement[Handle, Atom] = return parser.openElements[^1] func currentNode[Handle, Atom](parser: HTML5Parser[Handle, Atom]): Handle = return parser.currentNodeToken.element func currentToken[Handle, Atom](parser: HTML5Parser[Handle, Atom]): Token[Atom] = return parser.currentNodeToken.token func adjustedCurrentNodeToken[Handle, Atom](parser: HTML5Parser[Handle, Atom]): OpenElement[Handle, Atom] = if parser.fragment and parser.openElements.len == 1: return parser.ctx.get else: return parser.currentNodeToken func adjustedCurrentNode[Handle, Atom](parser: HTML5Parser[Handle, Atom]): Handle = return parser.adjustedCurrentNodeToken.element proc lastElementOfTag[Handle, Atom](parser: HTML5Parser[Handle, Atom], tagType: TagType): tuple[element: Option[Handle], pos: int] = for i in countdown(parser.openElements.high, 0): let element = parser.openElements[i].element if parser.getTagType(element) == tagType: return (some(element), i) return (none(Handle), -1) func last_child_of[Handle](n: Handle): AdjustedInsertionLocation[Handle] = (n, none(Handle)) func last_child_of[Handle, Atom](n: OpenElement[Handle, Atom]): AdjustedInsertionLocation[Handle] = last_child_of(n.element) # https://html.spec.whatwg.org/multipage/#appropriate-place-for-inserting-a-node proc appropriatePlaceForInsert[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: Handle): AdjustedInsertionLocation[Handle] = assert parser.getTagType(parser.openElements[0].element) == TAG_HTML let targetTagType = parser.getTagType(target) const FosterTagTypes = {TAG_TABLE, TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TR} if parser.fosterParenting and targetTagType in FosterTagTypes: let lastTemplate = parser.lastElementOfTag(TAG_TEMPLATE) let lastTable = parser.lastElementOfTag(TAG_TABLE) if lastTemplate.element.isSome and (lastTable.element.isNone or lastTable.pos < lastTemplate.pos): let content = parser.getTemplateContent(lastTemplate.element.get) return last_child_of(content) if lastTable.element.isNone: return last_child_of(parser.openElements[0].element) let parentNode = parser.getParentNode(lastTable.element.get) if parentNode.isSome: return (parentNode.get, lastTable.element) let previousElement = parser.openElements[lastTable.pos - 1] result = last_child_of(previousElement.element) else: result = last_child_of(target) if parser.getTagType(result.inside) == TAG_TEMPLATE: result = (parser.getTemplateContent(result.inside), none(Handle)) proc appropriatePlaceForInsert[Handle, Atom](parser: HTML5Parser[Handle, Atom]): AdjustedInsertionLocation[Handle] = parser.appropriatePlaceForInsert(parser.currentNode) proc hasElement[Handle, Atom](parser: HTML5Parser[Handle, Atom], tags: set[TagType]): bool = for (element, _) in parser.openElements: if parser.getTagType(element) in tags: return true return false proc hasElement[Handle, Atom](parser: HTML5Parser[Handle, Atom], tag: TagType): bool = return parser.hasElement({tag}) const Scope = { TAG_APPLET, TAG_CAPTION, TAG_HTML, TAG_TABLE, TAG_TD, TAG_TH, TAG_MARQUEE, TAG_OBJECT, TAG_TEMPLATE # (+ SVG, MathML) } proc hasElementInScopeWithXML[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: Handle, list: set[TagType]): bool = for i in countdown(parser.openElements.high, 0): let element = parser.openElements[i].element if element == target: return true let localName = parser.getLocalName(element) let tagType = parser.atomToTagType(localName) case parser.getNamespace(element) of Namespace.HTML: {.linearScanEnd.} if tagType in list: return false of Namespace.MATHML: const elements = { TAG_MI, TAG_MO, TAG_MN, TAG_MS, TAG_MTEXT, TAG_ANNOTATION_XML } if tagType in elements: return false of Namespace.SVG: const elements = {TAG_FOREIGN_OBJECT, TAG_DESC, TAG_TITLE} if tagType in elements: return false else: discard return false proc hasElementInScopeWithXML[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: set[TagType], list: set[TagType]): bool = for i in countdown(parser.openElements.high, 0): let element = parser.openElements[i].element let tagType = parser.atomToTagType(parser.getLocalName(element)) case parser.getNamespace(element) of Namespace.HTML: if tagType in target: return true if tagType in list: return false of Namespace.MATHML: const elements = { TAG_MI, TAG_MO, TAG_MN, TAG_MS, TAG_MTEXT, TAG_ANNOTATION_XML } if tagType in elements: return false of Namespace.SVG: const elements = {TAG_FOREIGN_OBJECT, TAG_DESC, TAG_TITLE} if tagType in elements: return false else: discard return false proc hasElementInScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: Handle): bool = return parser.hasElementInScopeWithXML(target, Scope) proc hasElementInScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: set[TagType]): bool = return parser.hasElementInScopeWithXML(target, Scope) proc hasElementInScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: TagType): bool = return parser.hasElementInScope({target}) proc hasElementInScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], localName: Atom): bool = let target = parser.atomToTagType(localName) return parser.hasElementInScope(target) proc hasElementInListItemScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: TagType): bool = const ListItemScope = Scope + {TAG_OL, TAG_UL} return parser.hasElementInScopeWithXML({target}, ListItemScope) proc hasElementInButtonScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: TagType): bool = const ButtonScope = Scope + {TAG_BUTTON} return parser.hasElementInScopeWithXML({target}, ButtonScope) # Note: these do not include the "Scope" tags. proc hasElementInSpecificScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: Handle, list: set[TagType]): bool = for i in countdown(parser.openElements.high, 0): let element = parser.openElements[i].element if element == target: return true if parser.getTagType(element) in list: return false assert false return false proc hasElementInSpecificScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: set[TagType], list: set[TagType]): bool = for i in countdown(parser.openElements.high, 0): let tagType = parser.getTagType(parser.openElements[i].element) if tagType in target: return true if tagType in list: return false assert false return false const TableScope = {TAG_HTML, TAG_TABLE, TAG_TEMPLATE} proc hasElementInTableScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: TagType): bool = return parser.hasElementInSpecificScope({target}, TableScope) proc hasElementInTableScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: set[TagType]): bool = return parser.hasElementInSpecificScope(target, TableScope) proc hasElementInSelectScope[Handle, Atom](parser: HTML5Parser[Handle, Atom], target: TagType): bool = for i in countdown(parser.openElements.high, 0): let tagType = parser.getTagType(parser.openElements[i].element) if tagType == target: return true if tagType notin {TAG_OPTION, TAG_OPTGROUP}: return false assert false return false proc createElementForToken[Handle, Atom](parser: HTML5Parser[Handle, Atom], localName: Atom, namespace: Namespace, intendedParent: Handle, htmlAttrs: Table[Atom, string], xmlAttrs: seq[ParsedAttr[Atom]]): Handle = mixin createElementForTokenImpl let element = parser.dombuilder.createElementForTokenImpl( localName, namespace, intendedParent, htmlAttrs, xmlAttrs ) let tagType = parser.atomToTagType(localName) if namespace == Namespace.HTML and tagType in FormAssociatedElements and parser.form.isSome and not parser.hasElement(TAG_TEMPLATE) and (tagType notin ListedElements or parser.tagTypeToAtom(TAG_FORM) notin htmlAttrs): parser.associateWithForm(element, parser.form.get, intendedParent) return element proc createElementForToken[Handle, Atom](parser: HTML5Parser[Handle, Atom], token: Token, namespace: Namespace, intendedParent: Handle): Handle = # attrs not adjusted return parser.createElementForToken(token.tagname, namespace, intendedParent, token.attrs, @[]) proc createHTMLElementForToken[Handle, Atom](parser: HTML5Parser[Handle, Atom], token: Token, intendedParent: Handle): Handle = # attrs not adjusted return parser.createElementForToken( token.tagname, Namespace.HTML, intendedParent, token.attrs, @[] ) proc pushElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom], node: Handle, token: Token[Atom]) = parser.openElements.add((node, token)) let node = parser.adjustedCurrentNode() parser.tokenizer.hasnonhtml = parser.getNamespace(node) != Namespace.HTML proc popElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom]): Handle = mixin elementPoppedImpl result = parser.openElements.pop().element when compiles(parser.dombuilder.elementPoppedImpl(result)): parser.dombuilder.elementPoppedImpl(result) if parser.openElements.len == 0: parser.tokenizer.hasnonhtml = false else: let node = parser.adjustedCurrentNode() parser.tokenizer.hasnonhtml = parser.getNamespace(node) != Namespace.HTML template pop_current_node = discard parser.popElement() proc insert[Handle, Atom](parser: HTML5Parser[Handle, Atom], location: AdjustedInsertionLocation[Handle], node: Handle) = parser.insertBefore(location.inside, node, location.before) proc append[Handle, Atom](parser: HTML5Parser[Handle, Atom], parent, node: Handle) = parser.insertBefore(parent, node, none(Handle)) proc insertForeignElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom], token: Token, localName: Atom, namespace: Namespace, stackOnly: bool, xmlAttrs: seq[ParsedAttr[Atom]]): Handle = let location = parser.appropriatePlaceForInsert() let parent = location.inside let element = parser.createElementForToken(localName, namespace, parent, token.attrs, xmlAttrs) if not stackOnly: parser.insert(location, element) parser.pushElement(element, token) return element proc insertForeignElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom], token: Token, namespace: Namespace, stackOnly: bool): Handle = parser.insertForeignElement(token, token.tagname, namespace, stackOnly, @[]) proc insertHTMLElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom], token: Token): Handle = return parser.insertForeignElement(token, Namespace.HTML, false) # Note: adjustMathMLAttributes and adjustSVGAttributes both include the "adjust # foreign attributes" step as well. proc adjustMathMLAttributes[Handle, Atom](parser: var HTML5Parser[Handle, Atom], htmlAttrs: var Table[Atom, string], xmlAttrs: var seq[ParsedAttr[Atom]]) = var deleted: seq[Atom] = @[] for k, v in htmlAttrs.mpairs: parser.foreignTable.withValue(k, p): xmlAttrs.add((p[].prefix, p[].namespace, p[].localName, v)) deleted.add(k) var v: string = "" if htmlAttrs.pop(parser.tagTypeToAtom(TAG_DEFINITION_URL), v): htmlAttrs[parser.strToAtom("definitionURL")] = v for k in deleted: htmlAttrs.del(k) proc adjustSVGAttributes[Handle, Atom](parser: var HTML5Parser[Handle, Atom], htmlAttrs: var Table[Atom, string], xmlAttrs: var seq[ParsedAttr[Atom]]) = var deleted: seq[Atom] = @[] for k, v in htmlAttrs: parser.foreignTable.withValue(k, p): xmlAttrs.add((p[].prefix, p[].namespace, p[].localName, v)) deleted.add(k) for k, ak in parser.adjustedTable: var v: string = "" if htmlAttrs.pop(k, v): htmlAttrs[ak] = v for k in deleted: htmlAttrs.del(k) proc insertCharacter(parser: var HTML5Parser, data: string) = let location = parser.appropriatePlaceForInsert() if location.inside == parser.getDocument(): return parser.insertText(location.inside, data, location.before) proc insertComment[Handle, Atom](parser: var HTML5Parser[Handle, Atom], token: Token, position: AdjustedInsertionLocation[Handle]) = let comment = parser.createComment(token.data) parser.insert(position, comment) proc insertComment(parser: var HTML5Parser, token: Token) = let position = parser.appropriatePlaceForInsert() parser.insertComment(token, position) const PublicIdentifierEquals = [ "-//W3O//DTD W3 HTML Strict 3.0//EN//", "-/W3C/DTD HTML 4.0 Transitional/EN", "HTML" ] const PublicIdentifierStartsWith = [ "+//Silmaril//dtd html Pro v0r11 19970101//", "-//AS//DTD HTML 3.0 asWedit + extensions//", "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//", "-//IETF//DTD HTML 2.0 Level 1//", "-//IETF//DTD HTML 2.0 Level 2//", "-//IETF//DTD HTML 2.0 Strict Level 1//", "-//IETF//DTD HTML 2.0 Strict Level 2//", "-//IETF//DTD HTML 2.0 Strict//", "-//IETF//DTD HTML 2.0//", "-//IETF//DTD HTML 2.1E//", "-//IETF//DTD HTML 3.0//", "-//IETF//DTD HTML 3.2 Final//", "-//IETF//DTD HTML 3.2//", "-//IETF//DTD HTML 3//", "-//IETF//DTD HTML Level 0//", "-//IETF//DTD HTML Level 1//", "-//IETF//DTD HTML Level 2//", "-//IETF//DTD HTML Level 3//", "-//IETF//DTD HTML Strict Level 0//", "-//IETF//DTD HTML Strict Level 1//", "-//IETF//DTD HTML Strict Level 2//", "-//IETF//DTD HTML Strict Level 3//", "-//IETF//DTD HTML Strict//", "-//IETF//DTD HTML//", "-//Metrius//DTD Metrius Presentational//", "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//", "-//Microsoft//DTD Internet Explorer 2.0 HTML//", "-//Microsoft//DTD Internet Explorer 2.0 Tables//", "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//", "-//Microsoft//DTD Internet Explorer 3.0 HTML//", "-//Microsoft//DTD Internet Explorer 3.0 Tables//", "-//Netscape Comm. Corp.//DTD HTML//", "-//Netscape Comm. Corp.//DTD Strict HTML//", "-//O'Reilly and Associates//DTD HTML 2.0//", "-//O'Reilly and Associates//DTD HTML Extended 1.0//", "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//", "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//", "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//", "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//", "-//Spyglass//DTD HTML 2.0 Extended//", "-//Sun Microsystems Corp.//DTD HotJava HTML//", "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//", "-//W3C//DTD HTML 3 1995-03-24//", "-//W3C//DTD HTML 3.2 Draft//", "-//W3C//DTD HTML 3.2 Final//", "-//W3C//DTD HTML 3.2//", "-//W3C//DTD HTML 3.2S Draft//", "-//W3C//DTD HTML 4.0 Frameset//", "-//W3C//DTD HTML 4.0 Transitional//", "-//W3C//DTD HTML Experimental 19960712//", "-//W3C//DTD HTML Experimental 970421//", "-//W3C//DTD W3 HTML//", "-//W3O//DTD W3 HTML 3.0//", "-//WebTechs//DTD Mozilla HTML 2.0//", "-//WebTechs//DTD Mozilla HTML//", ] const SystemIdentifierMissingAndPublicIdentifierStartsWith = [ "-//W3C//DTD HTML 4.01 Frameset//", "-//W3C//DTD HTML 4.01 Transitional//" ] const PublicIdentifierStartsWithLimited = [ "-//W3C//DTD XHTML 1.0 Frameset//", "-//W3C//DTD XHTML 1.0 Transitional//" ] const SystemIdentifierNotMissingAndPublicIdentifierStartsWith = [ "-//W3C//DTD HTML 4.01 Frameset//", "-//W3C//DTD HTML 4.01 Transitional//" ] func startsWithNoCase(str, prefix: string): bool = if str.len < prefix.len: return false # prefix.len is always lower var i = 0 while i != prefix.len: if str[i].toLowerAscii() != prefix[i].toLowerAscii(): return false inc i true func equalsIgnoreCase(s1, s2: string): bool {.inline.} = return s1.cmpIgnoreCase(s2) == 0 func quirksConditions(token: Token): bool = if token.quirks: return true if token.name.get("") != "html": return true if token.sysid.get("") == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd": return true if token.pubid.isSome: let pubid = token.pubid.get for id in PublicIdentifierEquals: if pubid.equalsIgnoreCase(id): return true for id in PublicIdentifierStartsWith: if pubid.startsWithNoCase(id): return true if token.sysid.isNone: for id in SystemIdentifierMissingAndPublicIdentifierStartsWith: if pubid.startsWithNoCase(id): return true return false func limitedQuirksConditions(token: Token): bool = if token.pubid.isNone: return false for id in PublicIdentifierStartsWithLimited: if token.pubid.get.startsWithNoCase(id): return true if token.sysid.isNone: return false for id in SystemIdentifierNotMissingAndPublicIdentifierStartsWith: if token.pubid.get.startsWithNoCase(id): return true return false # 13.2.6.2 proc genericRawtextElementParsingAlgorithm(parser: var HTML5Parser, token: Token) = discard parser.insertHTMLElement(token) parser.tokenizer.state = RAWTEXT parser.oldInsertionMode = parser.insertionMode parser.insertionMode = TEXT proc genericRCDATAElementParsingAlgorithm(parser: var HTML5Parser, token: Token) = discard parser.insertHTMLElement(token) parser.tokenizer.state = RCDATA parser.oldInsertionMode = parser.insertionMode parser.insertionMode = TEXT # Pop all elements, including the specified tag. proc popElementsIncl(parser: var HTML5Parser, tags: set[TagType]) = while parser.getTagType(parser.popElement()) notin tags: discard proc popElementsIncl(parser: var HTML5Parser, tag: TagType) = parser.popElementsIncl({tag}) # Pop all elements, including the specified element. proc popElementsIncl[Handle, Atom](parser: var HTML5Parser[Handle, Atom], handle: Handle) = while parser.popElement() != handle: discard # https://html.spec.whatwg.org/multipage/parsing.html#closing-elements-that-have-implied-end-tags proc generateImpliedEndTags(parser: var HTML5Parser) = const tags = { TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, TAG_RT, TAG_RTC } while parser.getTagType(parser.currentNode) in tags: discard parser.popElement() proc generateImpliedEndTags(parser: var HTML5Parser, exclude: TagType) = let tags = { TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, TAG_RT, TAG_RTC } - {exclude} while parser.getTagType(parser.currentNode) in tags: discard parser.popElement() proc generateImpliedEndTagsThoroughly(parser: var HTML5Parser) = const tags = { TAG_CAPTION, TAG_COLGROUP, TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, TAG_RT, TAG_RTC, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TR } while parser.getTagType(parser.currentNode) in tags: discard parser.popElement() # https://html.spec.whatwg.org/multipage/parsing.html#push-onto-the-list-of-active-formatting-elements proc pushOntoActiveFormatting[Handle, Atom](parser: var HTML5Parser[Handle, Atom], element: Handle, token: Token) = var count = 0 for i in countdown(parser.activeFormatting.high, 0): let it = parser.activeFormatting[i] if it[0].isNone: # marker break if it[1].tagname != token.tagname: continue if parser.getNamespace(it[0].get) != parser.getNamespace(element): continue if it[1].attrs != token.attrs: continue inc count if count == 3: parser.activeFormatting.delete(i) break parser.activeFormatting.add((some(element), token)) proc findOpenElement[Handle, Atom](parser: var HTML5Parser[Handle, Atom], element: Handle): int = for i, it in parser.openElements: if it.element == element: return i return -1 proc reconstructActiveFormatting[Handle, Atom](parser: var HTML5Parser[Handle, Atom]) = type State = enum REWIND, ADVANCE, CREATE if parser.activeFormatting.len == 0 or parser.activeFormatting[^1][0].isNone or parser.findOpenElement(parser.activeFormatting[^1][0].get) != -1: return var i = parser.activeFormatting.high template entry: Option[Handle] = (parser.activeFormatting[i][0]) var state = REWIND while true: case state of REWIND: if i == 0: state = CREATE continue dec i if entry.isSome and parser.findOpenElement(entry.get) == -1: continue state = ADVANCE of ADVANCE: inc i state = CREATE of CREATE: let element = parser.insertHTMLElement(parser.activeFormatting[i][1]) parser.activeFormatting[i] = ( some(element), parser.activeFormatting[i][1] ) if i != parser.activeFormatting.high: state = ADVANCE continue break proc clearActiveFormattingTillMarker(parser: var HTML5Parser) = while parser.activeFormatting.len > 0 and parser.activeFormatting.pop()[0].isSome: discard proc isMathMLIntegrationPoint[Handle, Atom](parser: HTML5Parser[Handle, Atom], element: Handle): bool = if parser.getNamespace(element) != Namespace.MATHML: return false let tagType = parser.atomToTagType(parser.getLocalName(element)) return tagType in {TAG_MI, TAG_MO, TAG_MN, TAG_MS, TAG_MTEXT} proc isHTMLIntegrationPoint[Handle, Atom](parser: HTML5Parser[Handle, Atom], oe: OpenElement[Handle, Atom]): bool = let (element, token) = oe let localName = parser.getLocalName(element) let namespace = parser.getNamespace(element) let tagType = parser.atomToTagType(localName) if namespace == Namespace.MATHML: if tagType == TAG_ANNOTATION_XML: token.attrs.withValue(parser.tagTypeToAtom(TAG_ENCODING), p): return p[].equalsIgnoreCase("text/html") or p[].equalsIgnoreCase("application/xhtml+xml") elif namespace == Namespace.SVG: return tagType in {TAG_FOREIGN_OBJECT, TAG_DESC, TAG_TITLE} return false const AsciiWhitespace = {' ', '\n', '\r', '\t', '\f'} func until(s: string, c1, c2: char, starti: int): string = result = "" for i in starti ..< s.len: if s[i] == c1 or s[i] == c2: break result &= s[i] func extractEncFromMeta(s: string): string = var i = 0 while true: # Loop: var j = 0 while i < s.len: template check(c: static char) = if s[i].toLowerAscii() == c: inc j else: j = 0 case j of 0: check 'c' of 1: check 'h' of 2: check 'a' of 3: check 'r' of 4: check 's' of 5: check 'e' of 6: check 't' of 7: inc j break else: discard inc i if j < 7: return "" while i < s.len and s[i] in AsciiWhitespace: inc i if i >= s.len or s[i] != '=': continue while i < s.len and s[i] in AsciiWhitespace: inc i break inc i if i >= s.len: return "" if s[i] in {'"', '\''}: let s2 = s.until('"', '\'', i + 1) if s2.len == 0 or s2[^1] != s[i]: return "" return s2 return s.until(';', ' ', i) # Find a node in the list of active formatting elements, or return -1. func findLastActiveFormatting[Handle, Atom](parser: var HTML5Parser[Handle, Atom], node: Handle): int = for i in countdown(parser.activeFormatting.high, 0): let it = parser.activeFormatting[i][0] if it.isSome and it.get == node: return i return -1 # > the last element in the list of active formatting elements that: # > is between the end of the list and the last marker in the list, if any, # > or the start of the list otherwise, and has the tag name subject. func findLastActiveFormattingAfterMarker(parser: var HTML5Parser, token: Token): int = for i in countdown(parser.activeFormatting.high, 0): let it = parser.activeFormatting[i][1] if it == nil: break # marker if it.tagname == token.tagname: return i return -1 proc findLastActiveFormattingAfterMarker(parser: var HTML5Parser, tagType: TagType): int = for i in countdown(parser.activeFormatting.high, 0): let it = parser.activeFormatting[i][0] if it.isNone: break if parser.getTagType(it.get) == tagType: return i return -1 #https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements const SpecialElements = { TAG_ADDRESS, TAG_APPLET, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_BASE, TAG_BASEFONT, TAG_BGSOUND, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CAPTION, TAG_CENTER, TAG_COL, TAG_COLGROUP, TAG_DD, TAG_DETAILS, TAG_DIR, TAG_DIV, TAG_DL, TAG_DT, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_FRAME, TAG_FRAMESET, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HGROUP, TAG_HR, TAG_HTML, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_KEYGEN, TAG_LI, TAG_LINK, TAG_LISTING, TAG_MAIN, TAG_MARQUEE, TAG_MENU, TAG_META, TAG_NAV, TAG_NOEMBED, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_SCRIPT, TAG_SEARCH, TAG_SECTION, TAG_SELECT, TAG_SOURCE, TAG_STYLE, TAG_SUMMARY, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_UL, TAG_WBR, TAG_XMP } proc isSpecialElement[Handle, Atom](parser: HTML5Parser[Handle, Atom], element: Handle): bool = let tagType = parser.atomToTagType(parser.getLocalName(element)) case parser.getNamespace(element) of Namespace.HTML: {.linearScanEnd.} return tagType in SpecialElements of Namespace.MATHML: const elements = { TAG_MI, TAG_MO, TAG_MN, TAG_MS, TAG_MTEXT, TAG_ANNOTATION_XML } return tagType in elements of Namespace.SVG: return tagType in {TAG_FOREIGN_OBJECT, TAG_DESC, TAG_TITLE} else: return false # > Let furthestBlock be the topmost node in the stack of open elements that # > is lower in the stack than formattingElement, and is an element in the # > special category. There might not be one. proc findFurthestBlockAfter(parser: HTML5Parser, stackIndex: int): int = for i in stackIndex ..< parser.openElements.len: if parser.isSpecialElement(parser.openElements[i].element): return i return -1 proc findLastActiveFormatting[Handle, Atom](parser: var HTML5Parser[Handle, Atom], tagTypes: set[TagType]): int = for i in countdown(parser.activeFormatting.high, 0): let it = parser.activeFormatting[i][0] if it.isSome and parser.getTagType(it.get) in tagTypes: return i return -1 # If true is returned, call "any other end tag". proc adoptionAgencyAlgorithm[Handle, Atom](parser: var HTML5Parser[Handle, Atom], token: Token): bool = if parser.currentToken.tagname == token.tagname and parser.findLastActiveFormatting(parser.currentNode) == -1: pop_current_node return false for i in 0 ..< 8: # outer loop var formattingIndex = parser.findLastActiveFormattingAfterMarker(token) if formattingIndex == -1: # no such element return true let formatting = parser.activeFormatting[formattingIndex][0].get let stackIndex = parser.findOpenElement(formatting) if stackIndex < 0: parser.activeFormatting.delete(formattingIndex) return false if not parser.hasElementInScope(formatting): return false var furthestBlockIndex = parser.findFurthestBlockAfter(stackIndex) if furthestBlockIndex == -1: parser.popElementsIncl(formatting) parser.activeFormatting.delete(formattingIndex) return false let furthestBlock = parser.openElements[furthestBlockIndex].element let commonAncestor = parser.openElements[stackIndex - 1].element var bookmark = formattingIndex var node = furthestBlock var aboveNode = parser.openElements[furthestBlockIndex - 1].element var lastNode = furthestBlock var j = 0 while true: inc j node = aboveNode if node == formatting: break let nodeStackIndex = parser.findOpenElement(node) var nodeFormattingIndex = parser.findLastActiveFormatting(node) if j > 3 and nodeFormattingIndex >= 0: parser.activeFormatting.delete(nodeFormattingIndex) if nodeFormattingIndex < bookmark: dec bookmark # a previous node got deleted, so decrement bookmark nodeFormattingIndex = -1 # deleted, so set to -1 if nodeFormattingIndex < 0: aboveNode = parser.openElements[nodeStackIndex - 1].element parser.openElements.delete(nodeStackIndex) if nodeStackIndex < furthestBlockIndex: dec furthestBlockIndex let element = parser.openElements[furthestBlockIndex].element assert furthestBlock == element continue let tok = parser.activeFormatting[nodeFormattingIndex][1] let element = parser.createHTMLElementForToken(tok, commonAncestor) parser.activeFormatting[nodeFormattingIndex] = (some(element), tok) parser.openElements[nodeStackIndex] = (element, tok) aboveNode = parser.openElements[nodeStackIndex - 1].element node = element if lastNode == furthestBlock: bookmark = nodeFormattingIndex + 1 parser.remove(lastNode) parser.append(node, lastNode) lastNode = node parser.remove(lastNode) let location = parser.appropriatePlaceForInsert(commonAncestor) parser.insert(location, lastNode) let token = parser.activeFormatting[formattingIndex][1] let element = parser.createHTMLElementForToken(token, furthestBlock) parser.moveChildren(furthestBlock, element) parser.append(furthestBlock, element) parser.activeFormatting.insert((some(element), token), bookmark) if formattingIndex >= bookmark: inc formattingIndex # increment because of insert parser.activeFormatting.delete(formattingIndex) parser.openElements.insert((element, token), furthestBlockIndex + 1) parser.openElements.delete(stackIndex) return false proc closeP(parser: var HTML5Parser, sure = false) = if sure or parser.hasElementInButtonScope(TAG_P): parser.generateImpliedEndTags(TAG_P) parser.popElementsIncl(TAG_P) proc newStartTagToken[Handle, Atom](parser: HTML5Parser[Handle, Atom], t: TagType): Token[Atom] = return Token[Atom](t: START_TAG, tagname: parser.tagTypeToAtom(t)) # Following is an implementation of the state (?) machine defined in # https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhtml # It uses the ad-hoc pattern matching macro `match' to apply the following # transformations: # * First, pairs of patterns and actions are stored in tuples (and `discard' # statements...) # * These pairs are then assigned to token types, later mapped to legs of the # first case statement. # * Another case statement is constructed where needed, e.g. for switching on # characters/tags/etc. # * Finally, the whole thing is wrapped in a named block, to implement a # pseudo-goto by breaking out only when the else statement needn't be # executed. # # For example, the following code: # # match token: # TokenType.COMMENT => (block: echo "comment") # ("