import macros import options import sequtils import streams import strformat import strutils import tables import unicode import css/sheet import data/charset import html/dom import html/tags import html/htmltokenizer import encoding/decoderstream import js/javascript import utils/twtstr type CharsetConfidence = enum CONFIDENCE_TENTATIVE, CONFIDENCE_CERTAIN, CONFIDENCE_IRRELEVANT DOMParser = ref object # JS interface OpenElements = seq[Element] HTML5Parser = object case fragment: bool of true: ctx: Element else: discard needsreinterpret: bool charset: Charset confidence: CharsetConfidence openElements: OpenElements insertionMode: InsertionMode oldInsertionMode: InsertionMode templateModes: seq[InsertionMode] head: Element tokenizer: Tokenizer document: Document form: HTMLFormElement fosterParenting: bool scripting: bool activeFormatting: seq[(Element, Token)] # nil => marker framesetok: bool ignoreLF: bool pendingTableChars: string pendingTableCharsWhitespace: bool AdjustedInsertionLocation = tuple[inside: Node, before: Node] # 13.2.4.1 InsertionMode = enum INITIAL, BEFORE_HTML, BEFORE_HEAD, IN_HEAD, IN_HEAD_NOSCRIPT, AFTER_HEAD, IN_BODY, TEXT, IN_TABLE, IN_TABLE_TEXT, IN_CAPTION, IN_COLUMN_GROUP, IN_TABLE_BODY, IN_ROW, IN_CELL, IN_SELECT, IN_SELECT_IN_TABLE, IN_TEMPLATE, AFTER_BODY, IN_FRAMESET, AFTER_FRAMESET, AFTER_AFTER_BODY, AFTER_AFTER_FRAMESET proc resetInsertionMode(parser: var HTML5Parser) = template switch_insertion_mode_and_return(mode: InsertionMode) = parser.insertionMode = mode return for i in countdown(parser.openElements.high, 0): var node = parser.openElements[i] let last = i == 0 if parser.fragment: node = parser.ctx if node.tagType == TAG_SELECT: if not last: for j in countdown(parser.openElements.high, 1): let ancestor = parser.openElements[j] case ancestor.tagType of TAG_TEMPLATE: break of TAG_TABLE: switch_insertion_mode_and_return IN_SELECT_IN_TABLE else: discard switch_insertion_mode_and_return IN_SELECT case node.tagType of TAG_TD, TAG_TH: if not last: switch_insertion_mode_and_return IN_CELL of TAG_TR: switch_insertion_mode_and_return IN_ROW of TAG_TBODY, TAG_THEAD, TAG_TFOOT: switch_insertion_mode_and_return IN_CAPTION of TAG_COLGROUP: switch_insertion_mode_and_return IN_COLUMN_GROUP of TAG_TABLE: switch_insertion_mode_and_return IN_TABLE of TAG_TEMPLATE: switch_insertion_mode_and_return parser.templateModes[^1] of TAG_HEAD: if not last: switch_insertion_mode_and_return IN_HEAD of TAG_BODY: switch_insertion_mode_and_return IN_BODY of TAG_FRAMESET: switch_insertion_mode_and_return IN_FRAMESET of TAG_HTML: if parser.head != nil: switch_insertion_mode_and_return BEFORE_HEAD else: switch_insertion_mode_and_return AFTER_HEAD else: discard if last: switch_insertion_mode_and_return IN_BODY func currentNode(parser: HTML5Parser): Element = if parser.openElements.len == 0: assert false else: return parser.openElements[^1] func adjustedCurrentNode(parser: HTML5Parser): Element = if parser.fragment: parser.ctx else: parser.currentNode template parse_error() = discard func lastElementOfTag(parser: HTML5Parser, tagType: TagType): tuple[element: Element, pos: int] = for i in countdown(parser.openElements.high, 0): if parser.openElements[i].tagType == tagType: return (parser.openElements[i], i) return (nil, -1) template last_child_of(n: Node): AdjustedInsertionLocation = (n, nil) # 13.2.6.1 func appropriatePlaceForInsert(parser: HTML5Parser, target: Element): AdjustedInsertionLocation = assert parser.openElements[0].tagType == TAG_HTML if parser.fosterParenting and target.tagType in {TAG_TABLE, TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TR}: let lastTemplate = parser.lastElementOfTag(TAG_TEMPLATE) let lastTable = parser.lastElementOfTag(TAG_TABLE) if lastTemplate.element != nil and (lastTable.element == nil or lastTable.pos < lastTemplate.pos): return last_child_of(HTMLTemplateElement(lastTemplate.element).content) if lastTable.element == nil: return last_child_of(parser.openElements[0]) if lastTable.element.parentNode != nil: return (lastTable.element.parentNode, lastTable.element) let previousElement = parser.openElements[lastTable.pos - 1] result = last_child_of(previousElement) else: result = last_child_of(target) if result.inside.nodeType == ELEMENT_NODE and Element(result.inside).tagType == TAG_TEMPLATE: result = (HTMLTemplateElement(result.inside).content, nil) func appropriatePlaceForInsert(parser: HTML5Parser): AdjustedInsertionLocation = parser.appropriatePlaceForInsert(parser.currentNode) func hasElement(elements: seq[Element], tag: TagType): bool = for element in elements: if element.tagType == tag: return true return false func hasElementInSpecificScope(elements: seq[Element], target: Element, list: set[TagType]): bool = for i in countdown(elements.high, 0): if elements[i] == target: return true if elements[i].tagType in list: return false assert false func hasElementInSpecificScope(elements: seq[Element], target: TagType, list: set[TagType]): bool = for i in countdown(elements.high, 0): if elements[i].tagType == target: return true if elements[i].tagType in list: return false assert false func hasElementInSpecificScope(elements: seq[Element], target: set[TagType], list: set[TagType]): bool = for i in countdown(elements.high, 0): if elements[i].tagType in target: return true if elements[i].tagType in list: return false assert false const Scope = {TAG_APPLET, TAG_CAPTION, TAG_HTML, TAG_TABLE, TAG_TD, TAG_TH, TAG_MARQUEE, TAG_OBJECT, TAG_TEMPLATE} #TODO SVG (NOTE MathML not implemented) func hasElementInScope(elements: seq[Element], target: TagType): bool = return elements.hasElementInSpecificScope(target, Scope) func hasElementInScope(elements: seq[Element], target: set[TagType]): bool = return elements.hasElementInSpecificScope(target, Scope) func hasElementInScope(elements: seq[Element], target: Element): bool = return elements.hasElementInSpecificScope(target, Scope) func hasElementInListItemScope(elements: seq[Element], target: TagType): bool = return elements.hasElementInSpecificScope(target, Scope + {TAG_OL, TAG_UL}) func hasElementInButtonScope(elements: seq[Element], target: TagType): bool = return elements.hasElementInSpecificScope(target, Scope + {TAG_BUTTON}) func hasElementInTableScope(elements: seq[Element], target: TagType): bool = return elements.hasElementInSpecificScope(target, {TAG_HTML, TAG_TABLE, TAG_TEMPLATE}) func hasElementInTableScope(elements: seq[Element], target: set[TagType]): bool = return elements.hasElementInSpecificScope(target, {TAG_HTML, TAG_TABLE, TAG_TEMPLATE}) func hasElementInSelectScope(elements: seq[Element], target: TagType): bool = for i in countdown(elements.high, 0): if elements[i].tagType == target: return true if elements[i].tagType notin {TAG_OPTION, TAG_OPTGROUP}: return false assert false func createElement(parser: HTML5Parser, token: Token, namespace: Namespace, intendedParent: Node): Element = #TODO custom elements let document = intendedParent.document let localName = token.tagname let element = document.newHTMLElement(localName, namespace, tagType = token.tagtype) element.appendAttributes(token.attrs) #for k, v in token.attrs: # element.appendAttribute(k, v) if element.isResettable(): element.resetElement() if element.tagType in SupportedFormAssociatedElements and parser.form != nil and not parser.openElements.hasElement(TAG_TEMPLATE) and (element.tagType notin ListedElements or not element.attrb("form")) and intendedParent.inSameTree(parser.form): let element = FormAssociatedElement(element) element.setForm(parser.form) element.parserInserted = true return element proc insert(location: AdjustedInsertionLocation, node: Node) = location.inside.insert(node, location.before) proc insertForeignElement(parser: var HTML5Parser, token: Token, namespace: Namespace): Element = let location = parser.appropriatePlaceForInsert() let element = parser.createElement(token, namespace, location.inside) if location.inside.preInsertionValidity(element, location.before): #TODO custom elements location.insert(element) parser.openElements.add(element) return element proc insertHTMLElement(parser: var HTML5Parser, token: Token): Element = return parser.insertForeignElement(token, Namespace.HTML) proc adjustSVGAttributes(token: Token) = const adjusted = { "attributename": "attributeName", "attributetype": "attributeType", "basefrequency": "baseFrequency", "baseprofile": "baseProfile", "calcmode": "calcMode", "clippathunits": "clipPathUnits", "diffuseconstant": "diffuseConstant", "edgemode": "edgeMode", "filterunits": "filterUnits", "glyphref": "glyphRef", "gradienttransform": "gradientTransform", "gradientunits": "gradientUnits", "kernelmatrix": "kernelMatrix", "kernelunitlength": "kernelUnitLength", "keypoints": "keyPoints", "keysplines": "keySplines", "keytimes": "keyTimes", "lengthadjust": "lengthAdjust", "limitingconeangle": "limitingConeAngle", "markerheight": "markerHeight", "markerunits": "markerUnits", "markerwidth": "markerWidth", "maskcontentunits": "maskContentUnits", "maskunits": "maskUnits", "numoctaves": "numOctaves", "pathlength": "pathLength", "patterncontentunits": "patternContentUnits", "patterntransform": "patternTransform", "patternunits": "patternUnits", "pointsatx": "pointsAtX", "pointsaty": "pointsAtY", "pointsatz": "pointsAtZ", "preservealpha": "preserveAlpha", "preserveaspectratio": "preserveAspectRatio", "primitiveunits": "primitiveUnits", "refx": "refX", "refy": "refY", "repeatcount": "repeatCount", "repeatdur": "repeatDur", "requiredextensions": "requiredExtensions", "requiredfeatures": "requiredFeatures", "specularconstant": "specularConstant", "specularexponent": "specularExponent", "spreadmethod": "spreadMethod", "startoffset": "startOffset", "stddeviation": "stdDeviation", "stitchtiles": "stitchTiles", "surfacescale": "surfaceScale", "systemlanguage": "systemLanguage", "tablevalues": "tableValues", "targetx": "targetX", "targety": "targetY", "textlength": "textLength", "viewbox": "viewBox", "viewtarget": "viewTarget", "xchannelselector": "xChannelSelector", "ychannelselector": "yChannelSelector", "zoomandpan": "zoomAndPan", }.toTable() var todo: seq[string] for k in token.attrs.keys: if k in adjusted: todo.add(k) for s in todo: token.attrs[adjusted[s]] = token.attrs[s] template insert_character_impl(parser: var HTML5Parser, data: typed) = let location = parser.appropriatePlaceForInsert() if location.inside.nodeType == DOCUMENT_NODE: return let insertNode = if location.before == nil: location.inside.lastChild else: location.before.previousSibling if insertNode != nil and insertNode.nodeType == TEXT_NODE: dom.Text(insertNode).data &= data else: let text = location.inside.document.newText($data) location.insert(text) if location.inside.nodeType == ELEMENT_NODE: let parent = Element(location.inside) if parent.tagType == TAG_STYLE: let parent = HTMLStyleElement(parent) parent.sheet_invalid = true proc insertCharacter(parser: var HTML5Parser, data: string) = insert_character_impl(parser, data) proc insertCharacter(parser: var HTML5Parser, data: char) = insert_character_impl(parser, data) proc insertCharacter(parser: var HTML5Parser, data: Rune) = insert_character_impl(parser, data) proc insertComment(parser: var HTML5Parser, token: Token, position: AdjustedInsertionLocation) = position.insert(position.inside.document.newComment(token.data)) proc insertComment(parser: var HTML5Parser, token: Token) = let position = parser.appropriatePlaceForInsert() position.insert(position.inside.document.newComment(token.data)) const PublicIdentifierEquals = [ "-//W3O//DTD W3 HTML Strict 3.0//EN//", "-/W3C/DTD HTML 4.0 Transitional/EN", "HTML" ] const PublicIdentifierStartsWith = [ "+//Silmaril//dtd html Pro v0r11 19970101//", "-//AS//DTD HTML 3.0 asWedit + extensions//", "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//", "-//IETF//DTD HTML 2.0 Level 1//", "-//IETF//DTD HTML 2.0 Level 2//", "-//IETF//DTD HTML 2.0 Strict Level 1//", "-//IETF//DTD HTML 2.0 Strict Level 2//", "-//IETF//DTD HTML 2.0 Strict//", "-//IETF//DTD HTML 2.0//", "-//IETF//DTD HTML 2.1E//", "-//IETF//DTD HTML 3.0//", "-//IETF//DTD HTML 3.2 Final//", "-//IETF//DTD HTML 3.2//", "-//IETF//DTD HTML 3//", "-//IETF//DTD HTML Level 0//", "-//IETF//DTD HTML Level 1//", "-//IETF//DTD HTML Level 2//", "-//IETF//DTD HTML Level 3//", "-//IETF//DTD HTML Strict Level 0//", "-//IETF//DTD HTML Strict Level 1//", "-//IETF//DTD HTML Strict Level 2//", "-//IETF//DTD HTML Strict Level 3//", "-//IETF//DTD HTML Strict//", "-//IETF//DTD HTML//", "-//Metrius//DTD Metrius Presentational//", "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//", "-//Microsoft//DTD Internet Explorer 2.0 HTML//", "-//Microsoft//DTD Internet Explorer 2.0 Tables//", "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//", "-//Microsoft//DTD Internet Explorer 3.0 HTML//", "-//Microsoft//DTD Internet Explorer 3.0 Tables//", "-//Netscape Comm. Corp.//DTD HTML//", "-//Netscape Comm. Corp.//DTD Strict HTML//", "-//O'Reilly and Associates//DTD HTML 2.0//", "-//O'Reilly and Associates//DTD HTML Extended 1.0//", "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//", "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//", "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//", "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//", "-//Spyglass//DTD HTML 2.0 Extended//", "-//Sun Microsystems Corp.//DTD HotJava HTML//", "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//", "-//W3C//DTD HTML 3 1995-03-24//", "-//W3C//DTD HTML 3.2 Draft//", "-//W3C//DTD HTML 3.2 Final//", "-//W3C//DTD HTML 3.2//", "-//W3C//DTD HTML 3.2S Draft//", "-//W3C//DTD HTML 4.0 Frameset//", "-//W3C//DTD HTML 4.0 Transitional//", "-//W3C//DTD HTML Experimental 19960712//", "-//W3C//DTD HTML Experimental 970421//", "-//W3C//DTD W3 HTML//", "-//W3O//DTD W3 HTML 3.0//", "-//WebTechs//DTD Mozilla HTML 2.0//", "-//WebTechs//DTD Mozilla HTML//", ] const SystemIdentifierMissingAndPublicIdentifierStartsWith = [ "-//W3C//DTD HTML 4.01 Frameset//", "-//W3C//DTD HTML 4.01 Transitional//" ] const PublicIdentifierStartsWithLimited = [ "-//W3C//DTD XHTML 1.0 Frameset//", "-//W3C//DTD XHTML 1.0 Transitional//" ] const SystemIdentifierNotMissingAndPublicIdentifierStartsWith = [ "-//W3C//DTD HTML 4.01 Frameset//", "-//W3C//DTD HTML 4.01 Transitional//" ] func quirksConditions(token: Token): bool = if token.quirks: return true if token.name.isnone or token.name.get != "html": return true if token.sysid.issome: if token.sysid.get == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd": return true if token.pubid.issome: if token.pubid.get in PublicIdentifierEquals: return true for id in PublicIdentifierStartsWith: if token.pubid.get.startsWithNoCase(id): return true if token.sysid.isnone: for id in SystemIdentifierMissingAndPublicIdentifierStartsWith: if token.pubid.get.startsWithNoCase(id): return true return false func limitedQuirksConditions(token: Token): bool = if token.pubid.isnone: return false for id in PublicIdentifierStartsWithLimited: if token.pubid.get.startsWithNoCase(id): return true if token.sysid.isnone: return false for id in SystemIdentifierNotMissingAndPublicIdentifierStartsWith: if token.pubid.get.startsWithNoCase(id): return true return false # 13.2.6.2 proc genericRawtextElementParsingAlgorithm(parser: var HTML5Parser, token: Token) = discard parser.insertHTMLElement(token) parser.tokenizer.state = RAWTEXT parser.oldInsertionMode = parser.insertionMode parser.insertionMode = TEXT proc genericRCDATAElementParsingAlgorithm(parser: var HTML5Parser, token: Token) = discard parser.insertHTMLElement(token) parser.tokenizer.state = RCDATA parser.oldInsertionMode = parser.insertionMode parser.insertionMode = TEXT proc popElement(parser: var HTML5Parser): Element = result = parser.openElements.pop() if result.tagType == TAG_TEXTAREA: result.resetElement() # 13.2.6.3 proc generateImpliedEndTags(parser: var HTML5Parser) = const tags = {TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, TAG_RT, TAG_RTC} while parser.currentNode.tagType in tags: discard parser.popElement() proc generateImpliedEndTags(parser: var HTML5Parser, exclude: TagType) = let tags = {TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, TAG_RT, TAG_RTC} - {exclude} while parser.currentNode.tagType in tags: discard parser.popElement() proc generateImpliedEndTagsThoroughly(parser: var HTML5Parser) = const tags = {TAG_CAPTION, TAG_COLGROUP, TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, TAG_RT, TAG_RTC, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TR} while parser.currentNode.tagType in tags: discard parser.popElement() # 13.2.4.3 proc pushOntoActiveFormatting(parser: var HTML5Parser, element: Element, token: Token) = var count = 0 for i in countdown(parser.activeFormatting.high, 0): let it = parser.activeFormatting[i] if it[0] == nil: break if it[0].tagType != element.tagType: continue if it[0].tagType == TAG_UNKNOWN: if it[0].localName != element.localName: continue if it[0].namespace != element.namespace: continue var fail = false for k, v in it[0].attributes: if k notin element.attributes: fail = true break if v != element.attributes[k]: fail = true break if fail: continue for k, v in element.attributes: if k notin it[0].attributes: fail = true break if fail: continue inc count if count == 3: parser.activeFormatting.delete(i) break parser.activeFormatting.add((element, token)) proc reconstructActiveFormatting(parser: var HTML5Parser) = type State = enum REWIND, ADVANCE, CREATE if parser.activeFormatting.len == 0: return if parser.activeFormatting[^1][0] == nil or parser.openElements.hasElement(parser.activeFormatting[^1][0].tagType): return var i = parser.activeFormatting.high template entry: Element = (parser.activeFormatting[i][0]) var state = REWIND while true: {.computedGoto.} case state of REWIND: if i == 0: state = CREATE continue dec i if entry != nil and not parser.openElements.hasElement(entry.tagType): continue state = ADVANCE of ADVANCE: inc i state = CREATE of CREATE: parser.activeFormatting[i] = (parser.insertHTMLElement(parser.activeFormatting[i][1]), parser.activeFormatting[i][1]) if i != parser.activeFormatting.high: state = ADVANCE continue break proc clearActiveFormattingTillMarker(parser: var HTML5Parser) = while parser.activeFormatting.len > 0 and parser.activeFormatting.pop()[0] != nil: discard template pop_current_node = discard parser.popElement() func isHTMLIntegrationPoint(node: Element): bool = return false #TODO SVG (NOTE MathML not implemented) func extractEncFromMeta(s: string): Charset = var i = 0 while true: # Loop: var j = 0 while i < s.len: template check(c: static char) = if s[i] in {c, c.toUpperAscii()}: inc j else: j = 0 case j of 0: check 'c' of 1: check 'h' of 2: check 'a' of 3: check 'r' of 4: check 's' of 5: check 'e' of 6: check 't' of 7: inc j break else: discard inc i if j < 7: return CHARSET_UNKNOWN while i < s.len and s[i] in AsciiWhitespace: inc i if i >= s.len or s[i] != '=': continue while i < s.len and s[i] in AsciiWhitespace: inc i break inc i if i >= s.len: return CHARSET_UNKNOWN if s[i] in {'"', '\''}: let s2 = s.substr(i + 1).until(s[i]) if s2.len == 0 or s2[^1] != s[i]: return CHARSET_UNKNOWN return getCharset(s2) return getCharset(s.substr(i).until({';', ' '})) proc changeEncoding(parser: var HTML5Parser, cs: Charset) = if parser.charset in {CHARSET_UTF_16_LE, CHARSET_UTF_16_BE}: parser.confidence = CONFIDENCE_CERTAIN return parser.confidence = CONFIDENCE_CERTAIN if cs == parser.charset: return if cs == CHARSET_X_USER_DEFINED: parser.charset = CHARSET_WINDOWS_1252 else: parser.charset = cs parser.needsreinterpret = true # Following is an implementation of the state (?) machine defined in # https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhtml # It uses the ad-hoc pattern matching macro `match' to apply the following # transformations: # * First, pairs of patterns and actions are stored in tuples (and `discard' # statements...) # * These pairs are then assigned to token types, later mapped to legs of the # first case statement. # * Another case statement is constructed where needed, e.g. for switching on # characters/tags/etc. # * Finally, the whole thing is wrapped in a named block, to implement a # pseudo-goto by breaking out only when the else statement needn't be # executed. # # For example, the following code: # # match token: # TokenType.COMMENT => (block: echo "comment") # ("