diff options
author | bptato <nincsnevem662@gmail.com> | 2023-08-15 18:35:19 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2023-08-15 18:35:19 +0200 |
commit | aea27f52335d203f8acfed3f2113ab0e61cdafd5 (patch) | |
tree | ab3ff3cf553e2a548c590bd25408945612686daa /src | |
parent | 542800305a01587a1cc7402ee429da6417fc1bb8 (diff) | |
download | chawan-aea27f52335d203f8acfed3f2113ab0e61cdafd5.tar.gz |
Move HTML parsing into Chame
Operation "modularize Chawan somewhat" part 2
Diffstat (limited to 'src')
-rw-r--r-- | src/buffer/buffer.nim | 3 | ||||
-rw-r--r-- | src/css/cascade.nim | 3 | ||||
-rw-r--r-- | src/css/match.nim | 3 | ||||
-rw-r--r-- | src/css/selectorparser.nim | 3 | ||||
-rw-r--r-- | src/css/sheet.nim | 3 | ||||
-rw-r--r-- | src/css/stylednode.nim | 3 | ||||
-rw-r--r-- | src/html/chadombuilder.nim | 5 | ||||
-rw-r--r-- | src/html/dom.nim | 3 | ||||
-rw-r--r-- | src/html/entity.nim | 17 | ||||
-rw-r--r-- | src/html/htmlparser.nim | 2749 | ||||
-rw-r--r-- | src/html/htmltokenizer.nim | 1564 | ||||
-rw-r--r-- | src/html/parseerror.nim | 70 | ||||
-rw-r--r-- | src/html/tags.nim | 186 | ||||
-rw-r--r-- | src/render/renderdocument.nim | 3 | ||||
-rw-r--r-- | src/version.nim | 2 | ||||
-rw-r--r-- | src/xhr/formdata.nim | 3 |
16 files changed, 23 insertions, 4597 deletions
diff --git a/src/buffer/buffer.nim b/src/buffer/buffer.nim index 1dde438f..0bb8f547 100644 --- a/src/buffer/buffer.nim +++ b/src/buffer/buffer.nim @@ -20,7 +20,6 @@ import css/values import html/chadombuilder import html/dom import html/env -import html/tags import img/png import io/connecterror import io/loader @@ -51,6 +50,8 @@ import xhr/formdata as formdata_impl import chakasu/charset import chakasu/decoderstream +import chame/tags + type LoadInfo* = enum CONNECT, DOWNLOAD, RENDER, DONE diff --git a/src/css/cascade.nim b/src/css/cascade.nim index 1c711dfb..53eff5cf 100644 --- a/src/css/cascade.nim +++ b/src/css/cascade.nim @@ -11,10 +11,11 @@ import css/sheet import css/stylednode import css/values import html/dom -import html/tags import layout/layoutunit import types/color +import chame/tags + type DeclarationList* = array[PseudoElem, seq[CSSDeclaration]] diff --git a/src/css/match.nim b/src/css/match.nim index 810cdbe4..e8a693eb 100644 --- a/src/css/match.nim +++ b/src/css/match.nim @@ -7,7 +7,8 @@ import css/cssparser import css/selectorparser import css/stylednode import html/dom -import html/tags + +import chame/tags func attrSelectorMatches(elem: Element, sel: Selector): bool = case sel.rel diff --git a/src/css/selectorparser.nim b/src/css/selectorparser.nim index 57532ca3..c8144ffb 100644 --- a/src/css/selectorparser.nim +++ b/src/css/selectorparser.nim @@ -4,7 +4,8 @@ import strutils import unicode import css/cssparser -import html/tags + +import chame/tags type SelectorType* = enum diff --git a/src/css/sheet.nim b/src/css/sheet.nim index d0d03cb0..1f68816a 100644 --- a/src/css/sheet.nim +++ b/src/css/sheet.nim @@ -4,7 +4,8 @@ import tables import css/mediaquery import css/cssparser import css/selectorparser -import html/tags + +import chame/tags type CSSRuleBase* = ref object of RootObj diff --git a/src/css/stylednode.nim b/src/css/stylednode.nim index c9358b90..1b99773a 100644 --- a/src/css/stylednode.nim +++ b/src/css/stylednode.nim @@ -1,7 +1,8 @@ import css/selectorparser import css/values import html/dom -import html/tags + +import chame/tags # Container to hold a style and a node. # Pseudo-elements are implemented using StyledNode objects without nodes. Input diff --git a/src/html/chadombuilder.nim b/src/html/chadombuilder.nim index c7a41619..13bad12c 100644 --- a/src/html/chadombuilder.nim +++ b/src/html/chadombuilder.nim @@ -3,13 +3,14 @@ import options import streams import html/dom -import html/htmlparser -import html/tags import js/javascript import types/url import chakasu/charset +import chame/htmlparser +import chame/tags + # DOMBuilder implementation for Chawan. type diff --git a/src/html/dom.nim b/src/html/dom.nim index 00f2a549..c9b6bd6e 100644 --- a/src/html/dom.nim +++ b/src/html/dom.nim @@ -11,7 +11,6 @@ import css/cssparser import css/sheet import css/values import html/event -import html/tags import img/bitmap import img/painter import img/path @@ -34,6 +33,8 @@ import utils/twtstr import chakasu/charset import chakasu/decoderstream +import chame/tags + type FormMethod* = enum FORM_METHOD_GET, FORM_METHOD_POST, FORM_METHOD_DIALOG diff --git a/src/html/entity.nim b/src/html/entity.nim deleted file mode 100644 index f2f55277..00000000 --- a/src/html/entity.nim +++ /dev/null @@ -1,17 +0,0 @@ -import json - -import utils/radixtree - -const entity = staticRead"res/entity.json" -proc genEntityMap(data: seq[tuple[a: cstring, b: cstring]]): RadixNode[string] = - result = newRadixTree[string]() - for pair in data: - result[$pair.a] = $pair.b - -proc genEntityTable(): seq[tuple[a: cstring, b: cstring]] = - let entityJson = parseJson(entity) - - for k, v in entityJson: - result.add((cstring(k.substr(1)), cstring(v{"characters"}.getStr()))) -const entityTable = genEntityTable() -let entityMap* = genEntityMap(entityTable) diff --git a/src/html/htmlparser.nim b/src/html/htmlparser.nim deleted file mode 100644 index 60a9aad9..00000000 --- a/src/html/htmlparser.nim +++ /dev/null @@ -1,2749 +0,0 @@ -import macros -import options -import sequtils -import streams -import strutils -import tables -import unicode - -import html/htmltokenizer -import html/parseerror -import html/tags -import utils/twtstr - -import chakasu/charset -import chakasu/decoderstream - -# Generics break without exporting macros. Maybe a compiler bug? -export macros - -# Heavily inspired by html5ever's TreeSink design. -type - DOMBuilder*[Handle] = ref object of RootObj - document*: Handle - ## Must never be nil. - finish*: DOMBuilderFinish[Handle] - ## May be nil. - parseError*: DOMBuilderParseError[Handle] - ## May be nil. - setQuirksMode*: DOMBuilderSetQuirksMode[Handle] - ## May be nil. - setCharacterSet*: DOMBuilderSetCharacterSet[Handle] - ## May be nil. - elementPopped*: DOMBuilderElementPopped[Handle] - ## May be nil. - getTemplateContent*: DOMBuilderGetTemplateContent[Handle] - ## May be nil. (If nil, templates are treated as regular elements.) - getParentNode*: DOMBuilderGetParentNode[Handle] - ## Must never be nil. - getLocalName*: DOMBuilderGetLocalName[Handle] - ## Must never be nil. - getTagType*: DOMBuilderGetTagType[Handle] - ## May be nil. (If nil, the parser falls back to getLocalName.) - getNamespace*: DOMBuilderGetNamespace[Handle] - ## May be nil. (If nil, the parser always uses the HTML namespace.) - createElement*: DOMBuilderCreateElement[Handle] - ## Must never be nil. - createComment*: DOMBuilderCreateComment[Handle] - ## Must never be nil. - createDocumentType*: DOMBuilderCreateDocumentType[Handle] - ## Must never be nil. - insertBefore*: DOMBuilderInsertBefore[Handle] - ## Must never be nil. - insertText*: DOMBuilderInsertText[Handle] - ## Must never be nil. - remove*: DOMBuilderRemove[Handle] - ## Must never be nil. - addAttrsIfMissing*: DOMBuilderAddAttrsIfMissing[Handle] - ## May be nil. (If nil, some attributes may not be added to the HTML or - ## BODY element if more than one of their respecting opening tags exist.) - setScriptAlreadyStarted*: DOMBuilderSetScriptAlreadyStarted[Handle] - ## May be nil. - associateWithForm*: DOMBuilderAssociateWithForm[Handle] - ## May be nil. - isSVGIntegrationPoint*: DOMBuilderIsSVGIntegrationPoint[Handle] - ## May be nil. (If nil, the parser considers no Handle an SVG integration - ## point.) - - HTML5ParserOpts*[Handle] = object - isIframeSrcdoc*: bool - ## Is the document an iframe srcdoc? - scripting*: bool - ## Is scripting enabled for this document? - canReinterpret*: bool - ## Can we try to parse the document again with a different character set? - ## - ## Note: this only works if inputStream is seekable, i.e. - ## inputStream.setPosition(0) must work correctly. - ## - ## Note 2: when this canReinterpret is false, confidence is set to - ## certain, no BOM sniffing is performed and meta charset tags are - ## disregarded. Expect this to change in the future. - charsets*: seq[Charset] - ## Fallback charsets. If empty, UTF-8 is used. In most cases, an empty - ## sequence or a single-element sequence consisting of a character set - ## chosen based on the user's locale will suffice. - ## - ## The parser goes through fallback charsets in the following order: - ## * A charset stack is initialized to `charsets`, reversed. This - ## means that the first charset specified in `charsets` is on top of - ## the stack. (e.g. say `charsets = @[CHARSET_UTF_16_LE, CHARSET_UTF_8]`, - ## then utf-16-le is tried before utf-8.) - ## * BOM sniffing is attempted. If successful, confidence is set to - ## certain and the resulting charset is used (i.e. other character - ## sets will not be tried for decoding this document.) - ## * If the charset stack is empty, UTF-8 is pushed on top. - ## * Attempt to parse the document with the first charset on top of - ## the stack. - ## * If BOM sniffing was unsuccessful, and a <meta charset=...> tag - ## is encountered, parsing is restarted with the specified charset. - ## No further attempts are made to detect the encoding, and decoder - ## errors are signaled by U+FFFD replacement characters. - ## * Otherwise, each charset on the charset stack is tried until either no - ## decoding errors are encountered, or only one charset is left. For - ## the last charset, decoder errors are signaled by U+FFFD replacement - ## characters. - ctx*: Option[Handle] - ## Context element for fragment parsing. When set to some Handle, - ## the fragment case is used while parsing. - - DOMBuilderFinish*[Handle] = - proc(builder: DOMBuilder[Handle]) {.nimcall.} - ## Parsing has finished. - - DOMBuilderParseError*[Handle] = - proc(builder: DOMBuilder[Handle], message: ParseError) {.nimcall.} - ## Parse error. `message` is an error code either specified by the - ## standard (in this case, message < LAST_SPECIFIED_ERROR) or named - ## arbitrarily. (At the time of writing, only tokenizer errors have - ## specified error codes.) - - DOMBuilderSetQuirksMode*[Handle] = - proc(builder: DOMBuilder[Handle], quirksMode: QuirksMode) {.nimcall.} - ## Set quirks mode to either QUIRKS or LIMITED_QUIRKS. NO_QUIRKS - ## is the default and is therefore never used here. - - DOMBuilderSetCharacterSet*[Handle] = - proc(builder: DOMBuilder[Handle], charset: Charset) {.nimcall.} - ## Set the recognized charset, if it differs from the initial input. - - DOMBuilderElementPopped*[Handle] = - proc(builder: DOMBuilder[Handle], element: Handle) {.nimcall.} - ## Called when an element is popped from the stack of open elements - ## (i.e. when it has been closed.) - - DOMBuilderGetTemplateContent*[Handle] = - proc(builder: DOMBuilder[Handle], handle: Handle): Handle {.nimcall.} - ## Retrieve a handle to the template element's contents. - ## Note: this function must never return nil. - - DOMBuilderGetParentNode*[Handle] = - proc(builder: DOMBuilder[Handle], handle: Handle): Option[Handle] - {.nimcall.} - ## Retrieve a handle to the parent node. - ## May return none(Handle) if no parent node exists. - - DOMBuilderGetTagType*[Handle] = - proc(builder: DOMBuilder[Handle], handle: Handle): TagType {.nimcall.} - ## Retrieve the tag type of element. - - DOMBuilderGetLocalName*[Handle] = - proc(builder: DOMBuilder[Handle], handle: Handle): string {.nimcall.} - ## Retrieve the local name of element. (This is tagName(getTagType), - ## unless the tag is unknown. - - DOMBuilderGetNamespace*[Handle] = - proc(builder: DOMBuilder[Handle], handle: Handle): Namespace {.nimcall.} - ## Retrieve the namespace of element. - - DOMBuilderCreateElement*[Handle] = - proc(builder: DOMBuilder[Handle], localName: string, namespace: Namespace, - tagType: TagType, attrs: Table[string, string]): Handle {.nimcall.} - ## Create a new element node. - ## - ## localName is the tag name of the token. - ## - ## namespace is the namespace passed to the function. (For HTML elements, - ## it's HTML.) - ## tagType is set based on localName. (This saves the consumer from - ## having to interpret localName again.) - ## - ## attrs is a table of the token's attributes. - - DOMBuilderCreateComment*[Handle] = - proc(builder: DOMBuilder[Handle], text: string): Handle {.nimcall.} - ## Create a new comment node. - - DOMBuilderInsertText*[Handle] = - proc(builder: DOMBuilder[Handle], parent: Handle, text: string, - before: Handle) {.nimcall.} - ## Insert a text node at the specified location with contents - ## `text`. If the specified location has a previous sibling that is - ## a text node, no new text node should be created, but instead `text` - ## should be appended to the previous sibling's character data. - - DOMBuilderCreateDocumentType*[Handle] = - proc(builder: DOMBuilder[Handle], name, publicId, systemId: string): Handle - {.nimcall.} - ## Create a new document type node. - - DOMBuilderInsertBefore*[Handle] = - proc(builder: DOMBuilder[Handle], parent, child, before: Handle) - {.nimcall.} - ## Insert node `child` before the node called `before`. - ## - ## If `before` is nil, `child` is expected to be appended to `parent`'s - ## node list. - ## - ## If `child` is a text, and its previous sibling after insertion is a - ## text as well, then they should be merged. `before` is never a - ## text node (and thus never has to be merged). - ## - ## Note: parent may either be an Element or a Document node. - - DOMBuilderRemove*[Handle] = - proc(builder: DOMBuilder[Handle], child: Handle) {.nimcall.} - ## Remove `child` from its parent node, and do nothing if `child` - ## has no parent node. - - DOMBuilderReparent*[Handle] = - proc(builder: DOMBuilder[Handle], child, newParent: Handle) {.nimcall.} - ## Remove `child` from its parent node, and append it to `newParent`. - ## In terms of DOM operations, this should be equivalent to calling - ## `child.remove()`, followed by `newParent.append(child)`. - - DOMBuilderAddAttrsIfMissing*[Handle] = - proc(builder: DOMBuilder[Handle], element: Handle, - attrs: Table[string, string]) {.nimcall.} - ## Add the attributes in `attrs` to the element node `element`. - ## At the time of writing, called for HTML and BODY only. (This may - ## change in the future.) - ## An example implementation: - ## ```nim - ## for k, v in attrs: - ## if k notin element.attrs: - ## element.attrs[k] = v - ## ``` - - DOMBuilderSetScriptAlreadyStarted*[Handle] = - proc(builder: DOMBuilder[Handle], script: Handle) {.nimcall.} - ## Set the "already started" flag for the script element. - ## - ## Note: this flag is not togglable, so this callback should just set it - ## to true. - - DOMBuilderAssociateWithForm*[Handle] = - proc(builder: DOMBuilder[Handle], element, form, intendedParent: Handle) - {.nimcall.} - ## Called after createElement. Attempts to set form for form-associated - ## elements. - ## - ## Note: the DOM builder is responsible for checking whether the - ## intended parent and the form element are in the same tree. - - DOMBuilderIsSVGIntegrationPoint*[Handle] = - proc(builder: DOMBuilder[Handle], element: Handle): bool {.nimcall.} - ## Check if element is an SVG integration point. - -type - CharsetConfidence = enum - CONFIDENCE_TENTATIVE, CONFIDENCE_CERTAIN, CONFIDENCE_IRRELEVANT - - HTML5Parser[Handle] = object - quirksMode: QuirksMode - dombuilder: DOMBuilder[Handle] - opts: HTML5ParserOpts[Handle] - ctx: Option[Handle] - needsreinterpret: bool - charset: Charset - confidence: CharsetConfidence - openElements: seq[Handle] - insertionMode: InsertionMode - oldInsertionMode: InsertionMode - templateModes: seq[InsertionMode] - head: Option[Handle] - tokenizer: Tokenizer - form: Option[Handle] - fosterParenting: bool - # Handle is an element. nil => marker - activeFormatting: seq[(Option[Handle], Token)] - framesetok: bool - ignoreLF: bool - pendingTableChars: string - pendingTableCharsWhitespace: bool - - AdjustedInsertionLocation[Handle] = tuple[inside, before: Handle] - -# 13.2.4.1 - InsertionMode = enum - INITIAL, BEFORE_HTML, BEFORE_HEAD, IN_HEAD, IN_HEAD_NOSCRIPT, AFTER_HEAD, - IN_BODY, TEXT, IN_TABLE, IN_TABLE_TEXT, IN_CAPTION, IN_COLUMN_GROUP, - IN_TABLE_BODY, IN_ROW, IN_CELL, IN_SELECT, IN_SELECT_IN_TABLE, IN_TEMPLATE, - AFTER_BODY, IN_FRAMESET, AFTER_FRAMESET, AFTER_AFTER_BODY, - AFTER_AFTER_FRAMESET - -# DOMBuilder interface functions -proc finish[Handle](parser: HTML5Parser[Handle]) = - if parser.dombuilder.finish != nil: - parser.dombuilder.finish(parser.dombuilder) - -proc parseError(parser: HTML5Parser, e: ParseError) = - if parser.dombuilder.parseError != nil: - parser.dombuilder.parseError(parser.dombuilder, e) - -proc setQuirksMode[Handle](parser: var HTML5Parser[Handle], mode: QuirksMode) = - parser.quirksMode = mode - if parser.dombuilder.setQuirksMode != nil: - parser.dombuilder.setQuirksMode(parser.dombuilder, mode) - -func document[Handle](parser: HTML5Parser[Handle]): Handle {.inline.} = - return parser.dombuilder.document - -func getTemplateContent[Handle](parser: HTML5Parser[Handle], - handle: Handle): Handle = - let dombuilder = parser.dombuilder - return dombuilder.getTemplateContent(dombuilder, handle) - -func getParentNode[Handle](parser: HTML5Parser[Handle], - handle: Handle): Option[Handle] = - let dombuilder = parser.dombuilder - return dombuilder.getParentNode(dombuilder, handle) - -func getLocalName[Handle](parser: HTML5Parser[Handle], handle: Handle): - string = - return parser.dombuilder.getLocalName(parser.dombuilder, handle) - -func getTagType[Handle](parser: HTML5Parser[Handle], handle: Handle): TagType = - if parser.dombuilder.getTagType != nil: - return parser.dombuilder.getTagType(parser.dombuilder, handle) - return tagType(parser.getLocalName(handle)) - -func getNamespace[Handle](parser: HTML5Parser[Handle], handle: Handle): - Namespace = - if parser.dombuilder.getNamespace != nil: - return parser.dombuilder.getNamespace(parser.dombuilder, handle) - return Namespace.HTML - -func createElement[Handle](parser: HTML5Parser[Handle], localName: string, - namespace: Namespace, tagType: TagType, attrs: Table[string, string]): - Handle = - return parser.dombuilder.createElement(parser.dombuilder, localName, - namespace, tagType, attrs) - -func createElement[Handle](parser: HTML5Parser[Handle], tagType: TagType, - namespace: Namespace): Handle = - return parser.createElement(tagName(tagType), namespace, tagType, - Table[string, string]()) - -func createComment[Handle](parser: HTML5Parser[Handle], text: string): Handle = - let dombuilder = parser.dombuilder - return dombuilder.createComment(dombuilder, text) - -proc createDocumentType[Handle](parser: HTML5Parser[Handle], name, publicId, - systemId: string): Handle = - let dombuilder = parser.dombuilder - return dombuilder.createDocumentType(dombuilder, name, publicId, systemId) - -proc insertBefore[Handle](parser: HTML5Parser[Handle], - parent, node, before: Handle) = - let dombuilder = parser.dombuilder - dombuilder.insertBefore(dombuilder, parent, node, before) - -proc insertText[Handle](parser: HTML5Parser[Handle], parent: Handle, - text: string, before: Handle) = - let dombuilder = parser.dombuilder - dombuilder.insertText(dombuilder, parent, text, before) - -proc remove[Handle](parser: HTML5Parser[Handle], child: Handle) = - let dombuilder = parser.dombuilder - dombuilder.remove(dombuilder, child) - -proc addAttrsIfMissing[Handle](parser: HTML5Parser, element: Handle, - attrs: Table[string, string]) = - let dombuilder = parser.dombuilder - if dombuilder.addAttrsIfMissing != nil: - dombuilder.addAttrsIfMissing(dombuilder, element, attrs) - -proc setScriptAlreadyStarted[Handle](parser: HTML5Parser, script: Handle) = - let dombuilder = parser.dombuilder - if dombuilder.setScriptAlreadyStarted != nil: - dombuilder.setScriptAlreadyStarted(dombuilder, script) - -proc associateWithForm[Handle](parser: HTML5Parser, element, form, - intendedParent: Handle) = - let dombuilder = parser.dombuilder - if dombuilder.associateWithForm != nil: - dombuilder.associateWithForm(dombuilder, element, form, intendedParent) - -func isSVGIntegrationPoint[Handle](parser: HTML5Parser, - element: Handle): bool = - let dombuilder = parser.dombuilder - if dombuilder.isSVGIntegrationPoint != nil: - return dombuilder.isSVGIntegrationPoint(dombuilder, element) - return false - -# Parser -func hasParseError(parser: HTML5Parser): bool = - return parser.dombuilder.parseError != nil - -func tagNameEquals[Handle](parser: HTML5Parser, handle: Handle, - token: Token): bool = - let tagType = parser.getTagType(handle) - if tagType != TAG_UNKNOWN: - return tagType == token.tagtype - let localName = parser.getLocalName(handle) - return localName == token.tagname - -func tagNameEquals[Handle](parser: HTML5Parser, a, b: Handle): bool = - let tagType = parser.getTagType(a) - if tagType != TAG_UNKNOWN: - return tagType == parser.getTagType(b) - return parser.getLocalName(a) == parser.getLocalName(b) - -func fragment(parser: HTML5Parser): bool = - return parser.ctx.isSome - -# https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately -proc resetInsertionMode(parser: var HTML5Parser) = - template switch_insertion_mode_and_return(mode: InsertionMode) = - parser.insertionMode = mode - return - for i in countdown(parser.openElements.high, 0): - var node = parser.openElements[i] - let last = i == 0 - if parser.fragment: - node = parser.ctx.get - let tagType = parser.getTagType(node) - if tagType == TAG_SELECT: - if not last: - for j in countdown(parser.openElements.high, 1): - let ancestor = parser.openElements[j] - case parser.getTagType(ancestor) - of TAG_TEMPLATE: break - of TAG_TABLE: switch_insertion_mode_and_return IN_SELECT_IN_TABLE - else: discard - switch_insertion_mode_and_return IN_SELECT - case tagType - of TAG_TD, TAG_TH: - if not last: - switch_insertion_mode_and_return IN_CELL - of TAG_TR: switch_insertion_mode_and_return IN_ROW - of TAG_TBODY, TAG_THEAD, TAG_TFOOT: - switch_insertion_mode_and_return IN_CAPTION - of TAG_COLGROUP: switch_insertion_mode_and_return IN_COLUMN_GROUP - of TAG_TABLE: switch_insertion_mode_and_return IN_TABLE - of TAG_TEMPLATE: switch_insertion_mode_and_return parser.templateModes[^1] - of TAG_HEAD: - if not last: - switch_insertion_mode_and_return IN_HEAD - of TAG_BODY: switch_insertion_mode_and_return IN_BODY - of TAG_FRAMESET: switch_insertion_mode_and_return IN_FRAMESET - of TAG_HTML: - if parser.head.isNone: - switch_insertion_mode_and_return BEFORE_HEAD - else: - switch_insertion_mode_and_return AFTER_HEAD - else: discard - if last: - switch_insertion_mode_and_return IN_BODY - -func currentNode[Handle](parser: HTML5Parser[Handle]): Handle = - return parser.openElements[^1] - -func adjustedCurrentNode[Handle](parser: HTML5Parser[Handle]): Handle = - if parser.fragment: - parser.ctx.get - else: - parser.currentNode - -func lastElementOfTag[Handle](parser: HTML5Parser[Handle], - tagType: TagType): tuple[element: Option[Handle], pos: int] = - for i in countdown(parser.openElements.high, 0): - if parser.getTagType(parser.openElements[i]) == tagType: - return (some(parser.openElements[i]), i) - return (none(Handle), -1) - -template last_child_of[Handle](n: Handle): AdjustedInsertionLocation[Handle] = - (n, nil) - -# https://html.spec.whatwg.org/multipage/#appropriate-place-for-inserting-a-node -func appropriatePlaceForInsert[Handle](parser: HTML5Parser[Handle], - target: Handle): AdjustedInsertionLocation[Handle] = - assert parser.getTagType(parser.openElements[0]) == TAG_HTML - let targetTagType = parser.getTagType(target) - const FosterTagTypes = {TAG_TABLE, TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TR} - if parser.fosterParenting and targetTagType in FosterTagTypes: - let lastTemplate = parser.lastElementOfTag(TAG_TEMPLATE) - let lastTable = parser.lastElementOfTag(TAG_TABLE) - if lastTemplate.element.isSome and - parser.dombuilder.getTemplateContent != nil and - (lastTable.element.isNone or lastTable.pos < lastTemplate.pos): - let content = parser.getTemplateContent(lastTemplate.element.get) - return last_child_of(content) - if lastTable.element.isNone: - return last_child_of(parser.openElements[0]) - let parentNode = parser.getParentNode(lastTable.element.get) - if parentNode.isSome: - return (parentNode.get, lastTable.element.get) - let previousElement = parser.openElements[lastTable.pos - 1] - result = last_child_of(previousElement) - else: - result = last_child_of(target) - if parser.getTagType(result.inside) == TAG_TEMPLATE and - parser.dombuilder.getTemplateContent != nil: - result = (parser.getTemplateContent(result.inside), nil) - -func appropriatePlaceForInsert[Handle](parser: HTML5Parser[Handle]): - AdjustedInsertionLocation[Handle] = - parser.appropriatePlaceForInsert(parser.currentNode) - -func hasElement[Handle](parser: HTML5Parser[Handle], tag: TagType): bool = - for element in parser.openElements: - if parser.getTagType(element) == tag: - return true - return false - -func hasElement[Handle](parser: HTML5Parser[Handle], tags: set[TagType]): bool = - for element in parser.openElements: - if parser.getTagType(element) in tags: - return true - return false - -func hasElementInSpecificScope[Handle](parser: HTML5Parser[Handle], - target: Handle, list: set[TagType]): bool = - for i in countdown(parser.openElements.high, 0): - if parser.openElements[i] == target: - return true - if parser.getTagType(parser.openElements[i]) in list: - return false - assert false - -func hasElementInSpecificScope[Handle](parser: HTML5Parser[Handle], - target: TagType, list: set[TagType]): bool = - for i in countdown(parser.openElements.high, 0): - let tagType = parser.getTagType(parser.openElements[i]) - if tagType == target: - return true - if tagType in list: - return false - assert false - -func hasElementInSpecificScope[Handle](parser: HTML5Parser[Handle], - target: set[TagType], list: set[TagType]): bool = - for i in countdown(parser.openElements.high, 0): - let tagType = parser.getTagType(parser.openElements[i]) - if tagType in target: - return true - if tagType in list: - return false - assert false - -const Scope = { - TAG_APPLET, TAG_CAPTION, TAG_HTML, TAG_TABLE, TAG_TD, TAG_TH, TAG_MARQUEE, - TAG_OBJECT, TAG_TEMPLATE #TODO SVG - # Note: MathML is not implemented -} - -func hasElementInScope[Handle](parser: HTML5Parser[Handle], - target: TagType): bool = - return parser.hasElementInSpecificScope(target, Scope) - -func hasElementInScope[Handle](parser: HTML5Parser[Handle], - target: set[TagType]): bool = - return parser.hasElementInSpecificScope(target, Scope) - -func hasElementInScope[Handle](parser: HTML5Parser[Handle], - target: Handle): bool = - return parser.hasElementInSpecificScope(target, Scope) - -func hasElementInListItemScope[Handle](parser: HTML5Parser[Handle], - target: TagType): bool = - const ListItemScope = Scope + {TAG_OL, TAG_UL} - return parser.hasElementInSpecificScope(target, ListItemScope) - -func hasElementInButtonScope[Handle](parser: HTML5Parser[Handle], - target: TagType): bool = - const ButtonScope = Scope + {TAG_BUTTON} - return parser.hasElementInSpecificScope(target, ButtonScope) - -const TableScope = {TAG_HTML, TAG_TABLE, TAG_TEMPLATE} -func hasElementInTableScope[Handle](parser: HTML5Parser[Handle], - target: TagType): bool = - return parser.hasElementInSpecificScope(target, TableScope) - -func hasElementInTableScope[Handle](parser: HTML5Parser[Handle], - target: set[TagType]): bool = - return parser.hasElementInSpecificScope(target, TableScope) - -func hasElementInSelectScope[Handle](parser: HTML5Parser[Handle], - target: TagType): bool = - for i in countdown(parser.openElements.high, 0): - let tagType = parser.getTagType(parser.openElements[i]) - if tagType == target: - return true - if tagType notin {TAG_OPTION, TAG_OPTGROUP}: - return false - assert false - -func createElement[Handle](parser: HTML5Parser[Handle], token: Token, - namespace: Namespace, intendedParent: Handle): Handle = - #TODO custom elements - let localName = token.tagname - let element = parser.createElement(localName, namespace, token.tagtype, - token.attrs) - if token.tagtype in FormAssociatedElements and parser.form.isSome and - not parser.hasElement(TAG_TEMPLATE) and - (token.tagtype notin ListedElements or "form" notin token.attrs): - parser.associateWithForm(element, parser.form.get, intendedParent) - return element - -proc pushElement[Handle](parser: var HTML5Parser[Handle], node: Handle) = - parser.openElements.add(node) - let node = parser.adjustedCurrentNode() - parser.tokenizer.hasnonhtml = parser.getNamespace(node) != Namespace.HTML - -proc popElement[Handle](parser: var HTML5Parser[Handle]): Handle = - result = parser.openElements.pop() - if parser.dombuilder.elementPopped != nil: - parser.dombuilder.elementPopped(parser.dombuilder, result) - if parser.openElements.len == 0: - parser.tokenizer.hasnonhtml = false - else: - let node = parser.adjustedCurrentNode() - parser.tokenizer.hasnonhtml = parser.getNamespace(node) != Namespace.HTML - -template pop_current_node = discard parser.popElement() - -proc insert[Handle](parser: HTML5Parser[Handle], - location: AdjustedInsertionLocation[Handle], node: Handle) = - parser.insertBefore(location.inside, node, location.before) - -proc append[Handle](parser: HTML5Parser[Handle], parent, node: Handle) = - parser.insertBefore(parent, node, nil) - -proc insertForeignElement[Handle](parser: var HTML5Parser[Handle], token: Token, - namespace: Namespace): Handle = - let location = parser.appropriatePlaceForInsert() - let element = parser.createElement(token, namespace, location.inside) - #TODO custom elements - parser.insert(location, element) - parser.pushElement(element) - return element - -proc insertHTMLElement[Handle](parser: var HTML5Parser[Handle], - token: Token): Handle = - return parser.insertForeignElement(token, Namespace.HTML) - -proc adjustSVGAttributes(token: Token) = - const adjusted = { - "attributename": "attributeName", - "attributetype": "attributeType", - "basefrequency": "baseFrequency", - "baseprofile": "baseProfile", - "calcmode": "calcMode", - "clippathunits": "clipPathUnits", - "diffuseconstant": "diffuseConstant", - "edgemode": "edgeMode", - "filterunits": "filterUnits", - "glyphref": "glyphRef", - "gradienttransform": "gradientTransform", - "gradientunits": "gradientUnits", - "kernelmatrix": "kernelMatrix", - "kernelunitlength": "kernelUnitLength", - "keypoints": "keyPoints", - "keysplines": "keySplines", - "keytimes": "keyTimes", - "lengthadjust": "lengthAdjust", - "limitingconeangle": "limitingConeAngle", - "markerheight": "markerHeight", - "markerunits": "markerUnits", - "markerwidth": "markerWidth", - "maskcontentunits": "maskContentUnits", - "maskunits": "maskUnits", - "numoctaves": "numOctaves", - "pathlength": "pathLength", - "patterncontentunits": "patternContentUnits", - "patterntransform": "patternTransform", - "patternunits": "patternUnits", - "pointsatx": "pointsAtX", - "pointsaty": "pointsAtY", - "pointsatz": "pointsAtZ", - "preservealpha": "preserveAlpha", - "preserveaspectratio": "preserveAspectRatio", - "primitiveunits": "primitiveUnits", - "refx": "refX", - "refy": "refY", - "repeatcount": "repeatCount", - "repeatdur": "repeatDur", - "requiredextensions": "requiredExtensions", - "requiredfeatures": "requiredFeatures", - "specularconstant": "specularConstant", - "specularexponent": "specularExponent", - "spreadmethod": "spreadMethod", - "startoffset": "startOffset", - "stddeviation": "stdDeviation", - "stitchtiles": "stitchTiles", - "surfacescale": "surfaceScale", - "systemlanguage": "systemLanguage", - "tablevalues": "tableValues", - "targetx": "targetX", - "targety": "targetY", - "textlength": "textLength", - "viewbox": "viewBox", - "viewtarget": "viewTarget", - "xchannelselector": "xChannelSelector", - "ychannelselector": "yChannelSelector", - "zoomandpan": "zoomAndPan", - }.toTable() - var todo: seq[string] - for k in token.attrs.keys: - if k in adjusted: - todo.add(k) - for s in todo: - token.attrs[adjusted[s]] = token.attrs[s] - -template insert_character_impl(parser: var HTML5Parser, data: typed) = - let location = parser.appropriatePlaceForInsert() - if location.inside.nodeType == DOCUMENT_NODE: - return - insertText(parser, location.inside, $data, location.before) - -proc insertCharacter(parser: var HTML5Parser, data: string) = - insert_character_impl(parser, data) - -proc insertCharacter(parser: var HTML5Parser, data: char) = - insert_character_impl(parser, data) - -proc insertCharacter(parser: var HTML5Parser, data: Rune) = - insert_character_impl(parser, data) - -proc insertComment[Handle](parser: var HTML5Parser[Handle], token: Token, - position: AdjustedInsertionLocation[Handle]) = - let comment = parser.createComment(token.data) - parser.insert(position, comment) - -proc insertComment(parser: var HTML5Parser, token: Token) = - let position = parser.appropriatePlaceForInsert() - parser.insertComment(token, position) - -const PublicIdentifierEquals = [ - "-//W3O//DTD W3 HTML Strict 3.0//EN//", - "-/W3C/DTD HTML 4.0 Transitional/EN", - "HTML" -] - -const PublicIdentifierStartsWith = [ - "+//Silmaril//dtd html Pro v0r11 19970101//", - "-//AS//DTD HTML 3.0 asWedit + extensions//", - "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//", - "-//IETF//DTD HTML 2.0 Level 1//", - "-//IETF//DTD HTML 2.0 Level 2//", - "-//IETF//DTD HTML 2.0 Strict Level 1//", - "-//IETF//DTD HTML 2.0 Strict Level 2//", - "-//IETF//DTD HTML 2.0 Strict//", - "-//IETF//DTD HTML 2.0//", - "-//IETF//DTD HTML 2.1E//", - "-//IETF//DTD HTML 3.0//", - "-//IETF//DTD HTML 3.2 Final//", - "-//IETF//DTD HTML 3.2//", - "-//IETF//DTD HTML 3//", - "-//IETF//DTD HTML Level 0//", - "-//IETF//DTD HTML Level 1//", - "-//IETF//DTD HTML Level 2//", - "-//IETF//DTD HTML Level 3//", - "-//IETF//DTD HTML Strict Level 0//", - "-//IETF//DTD HTML Strict Level 1//", - "-//IETF//DTD HTML Strict Level 2//", - "-//IETF//DTD HTML Strict Level 3//", - "-//IETF//DTD HTML Strict//", - "-//IETF//DTD HTML//", - "-//Metrius//DTD Metrius Presentational//", - "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//", - "-//Microsoft//DTD Internet Explorer 2.0 HTML//", - "-//Microsoft//DTD Internet Explorer 2.0 Tables//", - "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//", - "-//Microsoft//DTD Internet Explorer 3.0 HTML//", - "-//Microsoft//DTD Internet Explorer 3.0 Tables//", - "-//Netscape Comm. Corp.//DTD HTML//", - "-//Netscape Comm. Corp.//DTD Strict HTML//", - "-//O'Reilly and Associates//DTD HTML 2.0//", - "-//O'Reilly and Associates//DTD HTML Extended 1.0//", - "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//", - "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//", - "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//", - "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//", - "-//Spyglass//DTD HTML 2.0 Extended//", - "-//Sun Microsystems Corp.//DTD HotJava HTML//", - "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//", - "-//W3C//DTD HTML 3 1995-03-24//", - "-//W3C//DTD HTML 3.2 Draft//", - "-//W3C//DTD HTML 3.2 Final//", - "-//W3C//DTD HTML 3.2//", - "-//W3C//DTD HTML 3.2S Draft//", - "-//W3C//DTD HTML 4.0 Frameset//", - "-//W3C//DTD HTML 4.0 Transitional//", - "-//W3C//DTD HTML Experimental 19960712//", - "-//W3C//DTD HTML Experimental 970421//", - "-//W3C//DTD W3 HTML//", - "-//W3O//DTD W3 HTML 3.0//", - "-//WebTechs//DTD Mozilla HTML 2.0//", - "-//WebTechs//DTD Mozilla HTML//", -] - -const SystemIdentifierMissingAndPublicIdentifierStartsWith = [ - "-//W3C//DTD HTML 4.01 Frameset//", - "-//W3C//DTD HTML 4.01 Transitional//" -] - -const PublicIdentifierStartsWithLimited = [ - "-//W3C//DTD XHTML 1.0 Frameset//", - "-//W3C//DTD XHTML 1.0 Transitional//" -] - -const SystemIdentifierNotMissingAndPublicIdentifierStartsWith = [ - "-//W3C//DTD HTML 4.01 Frameset//", - "-//W3C//DTD HTML 4.01 Transitional//" -] - -func quirksConditions(token: Token): bool = - if token.quirks: return true - if token.name.isnone or token.name.get != "html": return true - if token.sysid.issome: - if token.sysid.get == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd": - return true - if token.pubid.issome: - if token.pubid.get in PublicIdentifierEquals: - return true - for id in PublicIdentifierStartsWith: - if token.pubid.get.startsWithNoCase(id): - return true - if token.sysid.isnone: - for id in SystemIdentifierMissingAndPublicIdentifierStartsWith: - if token.pubid.get.startsWithNoCase(id): - return true - return false - -func limitedQuirksConditions(token: Token): bool = - if token.pubid.isnone: return false - for id in PublicIdentifierStartsWithLimited: - if token.pubid.get.startsWithNoCase(id): - return true - if token.sysid.isnone: return false - for id in SystemIdentifierNotMissingAndPublicIdentifierStartsWith: - if token.pubid.get.startsWithNoCase(id): - return true - return false - -# 13.2.6.2 -proc genericRawtextElementParsingAlgorithm(parser: var HTML5Parser, token: Token) = - discard parser.insertHTMLElement(token) - parser.tokenizer.state = RAWTEXT - parser.oldInsertionMode = parser.insertionMode - parser.insertionMode = TEXT - -proc genericRCDATAElementParsingAlgorithm(parser: var HTML5Parser, token: Token) = - discard parser.insertHTMLElement(token) - parser.tokenizer.state = RCDATA - parser.oldInsertionMode = parser.insertionMode - parser.insertionMode = TEXT - -# Pop all elements, including the specified tag. -proc popElementsIncl(parser: var HTML5Parser, tag: TagType) = - while parser.getTagType(parser.popElement()) != tag: - discard - -proc popElementsIncl(parser: var HTML5Parser, tags: set[TagType]) = - while parser.getTagType(parser.popElement()) notin tags: - discard - -# https://html.spec.whatwg.org/multipage/parsing.html#closing-elements-that-have-implied-end-tags -proc generateImpliedEndTags(parser: var HTML5Parser) = - const tags = {TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, - TAG_RB, TAG_RP, TAG_RT, TAG_RTC} - while parser.getTagType(parser.currentNode) in tags: - discard parser.popElement() - -proc generateImpliedEndTags(parser: var HTML5Parser, exclude: TagType) = - let tags = { - TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, - TAG_RT, TAG_RTC - } - {exclude} - while parser.getTagType(parser.currentNode) in tags: - discard parser.popElement() - -proc generateImpliedEndTagsThoroughly(parser: var HTML5Parser) = - const tags = {TAG_CAPTION, TAG_COLGROUP, TAG_DD, TAG_DT, TAG_LI, - TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, TAG_RP, TAG_RT, - TAG_RTC, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH, TAG_THEAD, - TAG_TR} - while parser.getTagType(parser.currentNode) in tags: - discard parser.popElement() - -# https://html.spec.whatwg.org/multipage/parsing.html#push-onto-the-list-of-active-formatting-elements -proc pushOntoActiveFormatting[Handle](parser: var HTML5Parser[Handle], - element: Handle, token: Token) = - var count = 0 - for i in countdown(parser.activeFormatting.high, 0): - let it = parser.activeFormatting[i] - if it[0].isNone: break - if not parser.tagNameEquals(it[0].get, element): - continue - if parser.getNamespace(it[0].get) != parser.getNamespace(element): - continue - var fail = false - for k, v in it[1].attrs: - if k notin token.attrs: - fail = true - break - if v != token.attrs[k]: - fail = true - break - if fail: continue - for k, v in token.attrs: - if k notin it[1].attrs: - fail = true - break - if fail: continue - inc count - if count == 3: - parser.activeFormatting.delete(i) - break - parser.activeFormatting.add((some(element), token)) - -proc reconstructActiveFormatting[Handle](parser: var HTML5Parser[Handle]) = - type State = enum - REWIND, ADVANCE, CREATE - if parser.activeFormatting.len == 0: - return - if parser.activeFormatting[^1][0].isNone: - return - let tagType = parser.getTagType(parser.activeFormatting[^1][0].get) - if parser.hasElement(tagType): - return - var i = parser.activeFormatting.high - template entry: Option[Handle] = (parser.activeFormatting[i][0]) - var state = REWIND - while true: - {.computedGoto.} - case state - of REWIND: - if i == 0: - state = CREATE - continue - dec i - if entry.isSome: - let tagType = parser.getTagType(entry.get) - if not parser.hasElement(tagType): - continue - state = ADVANCE - of ADVANCE: - inc i - state = CREATE - of CREATE: - let element = parser.insertHTMLElement(parser.activeFormatting[i][1]) - parser.activeFormatting[i] = ( - some(element), parser.activeFormatting[i][1] - ) - if i != parser.activeFormatting.high: - state = ADVANCE - continue - break - -proc clearActiveFormattingTillMarker(parser: var HTML5Parser) = - while parser.activeFormatting.len > 0 and - parser.activeFormatting.pop()[0].isSome: - discard - -func isHTMLIntegrationPoint[Handle](parser: HTML5Parser[Handle], - element: Handle): bool = - return parser.isSVGIntegrationPoint(element) # (NOTE MathML not implemented) - -func extractEncFromMeta(s: string): Charset = - var i = 0 - while true: # Loop: - var j = 0 - while i < s.len: - template check(c: static char) = - if s[i] in {c, c.toUpperAscii()}: inc j - else: j = 0 - case j - of 0: check 'c' - of 1: check 'h' - of 2: check 'a' - of 3: check 'r' - of 4: check 's' - of 5: check 'e' - of 6: check 't' - of 7: - inc j - break - else: discard - inc i - if j < 7: return CHARSET_UNKNOWN - while i < s.len and s[i] in AsciiWhitespace: inc i - if i >= s.len or s[i] != '=': continue - while i < s.len and s[i] in AsciiWhitespace: inc i - break - inc i - if i >= s.len: return CHARSET_UNKNOWN - if s[i] in {'"', '\''}: - let s2 = s.substr(i + 1).until(s[i]) - if s2.len == 0 or s2[^1] != s[i]: - return CHARSET_UNKNOWN - return getCharset(s2) - return getCharset(s.substr(i).until({';', ' '})) - -proc changeEncoding(parser: var HTML5Parser, cs: Charset) = - if parser.charset in {CHARSET_UTF_16_LE, CHARSET_UTF_16_BE}: - parser.confidence = CONFIDENCE_CERTAIN - return - parser.confidence = CONFIDENCE_CERTAIN - if cs == parser.charset: - return - if cs == CHARSET_X_USER_DEFINED: - parser.charset = CHARSET_WINDOWS_1252 - else: - parser.charset = cs - parser.needsreinterpret = true - -proc parseErrorByTokenType(parser: var HTML5Parser, tokenType: TokenType) = - case tokenType - of START_TAG: - parser.parseError UNEXPECTED_START_TAG - of END_TAG: - parser.parseError UNEXPECTED_END_TAG - of EOF: - parser.parseError UNEXPECTED_EOF - else: - doAssert false - -proc adoptionAgencyAlgorithm[Handle](parser: var HTML5Parser[Handle], - token: Token): bool = - template parse_error(e: ParseError) = - parser.parseError(e) - if parser.tagNameEquals(parser.currentNode, token): - var fail = true - for it in parser.activeFormatting: - if it[0].isSome and it[0].get == parser.currentNode: - fail = false - if fail: - pop_current_node - return false - var i = 0 - while true: - if i >= 8: return false - inc i - if parser.activeFormatting.len == 0: return true - var formatting: Handle - var formattingIndex: int - for j in countdown(parser.activeFormatting.high, 0): - let element = parser.activeFormatting[j][0] - if element.isNone: - return true - if parser.tagNameEquals(parser.currentNode, token): - formatting = element.get - formattingIndex = j - break - if j == 0: - return true - let stackIndex = parser.openElements.find(formatting) - if stackIndex < 0: - parse_error ELEMENT_NOT_IN_OPEN_ELEMENTS - parser.activeFormatting.delete(formattingIndex) - return false - if not parser.hasElementInScope(formatting): - parse_error ELEMENT_NOT_IN_SCOPE - return false - if formatting != parser.currentNode: - parse_error ELEMENT_NOT_CURRENT_NODE - var furthestBlockIndex = -1 - for j in countdown(parser.openElements.high, 0): - if parser.openElements[j] == formatting: - break - if parser.getTagType(parser.openElements[j]) in SpecialElements: - furthestBlockIndex = j - break - if furthestBlockIndex == -1: - while parser.popElement() != formatting: discard - parser.activeFormatting.delete(formattingIndex) - return false - var furthestBlock = parser.openElements[furthestBlockIndex] - let commonAncestor = parser.openElements[stackIndex - 1] - var bookmark = formattingIndex - var node = furthestBlock - var aboveNode = parser.openElements[furthestBlockIndex - 1] - var lastNode = furthestBlock - var j = 0 - while true: - inc j - node = aboveNode - let nodeStackIndex = parser.openElements.find(node) - if node == formatting: break - var nodeFormattingIndex = -1 - for i in countdown(parser.activeFormatting.high, 0): - if parser.activeFormatting[i][0].isSome and - parser.activeFormatting[i][0].get == node: - nodeFormattingIndex = i - break - if j > 3 and nodeFormattingIndex >= 0: - parser.activeFormatting.delete(nodeFormattingIndex) - if nodeFormattingIndex < bookmark: - dec bookmark # a previous node got deleted, so decrease bookmark by one - if nodeFormattingIndex < 0: - aboveNode = parser.openElements[nodeStackIndex - 1] - parser.openElements.delete(nodeStackIndex) - if nodeStackIndex < furthestBlockIndex: - dec furthestBlockIndex - furthestBlock = parser.openElements[furthestBlockIndex] - continue - let tok = parser.activeFormatting[nodeFormattingIndex][1] - let element = parser.createElement(tok, Namespace.HTML, commonAncestor) - parser.activeFormatting[nodeFormattingIndex] = (some(element), tok) - parser.openElements[nodeStackIndex] = element - aboveNode = parser.openElements[nodeStackIndex - 1] - node = element - if lastNode == furthestBlock: - bookmark = nodeFormattingIndex + 1 - parser.append(node, lastNode) - lastNode = node - let location = parser.appropriatePlaceForInsert(commonAncestor) - parser.insertBefore(location.inside, lastNode, location.before) - let token = parser.activeFormatting[formattingIndex][1] - let element = parser.createElement(token, Namespace.HTML, furthestBlock) - var tomove: seq[Handle] - j = furthestBlock.childList.high - while j >= 0: - let child = furthestBlock.childList[j] - tomove.add(child) - parser.remove(child) - dec j - for child in tomove: - parser.append(element, child) - parser.append(furthestBlock, element) - parser.activeFormatting.insert((some(element), token), bookmark) - parser.activeFormatting.delete(formattingIndex) - parser.openElements.insert(element, furthestBlockIndex) - parser.openElements.delete(stackIndex) - -proc closeP(parser: var HTML5Parser) = - parser.generateImpliedEndTags(TAG_P) - if parser.getTagType(parser.currentNode) != TAG_P: - parser.parseError(MISMATCHED_TAGS) - while parser.getTagType(parser.popElement()) != TAG_P: - discard - -# Following is an implementation of the state (?) machine defined in -# https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhtml -# It uses the ad-hoc pattern matching macro `match' to apply the following -# transformations: -# * First, pairs of patterns and actions are stored in tuples (and `discard' -# statements...) -# * These pairs are then assigned to token types, later mapped to legs of the -# first case statement. -# * Another case statement is constructed where needed, e.g. for switching on -# characters/tags/etc. -# * Finally, the whole thing is wrapped in a named block, to implement a -# pseudo-goto by breaking out only when the else statement needn't be -# executed. -# -# For example, the following code: -# -# match token: -# TokenType.COMMENT => (block: echo "comment") -# ("<p>", "<a>", "</div>") => (block: echo "p, a or closing div") -# ("<div>", "</p>") => (block: anything_else) -# (TokenType.START_TAG, TokenType.END_TAG) => (block: assert false, "invalid") -# other => (block: echo "anything else") -# -# (effectively) generates this: -# -# block inside_not_else: -# case token.t -# of TokenType.COMMENT: -# echo "comment" -# break inside_not_else -# of TokenType.START_TAG: -# case token.tagtype -# of {TAG_P, TAG_A}: -# echo "p, a or closing div" -# break inside_not_else -# of TAG_DIV: discard -# else: -# assert false -# break inside_not_else -# of TokenType.END_TAG: -# case token.tagtype -# of TAG_DIV: -# echo "p, a or closing div" -# break inside_not_else -# of TAG_P: discard -# else: -# assert false -# break inside_not_else -# else: discard -# echo "anything else" -# -# This duplicates any code that applies for several token types, except for the -# else branch. -macro match(token: Token, body: typed): untyped = - type OfBranchStore = object - ofBranches: seq[(seq[NimNode], NimNode)] - defaultBranch: NimNode - painted: bool - - # Stores 'of' branches - var ofBranches: array[TokenType, OfBranchStore] - # Stores 'else', 'elif' branches - var defaultBranch: NimNode - - const tokenTypes = (func(): Table[string, TokenType] = - for tt in TokenType: - result[$tt] = tt)() - - for disc in body: - let tup = disc[0] # access actual tuple - let pattern = `tup`[0] - let lambda = `tup`[1] - var action = lambda.findChild(it.kind notin {nnkSym, nnkEmpty, nnkFormalParams}) - if pattern.kind != nnkDiscardStmt and not (action.len == 2 and action[1].kind == nnkDiscardStmt and action[1][0] == newStrLitNode("anything_else")): - action = quote do: - `action` - #eprint token #debug - break inside_not_else - - var patterns = @[pattern] - while patterns.len > 0: - let pattern = patterns.pop() - case pattern.kind - of nnkSym: # simple symbols; we assume these are the enums - ofBranches[tokenTypes[pattern.strVal]].defaultBranch = action - ofBranches[tokenTypes[pattern.strVal]].painted = true - of nnkCharLit: - ofBranches[CHARACTER_ASCII].ofBranches.add((@[pattern], action)) - ofBranches[CHARACTER_ASCII].painted = true - of nnkCurly: - case pattern[0].kind - of nnkCharLit: - ofBranches[CHARACTER_ASCII].ofBranches.add((@[pattern], action)) - ofBranches[CHARACTER_ASCII].painted = true - else: error "Unsupported curly of kind " & $pattern[0].kind - of nnkStrLit: - var tempTokenizer = newTokenizer(pattern.strVal) - for token in tempTokenizer.tokenize: - let tt = int(token.tagtype) - case token.t - of START_TAG, END_TAG: - var found = false - for i in 0..ofBranches[token.t].ofBranches.high: - if ofBranches[token.t].ofBranches[i][1] == action: - found = true - ofBranches[token.t].ofBranches[i][0].add((quote do: TagType(`tt`))) - ofBranches[token.t].painted = true - break - if not found: - ofBranches[token.t].ofBranches.add((@[(quote do: TagType(`tt`))], action)) - ofBranches[token.t].painted = true - else: - error pattern.strVal & ": Unsupported token " & $token & - " of kind " & $token.t - break - of nnkDiscardStmt: - defaultBranch = action - of nnkTupleConstr: - for child in pattern: - patterns.add(child) - else: - error pattern.strVal & ": Unsupported pattern of kind " & $pattern.kind - - func tokenBranchOn(tok: TokenType): NimNode = - case tok - of START_TAG, END_TAG: - return quote do: token.tagtype - of CHARACTER: - return quote do: token.r - of CHARACTER_ASCII: - return quote do: token.c - else: - error "Unsupported branching of token " & $tok - - template add_to_case(branch: typed) = - if branch[0].len == 1: - tokenCase.add(newNimNode(nnkOfBranch).add(branch[0][0]).add(branch[1])) - else: - var curly = newNimNode(nnkCurly) - for node in branch[0]: - curly.add(node) - tokenCase.add(newNimNode(nnkOfBranch).add(curly).add(branch[1])) - - # Build case statements - var mainCase = newNimNode(nnkCaseStmt).add(quote do: `token`.t) - for tt in TokenType: - let ofBranch = newNimNode(nnkOfBranch).add(quote do: TokenType(`tt`)) - let tokenCase = newNimNode(nnkCaseStmt) - if ofBranches[tt].defaultBranch != nil: - if ofBranches[tt].ofBranches.len > 0: - tokenCase.add(tokenBranchOn(tt)) - for branch in ofBranches[tt].ofBranches: - add_to_case branch - tokenCase.add(newNimNode(nnkElse).add(ofBranches[tt].defaultBranch)) - ofBranch.add(tokenCase) - mainCase.add(ofBranch) - else: - ofBranch.add(ofBranches[tt].defaultBranch) - mainCase.add(ofBranch) - else: - if ofBranches[tt].ofBranches.len > 0: - tokenCase.add(tokenBranchOn(tt)) - for branch in ofBranches[tt].ofBranches: - add_to_case branch - ofBranch.add(tokenCase) - tokenCase.add(newNimNode(nnkElse).add(quote do: discard)) - mainCase.add(ofBranch) - else: - discard - - for t in TokenType: - if not ofBranches[t].painted: - mainCase.add(newNimNode(nnkElse).add(quote do: discard)) - break - - var stmts = newStmtList().add(mainCase) - for stmt in defaultBranch: - stmts.add(stmt) - result = newBlockStmt(ident("inside_not_else"), stmts) - -proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], - token: Token, insertionMode: InsertionMode) = - template pop_all_nodes = - while parser.openElements.len > 1: pop_current_node - - template anything_else = discard "anything_else" - - macro `=>`(v: typed, body: untyped): untyped = - quote do: - discard (`v`, proc() = `body`) - - template other = discard - - template reprocess(tok: Token) = - parser.processInHTMLContent(tok, parser.insertionMode) - - template parse_error(e: ParseError) = - parser.parseError(e) - - template parse_error_if_mismatch(tagtype: TagType) = - if parser.hasParseError(): - if parser.getTagType(parser.currentNode) != TAG_DD: - parse_error MISMATCHED_TAGS - - template parse_error_if_mismatch(tagtypes: set[TagType]) = - if parser.hasParseError(): - if parser.getTagType(parser.currentNode) notin tagtypes: - parse_error MISMATCHED_TAGS - - case insertionMode - of INITIAL: - match token: - AsciiWhitespace => (block: discard) - TokenType.COMMENT => (block: - parser.insertComment(token, last_child_of(parser.document)) - ) - TokenType.DOCTYPE => (block: - if token.name.isNone or - token.name.get != "html" or token.pubid.isSome or - (token.sysid.isSome and token.sysid.get != "about:legacy-compat"): - parse_error INVALID_DOCTYPE - let doctype = parser.createDocumentType(token.name.get(""), - token.pubid.get(""), token.sysid.get("")) - parser.append(parser.document, doctype) - if not parser.opts.isIframeSrcdoc: - if quirksConditions(token): - parser.setQuirksMode(QUIRKS) - elif limitedQuirksConditions(token): - parser.setQuirksMode(LIMITED_QUIRKS) - parser.insertionMode = BEFORE_HTML - ) - other => (block: - if not parser.opts.isIframeSrcdoc: - parse_error UNEXPECTED_INITIAL_TOKEN - parser.setQuirksMode(QUIRKS) - parser.insertionMode = BEFORE_HTML - reprocess token - ) - - of BEFORE_HTML: - match token: - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) - TokenType.COMMENT => (block: - parser.insertComment(token, last_child_of(parser.document)) - ) - AsciiWhitespace => (block: discard) - "<html>" => (block: - let element = parser.createElement(token, Namespace.HTML, - parser.document) - parser.append(parser.document, element) - parser.pushElement(element) - parser.insertionMode = BEFORE_HEAD - ) - ("</head>", "</body>", "</html>", "</br>") => (block: anything_else) - TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) - other => (block: - let element = parser.createElement(TAG_HTML, Namespace.HTML) - parser.append(parser.document, element) - parser.pushElement(element) - parser.insertionMode = BEFORE_HEAD - reprocess token - ) - - of BEFORE_HEAD: - match token: - AsciiWhitespace => (block: discard) - TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) - "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) - "<head>" => (block: - parser.head = some(parser.insertHTMLElement(token)) - parser.insertionMode = IN_HEAD - ) - ("</head>", "</body>", "</html>", "</br>") => (block: anything_else) - TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) - other => (block: - let head = Token(t: START_TAG, tagtype: TAG_HEAD) - parser.head = some(parser.insertHTMLElement(head)) - parser.insertionMode = IN_HEAD - reprocess token - ) - - of IN_HEAD: - match token: - AsciiWhitespace => (block: discard) - TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) - "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) - ("<base>", "<basefont>", "<bgsound>", "<link>") => (block: - discard parser.insertHTMLElement(token) - pop_current_node - ) - "<meta>" => (block: - discard parser.insertHTMLElement(token) - pop_current_node - if parser.confidence == CONFIDENCE_TENTATIVE: - let cs = getCharset(token.attrs.getOrDefault("charset", "")) - if cs != CHARSET_UNKNOWN: - parser.changeEncoding(cs) - elif "http-equiv" in token.attrs: - if token.attrs["http-equiv"].equalsIgnoreCase("Content-Type") and - "content" in token.attrs: - let cs = extractEncFromMeta(token.attrs["content"]) - if cs != CHARSET_UNKNOWN: - parser.changeEncoding(cs) - ) - "<title>" => (block: parser.genericRCDATAElementParsingAlgorithm(token)) - "<noscript>" => (block: - if not parser.opts.scripting: - discard parser.insertHTMLElement(token) - parser.insertionMode = IN_HEAD_NOSCRIPT - else: - parser.genericRawtextElementParsingAlgorithm(token) - ) - ("<noframes>", "<style>") => (block: parser.genericRawtextElementParsingAlgorithm(token)) - "<script>" => (block: - let location = parser.appropriatePlaceForInsert() - let element = parser.createElement(token, Namespace.HTML, location.inside) - #TODO document.write (?) - parser.insert(location, element) - parser.pushElement(element) - parser.tokenizer.state = SCRIPT_DATA - parser.oldInsertionMode = parser.insertionMode - parser.insertionMode = TEXT - ) - "</head>" => (block: - pop_current_node - parser.insertionMode = AFTER_HEAD - ) - ("</body>", "</html>", "</br>") => (block: anything_else) - "<template>" => (block: - discard parser.insertHTMLElement(token) - parser.activeFormatting.add((none(Handle), nil)) - parser.framesetok = false - parser.insertionMode = IN_TEMPLATE - parser.templateModes.add(IN_TEMPLATE) - ) - "</template>" => (block: - if not parser.hasElement(TAG_TEMPLATE): - parse_error ELEMENT_NOT_IN_OPEN_ELEMENTS - else: - parser.generateImpliedEndTagsThoroughly() - if parser.getTagType(parser.currentNode) != TAG_TEMPLATE: - parse_error MISMATCHED_TAGS - parser.popElementsIncl(TAG_TEMPLATE) - parser.clearActiveFormattingTillMarker() - discard parser.templateModes.pop() - parser.resetInsertionMode() - ) - ("<head>", TokenType.END_TAG) => (block: parse_error UNEXPECTED_END_TAG) - other => (block: - pop_current_node - parser.insertionMode = AFTER_HEAD - reprocess token - ) - - of IN_HEAD_NOSCRIPT: - match token: - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) - "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) - "</noscript>" => (block: - pop_current_node - parser.insertionMode = IN_HEAD - ) - (AsciiWhitespace, - TokenType.COMMENT, - "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", - "<style>") => (block: - parser.processInHTMLContent(token, IN_HEAD)) - "</br>" => (block: anything_else) - ("<head>", "<noscript>") => (block: parse_error UNEXPECTED_START_TAG) - TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) - other => (block: - pop_current_node - parser.insertionMode = IN_HEAD - reprocess token - ) - - of AFTER_HEAD: - match token: - AsciiWhitespace => (block: parser.insertCharacter(token.c)) - TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) - "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) - "<body>" => (block: - discard parser.insertHTMLElement(token) - parser.framesetok = false - parser.insertionMode = IN_BODY - ) - "<frameset>" => (block: - discard parser.insertHTMLElement(token) - parser.insertionMode = IN_FRAMESET - ) - ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", - "<script>", "<style>", "<template>", "<title>") => (block: - parse_error UNEXPECTED_START_TAG - parser.pushElement(parser.head.get) - parser.processInHTMLContent(token, IN_HEAD) - for i in countdown(parser.openElements.high, 0): - if parser.openElements[i] == parser.head.get: - parser.openElements.delete(i) - ) - "</template>" => (block: parser.processInHTMLContent(token, IN_HEAD)) - ("</body>", "</html>", "</br>") => (block: anything_else) - ("<head>") => (block: parse_error UNEXPECTED_START_TAG) - (TokenType.END_TAG) => (block: parse_error UNEXPECTED_END_TAG) - other => (block: - discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_BODY)) - parser.insertionMode = IN_BODY - reprocess token - ) - - of IN_BODY: - template any_other_start_tag() = - parser.reconstructActiveFormatting() - discard parser.insertHTMLElement(token) - - template any_other_end_tag() = - for i in countdown(parser.openElements.high, 0): - let node = parser.openElements[i] - if parser.tagNameEquals(node, token): - parser.generateImpliedEndTags(token.tagtype) - if node != parser.currentNode: - parse_error ELEMENT_NOT_CURRENT_NODE - while parser.popElement() != node: - discard - break - elif parser.getTagType(node) in SpecialElements: - parse_error UNEXPECTED_SPECIAL_ELEMENT - return - - template parse_error_if_body_has_disallowed_open_elements = - if parser.hasParseError(): - const Disallowed = AllTagTypes - { - TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, - TAG_RP, TAG_RT, TAG_RTC, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH, - TAG_THEAD, TAG_TR, TAG_BODY, TAG_HTML - } - if parser.hasElement(Disallowed): - parse_error MISMATCHED_TAGS - - match token: - '\0' => (block: parse_error UNEXPECTED_NULL) - AsciiWhitespace => (block: - parser.reconstructActiveFormatting() - parser.insertCharacter(token.c) - ) - TokenType.CHARACTER_ASCII => (block: - parser.reconstructActiveFormatting() - parser.insertCharacter(token.c) - parser.framesetOk = false - ) - TokenType.CHARACTER => (block: - parser.reconstructActiveFormatting() - parser.insertCharacter(token.r) - parser.framesetOk = false - ) - TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) - "<html>" => (block: - parse_error UNEXPECTED_START_TAG - if parser.hasElement(TAG_TEMPLATE): - discard - else: - parser.addAttrsIfMissing(parser.openElements[0], token.attrs) - ) - ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", - "<script>", "<style>", "<template>", "<title>", - "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD)) - "<body>" => (block: - parse_error UNEXPECTED_START_TAG - if parser.openElements.len == 1 or - parser.getTagType(parser.openElements[1]) != TAG_BODY or - parser.hasElement(TAG_TEMPLATE): - discard - else: - parser.framesetOk = false - parser.addAttrsIfMissing(parser.openElements[1], token.attrs) - ) - "<frameset>" => (block: - parse_error UNEXPECTED_START_TAG - if parser.openElements.len == 1 or - parser.getTagType(parser.openElements[1]) != TAG_BODY or - not parser.framesetOk: - discard - else: - parser.remove(parser.openElements[1]) - pop_all_nodes - ) - TokenType.EOF => (block: - if parser.templateModes.len > 0: - parser.processInHTMLContent(token, IN_TEMPLATE) - else: - parse_error_if_body_has_disallowed_open_elements - # stop - ) - "</body>" => (block: - if not parser.hasElementInScope(TAG_BODY): - parse_error UNEXPECTED_END_TAG - else: - parse_error_if_body_has_disallowed_open_elements - parser.insertionMode = AFTER_BODY - ) - "</html>" => (block: - if not parser.hasElementInScope(TAG_BODY): - parse_error UNEXPECTED_END_TAG - else: - parse_error_if_body_has_disallowed_open_elements - parser.insertionMode = AFTER_BODY - reprocess token - ) - ("<address>", "<article>", "<aside>", "<blockquote>", "<center>", - "<details>", "<dialog>", "<dir>", "<div>", "<dl>", "<fieldset>", - "<figcaption>", "<figure>", "<footer>", "<header>", "<hgroup>", "<main>", - "<menu>", "<nav>", "<ol>", "<p>", "<search>", "<section>", "<summary>", - "<ul>") => (block: - if parser.hasElementInButtonScope(TAG_P): - parser.closeP() - discard parser.insertHTMLElement(token) - ) - ("<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>") => (block: - if parser.hasElementInButtonScope(TAG_P): - parser.closeP() - if parser.getTagType(parser.currentNode) in HTagTypes: - parse_error NESTED_TAGS - pop_current_node - discard parser.insertHTMLElement(token) - ) - ("<pre>", "<listing>") => (block: - if parser.hasElementInButtonScope(TAG_P): - parser.closeP() - discard parser.insertHTMLElement(token) - parser.ignoreLF = true - parser.framesetOk = false - ) - "<form>" => (block: - let hasTemplate = parser.hasElement(TAG_TEMPLATE) - if parser.form.isSome and not hasTemplate: - parse_error NESTED_TAGS - else: - if parser.hasElementInButtonScope(TAG_P): - parser.closeP() - let element = parser.insertHTMLElement(token) - if not hasTemplate: - parser.form = some(element) - ) - "<li>" => (block: - parser.framesetOk = false - for i in countdown(parser.openElements.high, 0): - let node = parser.openElements[i] - let tagType = parser.getTagType(node) - case tagType - of TAG_LI: - parser.generateImpliedEndTags(TAG_LI) - parse_error_if_mismatch TAG_LI - parser.popElementsIncl(TAG_LI) - break - of SpecialElements - {TAG_ADDRESS, TAG_DIV, TAG_P, TAG_LI}: - break - else: discard - if parser.hasElementInButtonScope(TAG_P): - parser.closeP() - discard parser.insertHTMLElement(token) - ) - ("<dd>", "<dt>") => (block: - parser.framesetOk = false - for i in countdown(parser.openElements.high, 0): - let node = parser.openElements[i] - let tagType = parser.getTagType(node) - case tagType - of TAG_DD: - parser.generateImpliedEndTags(TAG_DD) - parse_error_if_mismatch TAG_DD - parser.popElementsIncl(TAG_DD) - break - of TAG_DT: - parser.generateImpliedEndTags(TAG_DT) - parse_error_if_mismatch TAG_DT - parser.popElementsIncl(TAG_DT) - break - of SpecialElements - {TAG_ADDRESS, TAG_DIV, TAG_P, TAG_DD, TAG_DT}: - break - else: discard - if parser.hasElementInButtonScope(TAG_P): - parser.closeP() - discard parser.insertHTMLElement(token) - ) - "<plaintext>" => (block: - if parser.hasElementInButtonScope(TAG_P): - parser.closeP() - discard parser.insertHTMLElement(token) - parser.tokenizer.state = PLAINTEXT - ) - "<button>" => (block: - if parser.hasElementInScope(TAG_BUTTON): - parse_error NESTED_TAGS - parser.generateImpliedEndTags() - parser.popElementsIncl(TAG_BUTTON) - parser.reconstructActiveFormatting() - discard parser.insertHTMLElement(token) - parser.framesetOk = false - ) - ("</address>", "</article>", "</aside>", "</blockquote>", "</button>", - "</center>", "</details>", "</dialog>", "</dir>", "</div>", "</dl>", - "</fieldset>", "</figcaption>", "</figure>", "</footer>", "</header>", - "</hgroup>", "</listing>", "</main>", "</menu>", "</nav>", "</ol>", - "</pre>", "</search>", "</section>", "</summary>", "</ul>") => (block: - if not parser.hasElementInScope(token.tagtype): - parse_error ELEMENT_NOT_IN_SCOPE - else: - parser.generateImpliedEndTags() - parse_error_if_mismatch token.tagtype - parser.popElementsIncl(token.tagtype) - ) - "</form>" => (block: - if not parser.hasElement(TAG_TEMPLATE): - let form = parser.form - parser.form = none(Handle) - if form.isNone or - not parser.hasElementInScope(parser.getTagType(form.get)): - parse_error ELEMENT_NOT_IN_SCOPE - return - let node = form.get - parser.generateImpliedEndTags() - if parser.currentNode != node: - parse_error ELEMENT_NOT_CURRENT_NODE - parser.openElements.delete(parser.openElements.find(node)) - else: - if not parser.hasElementInScope(TAG_FORM): - parse_error ELEMENT_NOT_IN_SCOPE - else: - parser.generateImpliedEndTags() - parse_error_if_mismatch TAG_FORM - parser.popElementsIncl(TAG_FORM) - ) - "</p>" => (block: - if not parser.hasElementInButtonScope(TAG_P): - parse_error ELEMENT_NOT_IN_SCOPE - discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_P)) - parser.closeP() - ) - "</li>" => (block: - if not parser.hasElementInListItemScope(TAG_LI): - parse_error ELEMENT_NOT_IN_SCOPE - else: - parser.generateImpliedEndTags(TAG_LI) - parse_error_if_mismatch TAG_LI - parser.popElementsIncl(TAG_LI) - ) - ("</dd>", "</dt>") => (block: - if not parser.hasElementInScope(token.tagtype): - parse_error ELEMENT_NOT_IN_SCOPE - else: - parser.generateImpliedEndTags(token.tagtype) - parse_error_if_mismatch token.tagtype - parser.popElementsIncl(token.tagtype) - ) - ("</h1>", "</h2>", "</h3>", "</h4>", "</h5>", "</h6>") => (block: - if not parser.hasElementInScope(HTagTypes): - parse_error ELEMENT_NOT_IN_SCOPE - else: - parser.generateImpliedEndTags() - parse_error_if_mismatch token.tagtype - parser.popElementsIncl(HTagTypes) - ) - "</sarcasm>" => (block: - #*deep breath* - anything_else - ) - "<a>" => (block: - var anchor: Option[Handle] - for i in countdown(parser.activeFormatting.high, 0): - let format = parser.activeFormatting[i] - if format[0].isNone: - break - if parser.getTagType(format[0].get) == TAG_A: - anchor = format[0] - break - if anchor.isSome: - parse_error NESTED_TAGS - if parser.adoptionAgencyAlgorithm(token): - any_other_end_tag - return - for i in 0..parser.activeFormatting.high: - if parser.activeFormatting[i][0].isSome and - parser.activeFormatting[i][0].get == anchor.get: - parser.activeFormatting.delete(i) - break - for i in 0..parser.openElements.high: - if parser.openElements[i] == anchor.get: - parser.openElements.delete(i) - break - parser.reconstructActiveFormatting() - let element = parser.insertHTMLElement(token) - parser.pushOntoActiveFormatting(element, token) - ) - ("<b>", "<big>", "<code>", "<em>", "<font>", "<i>", "<s>", "<small>", - "<strike>", "<strong>", "<tt>", "<u>") => (block: - parser.reconstructActiveFormatting() - let element = parser.insertHTMLElement(token) - parser.pushOntoActiveFormatting(element, token) - ) - "<nobr>" => (block: - parser.reconstructActiveFormatting() - if parser.hasElementInScope(TAG_NOBR): - parse_error NESTED_TAGS - if parser.adoptionAgencyAlgorithm(token): - any_other_end_tag - return - parser.reconstructActiveFormatting() - let element = parser.insertHTMLElement(token) - parser.pushOntoActiveFormatting(element, token) - ) - ("</a>", "</b>", "</big>", "</code>", "</em>", "</font>", "</i>", - "</nobr>", "</s>", "</small>", "</strike>", "</strong>", "</tt>", - "</u>") => (block: - if parser.adoptionAgencyAlgorithm(token): - any_other_end_tag - return - ) - ("<applet>", "<marquee>", "<object>") => (block: - parser.reconstructActiveFormatting() - discard parser.insertHTMLElement(token) - parser.activeFormatting.add((none(Handle), nil)) - parser.framesetOk = false - ) - ("</applet>", "</marquee>", "</object>") => (block: - if not parser.hasElementInScope(token.tagtype): - parse_error ELEMENT_NOT_IN_SCOPE - else: - parser.generateImpliedEndTags() - parse_error_if_mismatch token.tagtype - while parser.getTagType(parser.popElement()) != token.tagtype: discard - parser.clearActiveFormattingTillMarker() - ) - "<table>" => (block: - if parser.quirksMode != QUIRKS: - if parser.hasElementInButtonScope(TAG_P): - parser.closeP() - discard parser.insertHTMLElement(token) - parser.framesetOk = false - parser.insertionMode = IN_TABLE - ) - "</br>" => (block: - parse_error UNEXPECTED_END_TAG - reprocess Token(t: START_TAG, tagtype: TAG_BR) - ) - ("<area>", "<br>", "<embed>", "<img>", "<keygen>", "<wbr>") => (block: - parser.reconstructActiveFormatting() - discard parser.insertHTMLElement(token) - pop_current_node - parser.framesetOk = false - ) - "<input>" => (block: - parser.reconstructActiveFormatting() - discard parser.insertHTMLElement(token) - pop_current_node - if not token.attrs.getOrDefault("type").equalsIgnoreCase("hidden"): - parser.framesetOk = false - ) - ("<param>", "<source>", "<track>") => (block: - discard parser.insertHTMLElement(token) - pop_current_node - ) - "<hr>" => (block: - if parser.hasElementInButtonScope(TAG_P): - parser.closeP() - discard parser.insertHTMLElement(token) - pop_current_node - parser.framesetOk = false - ) - "<image>" => (block: - #TODO ew - let token = Token(t: START_TAG, tagtype: TAG_IMG, tagname: "img", selfclosing: token.selfclosing, attrs: token.attrs) - reprocess token - ) - "<textarea>" => (block: - discard parser.insertHTMLElement(token) - parser.ignoreLF = true - parser.tokenizer.state = RCDATA - parser.oldInsertionMode = parser.insertionMode - parser.framesetOk = false - parser.insertionMode = TEXT - ) - "<xmp>" => (block: - if parser.hasElementInButtonScope(TAG_P): - parser.closeP() - parser.reconstructActiveFormatting() - parser.framesetOk = false - parser.genericRawtextElementParsingAlgorithm(token) - ) - "<iframe>" => (block: - parser.framesetOk = false - parser.genericRawtextElementParsingAlgorithm(token) - ) - "<noembed>" => (block: - parser.genericRawtextElementParsingAlgorithm(token) - ) - "<noscript>" => (block: - if parser.opts.scripting: - parser.genericRawtextElementParsingAlgorithm(token) - else: - any_other_start_tag - ) - "<select>" => (block: - parser.reconstructActiveFormatting() - discard parser.insertHTMLElement(token) - parser.framesetOk = false - if parser.insertionMode in {IN_TABLE, IN_CAPTION, IN_TABLE_BODY, IN_CELL}: - parser.insertionMode = IN_SELECT_IN_TABLE - else: - parser.insertionMode = IN_SELECT - ) - ("<optgroup>", "<option>") => (block: - if parser.getTagType(parser.currentNode) == TAG_OPTION: - pop_current_node - parser.reconstructActiveFormatting() - discard parser.insertHTMLElement(token) - ) - ("<rb>", "<rtc>") => (block: - if parser.hasElementInScope(TAG_RUBY): - parser.generateImpliedEndTags() - parse_error_if_mismatch TAG_RUBY - discard parser.insertHTMLElement(token) - ) - ("<rp>", "<rt>") => (block: - if parser.hasElementInScope(TAG_RUBY): - parser.generateImpliedEndTags(TAG_RTC) - parse_error_if_mismatch {TAG_RUBY, TAG_RTC} - discard parser.insertHTMLElement(token) - ) - #NOTE <math> (not implemented) - #TODO <svg> (SVG) - ("<caption>", "<col>", "<colgroup>", "<frame>", "<head>", "<tbody>", - "<td>", "<tfoot>", "<th>", "<thead>", "<tr>") => (block: - parse_error UNEXPECTED_START_TAG - ) - TokenType.START_TAG => (block: any_other_start_tag) - TokenType.END_TAG => (block: any_other_end_tag) - - of TEXT: - match token: - TokenType.CHARACTER_ASCII => (block: - assert token.c != '\0' - parser.insertCharacter(token.c) - ) - TokenType.CHARACTER => (block: - parser.insertCharacter(token.r) - ) - TokenType.EOF => (block: - parse_error UNEXPECTED_EOF - if parser.getTagType(parser.currentNode) == TAG_SCRIPT: - parser.setScriptAlreadyStarted(parser.currentNode) - pop_current_node - parser.insertionMode = parser.oldInsertionMode - reprocess token - ) - "</script>" => (block: - #TODO microtask (?) - pop_current_node - parser.insertionMode = parser.oldInsertionMode - ) - TokenType.END_TAG => (block: - pop_current_node - parser.insertionMode = parser.oldInsertionMode - ) - - of IN_TABLE: - template clear_the_stack_back_to_a_table_context() = - while parser.getTagType(parser.currentNode) notin {TAG_TABLE, TAG_TEMPLATE, TAG_HTML}: - pop_current_node - - match token: - (TokenType.CHARACTER_ASCII, TokenType.CHARACTER) => (block: - const CanHaveText = { - TAG_TABLE, TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TR - } - if parser.getTagType(parser.currentNode) in CanHaveText: - parser.pendingTableChars = "" - parser.pendingTableCharsWhitespace = true - parser.oldInsertionMode = parser.insertionMode - parser.insertionMode = IN_TABLE_TEXT - reprocess token - else: # anything else - parse_error INVALID_TEXT_PARENT - parser.fosterParenting = true - parser.processInHTMLContent(token, IN_BODY) - parser.fosterParenting = false - ) - TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) - "<caption>" => (block: - clear_the_stack_back_to_a_table_context - parser.activeFormatting.add((none(Handle), nil)) - discard parser.insertHTMLElement(token) - parser.insertionMode = IN_CAPTION - ) - "<colgroup>" => (block: - clear_the_stack_back_to_a_table_context - discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_COLGROUP)) - parser.insertionMode = IN_COLUMN_GROUP - ) - ("<tbody>", "<tfoot>", "<thead>") => (block: - clear_the_stack_back_to_a_table_context - discard parser.insertHTMLElement(token) - parser.insertionMode = IN_TABLE_BODY - ) - ("<td>", "<th>", "<tr>") => (block: - clear_the_stack_back_to_a_table_context - discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_TBODY)) - parser.insertionMode = IN_TABLE_BODY - reprocess token - ) - "<table>" => (block: - parse_error NESTED_TAGS - if not parser.hasElementInScope(TAG_TABLE): - discard - else: - while parser.getTagType(parser.popElement()) != TAG_TABLE: discard - parser.resetInsertionMode() - reprocess token - ) - "</table>" => (block: - if not parser.hasElementInScope(TAG_TABLE): - parse_error ELEMENT_NOT_IN_SCOPE - else: - while parser.getTagType(parser.popElement()) != TAG_TABLE: discard - parser.resetInsertionMode() - ) - ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</tbody>", - "</td>", "</tfoot>", "</th>", "</thead>", "</tr>") => (block: - parse_error UNEXPECTED_END_TAG - ) - ("<style>", "<script>", "<template>", "</template>") => (block: - parser.processInHTMLContent(token, IN_HEAD) - ) - "<input>" => (block: - parse_error UNEXPECTED_START_TAG - if not token.attrs.getOrDefault("type").equalsIgnoreCase("hidden"): - # anything else - parser.fosterParenting = true - parser.processInHTMLContent(token, IN_BODY) - parser.fosterParenting = false - else: - discard parser.insertHTMLElement(token) - pop_current_node - ) - "<form>" => (block: - parse_error UNEXPECTED_START_TAG - if parser.form.isSome or parser.hasElement(TAG_TEMPLATE): - discard - else: - parser.form = some(parser.insertHTMLElement(token)) - pop_current_node - ) - TokenType.EOF => (block: - parser.processInHTMLContent(token, IN_BODY) - ) - other => (block: - parse_error UNEXPECTED_START_TAG - parser.fosterParenting = true - parser.processInHTMLContent(token, IN_BODY) - parser.fosterParenting = false - ) - - of IN_TABLE_TEXT: - match token: - '\0' => (block: parse_error UNEXPECTED_NULL) - TokenType.CHARACTER_ASCII => (block: - if token.c notin AsciiWhitespace: - parser.pendingTableCharsWhitespace = false - parser.pendingTableChars &= token.c - ) - TokenType.CHARACTER => (block: - parser.pendingTableChars &= $token.r - parser.pendingTableCharsWhitespace = false - ) - other => (block: - if not parser.pendingTableCharsWhitespace: - # I *think* this is effectively the same thing the specification - # wants... - parse_error NON_SPACE_TABLE_TEXT - parser.fosterParenting = true - parser.reconstructActiveFormatting() - parser.insertCharacter(parser.pendingTableChars) - parser.framesetOk = false - parser.fosterParenting = false - else: - parser.insertCharacter(parser.pendingTableChars) - parser.insertionMode = parser.oldInsertionMode - reprocess token - ) - - of IN_CAPTION: - match token: - "</caption>" => (block: - if not parser.hasElementInTableScope(TAG_CAPTION): - parse_error ELEMENT_NOT_IN_SCOPE - else: - parser.generateImpliedEndTags() - parse_error_if_mismatch TAG_CAPTION - parser.popElementsIncl(TAG_CAPTION) - parser.clearActiveFormattingTillMarker() - parser.insertionMode = IN_TABLE - ) - ("<caption>", "<col>", "<colgroup>", "<tbody>", "<td>", "<tfoot>", - "<th>", "<thead>", "<tr>", "</table>") => (block: - if not parser.hasElementInTableScope(TAG_CAPTION): - parse_error ELEMENT_NOT_IN_SCOPE - else: - parser.generateImpliedEndTags() - parse_error_if_mismatch TAG_CAPTION - parser.clearActiveFormattingTillMarker() - parser.insertionMode = IN_TABLE - reprocess token - ) - ("</body>", "</col>", "</colgroup>", "</html>", "</tbody>", "</td>", - "</tfoot>", "</th>", "</thead>", "</tr>") => (block: - parse_error UNEXPECTED_END_TAG - ) - other => (block: parser.processInHTMLContent(token, IN_BODY)) - - of IN_COLUMN_GROUP: - match token: - AsciiWhitespace => (block: parser.insertCharacter(token.c)) - TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) - "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) - "<col>" => (block: - discard parser.insertHTMLElement(token) - pop_current_node - ) - "</colgroup>" => (block: - if parser.getTagType(parser.currentNode) != TAG_COLGROUP: - parse_error MISMATCHED_TAGS - else: - pop_current_node - parser.insertionMode = IN_TABLE - ) - "</col>" => (block: parse_error UNEXPECTED_END_TAG) - ("<template>", "</template>") => (block: - parser.processInHTMLContent(token, IN_HEAD) - ) - TokenType.EOF => (block: parser.processInHTMLContent(token, IN_BODY)) - other => (block: - if parser.getTagType(parser.currentNode) != TAG_COLGROUP: - parse_error MISMATCHED_TAGS - else: - pop_current_node - parser.insertionMode = IN_TABLE - reprocess token - ) - - of IN_TABLE_BODY: - template clear_the_stack_back_to_a_table_body_context() = - while parser.getTagType(parser.currentNode) notin {TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TEMPLATE, TAG_HTML}: - pop_current_node - - match token: - "<tr>" => (block: - clear_the_stack_back_to_a_table_body_context - discard parser.insertHTMLElement(token) - parser.insertionMode = IN_ROW - ) - ("<th>", "<td>") => (block: - parse_error UNEXPECTED_START_TAG - clear_the_stack_back_to_a_table_body_context - discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_TR)) - parser.insertionMode = IN_ROW - reprocess token - ) - ("</tbody>", "</tfoot>", "</thead>") => (block: - if not parser.hasElementInTableScope(token.tagtype): - parse_error ELEMENT_NOT_IN_SCOPE - else: - clear_the_stack_back_to_a_table_body_context - pop_current_node - parser.insertionMode = IN_TABLE - ) - ("<caption>", "<col>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>", - "</table>") => (block: - if not parser.hasElementInTableScope({TAG_TBODY, TAG_THEAD, TAG_TFOOT}): - parse_error ELEMENT_NOT_IN_SCOPE - else: - clear_the_stack_back_to_a_table_body_context - pop_current_node - parser.insertionMode = IN_TABLE - reprocess token - ) - ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</td>", - "</th>", "</tr>") => (block: - parse_error ELEMENT_NOT_IN_SCOPE - ) - other => (block: parser.processInHTMLContent(token, IN_TABLE)) - - of IN_ROW: - template clear_the_stack_back_to_a_table_row_context() = - while parser.getTagType(parser.currentNode) notin {TAG_TR, TAG_TEMPLATE, TAG_HTML}: - pop_current_node - - match token: - ("<th>", "<td>") => (block: - clear_the_stack_back_to_a_table_row_context - discard parser.insertHTMLElement(token) - parser.insertionMode = IN_CELL - parser.activeFormatting.add((none(Handle), nil)) - ) - "</tr>" => (block: - if not parser.hasElementInTableScope(TAG_TR): - parse_error ELEMENT_NOT_IN_SCOPE - else: - clear_the_stack_back_to_a_table_row_context - pop_current_node - parser.insertionMode = IN_TABLE_BODY - ) - ("<caption>", "<col>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>", - "<tr>", "</table>") => (block: - if not parser.hasElementInTableScope(TAG_TR): - parse_error ELEMENT_NOT_IN_SCOPE - else: - clear_the_stack_back_to_a_table_row_context - pop_current_node - parser.insertionMode = IN_TABLE_BODY - reprocess token - ) - ("</tbody>", "</tfoot>", "</thead>") => (block: - if not parser.hasElementInTableScope(token.tagtype): - parse_error ELEMENT_NOT_IN_SCOPE - elif not parser.hasElementInTableScope(TAG_TR): - discard - else: - clear_the_stack_back_to_a_table_row_context - pop_current_node - parser.insertionMode = IN_BODY - reprocess token - ) - ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</td>", - "</th>") => (block: parse_error UNEXPECTED_END_TAG) - other => (block: parser.processInHTMLContent(token, IN_TABLE)) - - of IN_CELL: - template close_cell() = - parser.generateImpliedEndTags() - parse_error_if_mismatch {TAG_TD, TAG_TH} - parser.popElementsIncl({TAG_TD, TAG_TH}) - parser.clearActiveFormattingTillMarker() - parser.insertionMode = IN_ROW - - match token: - ("</td>", "</th>") => (block: - if not parser.hasElementInTableScope(token.tagtype): - parse_error ELEMENT_NOT_IN_SCOPE - else: - parser.generateImpliedEndTags() - parse_error_if_mismatch token.tagtype - parser.popElementsIncl(token.tagtype) - parser.clearActiveFormattingTillMarker() - parser.insertionMode = IN_ROW - ) - ("<caption>", "<col>", "<colgroup>", "<tbody>", "<td>", "<tfoot>", - "<th>", "<thead>", "<tr>") => (block: - if not parser.hasElementInTableScope({TAG_TD, TAG_TH}): - parse_error ELEMENT_NOT_IN_SCOPE - else: - close_cell - reprocess token - ) - ("</body>", "</caption>", "</col>", "</colgroup>", "</html>") => (block: - parse_error UNEXPECTED_END_TAG - ) - ("</table>", "</tbody>", "</tfoot>", "</thead>", "</tr>") => (block: - if not parser.hasElementInTableScope(token.tagtype): - parse_error ELEMENT_NOT_IN_SCOPE - else: - close_cell - reprocess token - ) - other => (block: parser.processInHTMLContent(token, IN_BODY)) - - of IN_SELECT: - match token: - '\0' => (block: parse_error UNEXPECTED_NULL) - TokenType.CHARACTER_ASCII => (block: parser.insertCharacter(token.c)) - TokenType.CHARACTER => (block: parser.insertCharacter(token.r)) - TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) - "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) - "<option>" => (block: - if parser.getTagType(parser.currentNode) == TAG_OPTION: - pop_current_node - discard parser.insertHTMLElement(token) - ) - "<optgroup>" => (block: - if parser.getTagType(parser.currentNode) == TAG_OPTION: - pop_current_node - if parser.getTagType(parser.currentNode) == TAG_OPTGROUP: - pop_current_node - discard parser.insertHTMLElement(token) - ) - "</optgroup>" => (block: - if parser.getTagType(parser.currentNode) == TAG_OPTION: - if parser.openElements.len > 1 and parser.getTagType(parser.openElements[^2]) == TAG_OPTGROUP: - pop_current_node - if parser.getTagType(parser.currentNode) == TAG_OPTGROUP: - pop_current_node - else: - parse_error MISMATCHED_TAGS - ) - "</option>" => (block: - if parser.getTagType(parser.currentNode) == TAG_OPTION: - pop_current_node - else: - parse_error MISMATCHED_TAGS - ) - "</select>" => (block: - if not parser.hasElementInSelectScope(TAG_SELECT): - parse_error ELEMENT_NOT_IN_SCOPE - else: - while parser.getTagType(parser.popElement()) != TAG_SELECT: discard - parser.resetInsertionMode() - ) - "<select>" => (block: - parse_error NESTED_TAGS - if parser.hasElementInSelectScope(TAG_SELECT): - while parser.getTagType(parser.popElement()) != TAG_SELECT: discard - parser.resetInsertionMode() - ) - ("<input>", "<keygen>", "<textarea>") => (block: - parse_error UNEXPECTED_START_TAG - if not parser.hasElementInSelectScope(TAG_SELECT): - discard - else: - while parser.getTagType(parser.popElement()) != TAG_SELECT: discard - parser.resetInsertionMode() - reprocess token - ) - ("<script>", "<template>", "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD)) - TokenType.EOF => (block: parser.processInHTMLContent(token, IN_BODY)) - TokenType.START_TAG => (block: parse_error UNEXPECTED_START_TAG) - TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) - - of IN_SELECT_IN_TABLE: - match token: - ("<caption>", "<table>", "<tbody>", "<tfoot>", "<thead>", "<tr>", "<td>", - "<th>") => (block: - parse_error UNEXPECTED_START_TAG - while parser.getTagType(parser.popElement()) != TAG_SELECT: discard - parser.resetInsertionMode() - reprocess token - ) - ("</caption>", "</table>", "</tbody>", "</tfoot>", "</thead>", "</tr>", - "</td>", "</th>") => (block: - parse_error UNEXPECTED_END_TAG - if not parser.hasElementInTableScope(token.tagtype): - discard - else: - parser.popElementsIncl(TAG_SELECT) - parser.resetInsertionMode() - reprocess token - ) - other => (block: parser.processInHTMLContent(token, IN_SELECT)) - - of IN_TEMPLATE: - match token: - (TokenType.CHARACTER_ASCII, TokenType.CHARACTER, TokenType.DOCTYPE) => (block: - parser.processInHTMLContent(token, IN_BODY) - ) - ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", - "<script>", "<style>", "<template>", "<title>", "</template>") => (block: - parser.processInHTMLContent(token, IN_HEAD) - ) - ("<caption>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>") => (block: - discard parser.templateModes.pop() - parser.templateModes.add(IN_TABLE) - parser.insertionMode = IN_TABLE - reprocess token - ) - "<col>" => (block: - discard parser.templateModes.pop() - parser.templateModes.add(IN_COLUMN_GROUP) - parser.insertionMode = IN_COLUMN_GROUP - reprocess token - ) - "<tr>" => (block: - discard parser.templateModes.pop() - parser.templateModes.add(IN_TABLE_BODY) - parser.insertionMode = IN_TABLE_BODY - reprocess token - ) - ("<td>", "<th>") => (block: - discard parser.templateModes.pop() - parser.templateModes.add(IN_ROW) - parser.insertionMode = IN_ROW - reprocess token - ) - TokenType.START_TAG => (block: - discard parser.templateModes.pop() - parser.templateModes.add(IN_BODY) - parser.insertionMode = IN_BODY - reprocess token - ) - TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) - TokenType.EOF => (block: - if not parser.hasElement(TAG_TEMPLATE): - discard # stop - else: - parse_error UNEXPECTED_EOF - parser.popElementsIncl(TAG_TEMPLATE) - parser.clearActiveFormattingTillMarker() - discard parser.templateModes.pop() - parser.resetInsertionMode() - reprocess token - ) - - of AFTER_BODY: - match token: - AsciiWhitespace => (block: parser.processInHTMLContent(token, IN_BODY)) - TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.openElements[0]))) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) - "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) - "</html>" => (block: - if parser.fragment: - parse_error UNEXPECTED_END_TAG - else: - parser.insertionMode = AFTER_AFTER_BODY - ) - TokenType.EOF => (block: discard) # stop - other => (block: - parse_error UNEXPECTED_AFTER_BODY_TOKEN - parser.insertionMode = IN_BODY - reprocess token - ) - - of IN_FRAMESET: - match token: - AsciiWhitespace => (block: parser.insertCharacter(token.c)) - TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) - "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) - "<frameset>" => (block: - if parser.getTagType(parser.currentNode) == TAG_HTML: - parse_error UNEXPECTED_START_TAG - else: - pop_current_node - if not parser.fragment and - parser.getTagType(parser.currentNode) != TAG_FRAMESET: - parser.insertionMode = AFTER_FRAMESET - ) - "<frame>" => (block: - discard parser.insertHTMLElement(token) - pop_current_node - ) - "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) - TokenType.EOF => (block: - if parser.getTagType(parser.currentNode) != TAG_HTML: - parse_error UNEXPECTED_EOF - # stop - ) - other => (block: parser.parseErrorByTokenType(token.t)) - - of AFTER_FRAMESET: - match token: - AsciiWhitespace => (block: parser.insertCharacter(token.c)) - TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) - "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) - "</html>" => (block: parser.insertionMode = AFTER_AFTER_FRAMESET) - "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) - TokenType.EOF => (block: discard) # stop - other => (block: parser.parseErrorByTokenType(token.t)) - - of AFTER_AFTER_BODY: - match token: - TokenType.COMMENT => (block: - parser.insertComment(token, last_child_of(parser.document)) - ) - (TokenType.DOCTYPE, AsciiWhitespace, "<html>") => (block: - parser.processInHTMLContent(token, IN_BODY) - ) - TokenType.EOF => (block: discard) # stop - other => (block: - parser.parseErrorByTokenType(token.t) - parser.insertionMode = IN_BODY - reprocess token - ) - - of AFTER_AFTER_FRAMESET: - match token: - TokenType.COMMENT => (block: - parser.insertComment(token, last_child_of(parser.document)) - ) - (TokenType.DOCTYPE, AsciiWhitespace, "<html>") => (block: - parser.processInHTMLContent(token, IN_BODY) - ) - TokenType.EOF => (block: discard) # stop - "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) - other => (block: parser.parseErrorByTokenType(token.t)) - -const CaseTable = { - "altglyph": "altGlyph", - "altglyphdef": "altGlyphDef", - "altglyphitem": "altGlyphItem", - "animatecolor": "animateColor", - "animatemotion": "animateMotion", - "animatetransform": "animateTransform", - "clippath": "clipPath", - "feblend": "feBlend", - "fecolormatrix": "feColorMatrix", - "fecomponenttransfer": "feComponentTransfer", - "fecomposite": "feComposite", - "feconvolvematrix": "feConvolveMatrix", - "fediffuselighting": "feDiffuseLighting", - "fedisplacementmap": "feDisplacementMap", - "fedistantlight": "feDistantLight", - "fedropshadow": "feDropShadow", - "feflood": "feFlood", - "fefunca": "feFuncA", - "fefuncb": "feFuncB", - "fefuncg": "feFuncG", - "fefuncr": "feFuncR", - "fegaussianblur": "feGaussianBlur", - "feimage": "feImage", - "femerge": "feMerge", - "femergenode": "feMergeNode", - "femorphology": "feMorphology", - "feoffset": "feOffset", - "fepointlight": "fePointLight", - "fespecularlighting": "feSpecularLighting", - "fespotlight": "feSpotLight", - "fetile": "feTile", - "feturbulence": "feTurbulence", - "foreignobject": "foreignObject", - "glyphref": "glyphRef", - "lineargradient": "linearGradient", - "radialgradient": "radialGradient", - "textpath": "textPath", -}.toTable() - -proc processInForeignContent(parser: var HTML5Parser, token: Token) = - macro `=>`(v: typed, body: untyped): untyped = - quote do: - discard (`v`, proc() = `body`) - - template script_end_tag() = - pop_current_node - #TODO document.write (?) - #TODO SVG - - template parse_error(e: ParseError) = - parser.parseError(e) - - template any_other_end_tag() = - if parser.getLocalName(parser.currentNode) != token.tagname: - parse_error UNEXPECTED_END_TAG - for i in countdown(parser.openElements.high, 1): - let node = parser.openElements[i] - if parser.getLocalName(parser.currentNode) == token.tagname: - while parser.popElement() != node: - discard - break - if parser.getNamespace(node) == Namespace.HTML: - break - parser.processInHTMLContent(token, parser.insertionMode) - - match token: - '\0' => (block: - parse_error UNEXPECTED_NULL - parser.insertCharacter(Rune(0xFFFD)) - ) - AsciiWhitespace => (block: parser.insertCharacter(token.c)) - TokenType.CHARACTER_ASCII => (block: parser.insertCharacter(token.c)) - TokenType.CHARACTER => (block: parser.insertCharacter(token.r)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) - ("<b>", "<big>", "<blockquote>", "<body>", "<br>", "<center>", "<code>", - "<dd>", "<div>", "<dl>", "<dt>", "<em>", "<embed>", "<h1>", "<h2>", - "<h3>", "<h4>", "<h5>", "<h6>", "<head>", "<hr>", "<i>", "<img>", "<li>", - "<listing>", "<menu>", "<meta>", "<nobr>", "<ol>", "<p>", "<pre>", - "<ruby>", "<s>", "<small>", "<span>", "<strong>", "<strike>", "<sub>", - "<sup>", "<table>", "<tt>", "<u>", "<ul>", "<var>") => (block: - parse_error UNEXPECTED_START_TAG - #NOTE MathML not implemented - while not parser.isHTMLIntegrationPoint(parser.currentNode) and - parser.getNamespace(parser.currentNode) != Namespace.HTML: - pop_current_node - parser.processInHTMLContent(token, parser.insertionMode) - ) - TokenType.START_TAG => (block: - #NOTE MathML not implemented - let namespace = parser.getNamespace(parser.adjustedCurrentNode) - if namespace == Namespace.SVG: - if token.tagname in CaseTable: - token.tagname = CaseTable[token.tagname] - adjustSVGAttributes(token) - #TODO adjust foreign attributes - discard parser.insertForeignElement(token, namespace) - if token.selfclosing and namespace == Namespace.SVG: - script_end_tag - else: - pop_current_node - ) - "</script>" => (block: - let namespace = parser.getNamespace(parser.currentNode) - let localName = parser.getLocalName(parser.currentNode) - if namespace == Namespace.SVG and localName == "script": #TODO SVG - script_end_tag - else: - any_other_end_tag - ) - TokenType.END_TAG => (block: any_other_end_tag) - -proc constructTree[Handle](parser: var HTML5Parser[Handle]) = - for token in parser.tokenizer.tokenize: - if parser.ignoreLF: - parser.ignoreLF = false - if token.t == CHARACTER_ASCII and token.c == '\n': - continue - let isTokenHTML = token.t in {START_TAG, CHARACTER, CHARACTER_ASCII} - if parser.openElements.len == 0 or - parser.getNamespace(parser.adjustedCurrentNode) == Namespace.HTML or - parser.isHTMLIntegrationPoint(parser.adjustedCurrentNode) and - isTokenHTML or - token.t == EOF: - #NOTE MathML not implemented - parser.processInHTMLContent(token, parser.insertionMode) - else: - parser.processInForeignContent(token) - if parser.needsreinterpret: - break - -proc finishParsing(parser: var HTML5Parser) = - while parser.openElements.len > 0: - pop_current_node - if parser.dombuilder.finish != nil: - parser.dombuilder.finish(parser.dombuilder) - -proc bomSniff(inputStream: Stream): Charset = - # bom sniff - const u8bom = char(0xEF) & char(0xBB) & char(0xBF) - const bebom = char(0xFE) & char(0xFF) - const lebom = char(0xFF) & char(0xFE) - var bom = inputStream.readStr(2) - if bom == bebom: - return CHARSET_UTF_16_BE - elif bom == lebom: - return CHARSET_UTF_16_LE - else: - bom &= inputStream.readChar() - if bom == u8bom: - return CHARSET_UTF_8 - else: - inputStream.setPosition(0) - -# Any of these pointers being nil would later result in a crash. -proc checkCallbacks(dombuilder: DOMBuilder) = - doAssert dombuilder.getParentNode != nil - doAssert dombuilder.getLocalName != nil - doAssert dombuilder.createElement != nil - doAssert dombuilder.createComment != nil - doAssert dombuilder.createDocumentType != nil - doAssert dombuilder.insertBefore != nil - doAssert dombuilder.insertText != nil - doAssert dombuilder.remove != nil - -proc parseHTML*[Handle](inputStream: Stream, dombuilder: DOMBuilder[Handle], - opts: HTML5ParserOpts[Handle]) = - ## Parse an HTML document, using the DOMBuilder object `dombuilder`, and - ## parser options `opts`. - dombuilder.checkCallbacks() - var charsetStack: seq[Charset] - for i in countdown(opts.charsets.high, 0): - charsetStack.add(opts.charsets[i]) - var canReinterpret = opts.canReinterpret - var confidence: CharsetConfidence - if canReinterpret: - let scs = inputStream.bomSniff() - if scs != CHARSET_UNKNOWN: - charsetStack.add(scs) - confidence = CONFIDENCE_CERTAIN - canReinterpret = false - if charsetStack.len == 0: - charsetStack.add(DefaultCharset) # UTF-8 - while true: - let charset = charsetStack.pop() - var parser = HTML5Parser[Handle]( - dombuilder: dombuilder, - confidence: confidence, - charset: charset, - opts: opts - ) - confidence = CONFIDENCE_TENTATIVE # used in the next iteration - if not canReinterpret: - parser.confidence = CONFIDENCE_CERTAIN - let em = if charsetStack.len == 0 or not canReinterpret: - DECODER_ERROR_MODE_REPLACEMENT - else: - DECODER_ERROR_MODE_FATAL - let decoder = newDecoderStream(inputStream, parser.charset, errormode = em) - proc x(e: ParseError) = - parser.parseError(e) - let onParseError = if parser.hasParseError(): - x - else: - nil - parser.tokenizer = newTokenizer(decoder, onParseError) - parser.constructTree() - if parser.needsreinterpret and canReinterpret: - inputStream.setPosition(0) - charsetStack.add(parser.charset) - canReinterpret = false - continue - if decoder.failed and canReinterpret: - inputStream.setPosition(0) - continue - parser.finishParsing() - break diff --git a/src/html/htmltokenizer.nim b/src/html/htmltokenizer.nim deleted file mode 100644 index f487f31f..00000000 --- a/src/html/htmltokenizer.nim +++ /dev/null @@ -1,1564 +0,0 @@ -import options -import strformat -import strutils -import macros -import tables -import unicode - -import html/entity -import html/parseerror -import html/tags -import utils/opt -import utils/radixtree -import utils/twtstr - -import chakasu/decoderstream - -# Tokenizer -type - Tokenizer* = object - state*: TokenizerState - rstate: TokenizerState - tmp: string - code: int - tok: Token - laststart: Token - attrn: string - attrv: string - attr: bool - hasnonhtml*: bool - onParseError: proc(e: ParseError) - - decoder: DecoderStream - sbuf: seq[Rune] - sbuf_i: int - eof_i: int - - TokenType* = enum - DOCTYPE, START_TAG, END_TAG, COMMENT, CHARACTER, CHARACTER_ASCII, EOF - - TokenizerState* = enum - DATA, CHARACTER_REFERENCE, TAG_OPEN, RCDATA, RCDATA_LESS_THAN_SIGN, - RAWTEXT, RAWTEXT_LESS_THAN_SIGN, SCRIPT_DATA, SCRIPT_DATA_LESS_THAN_SIGN, - PLAINTEXT, MARKUP_DECLARATION_OPEN, END_TAG_OPEN, BOGUS_COMMENT, TAG_NAME, - BEFORE_ATTRIBUTE_NAME, RCDATA_END_TAG_OPEN, RCDATA_END_TAG_NAME, - RAWTEXT_END_TAG_OPEN, RAWTEXT_END_TAG_NAME, SELF_CLOSING_START_TAG, - SCRIPT_DATA_END_TAG_OPEN, SCRIPT_DATA_ESCAPE_START, - SCRIPT_DATA_END_TAG_NAME, SCRIPT_DATA_ESCAPE_START_DASH, - SCRIPT_DATA_ESCAPED_DASH_DASH, SCRIPT_DATA_ESCAPED, - SCRIPT_DATA_ESCAPED_DASH, SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, - SCRIPT_DATA_ESCAPED_END_TAG_OPEN, SCRIPT_DATA_DOUBLE_ESCAPE_START, - SCRIPT_DATA_ESCAPED_END_TAG_NAME, SCRIPT_DATA_DOUBLE_ESCAPED, - SCRIPT_DATA_DOUBLE_ESCAPED_DASH, SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, - SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, SCRIPT_DATA_DOUBLE_ESCAPE_END, - AFTER_ATTRIBUTE_NAME, ATTRIBUTE_NAME, BEFORE_ATTRIBUTE_VALUE, - ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED, - ATTRIBUTE_VALUE_UNQUOTED, AFTER_ATTRIBUTE_VALUE_QUOTED, COMMENT_START, - CDATA_SECTION, COMMENT_START_DASH, COMMENT, COMMENT_END, - COMMENT_LESS_THAN_SIGN, COMMENT_END_DASH, COMMENT_LESS_THAN_SIGN_BANG, - COMMENT_LESS_THAN_SIGN_BANG_DASH, COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH, - COMMENT_END_BANG, DOCTYPE, BEFORE_DOCTYPE_NAME, DOCTYPE_NAME, - AFTER_DOCTYPE_NAME, AFTER_DOCTYPE_PUBLIC_KEYWORD, - AFTER_DOCTYPE_SYSTEM_KEYWORD, BOGUS_DOCTYPE, - BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, - DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, AFTER_DOCTYPE_PUBLIC_IDENTIFIER, - BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, - DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, - DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, - AFTER_DOCTYPE_SYSTEM_IDENTIFIER, CDATA_SECTION_BRACKET, CDATA_SECTION_END, - NAMED_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE, - AMBIGUOUS_AMPERSAND_STATE, HEXADECIMAL_CHARACTER_REFERENCE_START, - DECIMAL_CHARACTER_REFERENCE_START, HEXADECIMAL_CHARACTER_REFERENCE, - DECIMAL_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE_END - - Token* = ref object - case t*: TokenType - of DOCTYPE: - name*: Option[string] - pubid*: Option[string] - sysid*: Option[string] - quirks*: bool - of START_TAG, END_TAG: - tagname*: string - tagtype*: TagType - selfclosing*: bool - attrs*: Table[string, string] - of CHARACTER: - r*: Rune - of CHARACTER_ASCII: - c*: char - of COMMENT: - data*: string - of EOF: discard - -func `$`*(tok: Token): string = - case tok.t - of DOCTYPE: fmt"{tok.t} {tok.name} {tok.pubid} {tok.sysid} {tok.quirks}" - of START_TAG, END_TAG: fmt"{tok.t} {tok.tagname} {tok.selfclosing} {tok.attrs}" - of CHARACTER: fmt"{tok.t} {tok.r}" - of CHARACTER_ASCII: fmt"{tok.t} {tok.c}" - of COMMENT: fmt"{tok.t} {tok.data}" - of EOF: fmt"{tok.t}" - -const bufLen = 1024 # * 4096 bytes -const copyBufLen = 16 # * 64 bytes - -proc readn(t: var Tokenizer) = - let l = t.sbuf.len - t.sbuf.setLen(bufLen) - let n = t.decoder.readData(addr t.sbuf[l], (bufLen - l) * sizeof(Rune)) - t.sbuf.setLen(l + n div sizeof(Rune)) - if t.decoder.atEnd: - t.eof_i = t.sbuf.len - -proc newTokenizer*(s: DecoderStream, onParseError: proc(e: ParseError)): Tokenizer = - var t = Tokenizer( - decoder: s, - sbuf: newSeqOfCap[Rune](bufLen), - eof_i: -1, - sbuf_i: 0, - onParseError: onParseError - ) - t.readn() - return t - -proc newTokenizer*(s: string): Tokenizer = - let rs = s.toRunes() - var t = Tokenizer( - sbuf: rs, - eof_i: rs.len, - sbuf_i: 0 - ) - return t - -func atEof(t: Tokenizer): bool = - t.eof_i != -1 and t.sbuf_i >= t.eof_i - -proc checkBufLen(t: var Tokenizer) = - if t.sbuf_i >= min(bufLen - copyBufLen, t.sbuf.len): - for i in t.sbuf_i ..< t.sbuf.len: - t.sbuf[i - t.sbuf_i] = t.sbuf[i] - t.sbuf.setLen(t.sbuf.len - t.sbuf_i) - t.sbuf_i = 0 - if t.sbuf.len < bufLen: - t.readn() - -proc consume(t: var Tokenizer): Rune = - t.checkBufLen() - ## Normalize newlines (\r\n -> \n, single \r -> \n) - if t.sbuf[t.sbuf_i] == Rune('\r'): - inc t.sbuf_i - t.checkBufLen() - if t.atEof or t.sbuf[t.sbuf_i] != Rune('\n'): - # \r - result = Rune('\n') - return - # else, \r\n so just return the \n - result = t.sbuf[t.sbuf_i] - inc t.sbuf_i - -proc reconsume(t: var Tokenizer) = - dec t.sbuf_i - -iterator tokenize*(tokenizer: var Tokenizer): Token = - var tokqueue: seq[Token] - var running = true - - template emit(tok: Token) = - if tok.t == START_TAG: - tokenizer.laststart = tok - if tok.t in {START_TAG, END_TAG}: - tok.tagtype = tagType(tok.tagname) - tokqueue.add(tok) - template emit(tok: TokenType) = emit Token(t: tok) - template emit(rn: Rune) = emit Token(t: CHARACTER, r: rn) - template emit(ch: char) = emit Token(t: CHARACTER_ASCII, c: ch) - template emit_eof = - emit EOF - running = false - template emit_tok = - if tokenizer.attr: - tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv - emit tokenizer.tok - template emit_current = - if is_eof: - emit_eof - elif c in Ascii: - emit c - else: - emit r - template emit_replacement = emit Rune(0xFFFD) - template switch_state(s: TokenizerState) = - tokenizer.state = s - template switch_state_return(s: TokenizerState) = - tokenizer.rstate = tokenizer.state - tokenizer.state = s - template reconsume_in(s: TokenizerState) = - tokenizer.reconsume() - switch_state s - template parse_error(error: untyped) = - if tokenizer.onParseError != nil: - tokenizer.onParseError(error) - template is_appropriate_end_tag_token(): bool = - tokenizer.laststart != nil and tokenizer.laststart.tagname == tokenizer.tok.tagname - template start_new_attribute = - if tokenizer.attr: - tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv - tokenizer.attrn = "" - tokenizer.attrv = "" - tokenizer.attr = true - template leave_attribute_name_state = - if tokenizer.attrn in tokenizer.tok.attrs: - tokenizer.attr = false - template append_to_current_attr_value(c: typed) = - if tokenizer.attr: - tokenizer.attrv &= c - template peek_str(s: string): bool = - # WARNING: will break on strings with copyBufLen + 4 bytes - # WARNING: only works with ascii - assert s.len < copyBufLen - 4 and s.len > 0 - if tokenizer.eof_i != -1 and tokenizer.sbuf_i + s.len >= tokenizer.eof_i: - false - else: - var b = true - for i in 0 ..< s.len: - let c = tokenizer.sbuf[tokenizer.sbuf_i + i] - if not c.isAscii() or cast[char](c) != s[i]: - b = false - break - b - - template peek_str_nocase(s: string): bool = - # WARNING: will break on strings with copyBufLen + 4 bytes - # WARNING: only works with UPPER CASE ascii - assert s.len < copyBufLen - 4 and s.len > 0 - if tokenizer.eof_i != -1 and tokenizer.sbuf_i + s.len >= tokenizer.eof_i: - false - else: - var b = true - for i in 0 ..< s.len: - let c = tokenizer.sbuf[tokenizer.sbuf_i + i] - if not c.isAscii() or cast[char](c).toUpperAscii() != s[i]: - b = false - break - b - template peek_char(): char = - let r = tokenizer.sbuf[tokenizer.sbuf_i] - if r.isAscii(): - cast[char](r) - else: - char(128) - template consume_and_discard(n: int) = #TODO optimize - var i = 0 - while i < n: - discard tokenizer.consume() - inc i - template consumed_as_an_attribute(): bool = - tokenizer.rstate in {ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED, ATTRIBUTE_VALUE_UNQUOTED} - template emit_tmp() = - var i = 0 - while i < tokenizer.tmp.len: - if tokenizer.tmp[i] in Ascii: - emit tokenizer.tmp[i] - inc i - else: - var r: Rune - fastRuneAt(tokenizer.tmp, i, r) - emit r - template flush_code_points_consumed_as_a_character_reference() = - if consumed_as_an_attribute: - append_to_current_attr_value tokenizer.tmp - else: - emit_tmp - template new_token(t: Token) = - if tokenizer.attr: - tokenizer.attr = false - tokenizer.tok = t - - # Fake EOF as an actual character. Also replace anything_else with the else - # branch. - macro stateMachine(states: varargs[untyped]): untyped = - var maincase = newNimNode(nnkCaseStmt).add(quote do: tokenizer.state) - for state in states: - if state.kind == nnkOfBranch: - var mainstmtlist: NimNode - var mainstmtlist_i = -1 - for i in 0 ..< state.len: - if state[i].kind == nnkStmtList: - mainstmtlist = state[i] - mainstmtlist_i = i - break - if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "ignore_eof": - maincase.add(state) - continue - - var hasanythingelse = false - if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "has_anything_else": - hasanythingelse = true - - let childcase = findChild(mainstmtlist, it.kind == nnkCaseStmt) - var haseof = false - var eofstmts: NimNode - var elsestmts: NimNode - - for i in countdown(childcase.len-1, 0): - let childof = childcase[i] - if childof.kind == nnkOfBranch: - for j in countdown(childof.len-1, 0): - if childof[j].kind == nnkIdent and childof[j].strVal == "eof": - haseof = true - eofstmts = childof.findChild(it.kind == nnkStmtList) - if childof.findChild(it.kind == nnkIdent and it.strVal != "eof") != nil: - childof.del(j) - else: - childcase.del(i) - elif childof.kind == nnkElse: - elsestmts = childof.findChild(it.kind == nnkStmtList) - - if not haseof: - eofstmts = elsestmts - if hasanythingelse: - let fake_anything_else = quote do: - template anything_else = - `elsestmts` - mainstmtlist.insert(0, fake_anything_else) - let eofstmtlist = quote do: - if is_eof: - `eofstmts` - else: - `mainstmtlist` - state[mainstmtlist_i] = eofstmtlist - maincase.add(state) - result = newNimNode(nnkStmtList) - result.add(maincase) - - template ignore_eof = discard # does nothing - template has_anything_else = discard # does nothing - - const null = char(0) - - while running: - #eprint tokenizer.state #debug - let is_eof = tokenizer.atEof # set eof here, otherwise we would exit at the last character - let r = if not is_eof: - tokenizer.consume() - else: - # avoid consuming eof... - Rune(null) - let c = if r.isAscii(): cast[char](r) else: char(128) - stateMachine: # => case tokenizer.state - of DATA: - case c - of '&': switch_state_return CHARACTER_REFERENCE - of '<': switch_state TAG_OPEN - of null: - parse_error UNEXPECTED_NULL_CHARACTER - emit_current - of eof: emit_eof - else: emit_current - - of RCDATA: - case c - of '&': switch_state_return CHARACTER_REFERENCE - of '<': switch_state RCDATA_LESS_THAN_SIGN - of null: parse_error UNEXPECTED_NULL_CHARACTER - of eof: emit_eof - else: emit_current - - of RAWTEXT: - case c - of '<': switch_state RAWTEXT_LESS_THAN_SIGN - of null: - parse_error UNEXPECTED_NULL_CHARACTER - emit_replacement - of eof: emit_eof - else: emit_current - - of SCRIPT_DATA: - case c - of '<': switch_state SCRIPT_DATA_LESS_THAN_SIGN - of null: - parse_error UNEXPECTED_NULL_CHARACTER - emit_replacement - of eof: emit_eof - else: emit_current - - of PLAINTEXT: - case c - of null: - parse_error UNEXPECTED_NULL_CHARACTER - emit_replacement - of eof: emit_eof - else: emit_current - - of TAG_OPEN: - case c - of '!': switch_state MARKUP_DECLARATION_OPEN - of '/': switch_state END_TAG_OPEN - of AsciiAlpha: - new_token Token(t: START_TAG) - reconsume_in TAG_NAME - of '?': - parse_error UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME - new_token Token(t: COMMENT) - reconsume_in BOGUS_COMMENT - of eof: - parse_error EOF_BEFORE_TAG_NAME - emit '<' - emit_eof - else: - parse_error INVALID_FIRST_CHARACTER_OF_TAG_NAME - emit '<' - reconsume_in DATA - - of END_TAG_OPEN: - case c - of AsciiAlpha: - new_token Token(t: END_TAG) - reconsume_in TAG_NAME - of '>': - parse_error MISSING_END_TAG_NAME - switch_state DATA - of eof: - parse_error EOF_BEFORE_TAG_NAME - emit '<' - emit '/' - emit_eof - else: - parse_error INVALID_FIRST_CHARACTER_OF_TAG_NAME - new_token Token(t: COMMENT) - reconsume_in BOGUS_COMMENT - - of TAG_NAME: - case c - of AsciiWhitespace: switch_state BEFORE_ATTRIBUTE_NAME - of '/': switch_state SELF_CLOSING_START_TAG - of '>': - switch_state DATA - emit_tok - of AsciiUpperAlpha: tokenizer.tok.tagname &= c.tolower() - of null: - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.tagname &= Rune(0xFFFD) - of eof: - parse_error EOF_IN_TAG - emit_eof - else: tokenizer.tok.tagname &= r - - of RCDATA_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state RCDATA_END_TAG_OPEN - else: - emit '<' - reconsume_in RCDATA - - of RCDATA_END_TAG_OPEN: - case c - of AsciiAlpha: - new_token Token(t: END_TAG) - reconsume_in RCDATA_END_TAG_NAME - else: - emit '<' - emit '/' - reconsume_in RCDATA - - of RCDATA_END_TAG_NAME: - has_anything_else - case c - of AsciiWhitespace: - if is_appropriate_end_tag_token: - switch_state BEFORE_ATTRIBUTE_NAME - else: - anything_else - of '/': - if is_appropriate_end_tag_token: - switch_state SELF_CLOSING_START_TAG - else: - anything_else - of '>': - if is_appropriate_end_tag_token: - switch_state DATA - emit_tok - else: - anything_else - of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= c.tolower() - tokenizer.tmp &= c - else: - new_token nil #TODO - emit '<' - emit '/' - emit_tmp - reconsume_in RCDATA - - of RAWTEXT_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state RAWTEXT_END_TAG_OPEN - else: - emit '<' - reconsume_in RAWTEXT - - of RAWTEXT_END_TAG_OPEN: - case c - of AsciiAlpha: - new_token Token(t: END_TAG) - reconsume_in RAWTEXT_END_TAG_NAME - else: - emit '<' - emit '/' - reconsume_in RAWTEXT - - of RAWTEXT_END_TAG_NAME: - has_anything_else - case c - of AsciiWhitespace: - if is_appropriate_end_tag_token: - switch_state BEFORE_ATTRIBUTE_NAME - else: - anything_else - of '/': - if is_appropriate_end_tag_token: - switch_state SELF_CLOSING_START_TAG - else: - anything_else - of '>': - if is_appropriate_end_tag_token: - switch_state DATA - emit_tok - else: - anything_else - of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= c.tolower() - tokenizer.tmp &= c - else: - new_token nil #TODO - emit '<' - emit '/' - for r in tokenizer.tmp.runes: - emit r - reconsume_in RAWTEXT - - of SCRIPT_DATA_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state SCRIPT_DATA_END_TAG_OPEN - of '!': - switch_state SCRIPT_DATA_ESCAPE_START - emit '<' - emit '!' - else: - emit '<' - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_END_TAG_OPEN: - case c - of AsciiAlpha: - new_token Token(t: END_TAG) - reconsume_in SCRIPT_DATA_END_TAG_NAME - else: - emit '<' - emit '/' - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_END_TAG_NAME: - has_anything_else - case c - of AsciiWhitespace: - if is_appropriate_end_tag_token: - switch_state BEFORE_ATTRIBUTE_NAME - else: - anything_else - of '/': - if is_appropriate_end_tag_token: - switch_state SELF_CLOSING_START_TAG - else: - anything_else - of '>': - if is_appropriate_end_tag_token: - switch_state DATA - emit_tok - else: - anything_else - of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= c.tolower() - tokenizer.tmp &= c - else: - emit '<' - emit '/' - emit_tmp - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_ESCAPE_START: - case c - of '-': - switch_state SCRIPT_DATA_ESCAPE_START_DASH - emit '-' - else: - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_ESCAPE_START_DASH: - case c - of '-': - switch_state SCRIPT_DATA_ESCAPED_DASH_DASH - emit '-' - else: - reconsume_in SCRIPT_DATA - - of SCRIPT_DATA_ESCAPED: - case c - of '-': - switch_state SCRIPT_DATA_ESCAPED_DASH - emit '-' - of '<': - switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN - of null: - parse_error UNEXPECTED_NULL_CHARACTER - emit_replacement - of eof: - parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT - emit_eof - else: - emit_current - - of SCRIPT_DATA_ESCAPED_DASH: - case c - of '-': - switch_state SCRIPT_DATA_ESCAPED_DASH_DASH - emit '-' - of '<': - switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN - of null: - parse_error UNEXPECTED_NULL_CHARACTER - switch_state SCRIPT_DATA_ESCAPED - of eof: - parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT - emit_eof - else: - switch_state SCRIPT_DATA_ESCAPED - emit_current - - of SCRIPT_DATA_ESCAPED_DASH_DASH: - case c - of '-': - emit '-' - of '<': - switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN - of '>': - switch_state SCRIPT_DATA - emit '>' - of null: - parse_error UNEXPECTED_NULL_CHARACTER - switch_state SCRIPT_DATA_ESCAPED - of eof: - parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT - emit_eof - else: - switch_state SCRIPT_DATA_ESCAPED - emit_current - - of SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state SCRIPT_DATA_ESCAPED_END_TAG_OPEN - of AsciiAlpha: - tokenizer.tmp = "" - emit '<' - reconsume_in SCRIPT_DATA_DOUBLE_ESCAPE_START - else: - emit '<' - reconsume_in SCRIPT_DATA_ESCAPED - - of SCRIPT_DATA_ESCAPED_END_TAG_OPEN: - case c - of AsciiAlpha: - new_token Token(t: START_TAG) - reconsume_in SCRIPT_DATA_ESCAPED_END_TAG_NAME - else: - emit '<' - emit '/' - reconsume_in SCRIPT_DATA_ESCAPED - - of SCRIPT_DATA_ESCAPED_END_TAG_NAME: - has_anything_else - case c - of AsciiWhitespace: - if is_appropriate_end_tag_token: - switch_state BEFORE_ATTRIBUTE_NAME - else: - anything_else - of '/': - if is_appropriate_end_tag_token: - switch_state SELF_CLOSING_START_TAG - else: - anything_else - of '>': - if is_appropriate_end_tag_token: - switch_state DATA - else: - anything_else - of AsciiAlpha: - tokenizer.tok.tagname &= c.tolower() - tokenizer.tmp &= c - else: - emit '<' - emit '/' - emit_tmp - reconsume_in SCRIPT_DATA_ESCAPED - - of SCRIPT_DATA_DOUBLE_ESCAPE_START: - case c - of AsciiWhitespace, '/', '>': - if tokenizer.tmp == "script": - switch_state SCRIPT_DATA_DOUBLE_ESCAPED - else: - switch_state SCRIPT_DATA_ESCAPED - emit_current - of AsciiAlpha: # note: merged upper & lower - tokenizer.tmp &= c.tolower() - emit_current - else: reconsume_in SCRIPT_DATA_ESCAPED - - of SCRIPT_DATA_DOUBLE_ESCAPED: - case c - of '-': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_DASH - emit '-' - of '<': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN - emit '<' - of null: - parse_error UNEXPECTED_NULL_CHARACTER - emit_replacement - of eof: - parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT - emit_eof - else: emit_current - - of SCRIPT_DATA_DOUBLE_ESCAPED_DASH: - case c - of '-': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH - emit '-' - of '<': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN - emit '<' - of null: - parse_error UNEXPECTED_NULL_CHARACTER - switch_state SCRIPT_DATA_DOUBLE_ESCAPED - emit_replacement - of eof: - parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT - emit_eof - else: - switch_state SCRIPT_DATA_DOUBLE_ESCAPED - emit_current - - of SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: - case c - of '-': emit '-' - of '<': - switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN - emit '<' - of '>': - switch_state SCRIPT_DATA - emit '>' - of null: - parse_error UNEXPECTED_NULL_CHARACTER - switch_state SCRIPT_DATA_DOUBLE_ESCAPED - emit_replacement - of eof: - parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT - emit_eof - else: switch_state SCRIPT_DATA_DOUBLE_ESCAPED - - of SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: - case c - of '/': - tokenizer.tmp = "" - switch_state SCRIPT_DATA_DOUBLE_ESCAPE_END - emit '/' - else: reconsume_in SCRIPT_DATA_DOUBLE_ESCAPED - - of SCRIPT_DATA_DOUBLE_ESCAPE_END: - case c - of AsciiWhitespace, '/', '>': - if tokenizer.tmp == "script": - switch_state SCRIPT_DATA_ESCAPED - else: - switch_state SCRIPT_DATA_DOUBLE_ESCAPED - emit_current - of AsciiAlpha: # note: merged upper & lower - tokenizer.tmp &= c.tolower() - emit_current - else: - reconsume_in SCRIPT_DATA_DOUBLE_ESCAPED - - of BEFORE_ATTRIBUTE_NAME: - case c - of AsciiWhitespace: discard - of '/', '>', eof: reconsume_in AFTER_ATTRIBUTE_NAME - of '=': - parse_error UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME - start_new_attribute - switch_state ATTRIBUTE_NAME - else: - start_new_attribute - reconsume_in ATTRIBUTE_NAME - - of ATTRIBUTE_NAME: - has_anything_else - case c - of AsciiWhitespace, '/', '>', eof: - leave_attribute_name_state - reconsume_in AFTER_ATTRIBUTE_NAME - of '=': - leave_attribute_name_state - switch_state BEFORE_ATTRIBUTE_VALUE - of AsciiUpperAlpha: - tokenizer.attrn &= c.tolower() - of null: - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.attrn &= Rune(0xFFFD) - of '"', '\'', '<': - parse_error UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME - anything_else - else: - tokenizer.attrn &= r - - of AFTER_ATTRIBUTE_NAME: - case c - of AsciiWhitespace: discard - of '/': switch_state SELF_CLOSING_START_TAG - of '=': switch_state BEFORE_ATTRIBUTE_VALUE - of '>': - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_TAG - emit_eof - else: - start_new_attribute - reconsume_in ATTRIBUTE_NAME - - of BEFORE_ATTRIBUTE_VALUE: - case c - of AsciiWhitespace: discard - of '"': switch_state ATTRIBUTE_VALUE_DOUBLE_QUOTED - of '\'': switch_state ATTRIBUTE_VALUE_SINGLE_QUOTED - of '>': - parse_error MISSING_ATTRIBUTE_VALUE - switch_state DATA - emit '>' - else: reconsume_in ATTRIBUTE_VALUE_UNQUOTED - - of ATTRIBUTE_VALUE_DOUBLE_QUOTED: - case c - of '"': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED - of '&': switch_state_return CHARACTER_REFERENCE - of null: - parse_error UNEXPECTED_NULL_CHARACTER - append_to_current_attr_value Rune(0xFFFD) - of eof: - parse_error EOF_IN_TAG - emit_eof - else: append_to_current_attr_value r - - of ATTRIBUTE_VALUE_SINGLE_QUOTED: - case c - of '\'': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED - of '&': switch_state_return CHARACTER_REFERENCE - of null: - parse_error UNEXPECTED_NULL_CHARACTER - append_to_current_attr_value Rune(0xFFFD) - of eof: - parse_error EOF_IN_TAG - emit_eof - else: append_to_current_attr_value r - - of ATTRIBUTE_VALUE_UNQUOTED: - case c - of AsciiWhitespace: switch_state BEFORE_ATTRIBUTE_NAME - of '&': switch_state_return CHARACTER_REFERENCE - of '>': - switch_state DATA - emit_tok - of null: - parse_error UNEXPECTED_NULL_CHARACTER - append_to_current_attr_value Rune(0xFFFD) - of '"', '\'', '<', '=', '`': - parse_error UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE - append_to_current_attr_value c - of eof: - parse_error EOF_IN_TAG - emit_eof - else: append_to_current_attr_value r - - of AFTER_ATTRIBUTE_VALUE_QUOTED: - case c - of AsciiWhitespace: - switch_state BEFORE_ATTRIBUTE_NAME - of '/': - switch_state SELF_CLOSING_START_TAG - of '>': - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_TAG - emit_eof - else: - parse_error MISSING_WHITESPACE_BETWEEN_ATTRIBUTES - reconsume_in BEFORE_ATTRIBUTE_NAME - - of SELF_CLOSING_START_TAG: - case c - of '>': - tokenizer.tok.selfclosing = true - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_TAG - emit_eof - else: - parse_error UNEXPECTED_SOLIDUS_IN_TAG - reconsume_in BEFORE_ATTRIBUTE_NAME - - of BOGUS_COMMENT: - assert tokenizer.tok.t == COMMENT - case c - of '>': - switch_state DATA - emit_tok - of eof: - emit_tok - emit_eof - of null: parse_error UNEXPECTED_NULL_CHARACTER - else: tokenizer.tok.data &= r - - of MARKUP_DECLARATION_OPEN: # note: rewritten to fit case model as we consume a char anyway - has_anything_else - case c - of '-': - if peek_char == '-': - new_token Token(t: COMMENT) - tokenizer.state = COMMENT_START - consume_and_discard 1 - else: anything_else - of 'D', 'd': - if peek_str_nocase("OCTYPE"): - consume_and_discard "OCTYPE".len - switch_state DOCTYPE - else: anything_else - of '[': - if peek_str("CDATA["): - consume_and_discard "CDATA[".len - if tokenizer.hasnonhtml: - switch_state CDATA_SECTION - else: - parse_error CDATA_IN_HTML_CONTENT - new_token Token(t: COMMENT, data: "[CDATA[") - switch_state BOGUS_COMMENT - else: anything_else - else: - parse_error INCORRECTLY_OPENED_COMMENT - new_token Token(t: COMMENT) - reconsume_in BOGUS_COMMENT - - of COMMENT_START: - case c - of '-': switch_state COMMENT_START_DASH - of '>': - parse_error ABRUPT_CLOSING_OF_EMPTY_COMMENT - switch_state DATA - emit_tok - else: reconsume_in COMMENT - - of COMMENT_START_DASH: - case c - of '-': switch_state COMMENT_END - of '>': - parse_error ABRUPT_CLOSING_OF_EMPTY_COMMENT - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_COMMENT - emit_tok - emit_eof - else: - tokenizer.tok.data &= '-' - reconsume_in COMMENT - - of COMMENT: - case c - of '<': - tokenizer.tok.data &= c - switch_state COMMENT_LESS_THAN_SIGN - of '-': switch_state COMMENT_END_DASH - of null: - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.data &= Rune(0xFFFD) - of eof: - parse_error EOF_IN_COMMENT - emit_tok - emit_eof - else: tokenizer.tok.data &= r - - of COMMENT_LESS_THAN_SIGN: - case c - of '!': - tokenizer.tok.data &= c - switch_state COMMENT_LESS_THAN_SIGN_BANG - of '<': tokenizer.tok.data &= c - else: reconsume_in COMMENT - - of COMMENT_LESS_THAN_SIGN_BANG: - case c - of '-': switch_state COMMENT_LESS_THAN_SIGN_BANG_DASH - else: reconsume_in COMMENT - - of COMMENT_LESS_THAN_SIGN_BANG_DASH: - case c - of '-': switch_state COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH - else: reconsume_in COMMENT_END_DASH - - of COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH: - case c - of '>', eof: reconsume_in COMMENT_END - else: - parse_error NESTED_COMMENT - reconsume_in COMMENT_END - - of COMMENT_END_DASH: - case c - of '-': switch_state COMMENT_END - of eof: - parse_error EOF_IN_COMMENT - emit_tok - emit_eof - else: - tokenizer.tok.data &= '-' - reconsume_in COMMENT - - of COMMENT_END: - case c - of '>': switch_state DATA - of '!': switch_state COMMENT_END_BANG - of '-': tokenizer.tok.data &= '-' - of eof: - parse_error EOF_IN_COMMENT - emit_tok - emit_eof - else: - tokenizer.tok.data &= "--" - reconsume_in COMMENT - - of COMMENT_END_BANG: - case c - of '-': - tokenizer.tok.data &= "--!" - switch_state COMMENT_END_DASH - of '>': - parse_error INCORRECTLY_CLOSED_COMMENT - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_COMMENT - emit_tok - emit_eof - else: - tokenizer.tok.data &= "--!" - reconsume_in COMMENT - - of DOCTYPE: - case c - of AsciiWhitespace: switch_state BEFORE_DOCTYPE_NAME - of '>': reconsume_in BEFORE_DOCTYPE_NAME - of eof: - parse_error EOF_IN_DOCTYPE - new_token Token(t: DOCTYPE, quirks: true) - emit_tok - emit_eof - else: - parse_error MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME - reconsume_in BEFORE_DOCTYPE_NAME - - of BEFORE_DOCTYPE_NAME: - case c - of AsciiWhitespace: discard - of AsciiUpperAlpha: - new_token Token(t: DOCTYPE, name: some($c.tolower())) - switch_state DOCTYPE_NAME - of null: - parse_error UNEXPECTED_NULL_CHARACTER - new_token Token(t: DOCTYPE, name: some($Rune(0xFFFD))) - of '>': - parse_error MISSING_DOCTYPE_NAME - new_token Token(t: DOCTYPE, quirks: true) - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_DOCTYPE - new_token Token(t: DOCTYPE, quirks: true) - emit_tok - emit_eof - else: - new_token Token(t: DOCTYPE, name: some($r)) - switch_state DOCTYPE_NAME - - of DOCTYPE_NAME: - case c - of AsciiWhitespace: switch_state AFTER_DOCTYPE_NAME - of '>': - switch_state DATA - emit_tok - of AsciiUpperAlpha: - tokenizer.tok.name.get &= c.tolower() - of null: - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.name.get &= Rune(0xFFFD) - of eof: - parse_error EOF_IN_DOCTYPE - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.name.get &= r - - of AFTER_DOCTYPE_NAME: # note: rewritten to fit case model as we consume a char anyway - has_anything_else - case c - of AsciiWhitespace: discard - of '>': - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_DOCTYPE - tokenizer.tok.quirks = true - emit_tok - emit_eof - of 'p', 'P': - if peek_str("UBLIC"): - consume_and_discard "UBLIC".len - switch_state AFTER_DOCTYPE_PUBLIC_KEYWORD - else: - anything_else - of 's', 'S': - if peek_str("YSTEM"): - consume_and_discard "YSTEM".len - switch_state AFTER_DOCTYPE_SYSTEM_KEYWORD - else: - anything_else - else: - parse_error INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of AFTER_DOCTYPE_PUBLIC_KEYWORD: - case c - of AsciiWhitespace: switch_state BEFORE_DOCTYPE_PUBLIC_IDENTIFIER - of '"': - parse_error MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED - of '>': - parse_error MISSING_DOCTYPE_PUBLIC_IDENTIFIER - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_DOCTYPE - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: - case c - of AsciiWhitespace: discard - of '"': - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED - of '\'': - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED - of '>': - parse_error MISSING_DOCTYPE_PUBLIC_IDENTIFIER - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_DOCTYPE - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: - case c - of '"': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER - of null: - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.pubid.get &= Rune(0xFFFD) - of '>': - parse_error ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_DOCTYPE - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.pubid.get &= r - - of DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: - case c - of '\'': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER - of null: - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.pubid.get &= Rune(0xFFFD) - of '>': - parse_error ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_DOCTYPE - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.pubid.get &= r - - of AFTER_DOCTYPE_PUBLIC_IDENTIFIER: - case c - of AsciiWhitespace: switch_state BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS - of '>': - switch_state DATA - emit_tok - of '"': - parse_error MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED - of '\'': - parse_error MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED - of eof: - parse_error EOF_IN_DOCTYPE - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: - case c - of AsciiWhitespace: discard - of '>': - switch_state DATA - emit_tok - of '"': - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED - of '\'': - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED - of eof: - parse_error EOF_IN_DOCTYPE - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of AFTER_DOCTYPE_SYSTEM_KEYWORD: - case c - of AsciiWhitespace: switch_state BEFORE_DOCTYPE_SYSTEM_IDENTIFIER - of '"': - parse_error MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED - of '\'': - parse_error MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD - tokenizer.tok.sysid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED - of '>': - parse_error MISSING_DOCTYPE_SYSTEM_IDENTIFIER - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_DOCTYPE - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: - case c - of AsciiWhitespace: discard - of '"': - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED - of '\'': - tokenizer.tok.pubid = some("") - switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED - of '>': - parse_error MISSING_DOCTYPE_SYSTEM_IDENTIFIER - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_DOCTYPE - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER - tokenizer.tok.quirks = true - reconsume_in BOGUS_DOCTYPE - - of DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: - case c - of '"': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER - of null: - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.sysid.get &= Rune(0xFFFD) - of '>': - parse_error ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_DOCTYPE - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.sysid.get &= r - - of DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: - case c - of '\'': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER - of null: - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.sysid.get &= Rune(0xFFFD) - of '>': - parse_error ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER - tokenizer.tok.quirks = true - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_DOCTYPE - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - tokenizer.tok.sysid.get &= r - - of AFTER_DOCTYPE_SYSTEM_IDENTIFIER: - case c - of AsciiWhitespace: discard - of '>': - switch_state DATA - emit_tok - of eof: - parse_error EOF_IN_DOCTYPE - tokenizer.tok.quirks = true - emit_tok - emit_eof - else: - parse_error UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER - reconsume_in BOGUS_DOCTYPE - - of BOGUS_DOCTYPE: - case c - of '>': - switch_state DATA - emit_tok - of null: parse_error UNEXPECTED_NULL_CHARACTER - of eof: - emit_tok - emit_eof - else: discard - - of CDATA_SECTION: - case c - of ']': switch_state CDATA_SECTION_BRACKET - of eof: - parse_error EOF_IN_CDATA - emit_eof - else: - emit_current - - of CDATA_SECTION_BRACKET: - case c - of ']': switch_state CDATA_SECTION_END - of '>': switch_state DATA - else: - emit ']' - reconsume_in CDATA_SECTION - - of CDATA_SECTION_END: - case c - of ']': emit ']' - of '>': switch_state DATA - else: - emit ']' - emit ']' - reconsume_in CDATA_SECTION - - of CHARACTER_REFERENCE: - tokenizer.tmp = "&" - case c - of AsciiAlpha: reconsume_in NAMED_CHARACTER_REFERENCE - of '#': - tokenizer.tmp &= '#' - switch_state NUMERIC_CHARACTER_REFERENCE - else: - flush_code_points_consumed_as_a_character_reference - reconsume_in tokenizer.rstate - - of NAMED_CHARACTER_REFERENCE: - ignore_eof # we check for eof ourselves - tokenizer.reconsume() - when nimvm: - eprint "Cannot evaluate character references at compile time" - else: - var tokenizerp = addr tokenizer - var lasti = 0 - let value = entityMap.find(proc(s: var string): bool = - if tokenizerp[].atEof: - return false - let rs = $tokenizerp[].consume() - lasti = tokenizerp[].tmp.len - tokenizerp[].tmp &= rs - s &= rs - return true - ) - tokenizer.reconsume() - tokenizer.tmp.setLen(lasti) - if value.isOk: - if consumed_as_an_attribute and tokenizer.tmp[^1] != ';' and peek_char in {'='} + AsciiAlpha: - flush_code_points_consumed_as_a_character_reference - switch_state tokenizer.rstate - else: - if tokenizer.tmp[^1] != ';': - parse_error MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE - tokenizer.tmp = value.get - flush_code_points_consumed_as_a_character_reference - switch_state tokenizer.rstate - else: - flush_code_points_consumed_as_a_character_reference - switch_state AMBIGUOUS_AMPERSAND_STATE - - of AMBIGUOUS_AMPERSAND_STATE: - case c - of AsciiAlpha: - if consumed_as_an_attribute: - append_to_current_attr_value c - else: - emit_current - of ';': - parse_error UNKNOWN_NAMED_CHARACTER_REFERENCE - reconsume_in tokenizer.rstate - else: reconsume_in tokenizer.rstate - - of NUMERIC_CHARACTER_REFERENCE: - tokenizer.code = 0 - case c - of 'x', 'X': - tokenizer.tmp &= c - switch_state HEXADECIMAL_CHARACTER_REFERENCE_START - else: reconsume_in DECIMAL_CHARACTER_REFERENCE_START - - of HEXADECIMAL_CHARACTER_REFERENCE_START: - case c - of AsciiHexDigit: reconsume_in HEXADECIMAL_CHARACTER_REFERENCE - else: - parse_error ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE - flush_code_points_consumed_as_a_character_reference - reconsume_in tokenizer.rstate - - of DECIMAL_CHARACTER_REFERENCE_START: - case c - of AsciiDigit: reconsume_in DECIMAL_CHARACTER_REFERENCE - else: - parse_error ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE - flush_code_points_consumed_as_a_character_reference - reconsume_in tokenizer.rstate - - of HEXADECIMAL_CHARACTER_REFERENCE: - case c - of AsciiHexDigit: # note: merged digit, upper hex, lower hex - tokenizer.code *= 0x10 - tokenizer.code += hexValue(c) - of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END - else: - parse_error MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE - reconsume_in NUMERIC_CHARACTER_REFERENCE_END - - of DECIMAL_CHARACTER_REFERENCE: - case c - of AsciiDigit: - tokenizer.code *= 10 - tokenizer.code += decValue(c) - of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END - else: - parse_error MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE - reconsume_in NUMERIC_CHARACTER_REFERENCE_END - - of NUMERIC_CHARACTER_REFERENCE_END: - ignore_eof # we reconsume anyway - case tokenizer.code - of 0x00: - parse_error NULL_CHARACTER_REFERENCE - tokenizer.code = 0xFFFD - elif tokenizer.code > 0x10FFFF: - parse_error CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE - tokenizer.code = 0xFFFD - elif Rune(tokenizer.code).isSurrogate(): - parse_error SURROGATE_CHARACTER_REFERENCE - tokenizer.code = 0xFFFD - elif Rune(tokenizer.code).isNonCharacter(): - parse_error NONCHARACTER_CHARACTER_REFERENCE - # do nothing - elif tokenizer.code in 0..255 and char(tokenizer.code) in ((Controls - AsciiWhitespace) + {chr(0x0D)}): - const ControlMapTable = [ - (0x80, 0x20AC), (0x82, 0x201A), (0x83, 0x0192), (0x84, 0x201E), - (0x85, 0x2026), (0x86, 0x2020), (0x87, 0x2021), (0x88, 0x02C6), - (0x89, 0x2030), (0x8A, 0x0160), (0x8B, 0x2039), (0x8C, 0x0152), - (0x8E, 0x017D), (0x91, 0x2018), (0x92, 0x2019), (0x93, 0x201C), - (0x94, 0x201D), (0x95, 0x2022), (0x96, 0x2013), (0x97, 0x2014), - (0x98, 0x02DC), (0x99, 0x2122), (0x9A, 0x0161), (0x9B, 0x203A), - (0x9C, 0x0153), (0x9E, 0x017E), (0x9F, 0x0178), - ].toTable() - if ControlMapTable.hasKey(tokenizer.code): - tokenizer.code = ControlMapTable[tokenizer.code] - tokenizer.tmp = $Rune(tokenizer.code) - flush_code_points_consumed_as_a_character_reference #TODO optimize so we flush directly - reconsume_in tokenizer.rstate # we unnecessarily consumed once so reconsume - - for tok in tokqueue: - yield tok - tokqueue.setLen(0) diff --git a/src/html/parseerror.nim b/src/html/parseerror.nim deleted file mode 100644 index d99b2fed..00000000 --- a/src/html/parseerror.nim +++ /dev/null @@ -1,70 +0,0 @@ -type ParseError* = enum - #TODO write a description for all error codes - ABRUPT_CLOSING_OF_EMPTY_COMMENT - ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER - ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER - ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE - CDATA_IN_HTML_CONTENT - CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE - CONTROL_CHARACTER_IN_INPUT_STREAM - CONTROL_CHARACTER_REFERENCE - END_TAG_WITH_ATTRIBUTES - DUPLICATE_ATTRIBUTE - END_TAG_WITH_TRAILING_SOLIDUS - EOF_BEFORE_TAG_NAME - EOF_IN_CDATA - EOF_IN_COMMENT - EOF_IN_DOCTYPE - EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT - EOF_IN_TAG - INCORRECTLY_CLOSED_COMMENT - INCORRECTLY_OPENED_COMMENT - INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME - INVALID_FIRST_CHARACTER_OF_TAG_NAME - MISSING_ATTRIBUTE_VALUE - MISSING_DOCTYPE_NAME - MISSING_DOCTYPE_PUBLIC_IDENTIFIER - MISSING_DOCTYPE_SYSTEM_IDENTIFIER - MISSING_END_TAG_NAME - MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER - MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER - MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE - MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD - MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD - MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME - MISSING_WHITESPACE_BETWEEN_ATTRIBUTES - MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS - NESTED_COMMENT - NONCHARACTER_CHARACTER_REFERENCE - NONCHARACTER_IN_INPUT_STREAM - NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS - NULL_CHARACTER_REFERENCE - SURROGATE_CHARACTER_REFERENCE - SURROGATE_IN_INPUT_STREAM - UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER - UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME - UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE - UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME - UNEXPECTED_NULL_CHARACTER - UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME - UNEXPECTED_SOLIDUS_IN_TAG - UNKNOWN_NAMED_CHARACTER_REFERENCE - LAST_SPECIFIED_ERROR # never returned - # From here on, error code names have not been specified by the standard. - MISMATCHED_TAGS = "Mismatched start and end tags" - INVALID_DOCTYPE = "Unrecognized document type" - UNEXPECTED_DOCTYPE = "Unexpected document type" - UNEXPECTED_INITIAL_TOKEN = "Unexpected token in initial state" - UNEXPECTED_START_TAG = "Unexpected start tag" - UNEXPECTED_END_TAG = "Unexpected end tag" - ELEMENT_NOT_IN_OPEN_ELEMENTS = "Element has not been added to open elements" - ELEMENT_NOT_IN_SCOPE = "Element not in appropriate scope" - ELEMENT_NOT_CURRENT_NODE = "Element is not current node" - #TODO merge with UNEXPECTED_NULL_CHARACTER? - UNEXPECTED_NULL = "Unexpected null character" - NESTED_TAGS = "Non-nestable nested tags" - UNEXPECTED_SPECIAL_ELEMENT = "Unexpected special element on open elements" - UNEXPECTED_EOF = "Unexpected end of file" - INVALID_TEXT_PARENT = "Invalid parent element for text node" - NON_SPACE_TABLE_TEXT = "Non-space table text" - UNEXPECTED_AFTER_BODY_TOKEN = "Unexpected token after body" diff --git a/src/html/tags.nim b/src/html/tags.nim deleted file mode 100644 index 4f7bba53..00000000 --- a/src/html/tags.nim +++ /dev/null @@ -1,186 +0,0 @@ -import tables -import strutils - -type - NodeType* = enum - ELEMENT_NODE = 1, - ATTRIBUTE_NODE = 2, - TEXT_NODE = 3, - CDATA_SECTION_NODE = 4, - ENTITY_REFERENCE_NODE = 5, - ENTITY_NODE = 6 - PROCESSING_INSTRUCTION_NODE = 7, - COMMENT_NODE = 8, - DOCUMENT_NODE = 9, - DOCUMENT_TYPE_NODE = 10, - DOCUMENT_FRAGMENT_NODE = 11, - NOTATION_NODE = 12 - - InputType* = enum - INPUT_UNKNOWN, INPUT_BUTTON, INPUT_CHECKBOX, INPUT_COLOR, INPUT_DATE, - INPUT_DATETIME_LOCAL, INPUT_EMAIL, INPUT_FILE, INPUT_HIDDEN, INPUT_IMAGE, - INPUT_MONTH, INPUT_NUMBER, INPUT_PASSWORD, INPUT_RADIO, INPUT_RANGE, - INPUT_RESET, INPUT_SEARCH, INPUT_SUBMIT, INPUT_TEL, INPUT_TEXT, INPUT_TIME, - INPUT_URL, INPUT_WEEK - - ButtonType* = enum - BUTTON_SUBMIT, BUTTON_RESET, BUTTON_BUTTON - - TagType* = enum - TAG_UNKNOWN, TAG_APPLET, TAG_BIG, TAG_HTML, TAG_BASE, TAG_BASEFONT, - TAG_BGSOUND, TAG_HEAD, TAG_LINK, TAG_LISTING, TAG_META, TAG_STYLE, - TAG_TITLE, TAG_BODY, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_FOOTER, - TAG_HEADER, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HGROUP, - TAG_MAIN, TAG_NAV, TAG_SEARCH, TAG_SECTION, TAG_BLOCKQUOTE, TAG_DD, - TAG_DIV, TAG_DL, TAG_DT, TAG_FIGCAPTION, TAG_FIGURE, TAG_HR, TAG_LI, - TAG_OL, TAG_P, TAG_PRE, TAG_UL, TAG_A, TAG_ABBR, TAG_B, TAG_BDI, TAG_BDO, - TAG_BR, TAG_NOBR, TAG_CITE, TAG_CODE, TAG_DATA, TAG_DFN, TAG_EM, TAG_EMBED, - TAG_I, TAG_KBD, TAG_MARK, TAG_MARQUEE, TAG_Q, TAG_RB, TAG_RP, TAG_RT, - TAG_RTC, TAG_RUBY, TAG_S, TAG_SAMP, TAG_SMALL, TAG_SPAN, TAG_STRONG, - TAG_SUB, TAG_SUP, TAG_TIME, TAG_U, TAG_VAR, TAG_WBR, TAG_AREA, - TAG_AUDIO, TAG_IMG, TAG_IMAGE, TAG_MAP, TAG_TRACK, TAG_VIDEO, TAG_IFRAME, - TAG_OBJECT, TAG_PARAM, TAG_PICTURE, TAG_PORTAL, TAG_SOURCE, TAG_CANVAS, - TAG_NOSCRIPT, TAG_NOEMBED, TAG_PLAINTEXT, TAG_XMP, TAG_SCRIPT, TAG_DEL, - TAG_INS, TAG_CAPTION, TAG_COL, TAG_COLGROUP, TAG_TABLE, TAG_TBODY, - TAG_TD, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TR, TAG_BUTTON, TAG_DATALIST, - TAG_FIELDSET, TAG_FORM, TAG_INPUT, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, - TAG_METER, TAG_OPTGROUP, TAG_OPTION, TAG_OUTPUT, TAG_PROGRESS, TAG_SELECT, - TAG_TEXTAREA, TAG_DETAILS, TAG_DIALOG, TAG_MENU, TAG_SUMMARY, TAG_BLINK, - TAG_CENTER, TAG_CONTENT, TAG_DIR, TAG_FONT, TAG_FRAME, TAG_NOFRAMES, - TAG_FRAMESET, TAG_STRIKE, TAG_TT, TAG_TEMPLATE, TAG_SARCASM - - QuirksMode* = enum - NO_QUIRKS, QUIRKS, LIMITED_QUIRKS - - Namespace* = enum - NO_NAMESPACE = "", - HTML = "http://www.w3.org/1999/xhtml", - MATHML = "http://www.w3.org/1998/Math/MathML", - SVG = "http://www.w3.org/2000/svg", - XLINK = "http://www.w3.org/1999/xlink", - XML = "http://www.w3.org/XML/1998/namespace", - XMLNS = "http://www.w3.org/2000/xmlns/" - -func getTagTypeMap(): Table[string, TagType] = - for i in TagType: - let enumname = $TagType(i) - let tagname = enumname.split('_')[1..^1].join("_").tolower() - result[tagname] = TagType(i) - -func getInputTypeMap(): Table[string, InputType] = - for i in InputType: - let enumname = $InputType(i) - let tagname = enumname.split('_')[1..^1].join("_").tolower() - result[tagname] = InputType(i) - -const tagTypeMap = getTagTypeMap() -const inputTypeMap = getInputTypeMap() - - -func tagType*(s: string): TagType = - if tagTypeMap.hasKey(s): - return tagTypeMap[s] - else: - return TAG_UNKNOWN - -func inputType*(s: string): InputType = - if inputTypeMap.hasKey(s): - return inputTypeMap[s] - else: - return INPUT_UNKNOWN - -const tagNameMap = (func(): Table[TagType, string] = - for k, v in tagTypeMap: - result[v] = k -)() - -const AllTagTypes* = (func(): set[TagType] = - for tag in TagType: - result.incl(tag) -)() - -func tagName*(t: TagType): string = - return tagNameMap[t] - -const SelfClosingTagTypes* = { - TAG_LI, TAG_P -} - -const VoidTagTypes* = { - TAG_AREA, TAG_BASE, TAG_BR, TAG_COL, TAG_FRAME, TAG_HR, TAG_IMG, TAG_INPUT, - TAG_SOURCE, TAG_TRACK, TAG_LINK, TAG_META, TAG_PARAM, TAG_WBR, TAG_HR -} - -const PClosingTagTypes* = { - TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_BLOCKQUOTE, TAG_DETAILS, TAG_DIV, - TAG_DL, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, - TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEADER, TAG_HGROUP, - TAG_HR, TAG_MAIN, TAG_MENU, TAG_NAV, TAG_OL, TAG_P, TAG_PRE, TAG_SECTION, - TAG_TABLE, TAG_UL -} - -const HTagTypes* = { - TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6 -} - -const HeadTagTypes* = { - TAG_BASE, TAG_LINK, TAG_META, TAG_TITLE, TAG_NOSCRIPT, TAG_SCRIPT, TAG_NOFRAMES, TAG_STYLE, TAG_HEAD -} - -# 4.10.2 Categories -const FormAssociatedElements* = { - TAG_BUTTON, TAG_FIELDSET, TAG_INPUT, TAG_OBJECT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA, TAG_IMG -} - -#TODO support all the other ones -const SupportedFormAssociatedElements* = { - TAG_BUTTON, TAG_INPUT, TAG_SELECT, TAG_TEXTAREA -} - -const ListedElements* = { - TAG_FIELDSET, TAG_INPUT, TAG_OBJECT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA -} - -const SubmittableElements* = { - TAG_BUTTON, TAG_INPUT, TAG_SELECT, TAG_TEXTAREA -} - -const ResettableElements* = { - TAG_INPUT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA -} - -const AutocapitalizeInheritingElements* = { - TAG_BUTTON, TAG_FIELDSET, TAG_INPUT, TAG_OUTPUT, TAG_SELECT, TAG_TEXTAREA -} - -const LabelableElements* = { - # input only if type not hidden - TAG_BUTTON, TAG_INPUT, TAG_METER, TAG_OUTPUT, TAG_PROGRESS, TAG_SELECT, TAG_TEXTAREA -} - -const CharacterDataNodes* = { - TEXT_NODE, CDATA_SECTION_NODE, PROCESSING_INSTRUCTION_NODE, COMMENT_NODE -} - -const InputTypeWithSize* = { - INPUT_SEARCH, INPUT_TEXT, INPUT_EMAIL, INPUT_PASSWORD, INPUT_URL, INPUT_TEL -} - -#https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements -#NOTE MathML not implemented -#TODO SVG foreignObject, SVG desc, SVG title -const SpecialElements* = { - TAG_ADDRESS, TAG_APPLET, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_BASE, - TAG_BASEFONT, TAG_BGSOUND, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, - TAG_CAPTION, TAG_CENTER, TAG_COL, TAG_COLGROUP, TAG_DD, TAG_DETAILS, TAG_DIR, - TAG_DIV, TAG_DL, TAG_DT, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, - TAG_FOOTER, TAG_FORM, TAG_FRAME, TAG_FRAMESET, TAG_H1, TAG_H2, TAG_H3, TAG_H4, - TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HGROUP, TAG_HR, TAG_HTML, - TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_KEYGEN, TAG_LI, TAG_LINK, TAG_LISTING, - TAG_MAIN, TAG_MARQUEE, TAG_MENU, TAG_META, TAG_NAV, TAG_NOEMBED, TAG_NOFRAMES, - TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, - TAG_SCRIPT, TAG_SEARCH, TAG_SECTION, TAG_SELECT, TAG_SOURCE, TAG_STYLE, - TAG_SUMMARY, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, - TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_UL, TAG_WBR, - TAG_XMP -} diff --git a/src/render/renderdocument.nim b/src/render/renderdocument.nim index b693d501..ab82b80e 100644 --- a/src/render/renderdocument.nim +++ b/src/render/renderdocument.nim @@ -7,7 +7,6 @@ import css/sheet import css/stylednode import css/values import html/dom -import html/tags import io/window import layout/box import layout/engine @@ -15,6 +14,8 @@ import layout/layoutunit import types/color import utils/twtstr +import chame/tags + func formatFromWord(computed: ComputedFormat): Format = result.fgcolor = computed.color.cellColor() if computed.bgcolor.a != 0: diff --git a/src/version.nim b/src/version.nim index 407b974a..2be08e9e 100644 --- a/src/version.nim +++ b/src/version.nim @@ -24,6 +24,8 @@ macro checkVersion(xs: static string, major, minor, patch: int) = "Please run `make submodule` to update.") tryImport chakasu/version, "chakasu" +tryImport chame/version, "chame" static: checkVersion("chakasu", 0, 1, 2) + checkVersion("chame", 0, 9, 3) diff --git a/src/xhr/formdata.nim b/src/xhr/formdata.nim index ad7a0529..e69b9bf7 100644 --- a/src/xhr/formdata.nim +++ b/src/xhr/formdata.nim @@ -1,11 +1,12 @@ import html/dom -import html/tags import js/exception import js/javascript import types/blob import types/formdata import utils/twtstr +import chame/tags + proc constructEntryList*(form: HTMLFormElement, submitter: Element = nil, encoding: string = ""): Option[seq[FormDataEntry]] |