diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/html/chadombuilder.nim | 1 | ||||
-rw-r--r-- | src/html/htmlparser.nim | 414 | ||||
-rw-r--r-- | src/html/htmltokenizer.nim | 233 | ||||
-rw-r--r-- | src/html/parseerror.nim | 70 |
4 files changed, 437 insertions, 281 deletions
diff --git a/src/html/chadombuilder.nim b/src/html/chadombuilder.nim index 2ddd067b..eb83af23 100644 --- a/src/html/chadombuilder.nim +++ b/src/html/chadombuilder.nim @@ -152,7 +152,6 @@ proc newChaDOMBuilder(url: URL, window: Window): ChaDOMBuilder = return ChaDOMBuilder( document: document, finish: finish, - parseError: parseError, setQuirksMode: setQuirksMode, setCharacterSet: setCharacterset, elementPopped: elementPopped, diff --git a/src/html/htmlparser.nim b/src/html/htmlparser.nim index 41bcf66b..e72f897e 100644 --- a/src/html/htmlparser.nim +++ b/src/html/htmlparser.nim @@ -8,11 +8,13 @@ import unicode import data/charset import encoding/decoderstream -import html/tags import html/htmltokenizer +import html/parseerror +import html/tags import utils/twtstr -export macros, unicode +# Generics break without exporting macros. Maybe a compiler bug? +export macros # Heavily inspired by html5ever's TreeSink design. type @@ -109,8 +111,11 @@ type ## Parsing has finished. DOMBuilderParseError*[Handle] = - proc(builder: DOMBuilder[Handle], message: string) {.nimcall.} - ## Parse error. + proc(builder: DOMBuilder[Handle], message: ParseError) {.nimcall.} + ## Parse error. `message` is an error code either specified by the + ## standard (in this case, message < LAST_SPECIFIED_ERROR) or named + ## arbitrarily. (At the time of writing, only tokenizer errors have + ## specified error codes.) DOMBuilderSetQuirksMode*[Handle] = proc(builder: DOMBuilder[Handle], quirksMode: QuirksMode) {.nimcall.} @@ -279,6 +284,10 @@ proc finish[Handle](parser: HTML5Parser[Handle]) = if parser.dombuilder.finish != nil: parser.dombuilder.finish(parser.dombuilder) +proc parseError(parser: HTML5Parser, e: ParseError) = + if parser.dombuilder.parseError != nil: + parser.dombuilder.parseError(parser.dombuilder, e) + proc setQuirksMode[Handle](parser: var HTML5Parser[Handle], mode: QuirksMode) = parser.quirksMode = mode if parser.dombuilder.setQuirksMode != nil: @@ -371,6 +380,9 @@ func isSVGIntegrationPoint[Handle](parser: HTML5Parser, return false # Parser +func hasParseError(parser: HTML5Parser): bool = + return parser.dombuilder.parseError != nil + func tagNameEquals[Handle](parser: HTML5Parser, handle: Handle, token: Token): bool = let tagType = parser.getTagType(handle) @@ -441,8 +453,6 @@ func adjustedCurrentNode[Handle](parser: HTML5Parser[Handle]): Handle = else: parser.currentNode -template parse_error() = discard - func lastElementOfTag[Handle](parser: HTML5Parser[Handle], tagType: TagType): tuple[element: Handle, pos: int] = for i in countdown(parser.openElements.high, 0): @@ -488,6 +498,12 @@ func hasElement[Handle](parser: HTML5Parser[Handle], tag: TagType): bool = return true return false +func hasElement[Handle](parser: HTML5Parser[Handle], tags: set[TagType]): bool = + for element in parser.openElements: + if parser.getTagType(element) in tags: + return true + return false + func hasElementInSpecificScope[Handle](parser: HTML5Parser[Handle], target: Handle, list: set[TagType]): bool = for i in countdown(parser.openElements.high, 0): @@ -980,8 +996,21 @@ proc changeEncoding(parser: var HTML5Parser, cs: Charset) = parser.charset = cs parser.needsreinterpret = true +proc parseErrorByTokenType(parser: var HTML5Parser, tokenType: TokenType) = + case tokenType + of START_TAG: + parser.parseError UNEXPECTED_START_TAG + of END_TAG: + parser.parseError UNEXPECTED_END_TAG + of EOF: + parser.parseError UNEXPECTED_EOF + else: + doAssert false + proc adoptionAgencyAlgorithm[Handle](parser: var HTML5Parser[Handle], token: Token): bool = + template parse_error(e: ParseError) = + parser.parseError(e) if parser.tagNameEquals(parser.currentNode, token): var fail = true for it in parser.activeFormatting: @@ -1009,13 +1038,14 @@ proc adoptionAgencyAlgorithm[Handle](parser: var HTML5Parser[Handle], return true let stackIndex = parser.openElements.find(formatting) if stackIndex < 0: - parse_error + parse_error ELEMENT_NOT_IN_OPEN_ELEMENTS parser.activeFormatting.delete(formattingIndex) return false if not parser.hasElementInScope(formatting): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE return false - if formatting != parser.currentNode: parse_error + if formatting != parser.currentNode: + parse_error ELEMENT_NOT_CURRENT_NODE var furthestBlock: Handle = nil var furthestBlockIndex: int for j in countdown(parser.openElements.high, 0): @@ -1087,7 +1117,7 @@ proc adoptionAgencyAlgorithm[Handle](parser: var HTML5Parser[Handle], proc closeP(parser: var HTML5Parser) = parser.generateImpliedEndTags(TAG_P) if parser.getTagType(parser.currentNode) != TAG_P: - parse_error + parser.parseError(MISMATCHED_TAGS) while parser.getTagType(parser.popElement()) != TAG_P: discard @@ -1275,22 +1305,43 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], token: Token, insertionMode: InsertionMode) = template pop_all_nodes = while parser.openElements.len > 1: pop_current_node + template anything_else = discard "anything_else" + macro `=>`(v: typed, body: untyped): untyped = quote do: discard (`v`, proc() = `body`) + template _ = discard + template reprocess(tok: Token) = parser.processInHTMLContent(tok, parser.insertionMode) + template parse_error(e: ParseError) = + parser.parseError(e) + + template parse_error_if_mismatch(tagtype: TagType) = + if parser.hasParseError(): + if parser.getTagType(parser.currentNode) != TAG_DD: + parse_error MISMATCHED_TAGS + + template parse_error_if_mismatch(tagtypes: set[TagType]) = + if parser.hasParseError(): + if parser.getTagType(parser.currentNode) notin tagtypes: + parse_error MISMATCHED_TAGS + case insertionMode of INITIAL: match token: AsciiWhitespace => (block: discard) - TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document))) + TokenType.COMMENT => (block: + parser.insertComment(token, last_child_of(parser.document)) + ) TokenType.DOCTYPE => (block: - if token.name.isnone or token.name.get != "html" or token.pubid.issome or (token.sysid.issome and token.sysid.get != "about:legacy-compat"): - parse_error + if token.name.isNone or + token.name.get != "html" or token.pubid.isSome or + (token.sysid.isSome and token.sysid.get != "about:legacy-compat"): + parse_error INVALID_DOCTYPE let doctype = parser.createDocumentType(token.name.get(""), token.pubid.get(""), token.sysid.get("")) parser.append(parser.document, doctype) @@ -1303,7 +1354,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) _ => (block: if not parser.opts.isIframeSrcdoc: - parse_error + parse_error UNEXPECTED_INITIAL_TOKEN parser.setQuirksMode(QUIRKS) parser.insertionMode = BEFORE_HTML reprocess token @@ -1311,17 +1362,20 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], of BEFORE_HTML: match token: - TokenType.DOCTYPE => (block: parse_error) - TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document))) + TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) + TokenType.COMMENT => (block: + parser.insertComment(token, last_child_of(parser.document)) + ) AsciiWhitespace => (block: discard) "<html>" => (block: - let element = parser.createElement(token, Namespace.HTML, parser.document) + let element = parser.createElement(token, Namespace.HTML, + parser.document) parser.append(parser.document, element) parser.pushElement(element) parser.insertionMode = BEFORE_HEAD ) ("</head>", "</body>", "</html>", "</br>") => (block: anything_else) - TokenType.END_TAG => (block: parse_error) + TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) _ => (block: let element = parser.createElement(TAG_HTML, Namespace.HTML) parser.append(parser.document, element) @@ -1334,14 +1388,14 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], match token: AsciiWhitespace => (block: discard) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error) + TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "<head>" => (block: parser.head = some(parser.insertHTMLElement(token)) parser.insertionMode = IN_HEAD ) ("</head>", "</body>", "</html>", "</br>") => (block: anything_else) - TokenType.END_TAG => (block: parse_error) + TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) _ => (block: let token = Token(t: START_TAG, tagtype: TAG_HEAD) parser.head = some(parser.insertHTMLElement(token)) @@ -1353,7 +1407,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], match token: AsciiWhitespace => (block: discard) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error) + TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) ("<base>", "<basefont>", "<bgsound>", "<link>") => (block: discard parser.insertHTMLElement(token) @@ -1406,17 +1460,17 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) "</template>" => (block: if not parser.hasElement(TAG_TEMPLATE): - parse_error + parse_error ELEMENT_NOT_IN_OPEN_ELEMENTS else: parser.generateImpliedEndTagsThoroughly() if parser.getTagType(parser.currentNode) != TAG_TEMPLATE: - parse_error + parse_error MISMATCHED_TAGS parser.popElementsIncl(TAG_TEMPLATE) parser.clearActiveFormattingTillMarker() discard parser.templateModes.pop() parser.resetInsertionMode() ) - ("<head>", TokenType.END_TAG) => (block: parse_error) + ("<head>", TokenType.END_TAG) => (block: parse_error UNEXPECTED_END_TAG) _ => (block: pop_current_node parser.insertionMode = AFTER_HEAD @@ -1425,19 +1479,20 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], of IN_HEAD_NOSCRIPT: match token: - TokenType.DOCTYPE => (block: parse_error) + TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "</noscript>" => (block: pop_current_node parser.insertionMode = IN_HEAD ) (AsciiWhitespace, - TokenType.COMMENT, - "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<style>") => (block: + TokenType.COMMENT, + "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", + "<style>") => (block: parser.processInHTMLContent(token, IN_HEAD)) "</br>" => (block: anything_else) - ("<head>", "<noscript>") => (block: parse_error) - TokenType.END_TAG => (block: parse_error) + ("<head>", "<noscript>") => (block: parse_error UNEXPECTED_START_TAG) + TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) _ => (block: pop_current_node parser.insertionMode = IN_HEAD @@ -1448,7 +1503,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], match token: AsciiWhitespace => (block: parser.insertCharacter(token.c)) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error) + TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "<body>" => (block: discard parser.insertHTMLElement(token) @@ -1461,7 +1516,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<script>", "<style>", "<template>", "<title>") => (block: - parse_error + parse_error UNEXPECTED_START_TAG parser.pushElement(parser.head.get) parser.processInHTMLContent(token, IN_HEAD) for i in countdown(parser.openElements.high, 0): @@ -1470,7 +1525,8 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) "</template>" => (block: parser.processInHTMLContent(token, IN_HEAD)) ("</body>", "</html>", "</br>") => (block: anything_else) - ("<head>", TokenType.END_TAG) => (block: parse_error) + ("<head>") => (block: parse_error UNEXPECTED_START_TAG) + (TokenType.END_TAG) => (block: parse_error UNEXPECTED_END_TAG) _ => (block: discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_BODY)) parser.insertionMode = IN_BODY @@ -1487,15 +1543,27 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], let node = parser.openElements[i] if parser.tagNameEquals(node, token): parser.generateImpliedEndTags(token.tagtype) - if node != parser.currentNode: parse_error - while parser.popElement() != node: discard + if node != parser.currentNode: + parse_error ELEMENT_NOT_CURRENT_NODE + while parser.popElement() != node: + discard break elif parser.getTagType(node) in SpecialElements: - parse_error + parse_error UNEXPECTED_SPECIAL_ELEMENT return + template parse_error_if_body_has_disallowed_open_elements = + if parser.hasParseError(): + const Disallowed = AllTagTypes - { + TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, + TAG_RP, TAG_RT, TAG_RTC, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH, + TAG_THEAD, TAG_TR, TAG_BODY, TAG_HTML + } + if parser.hasElement(Disallowed): + parse_error MISMATCHED_TAGS + match token: - '\0' => (block: parse_error) + '\0' => (block: parse_error UNEXPECTED_NULL) AsciiWhitespace => (block: parser.reconstructActiveFormatting() parser.insertCharacter(token.c) @@ -1511,18 +1579,19 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], parser.framesetOk = false ) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error) + TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: - parse_error + parse_error UNEXPECTED_START_TAG if parser.hasElement(TAG_TEMPLATE): discard else: parser.addAttrsIfMissing(parser.openElements[0], token.attrs) ) - ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<script>", "<style>", "<template>", "<title>", - "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD)) + ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", + "<script>", "<style>", "<template>", "<title>", + "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD)) "<body>" => (block: - parse_error + parse_error UNEXPECTED_START_TAG if parser.openElements.len == 1 or parser.getTagType(parser.openElements[1]) != TAG_BODY or parser.hasElement(TAG_TEMPLATE): @@ -1532,7 +1601,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], parser.addAttrsIfMissing(parser.openElements[1], token.attrs) ) "<frameset>" => (block: - parse_error + parse_error UNEXPECTED_START_TAG if parser.openElements.len == 1 or parser.getTagType(parser.openElements[1]) != TAG_BODY or not parser.framesetOk: @@ -1545,21 +1614,21 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], if parser.templateModes.len > 0: parser.processInHTMLContent(token, IN_TEMPLATE) else: - #NOTE parse error omitted - discard # stop + parse_error_if_body_has_disallowed_open_elements + # stop ) "</body>" => (block: if not parser.hasElementInScope(TAG_BODY): - parse_error + parse_error UNEXPECTED_END_TAG else: - #NOTE parse error omitted + parse_error_if_body_has_disallowed_open_elements parser.insertionMode = AFTER_BODY ) "</html>" => (block: if not parser.hasElementInScope(TAG_BODY): - parse_error + parse_error UNEXPECTED_END_TAG else: - #NOTE parse error omitted + parse_error_if_body_has_disallowed_open_elements parser.insertionMode = AFTER_BODY reprocess token ) @@ -1575,7 +1644,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], if parser.hasElementInButtonScope(TAG_P): parser.closeP() if parser.getTagType(parser.currentNode) in HTagTypes: - parse_error + parse_error NESTED_TAGS pop_current_node discard parser.insertHTMLElement(token) ) @@ -1589,7 +1658,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], "<form>" => (block: let hasTemplate = parser.hasElement(TAG_TEMPLATE) if parser.form.isSome and not hasTemplate: - parse_error + parse_error NESTED_TAGS else: if parser.hasElementInButtonScope(TAG_P): parser.closeP() @@ -1605,8 +1674,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], case tagType of TAG_LI: parser.generateImpliedEndTags(TAG_LI) - if parser.getTagType(parser.currentNode) != TAG_LI: - parse_error + parse_error_if_mismatch TAG_LI parser.popElementsIncl(TAG_LI) break of SpecialElements - {TAG_ADDRESS, TAG_DIV, TAG_P, TAG_LI}: @@ -1624,14 +1692,12 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], case tagType of TAG_DD: parser.generateImpliedEndTags(TAG_DD) - if parser.getTagType(parser.currentNode) != TAG_DD: - parse_error + parse_error_if_mismatch TAG_DD parser.popElementsIncl(TAG_DD) break of TAG_DT: parser.generateImpliedEndTags(TAG_DT) - if parser.getTagType(parser.currentNode) != TAG_DT: - parse_error + parse_error_if_mismatch TAG_DT parser.popElementsIncl(TAG_DT) break of SpecialElements - {TAG_ADDRESS, TAG_DIV, TAG_P, TAG_DD, TAG_DT}: @@ -1649,7 +1715,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) "<button>" => (block: if parser.hasElementInScope(TAG_BUTTON): - parse_error + parse_error NESTED_TAGS parser.generateImpliedEndTags() parser.popElementsIncl(TAG_BUTTON) parser.reconstructActiveFormatting() @@ -1662,11 +1728,10 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], "</hgroup>", "</listing>", "</main>", "</menu>", "</nav>", "</ol>", "</pre>", "</section>", "</summary>", "</ul>") => (block: if not parser.hasElementInScope(token.tagtype): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags() - if parser.getTagType(parser.currentNode) != token.tagtype: - parse_error + parse_error_if_mismatch token.tagtype parser.popElementsIncl(token.tagtype) ) "</form>" => (block: @@ -1675,53 +1740,49 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], parser.form = none(Handle) if form.isNone or not parser.hasElementInScope(parser.getTagType(form.get)): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE return let node = form.get parser.generateImpliedEndTags() if parser.currentNode != node: - parse_error + parse_error ELEMENT_NOT_CURRENT_NODE parser.openElements.delete(parser.openElements.find(node)) else: if not parser.hasElementInScope(TAG_FORM): - parse_error - return - parser.generateImpliedEndTags() - if parser.getTagType(parser.currentNode) != TAG_FORM: - parse_error - parser.popElementsIncl(TAG_FORM) + parse_error ELEMENT_NOT_IN_SCOPE + else: + parser.generateImpliedEndTags() + parse_error_if_mismatch TAG_FORM + parser.popElementsIncl(TAG_FORM) ) "</p>" => (block: if not parser.hasElementInButtonScope(TAG_P): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_P)) parser.closeP() ) "</li>" => (block: if not parser.hasElementInListItemScope(TAG_LI): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags(TAG_LI) - if parser.getTagType(parser.currentNode) != TAG_LI: - parse_error + parse_error_if_mismatch TAG_LI parser.popElementsIncl(TAG_LI) ) ("</dd>", "</dt>") => (block: if not parser.hasElementInScope(token.tagtype): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags(token.tagtype) - if parser.getTagType(parser.currentNode) != token.tagtype: - parse_error + parse_error_if_mismatch token.tagtype parser.popElementsIncl(token.tagtype) ) ("</h1>", "</h2>", "</h3>", "</h4>", "</h5>", "</h6>") => (block: if not parser.hasElementInScope(HTagTypes): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags() - if parser.getTagType(parser.currentNode) != token.tagtype: - parse_error + parse_error_if_mismatch token.tagtype parser.popElementsIncl(HTagTypes) ) "</sarcasm>" => (block: @@ -1729,25 +1790,25 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], anything_else ) "<a>" => (block: - var anchor: Handle = nil + var anchor: Option[Handle] for i in countdown(parser.activeFormatting.high, 0): let format = parser.activeFormatting[i] if format[0] == nil: break if parser.getTagType(format[0]) == TAG_A: - anchor = format[0] + anchor = some(format[0]) break - if anchor != nil: - parse_error + if anchor.isSome: + parse_error NESTED_TAGS if parser.adoptionAgencyAlgorithm(token): any_other_end_tag return for i in 0..parser.activeFormatting.high: - if parser.activeFormatting[i][0] == anchor: + if parser.activeFormatting[i][0] == anchor.get: parser.activeFormatting.delete(i) break for i in 0..parser.openElements.high: - if parser.openElements[i] == anchor: + if parser.openElements[i] == anchor.get: parser.openElements.delete(i) break parser.reconstructActiveFormatting() @@ -1763,7 +1824,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], "<nobr>" => (block: parser.reconstructActiveFormatting() if parser.hasElementInScope(TAG_NOBR): - parse_error + parse_error NESTED_TAGS if parser.adoptionAgencyAlgorithm(token): any_other_end_tag return @@ -1786,11 +1847,10 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) ("</applet>", "</marquee>", "</object>") => (block: if not parser.hasElementInScope(token.tagtype): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags() - if parser.getTagType(parser.currentNode) != token.tagtype: - parse_error + parse_error_if_mismatch token.tagtype while parser.getTagType(parser.popElement()) != token.tagtype: discard parser.clearActiveFormattingTillMarker() ) @@ -1803,7 +1863,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], parser.insertionMode = IN_TABLE ) "</br>" => (block: - parse_error + parse_error UNEXPECTED_END_TAG reprocess Token(t: START_TAG, tagtype: TAG_BR) ) ("<area>", "<br>", "<embed>", "<img>", "<keygen>", "<wbr>") => (block: @@ -1881,19 +1941,21 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ("<rb>", "<rtc>") => (block: if parser.hasElementInScope(TAG_RUBY): parser.generateImpliedEndTags() - if parser.getTagType(parser.currentNode) != TAG_RUBY: parse_error + parse_error_if_mismatch TAG_RUBY discard parser.insertHTMLElement(token) ) ("<rp>", "<rt>") => (block: if parser.hasElementInScope(TAG_RUBY): parser.generateImpliedEndTags(TAG_RTC) - if parser.getTagType(parser.currentNode) notin {TAG_RUBY, TAG_RTC}: parse_error + parse_error_if_mismatch {TAG_RUBY, TAG_RTC} discard parser.insertHTMLElement(token) ) #NOTE <math> (not implemented) #TODO <svg> (SVG) ("<caption>", "<col>", "<colgroup>", "<frame>", "<head>", "<tbody>", - "<td>", "<tfoot>", "<th>", "<thead>", "<tr>") => (block: parse_error) + "<td>", "<tfoot>", "<th>", "<thead>", "<tr>") => (block: + parse_error UNEXPECTED_START_TAG + ) TokenType.START_TAG => (block: any_other_start_tag) TokenType.END_TAG => (block: any_other_end_tag) @@ -1907,7 +1969,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], parser.insertCharacter(token.r) ) TokenType.EOF => (block: - parse_error + parse_error UNEXPECTED_EOF if parser.getTagType(parser.currentNode) == TAG_SCRIPT: parser.setScriptAlreadyStarted(parser.currentNode) pop_current_node @@ -1931,20 +1993,23 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], match token: (TokenType.CHARACTER_ASCII, TokenType.CHARACTER) => (block: - if parser.getTagType(parser.currentNode) in {TAG_TABLE, TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TR}: + const CanHaveText = { + TAG_TABLE, TAG_TBODY, TAG_TFOOT, TAG_THEAD, TAG_TR + } + if parser.getTagType(parser.currentNode) in CanHaveText: parser.pendingTableChars = "" parser.pendingTableCharsWhitespace = true parser.oldInsertionMode = parser.insertionMode parser.insertionMode = IN_TABLE_TEXT reprocess token else: # anything else - parse_error + parse_error INVALID_TEXT_PARENT parser.fosterParenting = true parser.processInHTMLContent(token, IN_BODY) parser.fosterParenting = false ) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error) + TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<caption>" => (block: clear_the_stack_back_to_a_table_context parser.activeFormatting.add((nil, nil)) @@ -1968,7 +2033,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], reprocess token ) "<table>" => (block: - parse_error + parse_error NESTED_TAGS if not parser.hasElementInScope(TAG_TABLE): discard else: @@ -1978,32 +2043,31 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) "</table>" => (block: if not parser.hasElementInScope(TAG_TABLE): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: while parser.getTagType(parser.popElement()) != TAG_TABLE: discard parser.resetInsertionMode() ) ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</tbody>", "</td>", "</tfoot>", "</th>", "</thead>", "</tr>") => (block: - parse_error + parse_error UNEXPECTED_END_TAG ) ("<style>", "<script>", "<template>", "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD) ) "<input>" => (block: + parse_error UNEXPECTED_START_TAG if not token.attrs.getOrDefault("type").equalsIgnoreCase("hidden"): # anything else - parse_error parser.fosterParenting = true parser.processInHTMLContent(token, IN_BODY) parser.fosterParenting = false else: - parse_error discard parser.insertHTMLElement(token) pop_current_node ) "<form>" => (block: - parse_error + parse_error UNEXPECTED_START_TAG if parser.form.isSome or parser.hasElement(TAG_TEMPLATE): discard else: @@ -2014,7 +2078,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], parser.processInHTMLContent(token, IN_BODY) ) _ => (block: - parse_error + parse_error UNEXPECTED_START_TAG parser.fosterParenting = true parser.processInHTMLContent(token, IN_BODY) parser.fosterParenting = false @@ -2022,20 +2086,21 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], of IN_TABLE_TEXT: match token: - '\0' => (block: parse_error) + '\0' => (block: parse_error UNEXPECTED_NULL) TokenType.CHARACTER_ASCII => (block: if token.c notin AsciiWhitespace: parser.pendingTableCharsWhitespace = false parser.pendingTableChars &= token.c ) TokenType.CHARACTER => (block: - parser.pendingTableChars &= token.r + parser.pendingTableChars &= $token.r parser.pendingTableCharsWhitespace = false ) _ => (block: if not parser.pendingTableCharsWhitespace: - # I *think* this is effectively the same thing the specification wants... - parse_error + # I *think* this is effectively the same thing the specification + # wants... + parse_error NON_SPACE_TABLE_TEXT parser.fosterParenting = true parser.reconstructActiveFormatting() parser.insertCharacter(parser.pendingTableChars) @@ -2051,34 +2116,36 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], match token: "</caption>" => (block: if not parser.hasElementInTableScope(TAG_CAPTION): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags() - if parser.getTagType(parser.currentNode) != TAG_CAPTION: parse_error - while parser.getTagType(parser.popElement()) != TAG_CAPTION: discard + parse_error_if_mismatch TAG_CAPTION + parser.popElementsIncl(TAG_CAPTION) parser.clearActiveFormattingTillMarker() parser.insertionMode = IN_TABLE ) ("<caption>", "<col>", "<colgroup>", "<tbody>", "<td>", "<tfoot>", "<th>", "<thead>", "<tr>", "</table>") => (block: if not parser.hasElementInTableScope(TAG_CAPTION): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags() - if parser.getTagType(parser.currentNode) != TAG_CAPTION: parse_error + parse_error_if_mismatch TAG_CAPTION parser.clearActiveFormattingTillMarker() parser.insertionMode = IN_TABLE reprocess token ) ("</body>", "</col>", "</colgroup>", "</html>", "</tbody>", "</td>", - "</tfoot>", "</th>", "</thead>", "</tr>") => (block: parse_error) + "</tfoot>", "</th>", "</thead>", "</tr>") => (block: + parse_error UNEXPECTED_END_TAG + ) _ => (block: parser.processInHTMLContent(token, IN_BODY)) of IN_COLUMN_GROUP: match token: AsciiWhitespace => (block: parser.insertCharacter(token.c)) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error) + TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "<col>" => (block: discard parser.insertHTMLElement(token) @@ -2086,19 +2153,19 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) "</colgroup>" => (block: if parser.getTagType(parser.currentNode) != TAG_COLGROUP: - parse_error + parse_error MISMATCHED_TAGS else: pop_current_node parser.insertionMode = IN_TABLE ) - "</col>" => (block: parse_error) + "</col>" => (block: parse_error UNEXPECTED_END_TAG) ("<template>", "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD) ) TokenType.EOF => (block: parser.processInHTMLContent(token, IN_BODY)) _ => (block: if parser.getTagType(parser.currentNode) != TAG_COLGROUP: - parse_error + parse_error MISMATCHED_TAGS else: pop_current_node parser.insertionMode = IN_TABLE @@ -2117,7 +2184,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], parser.insertionMode = IN_ROW ) ("<th>", "<td>") => (block: - parse_error + parse_error UNEXPECTED_START_TAG clear_the_stack_back_to_a_table_body_context discard parser.insertHTMLElement(Token(t: START_TAG, tagtype: TAG_TR)) parser.insertionMode = IN_ROW @@ -2125,7 +2192,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) ("</tbody>", "</tfoot>", "</thead>") => (block: if not parser.hasElementInTableScope(token.tagtype): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: clear_the_stack_back_to_a_table_body_context pop_current_node @@ -2134,7 +2201,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ("<caption>", "<col>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>", "</table>") => (block: if not parser.hasElementInTableScope({TAG_TBODY, TAG_THEAD, TAG_TFOOT}): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: clear_the_stack_back_to_a_table_body_context pop_current_node @@ -2143,7 +2210,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</td>", "</th>", "</tr>") => (block: - parse_error + parse_error ELEMENT_NOT_IN_SCOPE ) _ => (block: parser.processInHTMLContent(token, IN_TABLE)) @@ -2161,7 +2228,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) "</tr>" => (block: if not parser.hasElementInTableScope(TAG_TR): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: clear_the_stack_back_to_a_table_row_context pop_current_node @@ -2170,7 +2237,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ("<caption>", "<col>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>", "<tr>", "</table>") => (block: if not parser.hasElementInTableScope(TAG_TR): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: clear_the_stack_back_to_a_table_row_context pop_current_node @@ -2179,7 +2246,7 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) ("</tbody>", "</tfoot>", "</thead>") => (block: if not parser.hasElementInTableScope(token.tagtype): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE elif not parser.hasElementInTableScope(TAG_TR): discard else: @@ -2189,42 +2256,42 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], reprocess token ) ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</td>", - "</th>") => (block: parse_error) + "</th>") => (block: parse_error UNEXPECTED_END_TAG) _ => (block: parser.processInHTMLContent(token, IN_TABLE)) of IN_CELL: template close_cell() = parser.generateImpliedEndTags() - if parser.getTagType(parser.currentNode) notin {TAG_TD, TAG_TH}: parse_error - while parser.getTagType(parser.popElement()) notin {TAG_TD, TAG_TH}: discard + parse_error_if_mismatch {TAG_TD, TAG_TH} + parser.popElementsIncl({TAG_TD, TAG_TH}) parser.clearActiveFormattingTillMarker() parser.insertionMode = IN_ROW match token: ("</td>", "</th>") => (block: if not parser.hasElementInTableScope(token.tagtype): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: parser.generateImpliedEndTags() - if parser.getTagType(parser.currentNode) != token.tagtype: - parse_error - while parser.getTagType(parser.popElement()) != token.tagtype: discard + parse_error_if_mismatch token.tagtype + parser.popElementsIncl(token.tagtype) parser.clearActiveFormattingTillMarker() parser.insertionMode = IN_ROW ) ("<caption>", "<col>", "<colgroup>", "<tbody>", "<td>", "<tfoot>", "<thead>", "<tr>") => (block: if not parser.hasElementInTableScope({TAG_TD, TAG_TH}): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: close_cell reprocess token ) - ("</body>", "</caption>", "</col>", "</colgroup>", - "</html>") => (block: parse_error) + ("</body>", "</caption>", "</col>", "</colgroup>", "</html>") => (block: + parse_error UNEXPECTED_END_TAG + ) ("</table>", "</tbody>", "</tfoot>", "</thead>", "</tr>") => (block: if not parser.hasElementInTableScope(token.tagtype): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: close_cell reprocess token @@ -2233,10 +2300,11 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], of IN_SELECT: match token: - '\0' => (block: parse_error) + '\0' => (block: parse_error UNEXPECTED_NULL) TokenType.CHARACTER_ASCII => (block: parser.insertCharacter(token.c)) TokenType.CHARACTER => (block: parser.insertCharacter(token.r)) - TokenType.DOCTYPE => (block: parse_error) + TokenType.COMMENT => (block: parser.insertComment(token)) + TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "<option>" => (block: if parser.getTagType(parser.currentNode) == TAG_OPTION: @@ -2257,29 +2325,29 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], if parser.getTagType(parser.currentNode) == TAG_OPTGROUP: pop_current_node else: - parse_error + parse_error MISMATCHED_TAGS ) "</option>" => (block: if parser.getTagType(parser.currentNode) == TAG_OPTION: pop_current_node else: - parse_error + parse_error MISMATCHED_TAGS ) "</select>" => (block: if not parser.hasElementInSelectScope(TAG_SELECT): - parse_error + parse_error ELEMENT_NOT_IN_SCOPE else: while parser.getTagType(parser.popElement()) != TAG_SELECT: discard parser.resetInsertionMode() ) "<select>" => (block: - parse_error + parse_error NESTED_TAGS if parser.hasElementInSelectScope(TAG_SELECT): while parser.getTagType(parser.popElement()) != TAG_SELECT: discard parser.resetInsertionMode() ) ("<input>", "<keygen>", "<textarea>") => (block: - parse_error + parse_error UNEXPECTED_START_TAG if not parser.hasElementInSelectScope(TAG_SELECT): discard else: @@ -2289,24 +2357,25 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) ("<script>", "<template>", "</template>") => (block: parser.processInHTMLContent(token, IN_HEAD)) TokenType.EOF => (block: parser.processInHTMLContent(token, IN_BODY)) - _ => (block: parse_error) + TokenType.START_TAG => (block: parse_error UNEXPECTED_START_TAG) + TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) of IN_SELECT_IN_TABLE: match token: ("<caption>", "<table>", "<tbody>", "<tfoot>", "<thead>", "<tr>", "<td>", "<th>") => (block: - parse_error + parse_error UNEXPECTED_START_TAG while parser.getTagType(parser.popElement()) != TAG_SELECT: discard parser.resetInsertionMode() reprocess token ) ("</caption>", "</table>", "</tbody>", "</tfoot>", "</thead>", "</tr>", "</td>", "</th>") => (block: - parse_error + parse_error UNEXPECTED_END_TAG if not parser.hasElementInTableScope(token.tagtype): discard else: - while parser.getTagType(parser.popElement()) != TAG_SELECT: discard + parser.popElementsIncl(TAG_SELECT) parser.resetInsertionMode() reprocess token ) @@ -2351,12 +2420,12 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], parser.insertionMode = IN_BODY reprocess token ) - TokenType.END_TAG => (block: parse_error) + TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) TokenType.EOF => (block: if not parser.hasElement(TAG_TEMPLATE): discard # stop else: - parse_error + parse_error UNEXPECTED_EOF parser.popElementsIncl(TAG_TEMPLATE) parser.clearActiveFormattingTillMarker() discard parser.templateModes.pop() @@ -2368,17 +2437,17 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], match token: AsciiWhitespace => (block: parser.processInHTMLContent(token, IN_BODY)) TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.openElements[0]))) - TokenType.DOCTYPE => (block: parse_error) + TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "</html>" => (block: if parser.fragment: - parse_error + parse_error UNEXPECTED_END_TAG else: parser.insertionMode = AFTER_AFTER_BODY ) TokenType.EOF => (block: discard) # stop _ => (block: - parse_error + parse_error UNEXPECTED_AFTER_BODY_TOKEN parser.insertionMode = IN_BODY reprocess token ) @@ -2387,11 +2456,11 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], match token: AsciiWhitespace => (block: parser.insertCharacter(token.c)) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error) + TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "<frameset>" => (block: if parser.getTagType(parser.currentNode) == TAG_HTML: - parse_error + parse_error UNEXPECTED_START_TAG else: pop_current_node if not parser.fragment and @@ -2405,21 +2474,21 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) TokenType.EOF => (block: if parser.getTagType(parser.currentNode) != TAG_HTML: - parse_error + parse_error UNEXPECTED_EOF # stop ) - _ => (block: parse_error) + _ => (block: parser.parseErrorByTokenType(token.t)) of AFTER_FRAMESET: match token: AsciiWhitespace => (block: parser.insertCharacter(token.c)) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error) + TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: parser.processInHTMLContent(token, IN_BODY)) "</html>" => (block: parser.insertionMode = AFTER_AFTER_FRAMESET) "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) TokenType.EOF => (block: discard) # stop - _ => (block: parse_error) + _ => (block: parser.parseErrorByTokenType(token.t)) of AFTER_AFTER_BODY: match token: @@ -2431,18 +2500,22 @@ proc processInHTMLContent[Handle](parser: var HTML5Parser[Handle], ) TokenType.EOF => (block: discard) # stop _ => (block: - parse_error + parser.parseErrorByTokenType(token.t) parser.insertionMode = IN_BODY reprocess token ) of AFTER_AFTER_FRAMESET: match token: - TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.document))) - (TokenType.DOCTYPE, AsciiWhitespace, "<html>") => (block: parser.processInHTMLContent(token, IN_BODY)) + TokenType.COMMENT => (block: + parser.insertComment(token, last_child_of(parser.document)) + ) + (TokenType.DOCTYPE, AsciiWhitespace, "<html>") => (block: + parser.processInHTMLContent(token, IN_BODY) + ) TokenType.EOF => (block: discard) # stop "<noframes>" => (block: parser.processInHTMLContent(token, IN_HEAD)) - _ => (block: parse_error) + _ => (block: parser.parseErrorByTokenType(token.t)) const CaseTable = { "altglyph": "altGlyph", @@ -2488,14 +2561,18 @@ proc processInForeignContent(parser: var HTML5Parser, token: Token) = macro `=>`(v: typed, body: untyped): untyped = quote do: discard (`v`, proc() = `body`) + template script_end_tag() = pop_current_node #TODO document.write (?) #TODO SVG + template parse_error(e: ParseError) = + parser.parseError(e) + template any_other_end_tag() = if parser.getLocalName(parser.currentNode) != token.tagname: - parse_error + parse_error UNEXPECTED_END_TAG for i in countdown(parser.openElements.high, 1): let node = parser.openElements[i] if parser.getLocalName(parser.currentNode) == token.tagname: @@ -2508,20 +2585,20 @@ proc processInForeignContent(parser: var HTML5Parser, token: Token) = match token: '\0' => (block: - parse_error + parse_error UNEXPECTED_NULL parser.insertCharacter(Rune(0xFFFD)) ) AsciiWhitespace => (block: parser.insertCharacter(token.c)) TokenType.CHARACTER_ASCII => (block: parser.insertCharacter(token.c)) TokenType.CHARACTER => (block: parser.insertCharacter(token.r)) - TokenType.DOCTYPE => (block: parse_error) + TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) ("<b>", "<big>", "<blockquote>", "<body>", "<br>", "<center>", "<code>", "<dd>", "<div>", "<dl>", "<dt>", "<em>", "<embed>", "<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>", "<head>", "<hr>", "<i>", "<img>", "<li>", "<listing>", "<menu>", "<meta>", "<nobr>", "<ol>", "<p>", "<pre>", "<ruby>", "<s>", "<small>", "<span>", "<strong>", "<strike>", "<sub>", "<sup>", "<table>", "<tt>", "<u>", "<ul>", "<var>") => (block: - parse_error + parse_error UNEXPECTED_START_TAG #NOTE MathML not implemented while not parser.isHTMLIntegrationPoint(parser.currentNode) and parser.getNamespace(parser.currentNode) != Namespace.HTML: @@ -2627,7 +2704,12 @@ proc parseHTML*[Handle](inputStream: Stream, dombuilder: DOMBuilder[Handle], else: DECODER_ERROR_MODE_FATAL let decoder = newDecoderStream(inputStream, parser.charset, errormode = em) - parser.tokenizer = newTokenizer(decoder) + let onParseError = if parser.hasParseError(): + proc(e: ParseError) = + parser.parseError(e) + else: + nil + parser.tokenizer = newTokenizer(decoder, onParseError) parser.constructTree() if parser.needsreinterpret and canReinterpret: inputStream.setPosition(0) diff --git a/src/html/htmltokenizer.nim b/src/html/htmltokenizer.nim index c95db720..d38d427c 100644 --- a/src/html/htmltokenizer.nim +++ b/src/html/htmltokenizer.nim @@ -5,9 +5,10 @@ import macros import tables import unicode +import encoding/decoderstream import html/entity +import html/parseerror import html/tags -import encoding/decoderstream import utils/opt import utils/radixtree import utils/twtstr @@ -25,6 +26,7 @@ type attrv: string attr: bool hasnonhtml*: bool + onParseError: proc(e: ParseError) decoder: DecoderStream sbuf: seq[Rune] @@ -108,12 +110,13 @@ proc readn(t: var Tokenizer) = if t.decoder.atEnd: t.eof_i = t.sbuf.len -proc newTokenizer*(s: DecoderStream): Tokenizer = +proc newTokenizer*(s: DecoderStream, onParseError: proc(e: ParseError)): Tokenizer = var t = Tokenizer( decoder: s, sbuf: newSeqOfCap[Rune](bufLen), eof_i: -1, - sbuf_i: 0 + sbuf_i: 0, + onParseError: onParseError ) t.readn() return t @@ -192,7 +195,9 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = template reconsume_in(s: TokenizerState) = tokenizer.reconsume() switch_state s - template parse_error(error: untyped) = discard # does nothing for now... TODO? + template parse_error(error: untyped) = + if tokenizer.onParseError != nil: + tokenizer.onParseError(error) template is_appropriate_end_tag_token(): bool = tokenizer.laststart != nil and tokenizer.laststart.tagname == tokenizer.tok.tagname template start_new_attribute = @@ -346,7 +351,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '&': switch_state_return CHARACTER_REFERENCE of '<': switch_state TAG_OPEN of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER emit_current of eof: emit_eof else: emit_current @@ -355,7 +360,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '&': switch_state_return CHARACTER_REFERENCE of '<': switch_state RCDATA_LESS_THAN_SIGN - of null: parse_error unexpected_null_character + of null: parse_error UNEXPECTED_NULL_CHARACTER of eof: emit_eof else: emit_current @@ -363,7 +368,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '<': switch_state RAWTEXT_LESS_THAN_SIGN of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER emit_replacement of eof: emit_eof else: emit_current @@ -372,7 +377,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '<': switch_state SCRIPT_DATA_LESS_THAN_SIGN of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER emit_replacement of eof: emit_eof else: emit_current @@ -380,7 +385,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of PLAINTEXT: case c of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER emit_replacement of eof: emit_eof else: emit_current @@ -393,15 +398,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = new_token Token(t: START_TAG) reconsume_in TAG_NAME of '?': - parse_error unexpected_question_mark_instead_of_tag_name + parse_error UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME new_token Token(t: COMMENT) reconsume_in BOGUS_COMMENT of eof: - parse_error eof_before_tag_name + parse_error EOF_BEFORE_TAG_NAME emit '<' emit_eof else: - parse_error invalid_first_character_of_tag_name + parse_error INVALID_FIRST_CHARACTER_OF_TAG_NAME emit '<' reconsume_in DATA @@ -411,15 +416,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = new_token Token(t: END_TAG) reconsume_in TAG_NAME of '>': - parse_error missing_end_tag_name + parse_error MISSING_END_TAG_NAME switch_state DATA of eof: - parse_error eof_before_tag_name + parse_error EOF_BEFORE_TAG_NAME emit '<' emit '/' emit_eof else: - parse_error invalid_first_character_of_tag_name + parse_error INVALID_FIRST_CHARACTER_OF_TAG_NAME new_token Token(t: COMMENT) reconsume_in BOGUS_COMMENT @@ -432,10 +437,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok of AsciiUpperAlpha: tokenizer.tok.tagname &= c.tolower() of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tok.tagname &= Rune(0xFFFD) of eof: - parse_error eof_in_tag + parse_error EOF_IN_TAG emit_eof else: tokenizer.tok.tagname &= r @@ -611,10 +616,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '<': switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER emit_replacement of eof: - parse_error eof_in_script_html_comment_like_text + parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT emit_eof else: emit_current @@ -627,10 +632,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '<': switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER switch_state SCRIPT_DATA_ESCAPED of eof: - parse_error eof_in_script_html_comment_like_text + parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT emit_eof else: switch_state SCRIPT_DATA_ESCAPED @@ -646,10 +651,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state SCRIPT_DATA emit '>' of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER switch_state SCRIPT_DATA_ESCAPED of eof: - parse_error eof_in_script_html_comment_like_text + parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT emit_eof else: switch_state SCRIPT_DATA_ESCAPED @@ -727,10 +732,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN emit '<' of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER emit_replacement of eof: - parse_error eof_in_script_html_comment_like_text + parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT emit_eof else: emit_current @@ -743,11 +748,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN emit '<' of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER switch_state SCRIPT_DATA_DOUBLE_ESCAPED emit_replacement of eof: - parse_error eof_in_script_html_comment_like_text + parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT emit_eof else: switch_state SCRIPT_DATA_DOUBLE_ESCAPED @@ -763,11 +768,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state SCRIPT_DATA emit '>' of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER switch_state SCRIPT_DATA_DOUBLE_ESCAPED emit_replacement of eof: - parse_error eof_in_script_html_comment_like_text + parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT emit_eof else: switch_state SCRIPT_DATA_DOUBLE_ESCAPED @@ -798,7 +803,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of AsciiWhitespace: discard of '/', '>', eof: reconsume_in AFTER_ATTRIBUTE_NAME of '=': - parse_error unexpected_equals_sign_before_attribute_name + parse_error UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME start_new_attribute switch_state ATTRIBUTE_NAME else: @@ -817,10 +822,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of AsciiUpperAlpha: tokenizer.attrn &= c.tolower() of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.attrn &= Rune(0xFFFD) of '"', '\'', '<': - parse_error unexpected_character_in_attribute_name + parse_error UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME anything_else else: tokenizer.attrn &= r @@ -834,7 +839,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state DATA emit_tok of eof: - parse_error eof_in_tag + parse_error EOF_IN_TAG emit_eof else: start_new_attribute @@ -846,7 +851,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '"': switch_state ATTRIBUTE_VALUE_DOUBLE_QUOTED of '\'': switch_state ATTRIBUTE_VALUE_SINGLE_QUOTED of '>': - parse_error missing_attribute_value + parse_error MISSING_ATTRIBUTE_VALUE switch_state DATA emit '>' else: reconsume_in ATTRIBUTE_VALUE_UNQUOTED @@ -856,10 +861,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '"': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED of '&': switch_state_return CHARACTER_REFERENCE of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER append_to_current_attr_value Rune(0xFFFD) of eof: - parse_error eof_in_tag + parse_error EOF_IN_TAG emit_eof else: append_to_current_attr_value r @@ -868,10 +873,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '\'': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED of '&': switch_state_return CHARACTER_REFERENCE of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER append_to_current_attr_value Rune(0xFFFD) of eof: - parse_error eof_in_tag + parse_error EOF_IN_TAG emit_eof else: append_to_current_attr_value r @@ -883,13 +888,13 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state DATA emit_tok of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER append_to_current_attr_value Rune(0xFFFD) of '"', '\'', '<', '=', '`': - parse_error unexpected_character_in_unquoted_attribute_value + parse_error UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE append_to_current_attr_value c of eof: - parse_error eof_in_tag + parse_error EOF_IN_TAG emit_eof else: append_to_current_attr_value r @@ -903,10 +908,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state DATA emit_tok of eof: - parse_error eof_in_tag + parse_error EOF_IN_TAG emit_eof else: - parse_error missing_whitespace_between_attributes + parse_error MISSING_WHITESPACE_BETWEEN_ATTRIBUTES reconsume_in BEFORE_ATTRIBUTE_NAME of SELF_CLOSING_START_TAG: @@ -916,10 +921,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state DATA emit_tok of eof: - parse_error eof_in_tag + parse_error EOF_IN_TAG emit_eof else: - parse_error unexpected_solidus_in_tag + parse_error UNEXPECTED_SOLIDUS_IN_TAG reconsume_in BEFORE_ATTRIBUTE_NAME of BOGUS_COMMENT: @@ -931,7 +936,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of eof: emit_tok emit_eof - of null: parse_error unexpected_null_character + of null: parse_error UNEXPECTED_NULL_CHARACTER else: tokenizer.tok.data &= r of MARKUP_DECLARATION_OPEN: # note: rewritten to fit case model as we consume a char anyway @@ -954,12 +959,12 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = if tokenizer.hasnonhtml: switch_state CDATA_SECTION else: - parse_error cdata_in_html_content + parse_error CDATA_IN_HTML_CONTENT new_token Token(t: COMMENT, data: "[CDATA[") switch_state BOGUS_COMMENT else: anything_else else: - parse_error incorrectly_opened_comment + parse_error INCORRECTLY_OPENED_COMMENT new_token Token(t: COMMENT) reconsume_in BOGUS_COMMENT @@ -967,7 +972,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '-': switch_state COMMENT_START_DASH of '>': - parse_error abrupt_closing_of_empty_comment + parse_error ABRUPT_CLOSING_OF_EMPTY_COMMENT switch_state DATA emit_tok else: reconsume_in COMMENT @@ -976,11 +981,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '-': switch_state COMMENT_END of '>': - parse_error abrupt_closing_of_empty_comment + parse_error ABRUPT_CLOSING_OF_EMPTY_COMMENT switch_state DATA emit_tok of eof: - parse_error eof_in_comment + parse_error EOF_IN_COMMENT emit_tok emit_eof else: @@ -994,10 +999,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state COMMENT_LESS_THAN_SIGN of '-': switch_state COMMENT_END_DASH of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tok.data &= Rune(0xFFFD) of eof: - parse_error eof_in_comment + parse_error EOF_IN_COMMENT emit_tok emit_eof else: tokenizer.tok.data &= r @@ -1024,14 +1029,14 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '>', eof: reconsume_in COMMENT_END else: - parse_error nested_comment + parse_error NESTED_COMMENT reconsume_in COMMENT_END of COMMENT_END_DASH: case c of '-': switch_state COMMENT_END of eof: - parse_error eof_in_comment + parse_error EOF_IN_COMMENT emit_tok emit_eof else: @@ -1044,7 +1049,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '!': switch_state COMMENT_END_BANG of '-': tokenizer.tok.data &= '-' of eof: - parse_error eof_in_comment + parse_error EOF_IN_COMMENT emit_tok emit_eof else: @@ -1057,11 +1062,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.tok.data &= "--!" switch_state COMMENT_END_DASH of '>': - parse_error incorrectly_closed_comment + parse_error INCORRECTLY_CLOSED_COMMENT switch_state DATA emit_tok of eof: - parse_error eof_in_comment + parse_error EOF_IN_COMMENT emit_tok emit_eof else: @@ -1073,12 +1078,12 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of AsciiWhitespace: switch_state BEFORE_DOCTYPE_NAME of '>': reconsume_in BEFORE_DOCTYPE_NAME of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE new_token Token(t: DOCTYPE, quirks: true) emit_tok emit_eof else: - parse_error missing_whitespace_before_doctype_name + parse_error MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME reconsume_in BEFORE_DOCTYPE_NAME of BEFORE_DOCTYPE_NAME: @@ -1088,15 +1093,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = new_token Token(t: DOCTYPE, name: some($c.tolower())) switch_state DOCTYPE_NAME of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER new_token Token(t: DOCTYPE, name: some($Rune(0xFFFD))) of '>': - parse_error missing_doctype_name + parse_error MISSING_DOCTYPE_NAME new_token Token(t: DOCTYPE, quirks: true) switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE new_token Token(t: DOCTYPE, quirks: true) emit_tok emit_eof @@ -1113,10 +1118,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of AsciiUpperAlpha: tokenizer.tok.name.get &= c.tolower() of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tok.name.get &= Rune(0xFFFD) of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof @@ -1131,7 +1136,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof @@ -1148,7 +1153,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else else: - parse_error invalid_character_sequence_after_doctype_name + parse_error INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1156,21 +1161,21 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of AsciiWhitespace: switch_state BEFORE_DOCTYPE_PUBLIC_IDENTIFIER of '"': - parse_error missing_whitespace_after_doctype_public_keyword + parse_error MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD tokenizer.tok.pubid = some("") switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED of '>': - parse_error missing_doctype_public_identifier + parse_error MISSING_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof else: - parse_error missing_quote_before_doctype_public_identifier + parse_error MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1184,17 +1189,17 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.tok.pubid = some("") switch_state DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED of '>': - parse_error missing_doctype_public_identifier + parse_error MISSING_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof else: - parse_error missing_quote_before_doctype_public_identifier + parse_error MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1202,15 +1207,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '"': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tok.pubid.get &= Rune(0xFFFD) of '>': - parse_error abrupt_doctype_public_identifier + parse_error ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof @@ -1221,15 +1226,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '\'': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tok.pubid.get &= Rune(0xFFFD) of '>': - parse_error abrupt_doctype_public_identifier + parse_error ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof @@ -1243,20 +1248,20 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state DATA emit_tok of '"': - parse_error missing_whitespace_between_doctype_public_and_system_identifiers + parse_error MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED of '\'': - parse_error missing_whitespace_between_doctype_public_and_system_identifiers + parse_error MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof else: - parse_error missing_quote_before_doctype_system_identifier + parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1273,12 +1278,12 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof else: - parse_error missing_quote_before_doctype_system_identifier + parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1286,25 +1291,25 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of AsciiWhitespace: switch_state BEFORE_DOCTYPE_SYSTEM_IDENTIFIER of '"': - parse_error missing_whitespace_after_doctype_system_keyword + parse_error MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED of '\'': - parse_error missing_whitespace_after_doctype_system_keyword + parse_error MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED of '>': - parse_error missing_doctype_system_identifier + parse_error MISSING_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof else: - parse_error missing_quote_before_doctype_system_identifier + parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1318,17 +1323,17 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.tok.pubid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED of '>': - parse_error missing_doctype_system_identifier + parse_error MISSING_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof else: - parse_error missing_quote_before_doctype_system_identifier + parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1336,15 +1341,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '"': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tok.sysid.get &= Rune(0xFFFD) of '>': - parse_error abrupt_doctype_system_identifier + parse_error ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof @@ -1355,15 +1360,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '\'': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tok.sysid.get &= Rune(0xFFFD) of '>': - parse_error abrupt_doctype_system_identifier + parse_error ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof @@ -1377,12 +1382,12 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof else: - parse_error unexpected_character_after_doctype_system_identifier + parse_error UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER reconsume_in BOGUS_DOCTYPE of BOGUS_DOCTYPE: @@ -1390,7 +1395,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '>': switch_state DATA emit_tok - of null: parse_error unexpected_null_character + of null: parse_error UNEXPECTED_NULL_CHARACTER of eof: emit_tok emit_eof @@ -1400,7 +1405,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of ']': switch_state CDATA_SECTION_BRACKET of eof: - parse_error eof_in_cdata + parse_error EOF_IN_CDATA emit_eof else: emit_current @@ -1458,7 +1463,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state tokenizer.rstate else: if tokenizer.tmp[^1] != ';': - parse_error missing_semicolon_after_character_reference_parse_error + parse_error MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE tokenizer.tmp = value.get flush_code_points_consumed_as_a_character_reference switch_state tokenizer.rstate @@ -1474,7 +1479,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: emit_current of ';': - parse_error unknown_named_character_reference + parse_error UNKNOWN_NAMED_CHARACTER_REFERENCE reconsume_in tokenizer.rstate else: reconsume_in tokenizer.rstate @@ -1490,7 +1495,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of AsciiHexDigit: reconsume_in HEXADECIMAL_CHARACTER_REFERENCE else: - parse_error absence_of_digits_in_numeric_character_reference + parse_error ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE flush_code_points_consumed_as_a_character_reference reconsume_in tokenizer.rstate @@ -1498,7 +1503,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of AsciiDigit: reconsume_in DECIMAL_CHARACTER_REFERENCE else: - parse_error absence_of_digits_in_numeric_character_reference + parse_error ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE flush_code_points_consumed_as_a_character_reference reconsume_in tokenizer.rstate @@ -1509,7 +1514,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.code += hexValue(c) of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END else: - parse_error missing_semicolon_after_character_reference + parse_error MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE reconsume_in NUMERIC_CHARACTER_REFERENCE_END of DECIMAL_CHARACTER_REFERENCE: @@ -1519,23 +1524,23 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.code += decValue(c) of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END else: - parse_error missing_semicolon_after_character_reference + parse_error MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE reconsume_in NUMERIC_CHARACTER_REFERENCE_END of NUMERIC_CHARACTER_REFERENCE_END: ignore_eof # we reconsume anyway case tokenizer.code of 0x00: - parse_error null_character_reference + parse_error NULL_CHARACTER_REFERENCE tokenizer.code = 0xFFFD elif tokenizer.code > 0x10FFFF: - parse_error character_reference_outside_unicode_range + parse_error CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE tokenizer.code = 0xFFFD elif Rune(tokenizer.code).isSurrogate(): - parse_error surrogate_character_reference + parse_error SURROGATE_CHARACTER_REFERENCE tokenizer.code = 0xFFFD elif Rune(tokenizer.code).isNonCharacter(): - parse_error noncharacter_character_reference + parse_error NONCHARACTER_CHARACTER_REFERENCE # do nothing elif tokenizer.code in 0..255 and char(tokenizer.code) in ((Controls - AsciiWhitespace) + {chr(0x0D)}): const ControlMapTable = [ diff --git a/src/html/parseerror.nim b/src/html/parseerror.nim new file mode 100644 index 00000000..d99b2fed --- /dev/null +++ b/src/html/parseerror.nim @@ -0,0 +1,70 @@ +type ParseError* = enum + #TODO write a description for all error codes + ABRUPT_CLOSING_OF_EMPTY_COMMENT + ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER + ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER + ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE + CDATA_IN_HTML_CONTENT + CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE + CONTROL_CHARACTER_IN_INPUT_STREAM + CONTROL_CHARACTER_REFERENCE + END_TAG_WITH_ATTRIBUTES + DUPLICATE_ATTRIBUTE + END_TAG_WITH_TRAILING_SOLIDUS + EOF_BEFORE_TAG_NAME + EOF_IN_CDATA + EOF_IN_COMMENT + EOF_IN_DOCTYPE + EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT + EOF_IN_TAG + INCORRECTLY_CLOSED_COMMENT + INCORRECTLY_OPENED_COMMENT + INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME + INVALID_FIRST_CHARACTER_OF_TAG_NAME + MISSING_ATTRIBUTE_VALUE + MISSING_DOCTYPE_NAME + MISSING_DOCTYPE_PUBLIC_IDENTIFIER + MISSING_DOCTYPE_SYSTEM_IDENTIFIER + MISSING_END_TAG_NAME + MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER + MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER + MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE + MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD + MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD + MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME + MISSING_WHITESPACE_BETWEEN_ATTRIBUTES + MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS + NESTED_COMMENT + NONCHARACTER_CHARACTER_REFERENCE + NONCHARACTER_IN_INPUT_STREAM + NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS + NULL_CHARACTER_REFERENCE + SURROGATE_CHARACTER_REFERENCE + SURROGATE_IN_INPUT_STREAM + UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER + UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME + UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE + UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME + UNEXPECTED_NULL_CHARACTER + UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME + UNEXPECTED_SOLIDUS_IN_TAG + UNKNOWN_NAMED_CHARACTER_REFERENCE + LAST_SPECIFIED_ERROR # never returned + # From here on, error code names have not been specified by the standard. + MISMATCHED_TAGS = "Mismatched start and end tags" + INVALID_DOCTYPE = "Unrecognized document type" + UNEXPECTED_DOCTYPE = "Unexpected document type" + UNEXPECTED_INITIAL_TOKEN = "Unexpected token in initial state" + UNEXPECTED_START_TAG = "Unexpected start tag" + UNEXPECTED_END_TAG = "Unexpected end tag" + ELEMENT_NOT_IN_OPEN_ELEMENTS = "Element has not been added to open elements" + ELEMENT_NOT_IN_SCOPE = "Element not in appropriate scope" + ELEMENT_NOT_CURRENT_NODE = "Element is not current node" + #TODO merge with UNEXPECTED_NULL_CHARACTER? + UNEXPECTED_NULL = "Unexpected null character" + NESTED_TAGS = "Non-nestable nested tags" + UNEXPECTED_SPECIAL_ELEMENT = "Unexpected special element on open elements" + UNEXPECTED_EOF = "Unexpected end of file" + INVALID_TEXT_PARENT = "Invalid parent element for text node" + NON_SPACE_TABLE_TEXT = "Non-space table text" + UNEXPECTED_AFTER_BODY_TOKEN = "Unexpected token after body" |