diff options
author | bptato <nincsnevem662@gmail.com> | 2024-05-05 18:36:05 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2024-05-05 18:36:05 +0200 |
commit | f5cccfe0655f7266e88cc537f9572926ebdef570 (patch) | |
tree | 9ee6905b9fc6dc96d7d486ec7801f35fb94b9553 | |
parent | 464ae10e8c49c3600254a0188d10e59418a21d8d (diff) | |
download | chawan-f5cccfe0655f7266e88cc537f9572926ebdef570.tar.gz |
Remove parse error callback
It never worked properly, it's under-specified, and I don't need it.
-rw-r--r-- | chame/htmlparser.nim | 349 | ||||
-rw-r--r-- | chame/htmlparseriface.nim | 9 | ||||
-rw-r--r-- | chame/htmltokenizer.nim | 326 | ||||
-rw-r--r-- | chame/parseerror.nim | 71 |
4 files changed, 120 insertions, 635 deletions
diff --git a/chame/htmlparser.nim b/chame/htmlparser.nim index 24060a04..a522b8a7 100644 --- a/chame/htmlparser.nim +++ b/chame/htmlparser.nim @@ -5,7 +5,6 @@ import std/tables import dombuilder import htmltokenizer -import parseerror import tags import tokstate @@ -15,7 +14,6 @@ export macros # Export these so that htmlparseriface works seamlessly. export dombuilder export options -export parseerror export tags # Export tokstate too; it is needed for fragment parsing. @@ -135,11 +133,6 @@ proc atomToTagType*[Handle, Atom](parser: HTML5Parser[Handle, Atom], mixin atomToTagTypeImpl return parser.dombuilder.atomToTagTypeImpl(atom) -proc parseError(parser: HTML5Parser, e: ParseError) = - mixin parseErrorImpl - when compiles(parser.dombuilder.parseErrorImpl(e)): - parser.dombuilder.parseErrorImpl(e) - proc setQuirksMode[Handle, Atom](parser: var HTML5Parser[Handle, Atom], mode: QuirksMode) = mixin setQuirksModeImpl @@ -237,10 +230,6 @@ proc associateWithForm[Handle, Atom](parser: HTML5Parser[Handle, Atom], parser.dombuilder.associateWithFormImpl(element, form, intendedParent) # Parser -func hasParseError(parser: HTML5Parser): bool = - mixin parseErrorImpl - return compiles(parser.dombuilder.parseErrorImpl(default(ParseError))) - func fragment(parser: HTML5Parser): bool = return parser.ctx.isSome @@ -919,21 +908,6 @@ func extractEncFromMeta(s: string): string = return s2 return s.until(';', ' ', i) -proc parseErrorByTokenType(parser: var HTML5Parser, tokenType: TokenType) = - case tokenType - of START_TAG: - parser.parseError UNEXPECTED_START_TAG - of END_TAG: - parser.parseError UNEXPECTED_END_TAG - of EOF: - parser.parseError UNEXPECTED_EOF - of CHARACTER, CHARACTER_WHITESPACE: - parser.parseError UNEXPECTED_CHARACTER - of CHARACTER_NULL: - parser.parseError UNEXPECTED_NULL - of DOCTYPE, TokenType.COMMENT: - doAssert false - # Find a node in the list of active formatting elements, or return -1. func findLastActiveFormatting[Handle, Atom](parser: var HTML5Parser[Handle, Atom], node: Handle): int = @@ -1020,8 +994,6 @@ proc findLastActiveFormatting[Handle, Atom](parser: var HTML5Parser[Handle, Atom # If true is returned, call "any other end tag". proc adoptionAgencyAlgorithm[Handle, Atom](parser: var HTML5Parser[Handle, Atom], token: Token): bool = - template parse_error(e: ParseError) = - parser.parseError(e) if parser.currentToken.tagname == token.tagname and parser.findLastActiveFormatting(parser.currentNode) == -1: pop_current_node @@ -1034,15 +1006,10 @@ proc adoptionAgencyAlgorithm[Handle, Atom](parser: var HTML5Parser[Handle, Atom] let formatting = parser.activeFormatting[formattingIndex][0].get let stackIndex = parser.findOpenElement(formatting) if stackIndex < 0: - parse_error ELEMENT_NOT_IN_OPEN_ELEMENTS parser.activeFormatting.delete(formattingIndex) return false if not parser.hasElementInScope(formatting): - parse_error ELEMENT_NOT_IN_SCOPE return false - if formatting != parser.currentNode: - parse_error ELEMENT_NOT_CURRENT_NODE - # do not return var furthestBlockIndex = parser.findFurthestBlockAfter(stackIndex) if furthestBlockIndex == -1: parser.popElementsIncl(formatting) @@ -1104,8 +1071,6 @@ proc adoptionAgencyAlgorithm[Handle, Atom](parser: var HTML5Parser[Handle, Atom] proc closeP(parser: var HTML5Parser, sure = false) = if sure or parser.hasElementInButtonScope(TAG_P): parser.generateImpliedEndTags(TAG_P) - if parser.getTagType(parser.currentNode) != TAG_P: - parser.parseError(MISMATCHED_TAGS) parser.popElementsIncl(TAG_P) proc newStartTagToken[Handle, Atom](parser: HTML5Parser[Handle, Atom], @@ -1346,19 +1311,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], template reprocess(mode: InsertionMode): ParseResult = parser.processInHTMLContent(token, mode) - template parse_error(e: ParseError) = - parser.parseError(e) - - template parse_error_if_mismatch(tagtype: TagType) = - if parser.hasParseError(): - if parser.getTagType(parser.currentNode) != tagtype: - parse_error MISMATCHED_TAGS - - template parse_error_if_mismatch(tagtypes: set[TagType]) = - if parser.hasParseError(): - if parser.getTagType(parser.currentNode) notin tagtypes: - parse_error MISMATCHED_TAGS - case insertionMode of INITIAL: match token: @@ -1367,9 +1319,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.insertComment(token, last_child_of(parser.getDocument())) ) TokenType.DOCTYPE => (block: - if token.name.get("") != "html" or token.pubid.isSome or - token.sysid.isSome and token.sysid.get != "about:legacy-compat": - parse_error INVALID_DOCTYPE let doctype = parser.createDocumentType(token.name.get(""), token.pubid.get(""), token.sysid.get("")) parser.append(parser.getDocument(), doctype) @@ -1381,8 +1330,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.insertionMode = BEFORE_HTML ) other => (block: - if not parser.opts.isIframeSrcdoc: - parse_error UNEXPECTED_INITIAL_TOKEN parser.setQuirksMode(QUIRKS) parser.insertionMode = BEFORE_HTML reprocess token @@ -1390,11 +1337,11 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], of BEFORE_HTML: match token: - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) + (TokenType.DOCTYPE, TokenType.END_TAG, + TokenType.CHARACTER_WHITESPACE) => (block: discard) TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.getDocument())) ) - TokenType.CHARACTER_WHITESPACE => (block: discard) "<html>" => (block: let intendedParent = parser.getDocument() let element = parser.createHTMLElementForToken(token, intendedParent) @@ -1402,7 +1349,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.pushElement(element, token) parser.insertionMode = BEFORE_HEAD ) - TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) ("</head>", "</body>", "</html>", "</br>", other) => (block: let element = parser.createHTMLElement() parser.append(parser.getDocument(), element) @@ -1414,15 +1360,14 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], of BEFORE_HEAD: match token: - TokenType.CHARACTER_WHITESPACE => (block: discard) + (TokenType.CHARACTER_WHITESPACE, TokenType.DOCTYPE, + TokenType.END_TAG) => (block: discard) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: return reprocess IN_BODY) "<head>" => (block: parser.head = some((parser.insertHTMLElement(token), token)) parser.insertionMode = IN_HEAD ) - TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) ("</head>", "</body>", "</html>", "</br>", other) => (block: let head = parser.newStartTagToken(TAG_HEAD) parser.head = some((parser.insertHTMLElement(head), head)) @@ -1436,7 +1381,7 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.insertCharacter(token.s) ) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) + TokenType.DOCTYPE => (block: discard) "<html>" => (block: return reprocess IN_BODY) ("<base>", "<basefont>", "<bgsound>", "<link>") => (block: discard parser.insertHTMLElement(token) @@ -1496,18 +1441,14 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.templateModes.add(IN_TEMPLATE) ) "</template>" => (block: - if not parser.hasElement(TAG_TEMPLATE): - parse_error ELEMENT_NOT_IN_OPEN_ELEMENTS - else: + if parser.hasElement(TAG_TEMPLATE): parser.generateImpliedEndTagsThoroughly() - if parser.getTagType(parser.currentNode) != TAG_TEMPLATE: - parse_error MISMATCHED_TAGS parser.popElementsIncl(TAG_TEMPLATE) parser.clearActiveFormattingTillMarker() discard parser.templateModes.pop() parser.resetInsertionMode() ) - ("<head>", TokenType.END_TAG) => (block: parse_error UNEXPECTED_END_TAG) + ("<head>", TokenType.END_TAG) => (block: discard) ("</body>", "</html>", "</br>", other) => (block: pop_current_node parser.insertionMode = AFTER_HEAD @@ -1516,7 +1457,9 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], of IN_HEAD_NOSCRIPT: match token: - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) + (TokenType.DOCTYPE, "<head>", "<noscript>", TokenType.END_TAG) => (block: + discard + ) "<html>" => (block: return reprocess IN_BODY) "</noscript>" => (block: pop_current_node @@ -1527,8 +1470,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], "<style>") => (block: return reprocess IN_HEAD ) - ("<head>", "<noscript>") => (block: parse_error UNEXPECTED_START_TAG) - TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) ("</br>", other) => (block: pop_current_node parser.insertionMode = IN_HEAD @@ -1541,7 +1482,7 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.insertCharacter(token.s) ) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) + (TokenType.DOCTYPE, "<head>", TokenType.END_TAG) => (block: discard) "<html>" => (block: return reprocess IN_BODY) "<body>" => (block: discard parser.insertHTMLElement(token) @@ -1554,7 +1495,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], ) ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", "<script>", "<style>", "<template>", "<title>") => (block: - parse_error UNEXPECTED_START_TAG let (head, headTok) = parser.head.get parser.pushElement(head, headTok) result = reprocess IN_HEAD @@ -1565,11 +1505,7 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], if j != -1: parser.openElements.delete(j) ) - "</template>" => (block: - return reprocess IN_HEAD - ) - ("<head>") => (block: parse_error UNEXPECTED_START_TAG) - (TokenType.END_TAG) => (block: parse_error UNEXPECTED_END_TAG) + "</template>" => (block: return reprocess IN_HEAD) ("</body>", "</html>", "</br>", other) => (block: discard parser.insertHTMLElement(parser.newStartTagToken(TAG_BODY)) parser.insertionMode = IN_BODY @@ -1586,42 +1522,25 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], let (node, itToken) = parser.openElements[i] if itToken.tagname == token.tagname: parser.generateImpliedEndTags(tokenTagType) - if node != parser.currentNode: - parse_error ELEMENT_NOT_CURRENT_NODE parser.popElementsIncl(node) break elif parser.isSpecialElement(node): - parse_error UNEXPECTED_SPECIAL_ELEMENT return - template parse_error_if_body_has_disallowed_open_elements = - if parser.hasParseError(): - const Allowed = { - TAG_DD, TAG_DT, TAG_LI, TAG_OPTGROUP, TAG_OPTION, TAG_P, TAG_RB, - TAG_RP, TAG_RT, TAG_RTC, TAG_TBODY, TAG_TD, TAG_TFOOT, TAG_TH, - TAG_THEAD, TAG_TR, TAG_BODY, TAG_HTML - } - if parser.hasElement(AllTagTypes - Allowed): - parse_error MISMATCHED_TAGS - match token: TokenType.CHARACTER_WHITESPACE => (block: parser.reconstructActiveFormatting() parser.insertCharacter(token.s) ) - TokenType.CHARACTER_NULL => (block: parse_error UNEXPECTED_NULL) + (TokenType.CHARACTER_NULL, TokenType.DOCTYPE) => (block: discard) TokenType.CHARACTER => (block: parser.reconstructActiveFormatting() parser.insertCharacter(token.s) parser.framesetOk = false ) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: - parse_error UNEXPECTED_START_TAG - if parser.hasElement(TAG_TEMPLATE): - discard - else: + if not parser.hasElement(TAG_TEMPLATE): parser.addAttrsIfMissing(parser.openElements[0].element, token.attrs) ) ("<base>", "<basefont>", "<bgsound>", "<link>", "<meta>", "<noframes>", @@ -1630,7 +1549,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], return reprocess IN_HEAD ) "<body>" => (block: - parse_error UNEXPECTED_START_TAG if parser.openElements.len == 1 or parser.getTagType(parser.openElements[1].element) != TAG_BODY or parser.hasElement(TAG_TEMPLATE): @@ -1640,7 +1558,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.addAttrsIfMissing(parser.openElements[1].element, token.attrs) ) "<frameset>" => (block: - parse_error UNEXPECTED_START_TAG if parser.openElements.len == 1 or parser.getTagType(parser.openElements[1].element) != TAG_BODY or not parser.framesetOk: @@ -1655,22 +1572,14 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], TokenType.EOF => (block: if parser.templateModes.len > 0: return reprocess IN_TEMPLATE - else: - parse_error_if_body_has_disallowed_open_elements - # stop + # stop ) "</body>" => (block: - if not parser.hasElementInScope(TAG_BODY): - parse_error UNEXPECTED_END_TAG - else: - parse_error_if_body_has_disallowed_open_elements + if parser.hasElementInScope(TAG_BODY): parser.insertionMode = AFTER_BODY ) "</html>" => (block: - if not parser.hasElementInScope(TAG_BODY): - parse_error UNEXPECTED_END_TAG - else: - parse_error_if_body_has_disallowed_open_elements + if parser.hasElementInScope(TAG_BODY): parser.insertionMode = AFTER_BODY reprocess token ) @@ -1685,7 +1594,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], ("<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>") => (block: parser.closeP() if parser.getTagType(parser.currentNode) in HTagTypes: - parse_error NESTED_TAGS pop_current_node discard parser.insertHTMLElement(token) ) @@ -1697,9 +1605,7 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], ) "<form>" => (block: let hasTemplate = parser.hasElement(TAG_TEMPLATE) - if parser.form.isSome and not hasTemplate: - parse_error NESTED_TAGS - else: + if parser.form.isNone or hasTemplate: parser.closeP() let element = parser.insertHTMLElement(token) if not hasTemplate: @@ -1713,7 +1619,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], case tagType of TAG_LI: parser.generateImpliedEndTags(TAG_LI) - parse_error_if_mismatch TAG_LI parser.popElementsIncl(TAG_LI) break of TAG_ADDRESS, TAG_DIV, TAG_P: @@ -1732,12 +1637,10 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], case tagType of TAG_DD: parser.generateImpliedEndTags(TAG_DD) - parse_error_if_mismatch TAG_DD parser.popElementsIncl(TAG_DD) break of TAG_DT: parser.generateImpliedEndTags(TAG_DT) - parse_error_if_mismatch TAG_DT parser.popElementsIncl(TAG_DT) break of TAG_ADDRESS, TAG_DIV, TAG_P: @@ -1755,7 +1658,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], ) "<button>" => (block: if parser.hasElementInScope(TAG_BUTTON): - parse_error NESTED_TAGS parser.generateImpliedEndTags() parser.popElementsIncl(TAG_BUTTON) parser.reconstructActiveFormatting() @@ -1767,11 +1669,8 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], "</fieldset>", "</figcaption>", "</figure>", "</footer>", "</header>", "</hgroup>", "</listing>", "</main>", "</menu>", "</nav>", "</ol>", "</pre>", "</search>", "</section>", "</summary>", "</ul>") => (block: - if not parser.hasElementInScope(token.tagname): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInScope(token.tagname): parser.generateImpliedEndTags() - parse_error_if_mismatch tokenTagType parser.popElementsIncl(tokenTagType) ) "</form>" => (block: @@ -1779,57 +1678,40 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], let form = parser.form parser.form = none(Handle) if form.isNone or not parser.hasElementInScope(form.get): - parse_error ELEMENT_NOT_IN_SCOPE return let node = form.get parser.generateImpliedEndTags() - if parser.currentNode != node: - parse_error ELEMENT_NOT_CURRENT_NODE let i = parser.findOpenElement(node) parser.openElements.delete(i) else: - if not parser.hasElementInScope(TAG_FORM): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInScope(TAG_FORM): parser.generateImpliedEndTags() - parse_error_if_mismatch TAG_FORM parser.popElementsIncl(TAG_FORM) ) "</p>" => (block: if not parser.hasElementInButtonScope(TAG_P): - parse_error ELEMENT_NOT_IN_SCOPE discard parser.insertHTMLElement(parser.newStartTagToken(TAG_P)) parser.closeP(sure = true) ) "</li>" => (block: - if not parser.hasElementInListItemScope(TAG_LI): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInListItemScope(TAG_LI): parser.generateImpliedEndTags(TAG_LI) - parse_error_if_mismatch TAG_LI parser.popElementsIncl(TAG_LI) ) ("</dd>", "</dt>") => (block: - if not parser.hasElementInScope(tokenTagType): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInScope(tokenTagType): parser.generateImpliedEndTags(tokenTagType) - parse_error_if_mismatch tokenTagType parser.popElementsIncl(tokenTagType) ) ("</h1>", "</h2>", "</h3>", "</h4>", "</h5>", "</h6>") => (block: - if not parser.hasElementInScope(HTagTypes): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInScope(HTagTypes): parser.generateImpliedEndTags() - parse_error_if_mismatch tokenTagType parser.popElementsIncl(HTagTypes) ) "<a>" => (block: let i = parser.findLastActiveFormattingAfterMarker(TAG_A) if i != -1: let anchor = parser.activeFormatting[i][0].get - parse_error NESTED_TAGS if parser.adoptionAgencyAlgorithm(token): any_other_end_tag tokenTagType return @@ -1852,7 +1734,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], "<nobr>" => (block: parser.reconstructActiveFormatting() if parser.hasElementInScope(TAG_NOBR): - parse_error NESTED_TAGS if parser.adoptionAgencyAlgorithm(token): any_other_end_tag tokenTagType return @@ -1874,11 +1755,8 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.framesetOk = false ) ("</applet>", "</marquee>", "</object>") => (block: - if not parser.hasElementInScope(tokenTagType): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInScope(tokenTagType): parser.generateImpliedEndTags() - parse_error_if_mismatch tokenTagType parser.popElementsIncl(tokenTagType) parser.clearActiveFormattingTillMarker() ) @@ -1889,10 +1767,7 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.framesetOk = false parser.insertionMode = IN_TABLE ) - "</br>" => (block: - parse_error UNEXPECTED_END_TAG - reprocess parser.newStartTagToken(TAG_BR) - ) + "</br>" => (block: reprocess parser.newStartTagToken(TAG_BR)) ("<area>", "<br>", "<embed>", "<img>", "<keygen>", "<wbr>") => (block: parser.reconstructActiveFormatting() discard parser.insertHTMLElement(token) @@ -1971,13 +1846,11 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], ("<rb>", "<rtc>") => (block: if parser.hasElementInScope(TAG_RUBY): parser.generateImpliedEndTags() - parse_error_if_mismatch TAG_RUBY discard parser.insertHTMLElement(token) ) ("<rp>", "<rt>") => (block: if parser.hasElementInScope(TAG_RUBY): parser.generateImpliedEndTags(TAG_RTC) - parse_error_if_mismatch {TAG_RUBY, TAG_RTC} discard parser.insertHTMLElement(token) ) "<math>" => (block: @@ -2000,7 +1873,7 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], ) ("<caption>", "<col>", "<colgroup>", "<frame>", "<head>", "<tbody>", "<td>", "<tfoot>", "<th>", "<thead>", "<tr>") => (block: - parse_error UNEXPECTED_START_TAG + discard ) TokenType.START_TAG => (block: any_other_start_tag) TokenType.END_TAG => (block: @@ -2019,7 +1892,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.insertCharacter(token.s) ) TokenType.EOF => (block: - parse_error UNEXPECTED_EOF if parser.getTagType(parser.currentNode) == TAG_SCRIPT: parser.setScriptAlreadyStarted(parser.currentNode) pop_current_node @@ -2053,13 +1925,12 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.insertionMode = IN_TABLE_TEXT reprocess token else: # anything else - parse_error INVALID_TEXT_PARENT parser.fosterParenting = true result = reprocess IN_BODY parser.fosterParenting = false ) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) + TokenType.DOCTYPE => (block: discard) "<caption>" => (block: clear_the_stack_back_to_a_table_context parser.activeFormatting.add((none(Handle), nil)) @@ -2091,30 +1962,22 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], reprocess token ) "<table>" => (block: - parse_error NESTED_TAGS - if not parser.hasElementInTableScope(TAG_TABLE): - discard - else: + if parser.hasElementInTableScope(TAG_TABLE): parser.popElementsIncl(TAG_TABLE) parser.resetInsertionMode() reprocess token ) "</table>" => (block: - if not parser.hasElementInTableScope(TAG_TABLE): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInTableScope(TAG_TABLE): parser.popElementsIncl(TAG_TABLE) parser.resetInsertionMode() ) ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</tbody>", - "</td>", "</tfoot>", "</th>", "</thead>", "</tr>") => (block: - parse_error UNEXPECTED_END_TAG - ) + "</td>", "</tfoot>", "</th>", "</thead>", "</tr>") => (block: discard) ("<style>", "<script>", "<template>", "</template>") => (block: return reprocess IN_HEAD ) "<input>" => (block: - parse_error UNEXPECTED_START_TAG token.attrs.withValue(parser.tagTypeToAtom(TAG_TYP), p): if not p[].equalsIgnoreCase("hidden"): # anything else @@ -2131,16 +1994,12 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.fosterParenting = false ) "<form>" => (block: - parse_error UNEXPECTED_START_TAG - if parser.form.isSome or parser.hasElement(TAG_TEMPLATE): - discard - else: + if parser.form.isNone and not parser.hasElement(TAG_TEMPLATE): parser.form = some(parser.insertHTMLElement(token)) pop_current_node ) TokenType.EOF => (block: return reprocess IN_BODY) other => (block: - parse_error UNEXPECTED_START_TAG parser.fosterParenting = true result = reprocess IN_BODY parser.fosterParenting = false @@ -2148,7 +2007,7 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], of IN_TABLE_TEXT: match token: - TokenType.CHARACTER_NULL => (block: parse_error UNEXPECTED_NULL) + TokenType.CHARACTER_NULL => (block: discard) TokenType.CHARACTER_WHITESPACE => (block: parser.pendingTableChars &= token.s ) @@ -2160,7 +2019,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], if not parser.pendingTableCharsWhitespace: # I *think* this is effectively the same thing the specification # wants... - parse_error NON_SPACE_TABLE_TEXT parser.fosterParenting = true parser.reconstructActiveFormatting() parser.insertCharacter(parser.pendingTableChars) @@ -2175,31 +2033,23 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], of IN_CAPTION: match token: "</caption>" => (block: - if not parser.hasElementInTableScope(TAG_CAPTION): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInTableScope(TAG_CAPTION): parser.generateImpliedEndTags() - parse_error_if_mismatch TAG_CAPTION parser.popElementsIncl(TAG_CAPTION) parser.clearActiveFormattingTillMarker() parser.insertionMode = IN_TABLE ) ("<caption>", "<col>", "<colgroup>", "<tbody>", "<td>", "<tfoot>", "<th>", "<thead>", "<tr>", "</table>") => (block: - if not parser.hasElementInTableScope(TAG_CAPTION): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInTableScope(TAG_CAPTION): parser.generateImpliedEndTags() - parse_error_if_mismatch TAG_CAPTION parser.popElementsIncl(TAG_CAPTION) parser.clearActiveFormattingTillMarker() parser.insertionMode = IN_TABLE reprocess token ) ("</body>", "</col>", "</colgroup>", "</html>", "</tbody>", "</td>", - "</tfoot>", "</th>", "</thead>", "</tr>") => (block: - parse_error UNEXPECTED_END_TAG - ) + "</tfoot>", "</th>", "</thead>", "</tr>") => (block: discard) other => (block: return reprocess IN_BODY) of IN_COLUMN_GROUP: @@ -2208,26 +2058,21 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.insertCharacter(token.s) ) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) + (TokenType.DOCTYPE, "</col>") => (block: discard) "<html>" => (block: return reprocess IN_BODY) "<col>" => (block: discard parser.insertHTMLElement(token) pop_current_node ) "</colgroup>" => (block: - if parser.getTagType(parser.currentNode) != TAG_COLGROUP: - parse_error MISMATCHED_TAGS - else: + if parser.getTagType(parser.currentNode) == TAG_COLGROUP: pop_current_node parser.insertionMode = IN_TABLE ) - "</col>" => (block: parse_error UNEXPECTED_END_TAG) ("<template>", "</template>") => (block: return reprocess IN_HEAD) TokenType.EOF => (block: return reprocess IN_BODY) other => (block: - if parser.getTagType(parser.currentNode) != TAG_COLGROUP: - parse_error MISMATCHED_TAGS - else: + if parser.getTagType(parser.currentNode) == TAG_COLGROUP: pop_current_node parser.insertionMode = IN_TABLE reprocess token @@ -2246,34 +2091,27 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.insertionMode = IN_ROW ) ("<th>", "<td>") => (block: - parse_error UNEXPECTED_START_TAG clear_the_stack_back_to_a_table_body_context discard parser.insertHTMLElement(parser.newStartTagToken(TAG_TR)) parser.insertionMode = IN_ROW reprocess token ) ("</tbody>", "</tfoot>", "</thead>") => (block: - if not parser.hasElementInTableScope(tokenTagType): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInTableScope(tokenTagType): clear_the_stack_back_to_a_table_body_context pop_current_node parser.insertionMode = IN_TABLE ) ("<caption>", "<col>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>", "</table>") => (block: - if not parser.hasElementInTableScope({TAG_TBODY, TAG_THEAD, TAG_TFOOT}): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInTableScope({TAG_TBODY, TAG_THEAD, TAG_TFOOT}): clear_the_stack_back_to_a_table_body_context pop_current_node parser.insertionMode = IN_TABLE reprocess token ) ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</td>", - "</th>", "</tr>") => (block: - parse_error ELEMENT_NOT_IN_SCOPE - ) + "</th>", "</tr>") => (block: discard) other => (block: return reprocess IN_TABLE) of IN_ROW: @@ -2289,72 +2127,56 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.activeFormatting.add((none(Handle), nil)) ) "</tr>" => (block: - if not parser.hasElementInTableScope(TAG_TR): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInTableScope(TAG_TR): clear_the_stack_back_to_a_table_row_context pop_current_node parser.insertionMode = IN_TABLE_BODY ) ("<caption>", "<col>", "<colgroup>", "<tbody>", "<tfoot>", "<thead>", "<tr>", "</table>") => (block: - if not parser.hasElementInTableScope(TAG_TR): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInTableScope(TAG_TR): clear_the_stack_back_to_a_table_row_context pop_current_node parser.insertionMode = IN_TABLE_BODY reprocess token ) ("</tbody>", "</tfoot>", "</thead>") => (block: - if not parser.hasElementInTableScope(tokenTagType): - parse_error ELEMENT_NOT_IN_SCOPE - elif not parser.hasElementInTableScope(TAG_TR): - discard - else: + if parser.hasElementInTableScope({tokenTagType, TAG_TR}): clear_the_stack_back_to_a_table_row_context pop_current_node parser.insertionMode = IN_TABLE_BODY reprocess token ) ("</body>", "</caption>", "</col>", "</colgroup>", "</html>", "</td>", - "</th>") => (block: parse_error UNEXPECTED_END_TAG) + "</th>") => (block: discard) other => (block: return reprocess IN_TABLE) of IN_CELL: template close_cell() = parser.generateImpliedEndTags() - parse_error_if_mismatch {TAG_TD, TAG_TH} parser.popElementsIncl({TAG_TD, TAG_TH}) parser.clearActiveFormattingTillMarker() parser.insertionMode = IN_ROW match token: ("</td>", "</th>") => (block: - if not parser.hasElementInTableScope(tokenTagType): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInTableScope(tokenTagType): parser.generateImpliedEndTags() - parse_error_if_mismatch tokenTagType parser.popElementsIncl(tokenTagType) parser.clearActiveFormattingTillMarker() parser.insertionMode = IN_ROW ) ("<caption>", "<col>", "<colgroup>", "<tbody>", "<td>", "<tfoot>", "<th>", "<thead>", "<tr>") => (block: - if not parser.hasElementInTableScope({TAG_TD, TAG_TH}): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInTableScope({TAG_TD, TAG_TH}): close_cell reprocess token ) ("</body>", "</caption>", "</col>", "</colgroup>", "</html>") => (block: - parse_error UNEXPECTED_END_TAG + discard ) ("</table>", "</tbody>", "</tfoot>", "</thead>", "</tr>") => (block: - if not parser.hasElementInTableScope(tokenTagType): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInTableScope(tokenTagType): close_cell reprocess token ) @@ -2362,15 +2184,10 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], of IN_SELECT: match token: - TokenType.CHARACTER_NULL => (block: parse_error UNEXPECTED_NULL) - TokenType.CHARACTER => (block: - parser.insertCharacter(token.s) - ) - TokenType.CHARACTER_WHITESPACE => (block: + (TokenType.CHARACTER, TokenType.CHARACTER_WHITESPACE) => (block: parser.insertCharacter(token.s) ) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: return reprocess IN_BODY) "<option>" => (block: if parser.getTagType(parser.currentNode) == TAG_OPTION: @@ -2400,33 +2217,23 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], pop_current_node if parser.getTagType(parser.currentNode) == TAG_OPTGROUP: pop_current_node - else: - parse_error MISMATCHED_TAGS ) "</option>" => (block: if parser.getTagType(parser.currentNode) == TAG_OPTION: pop_current_node - else: - parse_error MISMATCHED_TAGS ) "</select>" => (block: - if not parser.hasElementInSelectScope(TAG_SELECT): - parse_error ELEMENT_NOT_IN_SCOPE - else: + if parser.hasElementInSelectScope(TAG_SELECT): parser.popElementsIncl(TAG_SELECT) parser.resetInsertionMode() ) "<select>" => (block: - parse_error NESTED_TAGS if parser.hasElementInSelectScope(TAG_SELECT): parser.popElementsIncl(TAG_SELECT) parser.resetInsertionMode() ) ("<input>", "<keygen>", "<textarea>") => (block: - parse_error UNEXPECTED_START_TAG - if not parser.hasElementInSelectScope(TAG_SELECT): - discard - else: + if parser.hasElementInSelectScope(TAG_SELECT): parser.popElementsIncl(TAG_SELECT) parser.resetInsertionMode() reprocess token @@ -2435,24 +2242,19 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], return reprocess IN_HEAD ) TokenType.EOF => (block: return reprocess IN_BODY) - TokenType.START_TAG => (block: parse_error UNEXPECTED_START_TAG) - TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) + other => (block: discard) of IN_SELECT_IN_TABLE: match token: ("<caption>", "<table>", "<tbody>", "<tfoot>", "<thead>", "<tr>", "<td>", "<th>") => (block: - parse_error UNEXPECTED_START_TAG parser.popElementsIncl(TAG_SELECT) parser.resetInsertionMode() reprocess token ) ("</caption>", "</table>", "</tbody>", "</tfoot>", "</thead>", "</tr>", "</td>", "</th>") => (block: - parse_error UNEXPECTED_END_TAG - if not parser.hasElementInTableScope(tokenTagType): - discard - else: + if parser.hasElementInTableScope(tokenTagType): parser.popElementsIncl(TAG_SELECT) parser.resetInsertionMode() reprocess token @@ -2499,12 +2301,11 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.insertionMode = IN_BODY reprocess token ) - TokenType.END_TAG => (block: parse_error UNEXPECTED_END_TAG) + TokenType.END_TAG => (block: discard) TokenType.EOF => (block: if not parser.hasElement(TAG_TEMPLATE): discard # stop else: - parse_error UNEXPECTED_EOF parser.popElementsIncl(TAG_TEMPLATE) parser.clearActiveFormattingTillMarker() discard parser.templateModes.pop() @@ -2520,17 +2321,14 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], TokenType.COMMENT => (block: parser.insertComment(token, last_child_of(parser.openElements[0])) ) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) + TokenType.DOCTYPE => (block: discard) "<html>" => (block: return reprocess IN_BODY) "</html>" => (block: - if parser.fragment: - parse_error UNEXPECTED_END_TAG - else: + if not parser.fragment: parser.insertionMode = AFTER_AFTER_BODY ) TokenType.EOF => (block: discard) # stop other => (block: - parse_error UNEXPECTED_AFTER_BODY_TOKEN parser.insertionMode = IN_BODY reprocess token ) @@ -2541,13 +2339,11 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.insertCharacter(token.s) ) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) + TokenType.DOCTYPE => (block: discard) "<html>" => (block: return reprocess IN_BODY) "<frameset>" => (block: discard parser.insertHTMLElement(token)) "</frameset>" => (block: - if parser.getTagType(parser.currentNode) == TAG_HTML: - parse_error UNEXPECTED_START_TAG - else: + if parser.getTagType(parser.currentNode) != TAG_HTML: pop_current_node if not parser.fragment and parser.getTagType(parser.currentNode) != TAG_FRAMESET: @@ -2559,11 +2355,9 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], ) "<noframes>" => (block: return reprocess IN_HEAD) TokenType.EOF => (block: - if parser.getTagType(parser.currentNode) != TAG_HTML: - parse_error UNEXPECTED_EOF # stop ) - other => (block: parser.parseErrorByTokenType(token.t)) + other => (block: discard) of AFTER_FRAMESET: match token: @@ -2571,12 +2365,10 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], parser.insertCharacter(token.s) ) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) "<html>" => (block: return reprocess IN_BODY) "</html>" => (block: parser.insertionMode = AFTER_AFTER_FRAMESET) "<noframes>" => (block: return reprocess IN_HEAD) - TokenType.EOF => (block: discard) # stop - other => (block: parser.parseErrorByTokenType(token.t)) + other => (block: discard) of AFTER_AFTER_BODY: match token: @@ -2588,7 +2380,6 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], ) TokenType.EOF => (block: discard) # stop other => (block: - parser.parseErrorByTokenType(token.t) parser.insertionMode = IN_BODY reprocess token ) @@ -2601,9 +2392,8 @@ proc processInHTMLContent[Handle, Atom](parser: var HTML5Parser[Handle, Atom], (TokenType.DOCTYPE, TokenType.CHARACTER_WHITESPACE, "<html>") => (block: return reprocess IN_BODY ) - TokenType.EOF => (block: discard) # stop "<noframes>" => (block: return reprocess IN_HEAD) - other => (block: parser.parseErrorByTokenType(token.t)) + other => (block: discard) return PRES_CONTINUE proc processInForeignContent[Handle, Atom]( @@ -2613,9 +2403,6 @@ proc processInForeignContent[Handle, Atom]( #TODO document.write (?) #TODO SVG - template parse_error(e: ParseError) = - parser.parseError(e) - template any_other_start_tag() = let namespace = parser.getNamespace(parser.adjustedCurrentNode) var tagname = token.tagname @@ -2635,10 +2422,6 @@ proc processInForeignContent[Handle, Atom]( pop_current_node template any_other_end_tag() = - if parser.currentToken.tagname != token.tagname: - # Compare the start tag token, since it is guaranteed to be lower case. - # (The local name might have been adjusted to a non-lower-case string.) - parse_error UNEXPECTED_END_TAG for i in countdown(parser.openElements.high, 0): # loop if i == 0: # fragment case assert parser.fragment @@ -2654,17 +2437,14 @@ proc processInForeignContent[Handle, Atom]( break match token: - TokenType.CHARACTER_NULL => (block: - parse_error UNEXPECTED_NULL - parser.insertCharacter("\uFFFD") - ) + TokenType.CHARACTER_NULL => (block: parser.insertCharacter("\uFFFD")) TokenType.CHARACTER_WHITESPACE => (block: parser.insertCharacter(token.s)) TokenType.CHARACTER => (block: parser.insertCharacter(token.s) parser.framesetOk = false ) TokenType.COMMENT => (block: parser.insertComment(token)) - TokenType.DOCTYPE => (block: parse_error UNEXPECTED_DOCTYPE) + TokenType.DOCTYPE => (block: discard) ("<b>", "<big>", "<blockquote>", "<body>", "<br>", "<center>", "<code>", "<dd>", "<div>", "<dl>", "<dt>", "<em>", "<embed>", "<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>", "<head>", "<hr>", "<i>", "<img>", "<li>", @@ -2681,7 +2461,6 @@ proc processInForeignContent[Handle, Atom]( atSize notin token.attrs: any_other_start_tag return - parse_error UNEXPECTED_START_TAG #TODO this makes no sense while not parser.isMathMLIntegrationPoint(parser.currentNode) and not parser.isHTMLIntegrationPoint(parser.currentNodeToken) and parser.getNamespace(parser.currentNode) != Namespace.HTML: diff --git a/chame/htmlparseriface.nim b/chame/htmlparseriface.nim index e5d33c05..5be4c536 100644 --- a/chame/htmlparseriface.nim +++ b/chame/htmlparseriface.nim @@ -41,15 +41,6 @@ when defined(nimdocdummy): ## Following procedures are optional hooks; implementations of this interface ## can choose to leave them out without getting compilation errors. ## - ## ```nim - ## proc parseErrorImpl(builder: DOMBuilderBase, e: ParseError) - ## ``` - ## - ## Parse error. `message` is an error code either specified by the - ## standard (in this case, `e` < `LAST_SPECIFIED_ERROR`) or named - ## arbitrarily. (At the time of writing, only tokenizer errors have - ## specified error codes.) - ## ## ## ```nim ## proc setQuirksModeImpl(builder: DOMBuilderBase, quirksMode: QuirksMode) diff --git a/chame/htmltokenizer.nim b/chame/htmltokenizer.nim index 30fe464e..ef4d12db 100644 --- a/chame/htmltokenizer.nim +++ b/chame/htmltokenizer.nim @@ -8,7 +8,6 @@ import std/unicode import dombuilder import entity_gen -import parseerror import tokstate export tokstate @@ -129,14 +128,6 @@ proc flushChars[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom]) = tokenizer.isws = false tokenizer.charbuf.setLen(0) -when not defined(parseErrorImpl): - proc parseErrorImpl(builder: DOMBuilderBase, e: ParseError) = - discard - -proc parseError(tokenizer: Tokenizer, e: ParseError) = - mixin parseErrorImpl - tokenizer.dombuilder.parseErrorImpl(e) - const AttributeStates = { ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED, ATTRIBUTE_VALUE_UNQUOTED @@ -224,25 +215,14 @@ proc findCharRef(tokenizer: var Tokenizer, c: char, return (i, ci, entry) proc numericCharacterReferenceEndState(tokenizer: var Tokenizer) = - template parse_error(error: untyped) = - tokenizer.parseError(error) var c = tokenizer.code - if c == 0x00: - parse_error NULL_CHARACTER_REFERENCE - c = 0xFFFD - elif c > 0x10FFFF: - parse_error CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE - c = 0xFFFD - elif c in 0xD800u32..0xDFFFu32: # surrogates - parse_error SURROGATE_CHARACTER_REFERENCE + if c == 0x00 or c > 0x10FFFF or c in 0xD800u32..0xDFFFu32: c = 0xFFFD elif c in 0xFDD0u32..0xFDEFu32 or (c and 0xFFFF) in 0xFFFEu32..0xFFFFu32: - parse_error NONCHARACTER_CHARACTER_REFERENCE - # do nothing + discard # noncharacter, do nothing elif c < 0x80 and char(c) in (Controls - AsciiWhitespace) + {char(0x0D)}: - parse_error CONTROL_CHARACTER_REFERENCE + discard # control, do nothing elif c in 0x80u32 .. 0x9Fu32: - parse_error CONTROL_CHARACTER_REFERENCE const ControlMapTable = [ 0x80_00_20ACu32, 0x82_00_201Au32, 0x83_00_0192u32, 0x84_00_201Eu32, 0x85_00_2026u32, 0x86_00_2020u32, 0x87_00_2021u32, 0x88_00_02C6u32, @@ -321,16 +301,9 @@ proc tokenizeEOF[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom]): bool = template emit(tok: Token) = tokenizer.flushChars() tokenizer.tokqueue.add(tok) - template emit(tok: TokenType) = emit Token[Atom](t: tok) template reconsume_in(s: TokenizerState) = tokenizer.state = s return true - template parse_error(error: untyped) = - tokenizer.parseError(error) - template emit_eof = - tokenizer.flushChars() - template emit_tok = - emit tokenizer.tok template emit(ch: char) = tokenizer.emit(ch) template emit(s: static string) = @@ -343,87 +316,26 @@ proc tokenizeEOF[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom]): bool = tokenizer.tokqueue.setLen(0) case tokenizer.state - of DATA, RCDATA, RAWTEXT, SCRIPT_DATA, PLAINTEXT, SCRIPT_DATA_ESCAPE_START, - SCRIPT_DATA_ESCAPE_START_DASH: - emit_eof - of TAG_OPEN: - parse_error EOF_BEFORE_TAG_NAME - emit '<' - emit_eof - of END_TAG_OPEN: - parse_error EOF_BEFORE_TAG_NAME - emit "</" - emit_eof - of TAG_NAME: - parse_error EOF_IN_TAG - emit_eof - of RCDATA_LESS_THAN_SIGN, RAWTEXT_LESS_THAN_SIGN, - SCRIPT_DATA_LESS_THAN_SIGN: + of TAG_OPEN, RCDATA_LESS_THAN_SIGN, RAWTEXT_LESS_THAN_SIGN, + SCRIPT_DATA_LESS_THAN_SIGN, SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: emit '<' - # note: was reconsume (rcdata/rawtext/script data) - emit_eof - of RCDATA_END_TAG_OPEN, RAWTEXT_END_TAG_OPEN, SCRIPT_DATA_END_TAG_OPEN: + of END_TAG_OPEN, RCDATA_END_TAG_OPEN, RAWTEXT_END_TAG_OPEN, + SCRIPT_DATA_END_TAG_OPEN, SCRIPT_DATA_ESCAPED_END_TAG_OPEN: emit "</" - # note: was reconsume (rcdata/rawtext/script data) - emit_eof - of RCDATA_END_TAG_NAME, RAWTEXT_END_TAG_NAME, SCRIPT_DATA_END_TAG_NAME: + of RCDATA_END_TAG_NAME, RAWTEXT_END_TAG_NAME, SCRIPT_DATA_END_TAG_NAME, + SCRIPT_DATA_ESCAPED_END_TAG_NAME: emit "</" tokenizer.emitTmp() - # note: was reconsume (rcdata/rawtext/script data) - emit_eof - of SCRIPT_DATA_ESCAPED, SCRIPT_DATA_ESCAPED_DASH, - SCRIPT_DATA_ESCAPED_DASH_DASH, SCRIPT_DATA_DOUBLE_ESCAPE_START: - parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT - emit_eof - of SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: - emit '<' - # note: was reconsume (script data escaped) - parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT - emit_eof - of SCRIPT_DATA_ESCAPED_END_TAG_OPEN: - emit "</" - # note: was reconsume (script data escaped) - parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT - emit_eof - of SCRIPT_DATA_ESCAPED_END_TAG_NAME: - emit "</" - tokenizer.emitTmp() - # note: was reconsume (script data escaped) - parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT - emit_eof - of SCRIPT_DATA_DOUBLE_ESCAPED, SCRIPT_DATA_DOUBLE_ESCAPED_DASH, - SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, - SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, - SCRIPT_DATA_DOUBLE_ESCAPE_END: - parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT - emit_eof - of AFTER_ATTRIBUTE_NAME, BEFORE_ATTRIBUTE_NAME, ATTRIBUTE_NAME: - parse_error EOF_IN_TAG - emit_eof - of ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED, - ATTRIBUTE_VALUE_UNQUOTED, AFTER_ATTRIBUTE_VALUE_QUOTED, - SELF_CLOSING_START_TAG, BEFORE_ATTRIBUTE_VALUE: - parse_error EOF_IN_TAG - emit_eof - of BOGUS_COMMENT, BOGUS_DOCTYPE: - emit_tok - emit_eof + of BOGUS_COMMENT, BOGUS_DOCTYPE, COMMENT_END_DASH, + COMMENT_END, COMMENT_END_BANG, COMMENT_LESS_THAN_SIGN_BANG_DASH, + COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH, COMMENT_START_DASH, COMMENT, + COMMENT_START, COMMENT_LESS_THAN_SIGN, COMMENT_LESS_THAN_SIGN_BANG: + emit tokenizer.tok of MARKUP_DECLARATION_OPEN: - parse_error INCORRECTLY_OPENED_COMMENT # note: was reconsume (bogus comment) emit Token[Atom](t: COMMENT) - emit_eof - of COMMENT_END_DASH, COMMENT_END, COMMENT_END_BANG, - COMMENT_LESS_THAN_SIGN_BANG_DASH, COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH, - COMMENT_START_DASH, COMMENT, COMMENT_START, COMMENT_LESS_THAN_SIGN, - COMMENT_LESS_THAN_SIGN_BANG: - parse_error EOF_IN_COMMENT - emit_tok - emit_eof of DOCTYPE, BEFORE_DOCTYPE_NAME: - parse_error EOF_IN_DOCTYPE emit Token[Atom](t: DOCTYPE, quirks: true) - emit_eof of DOCTYPE_NAME, AFTER_DOCTYPE_NAME, AFTER_DOCTYPE_PUBLIC_KEYWORD, BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, @@ -434,23 +346,14 @@ proc tokenizeEOF[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom]): bool = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, AFTER_DOCTYPE_SYSTEM_IDENTIFIER: - parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true - emit_tok - emit_eof - of CDATA_SECTION: - parse_error EOF_IN_CDATA - emit_eof + emit tokenizer.tok of CDATA_SECTION_BRACKET: emit ']' # note: was reconsume (CDATA section) - parse_error EOF_IN_CDATA - emit_eof of CDATA_SECTION_END: emit "]]" # note: was reconsume (CDATA section) - parse_error EOF_IN_CDATA - emit_eof of CHARACTER_REFERENCE: tokenizer.tmp = "&" tokenizer.flushCodePointsConsumedAsCharRef() @@ -464,19 +367,15 @@ proc tokenizeEOF[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom]): bool = reconsume_in tokenizer.rstate of HEXADECIMAL_CHARACTER_REFERENCE_START, DECIMAL_CHARACTER_REFERENCE_START, NUMERIC_CHARACTER_REFERENCE: - parse_error ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE tokenizer.flushCodePointsConsumedAsCharRef() reconsume_in tokenizer.rstate - of HEXADECIMAL_CHARACTER_REFERENCE, DECIMAL_CHARACTER_REFERENCE: - parse_error MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE - # note: was reconsume (numeric character reference end) - tokenizer.numericCharacterReferenceEndState() - # we unnecessarily consumed once so reconsume - reconsume_in tokenizer.rstate - of NUMERIC_CHARACTER_REFERENCE_END: + of HEXADECIMAL_CHARACTER_REFERENCE, DECIMAL_CHARACTER_REFERENCE, + NUMERIC_CHARACTER_REFERENCE_END: tokenizer.numericCharacterReferenceEndState() # we unnecessarily consumed once so reconsume reconsume_in tokenizer.rstate + else: discard + tokenizer.flushChars() false type TokenizeResult* = enum @@ -513,8 +412,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], template switch_state_return(s: TokenizerState) = tokenizer.rstate = tokenizer.state tokenizer.state = s - template parse_error(error: untyped) = - tokenizer.parseError(error) template is_appropriate_end_tag_token(): bool = tokenizer.laststart != nil and tokenizer.laststart.tagname == tokenizer.tok.tagname @@ -552,41 +449,31 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], case c of '&': switch_state_return CHARACTER_REFERENCE of '<': switch_state TAG_OPEN - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - emit_null + of '\0': emit_null else: emit c of RCDATA: case c of '&': switch_state_return CHARACTER_REFERENCE of '<': switch_state RCDATA_LESS_THAN_SIGN - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - emit_replacement + of '\0': emit_replacement else: emit c of RAWTEXT: case c of '<': switch_state RAWTEXT_LESS_THAN_SIGN - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - emit_replacement + of '\0': emit_replacement else: emit c of SCRIPT_DATA: case c of '<': switch_state SCRIPT_DATA_LESS_THAN_SIGN - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - emit_replacement + of '\0': emit_replacement else: emit c of PLAINTEXT: case c - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - emit_replacement + of '\0': emit_replacement else: emit c of TAG_OPEN: @@ -599,12 +486,10 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], # note: was reconsume switch_state TAG_NAME of '?': - parse_error UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME new_token Token[Atom](t: COMMENT, data: "?") # note: was reconsume switch_state BOGUS_COMMENT else: - parse_error INVALID_FIRST_CHARACTER_OF_TAG_NAME emit '<' reconsume_in DATA @@ -615,11 +500,8 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], tokenizer.tagNameBuf = $c.toLowerAscii() # note: was reconsume switch_state TAG_NAME - of '>': - parse_error MISSING_END_TAG_NAME - switch_state DATA + of '>': switch_state DATA else: - parse_error INVALID_FIRST_CHARACTER_OF_TAG_NAME new_token Token[Atom](t: COMMENT) reconsume_in BOGUS_COMMENT @@ -636,9 +518,7 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], tokenizer.flushTagName() emit_tok of AsciiUpperAlpha: tokenizer.tagNameBuf &= c.toLowerAscii() - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tagNameBuf &= "\uFFFD" + of '\0': tokenizer.tagNameBuf &= "\uFFFD" else: tokenizer.tagNameBuf &= c of RCDATA_LESS_THAN_SIGN: @@ -821,13 +701,9 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], of '-': switch_state SCRIPT_DATA_ESCAPED_DASH emit '-' - of '<': - switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - emit_replacement - else: - emit c + of '<': switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN + of '\0': emit_replacement + else: emit c of SCRIPT_DATA_ESCAPED_DASH: case c @@ -837,7 +713,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], of '<': switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN of '\0': - parse_error UNEXPECTED_NULL_CHARACTER switch_state SCRIPT_DATA_ESCAPED emit_replacement else: @@ -854,7 +729,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], switch_state SCRIPT_DATA emit '>' of '\0': - parse_error UNEXPECTED_NULL_CHARACTER switch_state SCRIPT_DATA_ESCAPED emit_replacement else: @@ -940,9 +814,7 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], of '<': switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN emit '<' - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - emit_replacement + of '\0': emit_replacement else: emit c of SCRIPT_DATA_DOUBLE_ESCAPED_DASH: @@ -954,7 +826,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN emit '<' of '\0': - parse_error UNEXPECTED_NULL_CHARACTER switch_state SCRIPT_DATA_DOUBLE_ESCAPED emit_replacement else: @@ -971,7 +842,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], switch_state SCRIPT_DATA emit '>' of '\0': - parse_error UNEXPECTED_NULL_CHARACTER switch_state SCRIPT_DATA_DOUBLE_ESCAPED emit_replacement else: @@ -1005,7 +875,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], of AsciiWhitespace: discard of '/', '>': reconsume_in AFTER_ATTRIBUTE_NAME of '=': - parse_error UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME start_new_attribute tokenizer.tmp &= c switch_state ATTRIBUTE_NAME @@ -1014,8 +883,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], reconsume_in ATTRIBUTE_NAME of ATTRIBUTE_NAME: - template anything_else = - tokenizer.tmp &= c case c of AsciiWhitespace, '/', '>': leave_attribute_name_state @@ -1026,13 +893,9 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], of AsciiUpperAlpha: tokenizer.tmp &= c.toLowerAscii() of '\0': - parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tmp &= "\uFFFD" - of '"', '\'', '<': - parse_error UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME - anything_else else: - anything_else + tokenizer.tmp &= c of AFTER_ATTRIBUTE_NAME: case c @@ -1053,7 +916,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], of '"': switch_state ATTRIBUTE_VALUE_DOUBLE_QUOTED of '\'': switch_state ATTRIBUTE_VALUE_SINGLE_QUOTED of '>': - parse_error MISSING_ATTRIBUTE_VALUE switch_state DATA prepare_attrs_if_start emit_tok @@ -1063,18 +925,14 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], case c of '"': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED of '&': switch_state_return CHARACTER_REFERENCE - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.appendToCurrentAttrValue("\uFFFD") + of '\0': tokenizer.appendToCurrentAttrValue("\uFFFD") else: tokenizer.appendToCurrentAttrValue(c) of ATTRIBUTE_VALUE_SINGLE_QUOTED: case c of '\'': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED of '&': switch_state_return CHARACTER_REFERENCE - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.appendToCurrentAttrValue("\uFFFD") + of '\0': tokenizer.appendToCurrentAttrValue("\uFFFD") else: tokenizer.appendToCurrentAttrValue(c) of ATTRIBUTE_VALUE_UNQUOTED: @@ -1085,12 +943,7 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], switch_state DATA prepare_attrs_if_start emit_tok - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.appendToCurrentAttrValue("\uFFFD") - of '"', '\'', '<', '=', '`': - parse_error UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE - tokenizer.appendToCurrentAttrValue(c) + of '\0': tokenizer.appendToCurrentAttrValue("\uFFFD") else: tokenizer.appendToCurrentAttrValue(c) of AFTER_ATTRIBUTE_VALUE_QUOTED: @@ -1103,9 +956,7 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], switch_state DATA prepare_attrs_if_start emit_tok - else: - parse_error MISSING_WHITESPACE_BETWEEN_ATTRIBUTES - reconsume_in BEFORE_ATTRIBUTE_NAME + else: reconsume_in BEFORE_ATTRIBUTE_NAME of SELF_CLOSING_START_TAG: case c @@ -1114,9 +965,7 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], switch_state DATA prepare_attrs_if_start emit_tok - else: - parse_error UNEXPECTED_SOLIDUS_IN_TAG - reconsume_in BEFORE_ATTRIBUTE_NAME + else: reconsume_in BEFORE_ATTRIBUTE_NAME of BOGUS_COMMENT: assert tokenizer.tok.t == COMMENT @@ -1124,14 +973,11 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], of '>': switch_state DATA emit_tok - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.data &= "\uFFFD" + of '\0': tokenizer.tok.data &= "\uFFFD" else: tokenizer.tok.data &= c of MARKUP_DECLARATION_OPEN: # note: rewritten to fit case model as we consume a char anyway template anything_else = - parse_error INCORRECTLY_OPENED_COMMENT new_token Token[Atom](t: COMMENT) switch_state BOGUS_COMMENT case c @@ -1153,7 +999,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], if tokenizer.hasnonhtml: switch_state CDATA_SECTION else: - parse_error CDATA_IN_HTML_CONTENT new_token Token[Atom](t: COMMENT, data: "[CDATA[") switch_state BOGUS_COMMENT of esrRetry: break @@ -1167,7 +1012,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], case c of '-': switch_state COMMENT_START_DASH of '>': - parse_error ABRUPT_CLOSING_OF_EMPTY_COMMENT switch_state DATA emit_tok else: reconsume_in COMMENT @@ -1176,7 +1020,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], case c of '-': switch_state COMMENT_END of '>': - parse_error ABRUPT_CLOSING_OF_EMPTY_COMMENT switch_state DATA emit_tok else: @@ -1189,9 +1032,7 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], tokenizer.tok.data &= c switch_state COMMENT_LESS_THAN_SIGN of '-': switch_state COMMENT_END_DASH - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.data &= "\uFFFD" + of '\0': tokenizer.tok.data &= "\uFFFD" else: tokenizer.tok.data &= c of COMMENT_LESS_THAN_SIGN: @@ -1218,9 +1059,7 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], # note: was reconsume (comment end) switch_state DATA emit_tok - else: - parse_error NESTED_COMMENT - reconsume_in COMMENT_END + else: reconsume_in COMMENT_END of COMMENT_END_DASH: case c @@ -1246,7 +1085,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], tokenizer.tok.data &= "--!" switch_state COMMENT_END_DASH of '>': - parse_error INCORRECTLY_CLOSED_COMMENT switch_state DATA emit_tok else: @@ -1257,9 +1095,7 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], case c of AsciiWhitespace: switch_state BEFORE_DOCTYPE_NAME of '>': reconsume_in BEFORE_DOCTYPE_NAME - else: - parse_error MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME - reconsume_in BEFORE_DOCTYPE_NAME + else: reconsume_in BEFORE_DOCTYPE_NAME of BEFORE_DOCTYPE_NAME: case c @@ -1268,11 +1104,9 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], new_token Token[Atom](t: DOCTYPE, name: some($c.toLowerAscii())) switch_state DOCTYPE_NAME of '\0': - parse_error UNEXPECTED_NULL_CHARACTER new_token Token[Atom](t: DOCTYPE, name: some($"\uFFFD")) switch_state DOCTYPE_NAME of '>': - parse_error MISSING_DOCTYPE_NAME new_token Token[Atom](t: DOCTYPE, quirks: true) switch_state DATA emit_tok @@ -1286,17 +1120,12 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], of '>': switch_state DATA emit_tok - of AsciiUpperAlpha: - tokenizer.tok.name.get &= c.toLowerAscii() - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.name.get &= "\uFFFD" - else: - tokenizer.tok.name.get &= c + of AsciiUpperAlpha: tokenizer.tok.name.get &= c.toLowerAscii() + of '\0': tokenizer.tok.name.get &= "\uFFFD" + else: tokenizer.tok.name.get &= c of AFTER_DOCTYPE_NAME: # note: rewritten to fit case model as we consume a char anyway template anything_else = - parse_error INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME tokenizer.tok.quirks = true switch_state BOGUS_DOCTYPE case c @@ -1323,20 +1152,16 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], case c of AsciiWhitespace: switch_state BEFORE_DOCTYPE_PUBLIC_IDENTIFIER of '"': - parse_error MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD tokenizer.tok.pubid = some("") switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED of '\'': - parse_error MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD tokenizer.tok.pubid = some("") switch_state DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED of '>': - parse_error MISSING_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok else: - parse_error MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1350,59 +1175,47 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], tokenizer.tok.pubid = some("") switch_state DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED of '>': - parse_error MISSING_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok else: - parse_error MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE of DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: case c of '"': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.pubid.get &= "\uFFFD" + of '\0': tokenizer.tok.pubid.get &= "\uFFFD" of '>': - parse_error ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok - else: - tokenizer.tok.pubid.get &= c + else: tokenizer.tok.pubid.get &= c of DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: case c of '\'': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.pubid.get &= "\uFFFD" + of '\0': tokenizer.tok.pubid.get &= "\uFFFD" of '>': - parse_error ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok - else: - tokenizer.tok.pubid.get &= c + else: tokenizer.tok.pubid.get &= c of AFTER_DOCTYPE_PUBLIC_IDENTIFIER: case c - of AsciiWhitespace: switch_state BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS + of AsciiWhitespace: + switch_state BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS of '>': switch_state DATA emit_tok of '"': - parse_error MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED of '\'': - parse_error MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED else: - parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1419,7 +1232,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED else: - parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1427,20 +1239,16 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], case c of AsciiWhitespace: switch_state BEFORE_DOCTYPE_SYSTEM_IDENTIFIER of '"': - parse_error MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED of '\'': - parse_error MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED of '>': - parse_error MISSING_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok else: - parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1454,37 +1262,28 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED of '>': - parse_error MISSING_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok else: - parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE of DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: case c of '"': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.sysid.get &= "\uFFFD" + of '\0': tokenizer.tok.sysid.get &= "\uFFFD" of '>': - parse_error ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok - else: - tokenizer.tok.sysid.get &= c + else: tokenizer.tok.sysid.get &= c of DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: case c of '\'': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER - of '\0': - parse_error UNEXPECTED_NULL_CHARACTER - tokenizer.tok.sysid.get &= "\uFFFD" + of '\0': tokenizer.tok.sysid.get &= "\uFFFD" of '>': - parse_error ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok @@ -1497,16 +1296,13 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], of '>': switch_state DATA emit_tok - else: - parse_error UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER - reconsume_in BOGUS_DOCTYPE + else: reconsume_in BOGUS_DOCTYPE of BOGUS_DOCTYPE: case c of '>': switch_state DATA emit_tok - of '\0': parse_error UNEXPECTED_NULL_CHARACTER else: discard of CDATA_SECTION: @@ -1576,8 +1372,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], else: if n != -1: tokenizer.reconsume(cast[char](n)) - if tokenizer.tmp[^1] != ';': - parse_error MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE tokenizer.tmp = "" var ci = ci + 1 while (let c = entry[ci]; c != '\0'): @@ -1596,9 +1390,7 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], tokenizer.appendToCurrentAttrValue(c) else: emit c - of ';': - parse_error UNKNOWN_NAMED_CHARACTER_REFERENCE - reconsume_in tokenizer.rstate + of ';': reconsume_in tokenizer.rstate else: reconsume_in tokenizer.rstate of NUMERIC_CHARACTER_REFERENCE: @@ -1624,7 +1416,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], # note: was reconsume switch_state HEXADECIMAL_CHARACTER_REFERENCE else: - parse_error ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE tokenizer.flushCodePointsConsumedAsCharRef() reconsume_in tokenizer.rstate @@ -1635,7 +1426,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], # note: was reconsume switch_state DECIMAL_CHARACTER_REFERENCE else: - parse_error ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE tokenizer.flushCodePointsConsumedAsCharRef() reconsume_in tokenizer.rstate @@ -1654,9 +1444,7 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], tokenizer.code *= 0x10 tokenizer.code += uint32(c) - uint32('A') + 10 of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END - else: - parse_error MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE - reconsume_in NUMERIC_CHARACTER_REFERENCE_END + else: reconsume_in NUMERIC_CHARACTER_REFERENCE_END of DECIMAL_CHARACTER_REFERENCE: case c @@ -1665,9 +1453,7 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom], tokenizer.code *= 10 tokenizer.code += uint32(c) - uint32('0') of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END - else: - parse_error MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE - reconsume_in NUMERIC_CHARACTER_REFERENCE_END + else: reconsume_in NUMERIC_CHARACTER_REFERENCE_END of NUMERIC_CHARACTER_REFERENCE_END: tokenizer.numericCharacterReferenceEndState() diff --git a/chame/parseerror.nim b/chame/parseerror.nim deleted file mode 100644 index 8e85ead1..00000000 --- a/chame/parseerror.nim +++ /dev/null @@ -1,71 +0,0 @@ -type ParseError* = enum - #TODO write a description for all error codes - ABRUPT_CLOSING_OF_EMPTY_COMMENT - ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER - ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER - ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE - CDATA_IN_HTML_CONTENT - CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE - CONTROL_CHARACTER_IN_INPUT_STREAM - CONTROL_CHARACTER_REFERENCE - END_TAG_WITH_ATTRIBUTES - DUPLICATE_ATTRIBUTE - END_TAG_WITH_TRAILING_SOLIDUS - EOF_BEFORE_TAG_NAME - EOF_IN_CDATA - EOF_IN_COMMENT - EOF_IN_DOCTYPE - EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT - EOF_IN_TAG - INCORRECTLY_CLOSED_COMMENT - INCORRECTLY_OPENED_COMMENT - INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME - INVALID_FIRST_CHARACTER_OF_TAG_NAME - MISSING_ATTRIBUTE_VALUE - MISSING_DOCTYPE_NAME - MISSING_DOCTYPE_PUBLIC_IDENTIFIER - MISSING_DOCTYPE_SYSTEM_IDENTIFIER - MISSING_END_TAG_NAME - MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER - MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER - MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE - MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD - MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD - MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME - MISSING_WHITESPACE_BETWEEN_ATTRIBUTES - MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS - NESTED_COMMENT - NONCHARACTER_CHARACTER_REFERENCE - NONCHARACTER_IN_INPUT_STREAM - NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS - NULL_CHARACTER_REFERENCE - SURROGATE_CHARACTER_REFERENCE - SURROGATE_IN_INPUT_STREAM - UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER - UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME - UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE - UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME - UNEXPECTED_NULL_CHARACTER - UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME - UNEXPECTED_SOLIDUS_IN_TAG - UNKNOWN_NAMED_CHARACTER_REFERENCE - LAST_SPECIFIED_ERROR # never returned - # From here on, error code names have not been specified by the standard. - MISMATCHED_TAGS = "Mismatched start and end tags" - INVALID_DOCTYPE = "Unrecognized document type" - UNEXPECTED_DOCTYPE = "Unexpected document type" - UNEXPECTED_INITIAL_TOKEN = "Unexpected token in initial state" - UNEXPECTED_START_TAG = "Unexpected start tag" - UNEXPECTED_END_TAG = "Unexpected end tag" - ELEMENT_NOT_IN_OPEN_ELEMENTS = "Element has not been added to open elements" - ELEMENT_NOT_IN_SCOPE = "Element not in appropriate scope" - ELEMENT_NOT_CURRENT_NODE = "Element is not current node" - #TODO merge with UNEXPECTED_NULL_CHARACTER? - UNEXPECTED_NULL = "Unexpected null character" - NESTED_TAGS = "Non-nestable nested tags" - UNEXPECTED_SPECIAL_ELEMENT = "Unexpected special element on open elements" - UNEXPECTED_EOF = "Unexpected end of file" - INVALID_TEXT_PARENT = "Invalid parent element for text node" - NON_SPACE_TABLE_TEXT = "Non-space table text" - UNEXPECTED_AFTER_BODY_TOKEN = "Unexpected token after body" - UNEXPECTED_CHARACTER = "Unexpected character token" |