diff options
author | bptato <nincsnevem662@gmail.com> | 2023-07-15 20:05:15 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2023-07-15 20:06:07 +0200 |
commit | 9aa9ecd2d2c5048ac3057aeee28b1e42ce4be0d8 (patch) | |
tree | 8f0ca8b8625f633b1e75f9f929f15b4b4322038f /src/html/htmltokenizer.nim | |
parent | 96fceba76a218799d49652befab8b99289b07133 (diff) | |
download | chawan-9aa9ecd2d2c5048ac3057aeee28b1e42ce4be0d8.tar.gz |
htmlparser: add parse error handling
Diffstat (limited to 'src/html/htmltokenizer.nim')
-rw-r--r-- | src/html/htmltokenizer.nim | 233 |
1 files changed, 119 insertions, 114 deletions
diff --git a/src/html/htmltokenizer.nim b/src/html/htmltokenizer.nim index c95db720..d38d427c 100644 --- a/src/html/htmltokenizer.nim +++ b/src/html/htmltokenizer.nim @@ -5,9 +5,10 @@ import macros import tables import unicode +import encoding/decoderstream import html/entity +import html/parseerror import html/tags -import encoding/decoderstream import utils/opt import utils/radixtree import utils/twtstr @@ -25,6 +26,7 @@ type attrv: string attr: bool hasnonhtml*: bool + onParseError: proc(e: ParseError) decoder: DecoderStream sbuf: seq[Rune] @@ -108,12 +110,13 @@ proc readn(t: var Tokenizer) = if t.decoder.atEnd: t.eof_i = t.sbuf.len -proc newTokenizer*(s: DecoderStream): Tokenizer = +proc newTokenizer*(s: DecoderStream, onParseError: proc(e: ParseError)): Tokenizer = var t = Tokenizer( decoder: s, sbuf: newSeqOfCap[Rune](bufLen), eof_i: -1, - sbuf_i: 0 + sbuf_i: 0, + onParseError: onParseError ) t.readn() return t @@ -192,7 +195,9 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = template reconsume_in(s: TokenizerState) = tokenizer.reconsume() switch_state s - template parse_error(error: untyped) = discard # does nothing for now... TODO? + template parse_error(error: untyped) = + if tokenizer.onParseError != nil: + tokenizer.onParseError(error) template is_appropriate_end_tag_token(): bool = tokenizer.laststart != nil and tokenizer.laststart.tagname == tokenizer.tok.tagname template start_new_attribute = @@ -346,7 +351,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '&': switch_state_return CHARACTER_REFERENCE of '<': switch_state TAG_OPEN of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER emit_current of eof: emit_eof else: emit_current @@ -355,7 +360,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '&': switch_state_return CHARACTER_REFERENCE of '<': switch_state RCDATA_LESS_THAN_SIGN - of null: parse_error unexpected_null_character + of null: parse_error UNEXPECTED_NULL_CHARACTER of eof: emit_eof else: emit_current @@ -363,7 +368,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '<': switch_state RAWTEXT_LESS_THAN_SIGN of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER emit_replacement of eof: emit_eof else: emit_current @@ -372,7 +377,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '<': switch_state SCRIPT_DATA_LESS_THAN_SIGN of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER emit_replacement of eof: emit_eof else: emit_current @@ -380,7 +385,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of PLAINTEXT: case c of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER emit_replacement of eof: emit_eof else: emit_current @@ -393,15 +398,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = new_token Token(t: START_TAG) reconsume_in TAG_NAME of '?': - parse_error unexpected_question_mark_instead_of_tag_name + parse_error UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME new_token Token(t: COMMENT) reconsume_in BOGUS_COMMENT of eof: - parse_error eof_before_tag_name + parse_error EOF_BEFORE_TAG_NAME emit '<' emit_eof else: - parse_error invalid_first_character_of_tag_name + parse_error INVALID_FIRST_CHARACTER_OF_TAG_NAME emit '<' reconsume_in DATA @@ -411,15 +416,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = new_token Token(t: END_TAG) reconsume_in TAG_NAME of '>': - parse_error missing_end_tag_name + parse_error MISSING_END_TAG_NAME switch_state DATA of eof: - parse_error eof_before_tag_name + parse_error EOF_BEFORE_TAG_NAME emit '<' emit '/' emit_eof else: - parse_error invalid_first_character_of_tag_name + parse_error INVALID_FIRST_CHARACTER_OF_TAG_NAME new_token Token(t: COMMENT) reconsume_in BOGUS_COMMENT @@ -432,10 +437,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok of AsciiUpperAlpha: tokenizer.tok.tagname &= c.tolower() of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tok.tagname &= Rune(0xFFFD) of eof: - parse_error eof_in_tag + parse_error EOF_IN_TAG emit_eof else: tokenizer.tok.tagname &= r @@ -611,10 +616,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '<': switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER emit_replacement of eof: - parse_error eof_in_script_html_comment_like_text + parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT emit_eof else: emit_current @@ -627,10 +632,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '<': switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER switch_state SCRIPT_DATA_ESCAPED of eof: - parse_error eof_in_script_html_comment_like_text + parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT emit_eof else: switch_state SCRIPT_DATA_ESCAPED @@ -646,10 +651,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state SCRIPT_DATA emit '>' of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER switch_state SCRIPT_DATA_ESCAPED of eof: - parse_error eof_in_script_html_comment_like_text + parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT emit_eof else: switch_state SCRIPT_DATA_ESCAPED @@ -727,10 +732,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN emit '<' of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER emit_replacement of eof: - parse_error eof_in_script_html_comment_like_text + parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT emit_eof else: emit_current @@ -743,11 +748,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN emit '<' of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER switch_state SCRIPT_DATA_DOUBLE_ESCAPED emit_replacement of eof: - parse_error eof_in_script_html_comment_like_text + parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT emit_eof else: switch_state SCRIPT_DATA_DOUBLE_ESCAPED @@ -763,11 +768,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state SCRIPT_DATA emit '>' of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER switch_state SCRIPT_DATA_DOUBLE_ESCAPED emit_replacement of eof: - parse_error eof_in_script_html_comment_like_text + parse_error EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT emit_eof else: switch_state SCRIPT_DATA_DOUBLE_ESCAPED @@ -798,7 +803,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of AsciiWhitespace: discard of '/', '>', eof: reconsume_in AFTER_ATTRIBUTE_NAME of '=': - parse_error unexpected_equals_sign_before_attribute_name + parse_error UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME start_new_attribute switch_state ATTRIBUTE_NAME else: @@ -817,10 +822,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of AsciiUpperAlpha: tokenizer.attrn &= c.tolower() of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.attrn &= Rune(0xFFFD) of '"', '\'', '<': - parse_error unexpected_character_in_attribute_name + parse_error UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME anything_else else: tokenizer.attrn &= r @@ -834,7 +839,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state DATA emit_tok of eof: - parse_error eof_in_tag + parse_error EOF_IN_TAG emit_eof else: start_new_attribute @@ -846,7 +851,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '"': switch_state ATTRIBUTE_VALUE_DOUBLE_QUOTED of '\'': switch_state ATTRIBUTE_VALUE_SINGLE_QUOTED of '>': - parse_error missing_attribute_value + parse_error MISSING_ATTRIBUTE_VALUE switch_state DATA emit '>' else: reconsume_in ATTRIBUTE_VALUE_UNQUOTED @@ -856,10 +861,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '"': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED of '&': switch_state_return CHARACTER_REFERENCE of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER append_to_current_attr_value Rune(0xFFFD) of eof: - parse_error eof_in_tag + parse_error EOF_IN_TAG emit_eof else: append_to_current_attr_value r @@ -868,10 +873,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '\'': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED of '&': switch_state_return CHARACTER_REFERENCE of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER append_to_current_attr_value Rune(0xFFFD) of eof: - parse_error eof_in_tag + parse_error EOF_IN_TAG emit_eof else: append_to_current_attr_value r @@ -883,13 +888,13 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state DATA emit_tok of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER append_to_current_attr_value Rune(0xFFFD) of '"', '\'', '<', '=', '`': - parse_error unexpected_character_in_unquoted_attribute_value + parse_error UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE append_to_current_attr_value c of eof: - parse_error eof_in_tag + parse_error EOF_IN_TAG emit_eof else: append_to_current_attr_value r @@ -903,10 +908,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state DATA emit_tok of eof: - parse_error eof_in_tag + parse_error EOF_IN_TAG emit_eof else: - parse_error missing_whitespace_between_attributes + parse_error MISSING_WHITESPACE_BETWEEN_ATTRIBUTES reconsume_in BEFORE_ATTRIBUTE_NAME of SELF_CLOSING_START_TAG: @@ -916,10 +921,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state DATA emit_tok of eof: - parse_error eof_in_tag + parse_error EOF_IN_TAG emit_eof else: - parse_error unexpected_solidus_in_tag + parse_error UNEXPECTED_SOLIDUS_IN_TAG reconsume_in BEFORE_ATTRIBUTE_NAME of BOGUS_COMMENT: @@ -931,7 +936,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of eof: emit_tok emit_eof - of null: parse_error unexpected_null_character + of null: parse_error UNEXPECTED_NULL_CHARACTER else: tokenizer.tok.data &= r of MARKUP_DECLARATION_OPEN: # note: rewritten to fit case model as we consume a char anyway @@ -954,12 +959,12 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = if tokenizer.hasnonhtml: switch_state CDATA_SECTION else: - parse_error cdata_in_html_content + parse_error CDATA_IN_HTML_CONTENT new_token Token(t: COMMENT, data: "[CDATA[") switch_state BOGUS_COMMENT else: anything_else else: - parse_error incorrectly_opened_comment + parse_error INCORRECTLY_OPENED_COMMENT new_token Token(t: COMMENT) reconsume_in BOGUS_COMMENT @@ -967,7 +972,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '-': switch_state COMMENT_START_DASH of '>': - parse_error abrupt_closing_of_empty_comment + parse_error ABRUPT_CLOSING_OF_EMPTY_COMMENT switch_state DATA emit_tok else: reconsume_in COMMENT @@ -976,11 +981,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '-': switch_state COMMENT_END of '>': - parse_error abrupt_closing_of_empty_comment + parse_error ABRUPT_CLOSING_OF_EMPTY_COMMENT switch_state DATA emit_tok of eof: - parse_error eof_in_comment + parse_error EOF_IN_COMMENT emit_tok emit_eof else: @@ -994,10 +999,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state COMMENT_LESS_THAN_SIGN of '-': switch_state COMMENT_END_DASH of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tok.data &= Rune(0xFFFD) of eof: - parse_error eof_in_comment + parse_error EOF_IN_COMMENT emit_tok emit_eof else: tokenizer.tok.data &= r @@ -1024,14 +1029,14 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '>', eof: reconsume_in COMMENT_END else: - parse_error nested_comment + parse_error NESTED_COMMENT reconsume_in COMMENT_END of COMMENT_END_DASH: case c of '-': switch_state COMMENT_END of eof: - parse_error eof_in_comment + parse_error EOF_IN_COMMENT emit_tok emit_eof else: @@ -1044,7 +1049,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '!': switch_state COMMENT_END_BANG of '-': tokenizer.tok.data &= '-' of eof: - parse_error eof_in_comment + parse_error EOF_IN_COMMENT emit_tok emit_eof else: @@ -1057,11 +1062,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.tok.data &= "--!" switch_state COMMENT_END_DASH of '>': - parse_error incorrectly_closed_comment + parse_error INCORRECTLY_CLOSED_COMMENT switch_state DATA emit_tok of eof: - parse_error eof_in_comment + parse_error EOF_IN_COMMENT emit_tok emit_eof else: @@ -1073,12 +1078,12 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of AsciiWhitespace: switch_state BEFORE_DOCTYPE_NAME of '>': reconsume_in BEFORE_DOCTYPE_NAME of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE new_token Token(t: DOCTYPE, quirks: true) emit_tok emit_eof else: - parse_error missing_whitespace_before_doctype_name + parse_error MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME reconsume_in BEFORE_DOCTYPE_NAME of BEFORE_DOCTYPE_NAME: @@ -1088,15 +1093,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = new_token Token(t: DOCTYPE, name: some($c.tolower())) switch_state DOCTYPE_NAME of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER new_token Token(t: DOCTYPE, name: some($Rune(0xFFFD))) of '>': - parse_error missing_doctype_name + parse_error MISSING_DOCTYPE_NAME new_token Token(t: DOCTYPE, quirks: true) switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE new_token Token(t: DOCTYPE, quirks: true) emit_tok emit_eof @@ -1113,10 +1118,10 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of AsciiUpperAlpha: tokenizer.tok.name.get &= c.tolower() of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tok.name.get &= Rune(0xFFFD) of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof @@ -1131,7 +1136,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof @@ -1148,7 +1153,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else else: - parse_error invalid_character_sequence_after_doctype_name + parse_error INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1156,21 +1161,21 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of AsciiWhitespace: switch_state BEFORE_DOCTYPE_PUBLIC_IDENTIFIER of '"': - parse_error missing_whitespace_after_doctype_public_keyword + parse_error MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD tokenizer.tok.pubid = some("") switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED of '>': - parse_error missing_doctype_public_identifier + parse_error MISSING_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof else: - parse_error missing_quote_before_doctype_public_identifier + parse_error MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1184,17 +1189,17 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.tok.pubid = some("") switch_state DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED of '>': - parse_error missing_doctype_public_identifier + parse_error MISSING_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof else: - parse_error missing_quote_before_doctype_public_identifier + parse_error MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1202,15 +1207,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '"': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tok.pubid.get &= Rune(0xFFFD) of '>': - parse_error abrupt_doctype_public_identifier + parse_error ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof @@ -1221,15 +1226,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '\'': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tok.pubid.get &= Rune(0xFFFD) of '>': - parse_error abrupt_doctype_public_identifier + parse_error ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof @@ -1243,20 +1248,20 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state DATA emit_tok of '"': - parse_error missing_whitespace_between_doctype_public_and_system_identifiers + parse_error MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED of '\'': - parse_error missing_whitespace_between_doctype_public_and_system_identifiers + parse_error MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof else: - parse_error missing_quote_before_doctype_system_identifier + parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1273,12 +1278,12 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof else: - parse_error missing_quote_before_doctype_system_identifier + parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1286,25 +1291,25 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of AsciiWhitespace: switch_state BEFORE_DOCTYPE_SYSTEM_IDENTIFIER of '"': - parse_error missing_whitespace_after_doctype_system_keyword + parse_error MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED of '\'': - parse_error missing_whitespace_after_doctype_system_keyword + parse_error MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD tokenizer.tok.sysid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED of '>': - parse_error missing_doctype_system_identifier + parse_error MISSING_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof else: - parse_error missing_quote_before_doctype_system_identifier + parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1318,17 +1323,17 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.tok.pubid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED of '>': - parse_error missing_doctype_system_identifier + parse_error MISSING_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof else: - parse_error missing_quote_before_doctype_system_identifier + parse_error MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true reconsume_in BOGUS_DOCTYPE @@ -1336,15 +1341,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '"': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tok.sysid.get &= Rune(0xFFFD) of '>': - parse_error abrupt_doctype_system_identifier + parse_error ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof @@ -1355,15 +1360,15 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of '\'': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER of null: - parse_error unexpected_null_character + parse_error UNEXPECTED_NULL_CHARACTER tokenizer.tok.sysid.get &= Rune(0xFFFD) of '>': - parse_error abrupt_doctype_system_identifier + parse_error ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER tokenizer.tok.quirks = true switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof @@ -1377,12 +1382,12 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state DATA emit_tok of eof: - parse_error eof_in_doctype + parse_error EOF_IN_DOCTYPE tokenizer.tok.quirks = true emit_tok emit_eof else: - parse_error unexpected_character_after_doctype_system_identifier + parse_error UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER reconsume_in BOGUS_DOCTYPE of BOGUS_DOCTYPE: @@ -1390,7 +1395,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of '>': switch_state DATA emit_tok - of null: parse_error unexpected_null_character + of null: parse_error UNEXPECTED_NULL_CHARACTER of eof: emit_tok emit_eof @@ -1400,7 +1405,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of ']': switch_state CDATA_SECTION_BRACKET of eof: - parse_error eof_in_cdata + parse_error EOF_IN_CDATA emit_eof else: emit_current @@ -1458,7 +1463,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = switch_state tokenizer.rstate else: if tokenizer.tmp[^1] != ';': - parse_error missing_semicolon_after_character_reference_parse_error + parse_error MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE tokenizer.tmp = value.get flush_code_points_consumed_as_a_character_reference switch_state tokenizer.rstate @@ -1474,7 +1479,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: emit_current of ';': - parse_error unknown_named_character_reference + parse_error UNKNOWN_NAMED_CHARACTER_REFERENCE reconsume_in tokenizer.rstate else: reconsume_in tokenizer.rstate @@ -1490,7 +1495,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of AsciiHexDigit: reconsume_in HEXADECIMAL_CHARACTER_REFERENCE else: - parse_error absence_of_digits_in_numeric_character_reference + parse_error ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE flush_code_points_consumed_as_a_character_reference reconsume_in tokenizer.rstate @@ -1498,7 +1503,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = case c of AsciiDigit: reconsume_in DECIMAL_CHARACTER_REFERENCE else: - parse_error absence_of_digits_in_numeric_character_reference + parse_error ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE flush_code_points_consumed_as_a_character_reference reconsume_in tokenizer.rstate @@ -1509,7 +1514,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.code += hexValue(c) of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END else: - parse_error missing_semicolon_after_character_reference + parse_error MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE reconsume_in NUMERIC_CHARACTER_REFERENCE_END of DECIMAL_CHARACTER_REFERENCE: @@ -1519,23 +1524,23 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = tokenizer.code += decValue(c) of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END else: - parse_error missing_semicolon_after_character_reference + parse_error MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE reconsume_in NUMERIC_CHARACTER_REFERENCE_END of NUMERIC_CHARACTER_REFERENCE_END: ignore_eof # we reconsume anyway case tokenizer.code of 0x00: - parse_error null_character_reference + parse_error NULL_CHARACTER_REFERENCE tokenizer.code = 0xFFFD elif tokenizer.code > 0x10FFFF: - parse_error character_reference_outside_unicode_range + parse_error CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE tokenizer.code = 0xFFFD elif Rune(tokenizer.code).isSurrogate(): - parse_error surrogate_character_reference + parse_error SURROGATE_CHARACTER_REFERENCE tokenizer.code = 0xFFFD elif Rune(tokenizer.code).isNonCharacter(): - parse_error noncharacter_character_reference + parse_error NONCHARACTER_CHARACTER_REFERENCE # do nothing elif tokenizer.code in 0..255 and char(tokenizer.code) in ((Controls - AsciiWhitespace) + {chr(0x0D)}): const ControlMapTable = [ |