diff options
author | bptato <nincsnevem662@gmail.com> | 2022-12-10 19:05:38 +0100 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2022-12-10 19:05:38 +0100 |
commit | 1e858c874804444bc4b95b6e89eb96a0deb8473c (patch) | |
tree | 3151b498e19c6d6eed3d90827483eb270314f3da /src/html | |
parent | d963385cd9fd77f0a950c5b92be7774bbf76d661 (diff) | |
download | chawan-1e858c874804444bc4b95b6e89eb96a0deb8473c.tar.gz |
Add support for the encoding standard, fix parseLegacyColor
Also, fix a bug in the
Diffstat (limited to 'src/html')
-rw-r--r-- | src/html/htmlparser.nim | 112 | ||||
-rw-r--r-- | src/html/htmltokenizer.nim | 241 |
2 files changed, 233 insertions, 120 deletions
diff --git a/src/html/htmlparser.nim b/src/html/htmlparser.nim index 65da5c4e..ce8ce2bb 100644 --- a/src/html/htmlparser.nim +++ b/src/html/htmlparser.nim @@ -3,17 +3,23 @@ import options import sequtils import streams import strformat +import strutils import tables import unicode import css/sheet +import data/charset import html/dom import html/tags import html/htmltokenizer import js/javascript +import strings/decoderstream import utils/twtstr type + CharsetConfidence = enum + CONFIDENCE_TENTATIVE, CONFIDENCE_CERTAIN, CONFIDENCE_IRRELEVANT + DOMParser = ref object # JS interface OpenElements = seq[Element] @@ -22,6 +28,9 @@ type case fragment: bool of true: ctx: Element else: discard + needsreinterpret: bool + charset: Charset + confidence: CharsetConfidence openElements: OpenElements insertionMode: InsertionMode oldInsertionMode: InsertionMode @@ -548,6 +557,54 @@ template pop_current_node = discard parser.popElement() func isHTMLIntegrationPoint(node: Element): bool = return false #TODO SVG (NOTE MathML not implemented) +func extractEncFromMeta(s: string): Charset = + var i = 0 + while true: # Loop: + var j = 0 + while i < s.len: + template check(c: static char) = + if s[i] in {c, c.toUpperAscii()}: inc j + else: j = 0 + case j + of 0: check 'c' + of 1: check 'h' + of 2: check 'a' + of 3: check 'r' + of 4: check 's' + of 5: check 'e' + of 6: check 't' + of 7: + inc i + break + else: discard + inc i + if j < 7: return CHARSET_UNKNOWN + j = 0 + while i < s.len and s[i] in AsciiWhitespace: inc i + if i >= s.len or s[i] != '=': continue + while i < s.len and s[i] in AsciiWhitespace: inc i + break + if i >= s.len: return CHARSET_UNKNOWN + if s[i] in {'"', '\''}: + let s2 = s.substr(i).until(s[i]) + if s2.len == 0 or s2[^1] != s[i]: + return CHARSET_UNKNOWN + return getCharset(s2) + return getCharset(s.substr(i - 1).until({';', ' '})) + +proc changeEncoding(parser: var HTML5Parser, cs: Charset) = + if parser.charset in {CHARSET_UTF_16_LE, CHARSET_UTF_16_BE}: + parser.confidence = CONFIDENCE_CERTAIN + return + parser.confidence = CONFIDENCE_CERTAIN + if cs == parser.charset: + return + if cs == CHARSET_X_USER_DEFINED: + parser.charset = CHARSET_WINDOWS_1252 + else: + parser.charset = cs + parser.needsreinterpret = true + # Following is an implementation of the state (?) machine defined in # https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhtml # It uses the ad-hoc pattern matching macro `match' to apply the following @@ -562,7 +619,7 @@ func isHTMLIntegrationPoint(node: Element): bool = # pseudo-goto by breaking out only when the else statement needn't be # executed. # -# e.g. the following code: +# For example, the following code: # # match token: # TokenType.COMMENT => (block: echo "comment") @@ -644,7 +701,7 @@ macro match(token: Token, body: typed): untyped = ofBranches[CHARACTER_ASCII].painted = true else: error fmt"Unsupported curly of kind {pattern[0].kind}" of nnkStrLit: - var tempTokenizer = newTokenizer(newStringStream(pattern.strVal)) + var tempTokenizer = newTokenizer(pattern.strVal) for token in tempTokenizer.tokenize: let tt = int(token.tagtype) case token.t @@ -811,9 +868,16 @@ proc processInHTMLContent(parser: var HTML5Parser, token: Token, insertionMode = pop_current_node ) "<meta>" => (block: - discard parser.insertHTMLElement(token) + let element = parser.insertHTMLElement(token) pop_current_node - #TODO encodings + if parser.confidence == CONFIDENCE_TENTATIVE: + let cs = getCharset(element.attr("charset")) + if cs != CHARSET_UNKNOWN: + parser.changeEncoding(cs) + elif element.attr("http-equiv").equalsIgnoreCase("Content-Type"): + let cs = extractEncFromMeta(element.attr("content")) + if cs != CHARSET_UNKNOWN: + parser.changeEncoding(cs) ) "<title>" => (block: parser.genericRCDATAElementParsingAlgorithm(token)) "<noscript>" => (block: @@ -2092,17 +2156,48 @@ proc constructTree(parser: var HTML5Parser): Document = parser.processInHTMLContent(token) else: parser.processInForeignContent(token) + if parser.needsreinterpret: + return nil #TODO document.write (?) #TODO etc etc... return parser.document -proc parseHTML5*(inputStream: Stream): Document = +proc parseHTML5*(inputStream: Stream, cs = none(Charset), fallbackcs = CHARSET_UTF_8): (Document, Charset) = var parser: HTML5Parser + var bom: string + if cs.isSome: + parser.charset = cs.get + parser.confidence = CONFIDENCE_CERTAIN + else: + # bom sniff + const u8bom = char(0xEF) & char(0xBB) & char(0xBF) + const bebom = char(0xFE) & char(0xFF) + const lebom = char(0xFF) & char(0xFE) + bom = inputStream.readStr(2) + if bom == bebom: + parser.charset = CHARSET_UTF_16_BE + parser.confidence = CONFIDENCE_CERTAIN + bom = "" + elif bom == lebom: + parser.charset = CHARSET_UTF_16_LE + parser.confidence = CONFIDENCE_CERTAIN + bom = "" + else: + bom &= inputStream.readChar() + if bom == u8bom: + parser.charset = CHARSET_UTF_8 + parser.confidence = CONFIDENCE_CERTAIN + bom = "" + else: + parser.charset = fallbackcs + let decoder = newDecoderStream(inputStream, parser.charset) + for c in bom: + decoder.prepend(cast[uint32](c)) parser.document = newDocument() - parser.tokenizer = inputStream.newTokenizer() - return parser.constructTree() + parser.tokenizer = newTokenizer(decoder) + return (parser.constructTree(), parser.charset) proc newDOMParser*(): DOMParser {.jsctor.} = new(result) @@ -2110,7 +2205,8 @@ proc newDOMParser*(): DOMParser {.jsctor.} = proc parseFromString*(parser: DOMParser, str: string, t: string): Document {.jserr, jsfunc.} = case t of "text/html": - return parseHTML5(newStringStream(str)) + let (res, _) = parseHTML5(newStringStream(str)) + return res of "text/xml", "application/xml", "application/xhtml+xml", "image/svg+xml": JS_THROW JS_InternalError, "XML parsing is not supported yet" else: diff --git a/src/html/htmltokenizer.nim b/src/html/htmltokenizer.nim index c8f96144..08779c24 100644 --- a/src/html/htmltokenizer.nim +++ b/src/html/htmltokenizer.nim @@ -1,5 +1,4 @@ import options -import streams import strformat import strutils import macros @@ -8,6 +7,7 @@ import unicode import html/entity import html/tags +import strings/decoderstream import utils/radixtree import utils/twtstr @@ -16,7 +16,6 @@ type Tokenizer* = object state*: TokenizerState rstate: TokenizerState - curr: Rune tmp: string code: int tok: Token @@ -25,10 +24,9 @@ type attrv: string attr: bool - istream: Stream - sbuf: string + decoder: DecoderStream + sbuf: seq[Rune] sbuf_i: int - sbuf_ip: int eof_i: int TokenType* = enum @@ -97,65 +95,67 @@ func `$`*(tok: Token): string = of COMMENT: fmt"{tok.t} {tok.data}" of EOF: fmt"{tok.t}" -const bufSize = 4096 -const copyBufSize = 16 -proc newTokenizer*(s: Stream): Tokenizer = - result.sbuf = newString(bufSize) - result.istream = s - result.eof_i = -1 - if result.istream.atEnd: - result.eof_i = 0 - else: - let n = s.readDataStr(result.sbuf, 0..bufSize-1) - if n != bufSize: - result.eof_i = n +const bufLen = 1024 # * 4096 bytes +const copyBufLen = 16 # * 64 bytes + +proc readn(t: var Tokenizer) = + let l = t.sbuf.len + t.sbuf.setLen(bufLen) + let n = t.decoder.readData(addr t.sbuf[l], (bufLen - l) * sizeof(Rune)) + t.sbuf.setLen(l + n div sizeof(Rune)) + if t.decoder.atEnd: + t.eof_i = t.sbuf.len + +proc newTokenizer*(s: DecoderStream): Tokenizer = + var t = Tokenizer( + decoder: s, + sbuf: newSeqOfCap[Rune](bufLen), + eof_i: -1, + sbuf_i: 0 + ) + t.readn() + return t + +proc newTokenizer*(s: string): Tokenizer = + let rs = s.toRunes() + var t = Tokenizer( + sbuf: rs, + eof_i: rs.len, + sbuf_i: 0 + ) + return t func atEof(t: Tokenizer): bool = t.eof_i != -1 and t.sbuf_i >= t.eof_i -proc consume(t: var Tokenizer): char {.inline.} = - if t.eof_i == -1 and t.sbuf_i >= bufSize-copyBufSize: - # Workaround to swap buffer without breaking fastRuneAt. - var sbuf2 = newString(copyBufSize) - var i = 0 - while t.sbuf_i + i < bufSize: - sbuf2[i] = t.sbuf[t.sbuf_i + i] - inc i - let n = t.istream.readDataStr(t.sbuf, i..bufSize-1) - if n != bufSize - i: - t.eof_i = i + n +proc consume(t: var Tokenizer): Rune = + if t.sbuf_i >= min(bufLen - copyBufLen, t.sbuf.len): + for i in t.sbuf_i ..< t.sbuf.len: + t.sbuf[i - t.sbuf_i] = t.sbuf[i] + t.sbuf.setLen(t.sbuf.len - t.sbuf_i) t.sbuf_i = 0 - - var j = 0 - while j < i: - t.sbuf[j] = sbuf2[j] - inc j - - assert t.eof_i == -1 or t.sbuf_i < t.eof_i # not consuming eof... - t.sbuf_ip = t.sbuf_i # save previous pointer for potential reconsume - - # Normalize newlines (\r\n -> \n, single \r -> \n) - if t.sbuf[t.sbuf_i] == '\r': + if t.sbuf.len < bufLen: + t.readn() + ## Normalize newlines (\r\n -> \n, single \r -> \n) + if t.sbuf[t.sbuf_i] == Rune('\r'): inc t.sbuf_i - if t.sbuf[t.sbuf_i] != '\n': + if t.sbuf[t.sbuf_i] != Rune('\n'): # \r - result = '\n' - t.curr = Rune('\n') + result = Rune('\n') return # else, \r\n so just return the \n - result = t.sbuf[t.sbuf_i] - fastRuneAt(t.sbuf, t.sbuf_i, t.curr) + inc t.sbuf_i proc reconsume(t: var Tokenizer) = - t.sbuf_i = t.sbuf_ip + dec t.sbuf_i iterator tokenize*(tokenizer: var Tokenizer): Token = template emit(tok: Token) = if tok.t == START_TAG: tokenizer.laststart = tok if tok.t in {START_TAG, END_TAG}: - tok.tagtype = tagType(tok.tagName) + tok.tagtype = tagType(tok.tagname) yield tok template emit(tok: TokenType) = emit Token(t: tok) template emit(rn: Rune) = emit Token(t: CHARACTER, r: rn) @@ -173,7 +173,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = elif c in Ascii: emit c else: - emit tokenizer.curr + emit r template emit_replacement = emit Rune(0xFFFD) template switch_state(s: TokenizerState) = tokenizer.state = s @@ -199,23 +199,40 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = if tokenizer.attr: tokenizer.attrv &= c template peek_str(s: string): bool = - # WARNING: will break on strings with copyBufSize + 4 bytes - assert s.len < copyBufSize - 4 and s.len > 0 - if tokenizer.sbuf_i + s.len > tokenizer.eof_i: + # WARNING: will break on strings with copyBufLen + 4 bytes + # WARNING: only works with ascii + assert s.len < copyBufLen - 4 and s.len > 0 + if tokenizer.eof_i != -1 and tokenizer.sbuf_i + s.len >= tokenizer.eof_i: false else: - let slice = tokenizer.sbuf[tokenizer.sbuf_i..tokenizer.sbuf_i+s.high] - s == slice + var b = true + for i in 0 ..< s.len: + let c = tokenizer.sbuf[tokenizer.sbuf_i + i] + if not c.isAscii() or cast[char](c) != s[i]: + b = false + break + b + template peek_str_nocase(s: string): bool = - # WARNING: will break on strings with copyBufSize + 4 bytes + # WARNING: will break on strings with copyBufLen + 4 bytes # WARNING: only works with UPPER CASE ascii - assert s.len < copyBufSize - 4 and s.len > 0 - if tokenizer.sbuf_i + s.len > tokenizer.eof_i: + assert s.len < copyBufLen - 4 and s.len > 0 + if tokenizer.eof_i != -1 and tokenizer.sbuf_i + s.len >= tokenizer.eof_i: false else: - let slice = tokenizer.sbuf[tokenizer.sbuf_i..tokenizer.sbuf_i+s.high] - s == slice.toUpperAscii() - template peek_char(): char = tokenizer.sbuf[tokenizer.sbuf_i] + var b = true + for i in 0 ..< s.len: + let c = tokenizer.sbuf[tokenizer.sbuf_i + i] + if not c.isAscii() or cast[char](c).toUpperAscii() != s[i]: + b = false + break + b + template peek_char(): char = + let r = tokenizer.sbuf[tokenizer.sbuf_i] + if r.isAscii(): + cast[char](r) + else: + char(128) template has_adjusted_current_node(): bool = false #TODO implement this template consume_and_discard(n: int) = #TODO optimize var i = 0 @@ -298,17 +315,17 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = template has_anything_else = discard # does nothing const null = char(0) - const whitespace = {'\t', '\n', '\f', ' '} while true: {.computedGoto.} #eprint tokenizer.state #debug let is_eof = tokenizer.atEof # set eof here, otherwise we would exit at the last character - let c = if not is_eof: + let r = if not is_eof: tokenizer.consume() else: # avoid consuming eof... - null + Rune(null) + let c = if r.isAscii(): cast[char](r) else: char(128) stateMachine: # => case tokenizer.state of DATA: case c @@ -394,19 +411,19 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of TAG_NAME: case c - of whitespace: switch_state BEFORE_ATTRIBUTE_NAME + of AsciiWhitespace: switch_state BEFORE_ATTRIBUTE_NAME of '/': switch_state SELF_CLOSING_START_TAG of '>': switch_state DATA emit_tok - of AsciiUpperAlpha: tokenizer.tok.tagname &= char(tokenizer.curr).tolower() + of AsciiUpperAlpha: tokenizer.tok.tagname &= c.tolower() of null: parse_error unexpected_null_character tokenizer.tok.tagname &= Rune(0xFFFD) of eof: parse_error eof_in_tag emit_eof - else: tokenizer.tok.tagname &= tokenizer.curr + else: tokenizer.tok.tagname &= r of RCDATA_LESS_THAN_SIGN: case c @@ -430,7 +447,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of RCDATA_END_TAG_NAME: has_anything_else case c - of whitespace: + of AsciiWhitespace: if is_appropriate_end_tag_token: switch_state BEFORE_ATTRIBUTE_NAME else: @@ -447,8 +464,8 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr + tokenizer.tok.tagname &= c.tolower() + tokenizer.tmp &= c else: new_token nil #TODO emit '<' @@ -478,7 +495,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of RAWTEXT_END_TAG_NAME: has_anything_else case c - of whitespace: + of AsciiWhitespace: if is_appropriate_end_tag_token: switch_state BEFORE_ATTRIBUTE_NAME else: @@ -495,8 +512,8 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr + tokenizer.tok.tagname &= c.tolower() + tokenizer.tmp &= c else: new_token nil #TODO emit '<' @@ -531,7 +548,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of SCRIPT_DATA_END_TAG_NAME: has_anything_else case c - of whitespace: + of AsciiWhitespace: if is_appropriate_end_tag_token: switch_state BEFORE_ATTRIBUTE_NAME else: @@ -548,8 +565,8 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr + tokenizer.tok.tagname &= c.tolower() + tokenizer.tmp &= c else: emit '<' emit '/' @@ -650,7 +667,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of SCRIPT_DATA_ESCAPED_END_TAG_NAME: has_anything_else case c - of whitespace: + of AsciiWhitespace: if is_appropriate_end_tag_token: switch_state BEFORE_ATTRIBUTE_NAME else: @@ -666,8 +683,8 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else of AsciiAlpha: - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr + tokenizer.tok.tagname &= c.tolower() + tokenizer.tmp &= c else: emit '<' emit '/' @@ -676,7 +693,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of SCRIPT_DATA_DOUBLE_ESCAPE_START: case c - of whitespace, '/', '>': + of AsciiWhitespace, '/', '>': if tokenizer.tmp == "script": switch_state SCRIPT_DATA_DOUBLE_ESCAPED else: @@ -750,7 +767,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of SCRIPT_DATA_DOUBLE_ESCAPE_END: case c - of whitespace, '/', '>': + of AsciiWhitespace, '/', '>': if tokenizer.tmp == "script": switch_state SCRIPT_DATA_ESCAPED else: @@ -764,7 +781,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of BEFORE_ATTRIBUTE_NAME: case c - of whitespace: discard + of AsciiWhitespace: discard of '/', '>', eof: reconsume_in AFTER_ATTRIBUTE_NAME of '=': parse_error unexpected_equals_sign_before_attribute_name @@ -777,7 +794,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of ATTRIBUTE_NAME: has_anything_else case c - of whitespace, '/', '>', eof: + of AsciiWhitespace, '/', '>', eof: leave_attribute_name_state reconsume_in AFTER_ATTRIBUTE_NAME of '=': @@ -792,11 +809,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = parse_error unexpected_character_in_attribute_name anything_else else: - tokenizer.attrn &= tokenizer.curr + tokenizer.attrn &= r of AFTER_ATTRIBUTE_NAME: case c - of whitespace: discard + of AsciiWhitespace: discard of '/': switch_state SELF_CLOSING_START_TAG of '=': switch_state BEFORE_ATTRIBUTE_VALUE of '>': @@ -811,7 +828,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of BEFORE_ATTRIBUTE_VALUE: case c - of whitespace: discard + of AsciiWhitespace: discard of '"': switch_state ATTRIBUTE_VALUE_DOUBLE_QUOTED of '\'': switch_state ATTRIBUTE_VALUE_SINGLE_QUOTED of '>': @@ -830,7 +847,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of eof: parse_error eof_in_tag emit_eof - else: append_to_current_attr_value tokenizer.curr + else: append_to_current_attr_value r of ATTRIBUTE_VALUE_SINGLE_QUOTED: case c @@ -842,11 +859,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of eof: parse_error eof_in_tag emit_eof - else: append_to_current_attr_value tokenizer.curr + else: append_to_current_attr_value r of ATTRIBUTE_VALUE_UNQUOTED: case c - of whitespace: switch_state BEFORE_ATTRIBUTE_NAME + of AsciiWhitespace: switch_state BEFORE_ATTRIBUTE_NAME of '&': switch_state_return CHARACTER_REFERENCE of '>': switch_state DATA @@ -860,11 +877,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of eof: parse_error eof_in_tag emit_eof - else: append_to_current_attr_value tokenizer.curr + else: append_to_current_attr_value r of AFTER_ATTRIBUTE_VALUE_QUOTED: case c - of whitespace: + of AsciiWhitespace: switch_state BEFORE_ATTRIBUTE_NAME of '/': switch_state SELF_CLOSING_START_TAG @@ -874,7 +891,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of eof: parse_error eof_in_tag emit_eof - else: append_to_current_attr_value tokenizer.curr + else: append_to_current_attr_value r of SELF_CLOSING_START_TAG: case c @@ -899,7 +916,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok emit_eof of null: parse_error unexpected_null_character - else: tokenizer.tok.data &= tokenizer.curr + else: tokenizer.tok.data &= r of MARKUP_DECLARATION_OPEN: # note: rewritten to fit case model as we consume a char anyway has_anything_else @@ -967,7 +984,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = parse_error eof_in_comment emit_tok emit_eof - else: tokenizer.tok.data &= tokenizer.curr + else: tokenizer.tok.data &= r of COMMENT_LESS_THAN_SIGN: case c @@ -1037,7 +1054,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of DOCTYPE: case c - of whitespace: switch_state BEFORE_DOCTYPE_NAME + of AsciiWhitespace: switch_state BEFORE_DOCTYPE_NAME of '>': reconsume_in BEFORE_DOCTYPE_NAME of eof: parse_error eof_in_doctype @@ -1050,7 +1067,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of BEFORE_DOCTYPE_NAME: case c - of whitespace: discard + of AsciiWhitespace: discard of AsciiUpperAlpha: new_token Token(t: DOCTYPE, name: some($c.tolower())) switch_state DOCTYPE_NAME @@ -1068,12 +1085,12 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok emit_eof else: - new_token Token(t: DOCTYPE, name: some($tokenizer.curr)) + new_token Token(t: DOCTYPE, name: some($r)) switch_state DOCTYPE_NAME of DOCTYPE_NAME: case c - of whitespace: switch_state AFTER_DOCTYPE_NAME + of AsciiWhitespace: switch_state AFTER_DOCTYPE_NAME of '>': switch_state DATA emit_tok @@ -1088,12 +1105,12 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok emit_eof else: - tokenizer.tok.name.get &= tokenizer.curr + tokenizer.tok.name.get &= r of AFTER_DOCTYPE_NAME: # note: rewritten to fit case model as we consume a char anyway has_anything_else case c - of whitespace: discard + of AsciiWhitespace: discard of '>': switch_state DATA emit_tok @@ -1121,7 +1138,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of AFTER_DOCTYPE_PUBLIC_KEYWORD: case c - of whitespace: switch_state BEFORE_DOCTYPE_PUBLIC_IDENTIFIER + of AsciiWhitespace: switch_state BEFORE_DOCTYPE_PUBLIC_IDENTIFIER of '"': parse_error missing_whitespace_after_doctype_public_keyword tokenizer.tok.pubid = some("") @@ -1143,7 +1160,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: case c - of whitespace: discard + of AsciiWhitespace: discard of '"': tokenizer.tok.pubid = some("") switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED @@ -1182,7 +1199,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok emit_eof else: - tokenizer.tok.pubid.get &= tokenizer.curr + tokenizer.tok.pubid.get &= r of DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: case c @@ -1201,11 +1218,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok emit_eof else: - tokenizer.tok.pubid.get &= tokenizer.curr + tokenizer.tok.pubid.get &= r of AFTER_DOCTYPE_PUBLIC_IDENTIFIER: case c - of whitespace: switch_state BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS + of AsciiWhitespace: switch_state BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS of '>': switch_state DATA emit_tok @@ -1229,7 +1246,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: case c - of whitespace: discard + of AsciiWhitespace: discard of '>': switch_state DATA emit_tok @@ -1251,7 +1268,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of AFTER_DOCTYPE_SYSTEM_KEYWORD: case c - of whitespace: switch_state BEFORE_DOCTYPE_SYSTEM_IDENTIFIER + of AsciiWhitespace: switch_state BEFORE_DOCTYPE_SYSTEM_IDENTIFIER of '"': parse_error missing_whitespace_after_doctype_system_keyword tokenizer.tok.sysid = some("") @@ -1277,7 +1294,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: case c - of whitespace: discard + of AsciiWhitespace: discard of '"': tokenizer.tok.pubid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED @@ -1316,7 +1333,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok emit_eof else: - tokenizer.tok.sysid.get &= tokenizer.curr + tokenizer.tok.sysid.get &= r of DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: case c @@ -1335,11 +1352,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok emit_eof else: - tokenizer.tok.sysid.get &= tokenizer.curr + tokenizer.tok.sysid.get &= r of AFTER_DOCTYPE_SYSTEM_IDENTIFIER: case c - of whitespace: discard + of AsciiWhitespace: discard of '>': switch_state DATA emit_tok @@ -1403,7 +1420,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of NAMED_CHARACTER_REFERENCE: ignore_eof # we check for eof ourselves tokenizer.reconsume() - when nimVm: + when nimvm: eprint "Cannot evaluate character references at compile time" else: var buf = "" @@ -1412,8 +1429,8 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = #TODO interfacing with RadixNode is suffering # plus this doesn't look very efficient either while not tokenizer.atEof: - let c = tokenizer.consume() - buf &= c + let r = tokenizer.consume() + buf &= r if not node.hasPrefix(buf): tokenizer.reconsume() break @@ -1423,7 +1440,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = buf = "" if node.value.issome: value = node.value - tokenizer.tmp &= tokenizer.curr + tokenizer.tmp &= r if value.issome: if consumed_as_an_attribute and tokenizer.tmp[^1] != ';' and peek_char in {'='} + AsciiAlpha: flush_code_points_consumed_as_a_character_reference |