about summary refs log blame commit diff stats
path: root/src/html/htmltokenizer.nim
blob: a1776820c64db025104fd73457f0423334b64f72 (plain) (tree)
1
2
3
4
5
6
7
8
9
              







                  
                             
                







                          






                    
                     
 

                          
               



































































                                                                                 




























                                                                         



                                       
                                    



                                                      
                

                           


                                      

                                                     
                

                                                 
          
                         

                                      
                           
              

                                  
              





                                                     
                                        
















                                                                 
            
























                                                                                       



                                                                             

           







                                                    
                                             
                                                              
                                               

                                                                             

           












                                                                   









                                                                                                                
                                   





































































                                                                                              




                                                                                              
                          


                              

                                                         




















































































                                                                
                                                            



                                                 
                                                              





                                             
                                      






















                                        
                         















                                                 

                                            




























                                         
                         















                                                 

                                            

































                                             
                         















                                                 

                                            



































































































                                                        
                         














                                             

                                            







                                        
                                   








































































                                                              
                                   












                                                 
                                 











                                                                
                                        













                                                          
                            


                            
                                 



                                                 
                








                                   
                                 

















                                                         
                                          










                                                        
                                          


                                
                                                            
                                                     


                         








                                                                    
                                          


                                    
                         








                                           


                                                         























                                                    
                                   

















                                                                                               
                                  















































                                                        
                                   




































































                                                                
                                                          











                                                          
                                 
















                                                              
                                                   



                                 
                                                         













                                              
                                   



                                                                                          
                                 


























                                                                 
                                                                       




















                                                                   
                                 





































                                                                  
                                    

















                                                           
                                    


                                       
                                                                                    






















                                                                                    
                                 




















                                                                  
                                                                       
























                                                                   
                                 





































                                                                  
                                    

















                                                           
                                    


                                       
                                 






























































                                                                        
                 

                                                                     













                                                              



































































































                                                                                                             
import options
import strformat
import strutils
import macros
import tables
import unicode

import html/entity
import html/tags
import encoding/decoderstream
import utils/opt
import utils/radixtree
import utils/twtstr

# Tokenizer
type
  Tokenizer* = object
    state*: TokenizerState
    rstate: TokenizerState
    tmp: string
    code: int
    tok: Token
    laststart: Token
    attrn: string
    attrv: string
    attr: bool
    hasnonhtml*: bool

    decoder: DecoderStream
    sbuf: seq[Rune]
    sbuf_i: int
    eof_i: int

  TokenType* = enum
    DOCTYPE, START_TAG, END_TAG, COMMENT, CHARACTER, CHARACTER_ASCII, EOF

  TokenizerState* = enum
    DATA, CHARACTER_REFERENCE, TAG_OPEN, RCDATA, RCDATA_LESS_THAN_SIGN,
    RAWTEXT, RAWTEXT_LESS_THAN_SIGN, SCRIPT_DATA, SCRIPT_DATA_LESS_THAN_SIGN,
    PLAINTEXT, MARKUP_DECLARATION_OPEN, END_TAG_OPEN, BOGUS_COMMENT, TAG_NAME,
    BEFORE_ATTRIBUTE_NAME, RCDATA_END_TAG_OPEN, RCDATA_END_TAG_NAME,
    RAWTEXT_END_TAG_OPEN, RAWTEXT_END_TAG_NAME, SELF_CLOSING_START_TAG,
    SCRIPT_DATA_END_TAG_OPEN, SCRIPT_DATA_ESCAPE_START,
    SCRIPT_DATA_END_TAG_NAME, SCRIPT_DATA_ESCAPE_START_DASH,
    SCRIPT_DATA_ESCAPED_DASH_DASH, SCRIPT_DATA_ESCAPED,
    SCRIPT_DATA_ESCAPED_DASH, SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN,
    SCRIPT_DATA_ESCAPED_END_TAG_OPEN, SCRIPT_DATA_DOUBLE_ESCAPE_START,
    SCRIPT_DATA_ESCAPED_END_TAG_NAME, SCRIPT_DATA_DOUBLE_ESCAPED,
    SCRIPT_DATA_DOUBLE_ESCAPED_DASH, SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN,
    SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, SCRIPT_DATA_DOUBLE_ESCAPE_END,
    AFTER_ATTRIBUTE_NAME, ATTRIBUTE_NAME, BEFORE_ATTRIBUTE_VALUE,
    ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED,
    ATTRIBUTE_VALUE_UNQUOTED, AFTER_ATTRIBUTE_VALUE_QUOTED, COMMENT_START,
    CDATA_SECTION, COMMENT_START_DASH, COMMENT, COMMENT_END,
    COMMENT_LESS_THAN_SIGN, COMMENT_END_DASH, COMMENT_LESS_THAN_SIGN_BANG,
    COMMENT_LESS_THAN_SIGN_BANG_DASH, COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH,
    COMMENT_END_BANG, DOCTYPE, BEFORE_DOCTYPE_NAME, DOCTYPE_NAME,
    AFTER_DOCTYPE_NAME, AFTER_DOCTYPE_PUBLIC_KEYWORD,
    AFTER_DOCTYPE_SYSTEM_KEYWORD, BOGUS_DOCTYPE,
    BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED,
    DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, AFTER_DOCTYPE_PUBLIC_IDENTIFIER,
    BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
    DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED,
    DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
    AFTER_DOCTYPE_SYSTEM_IDENTIFIER, CDATA_SECTION_BRACKET, CDATA_SECTION_END,
    NAMED_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE,
    AMBIGUOUS_AMPERSAND_STATE, HEXADECIMAL_CHARACTER_REFERENCE_START,
    DECIMAL_CHARACTER_REFERENCE_START, HEXADECIMAL_CHARACTER_REFERENCE,
    DECIMAL_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE_END

  Token* = ref object
    case t*: TokenType
    of DOCTYPE:
      name*: Option[string]
      pubid*: Option[string]
      sysid*: Option[string]
      quirks*: bool
    of START_TAG, END_TAG:
      tagname*: string
      tagtype*: TagType
      selfclosing*: bool
      attrs*: Table[string, string]
    of CHARACTER:
      r*: Rune
    of CHARACTER_ASCII:
      c*: char
    of COMMENT:
      data*: string
    of EOF: discard

func `$`*(tok: Token): string =
  case tok.t
  of DOCTYPE: fmt"{tok.t} {tok.name} {tok.pubid} {tok.sysid} {tok.quirks}"
  of START_TAG, END_TAG: fmt"{tok.t} {tok.tagname} {tok.selfclosing} {tok.attrs}"
  of CHARACTER: fmt"{tok.t} {tok.r}"
  of CHARACTER_ASCII: fmt"{tok.t} {tok.c}"
  of COMMENT: fmt"{tok.t} {tok.data}"
  of EOF: fmt"{tok.t}"

const bufLen = 1024 # * 4096 bytes
const copyBufLen = 16 # * 64 bytes

proc readn(t: var Tokenizer) =
  let l = t.sbuf.len
  t.sbuf.setLen(bufLen)
  let n = t.decoder.readData(addr t.sbuf[l], (bufLen - l) * sizeof(Rune))
  t.sbuf.setLen(l + n div sizeof(Rune))
  if t.decoder.atEnd:
    t.eof_i = t.sbuf.len

proc newTokenizer*(s: DecoderStream): Tokenizer =
  var t = Tokenizer(
    decoder: s,
    sbuf: newSeqOfCap[Rune](bufLen),
    eof_i: -1,
    sbuf_i: 0
  )
  t.readn()
  return t

proc newTokenizer*(s: string): Tokenizer =
  let rs = s.toRunes()
  var t = Tokenizer(
    sbuf: rs,
    eof_i: rs.len,
    sbuf_i: 0
  )
  return t

func atEof(t: Tokenizer): bool =
  t.eof_i != -1 and t.sbuf_i >= t.eof_i

proc checkBufLen(t: var Tokenizer) =
  if t.sbuf_i >= min(bufLen - copyBufLen, t.sbuf.len):
    for i in t.sbuf_i ..< t.sbuf.len:
      t.sbuf[i - t.sbuf_i] = t.sbuf[i]
    t.sbuf.setLen(t.sbuf.len - t.sbuf_i)
    t.sbuf_i = 0
    if t.sbuf.len < bufLen:
      t.readn()

proc consume(t: var Tokenizer): Rune =
  t.checkBufLen()
  ## Normalize newlines (\r\n -> \n, single \r -> \n)
  if t.sbuf[t.sbuf_i] == Rune('\r'):
    inc t.sbuf_i
    t.checkBufLen()
    if t.atEof or t.sbuf[t.sbuf_i] != Rune('\n'):
      # \r
      result = Rune('\n')
      return
    # else, \r\n so just return the \n
  result = t.sbuf[t.sbuf_i]
  inc t.sbuf_i

proc reconsume(t: var Tokenizer) =
  dec t.sbuf_i

iterator tokenize*(tokenizer: var Tokenizer): Token =
  template emit(tok: Token) =
    if tok.t == START_TAG:
      tokenizer.laststart = tok
    if tok.t in {START_TAG, END_TAG}:
      tok.tagtype = tagType(tok.tagname)
    yield tok
  template emit(tok: TokenType) = emit Token(t: tok)
  template emit(rn: Rune) = emit Token(t: CHARACTER, r: rn)
  template emit(ch: char) = emit Token(t: CHARACTER_ASCII, c: ch)
  template emit_eof =
    emit EOF
    break
  template emit_tok =
    if tokenizer.attr:
      tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv
    emit tokenizer.tok
  template emit_current =
    if is_eof:
      emit_eof
    elif c in Ascii:
      emit c
    else:
      emit r
  template emit_replacement = emit Rune(0xFFFD)
  template switch_state(s: TokenizerState) =
    tokenizer.state = s
  template switch_state_return(s: TokenizerState) =
    tokenizer.rstate = tokenizer.state
    tokenizer.state = s
  template reconsume_in(s: TokenizerState) =
    tokenizer.reconsume()
    switch_state s
  template parse_error(error: untyped) = discard # does nothing for now... TODO?
  template is_appropriate_end_tag_token(): bool =
    tokenizer.laststart != nil and tokenizer.laststart.tagname == tokenizer.tok.tagname
  template start_new_attribute =
    if tokenizer.attr:
      tokenizer.tok.attrs[tokenizer.attrn] = tokenizer.attrv
    tokenizer.attrn = ""
    tokenizer.attrv = ""
    tokenizer.attr = true
  template leave_attribute_name_state =
    if tokenizer.attrn in tokenizer.tok.attrs:
      tokenizer.attr = false
  template append_to_current_attr_value(c: typed) =
    if tokenizer.attr:
      tokenizer.attrv &= c
  template peek_str(s: string): bool =
    # WARNING: will break on strings with copyBufLen + 4 bytes
    # WARNING: only works with ascii
    assert s.len < copyBufLen - 4 and s.len > 0
    if tokenizer.eof_i != -1 and tokenizer.sbuf_i + s.len >= tokenizer.eof_i:
      false
    else:
      var b = true
      for i in 0 ..< s.len:
        let c = tokenizer.sbuf[tokenizer.sbuf_i + i]
        if not c.isAscii() or cast[char](c) != s[i]:
          b = false
          break
      b

  template peek_str_nocase(s: string): bool =
    # WARNING: will break on strings with copyBufLen + 4 bytes
    # WARNING: only works with UPPER CASE ascii
    assert s.len < copyBufLen - 4 and s.len > 0
    if tokenizer.eof_i != -1 and tokenizer.sbuf_i + s.len >= tokenizer.eof_i:
      false
    else:
      var b = true
      for i in 0 ..< s.len:
        let c = tokenizer.sbuf[tokenizer.sbuf_i + i]
        if not c.isAscii() or cast[char](c).toUpperAscii() != s[i]:
          b = false
          break
      b
  template peek_char(): char =
    let r = tokenizer.sbuf[tokenizer.sbuf_i]
    if r.isAscii():
      cast[char](r)
    else:
      char(128)
  template consume_and_discard(n: int) = #TODO optimize
    var i = 0
    while i < n:
      discard tokenizer.consume()
      inc i
  template consumed_as_an_attribute(): bool =
    tokenizer.rstate in {ATTRIBUTE_VALUE_DOUBLE_QUOTED, ATTRIBUTE_VALUE_SINGLE_QUOTED, ATTRIBUTE_VALUE_UNQUOTED}
  template emit_tmp() =
    var i = 0
    while i < tokenizer.tmp.len:
      if tokenizer.tmp[i] in Ascii:
        emit tokenizer.tmp[i]
        inc i
      else:
        var r: Rune
        fastRuneAt(tokenizer.tmp, i, r)
        emit r
  template flush_code_points_consumed_as_a_character_reference() =
    if consumed_as_an_attribute:
      append_to_current_attr_value tokenizer.tmp
    else:
      emit_tmp
  template new_token(t: Token) =
    if tokenizer.attr:
      tokenizer.attr = false
    tokenizer.tok = t

  # Fake EOF as an actual character. Also replace anything_else with the else
  # branch.
  macro stateMachine(states: varargs[untyped]): untyped =
    var maincase = newNimNode(nnkCaseStmt).add(quote do: tokenizer.state)
    for state in states:
      if state.kind == nnkOfBranch:
        let mainstmtlist = findChild(state, it.kind == nnkStmtList)
        if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "ignore_eof":
          maincase.add(state)
          continue

        var hasanythingelse = false
        if mainstmtlist[0].kind == nnkIdent and mainstmtlist[0].strVal == "has_anything_else":
          hasanythingelse = true

        let childcase = findChild(mainstmtlist, it.kind == nnkCaseStmt)
        var haseof = false
        var eofstmts: NimNode
        var elsestmts: NimNode

        for i in countdown(childcase.len-1, 0):
          let childof = childcase[i]
          if childof.kind == nnkOfBranch:
            for j in countdown(childof.len-1, 0):
              if childof[j].kind == nnkIdent and childof[j].strVal == "eof":
                haseof = true
                eofstmts = childof.findChild(it.kind == nnkStmtList)
                if childof.findChild(it.kind == nnkIdent and it.strVal != "eof") != nil:
                  childof.del(j)
                else:
                  childcase.del(i)
          elif childof.kind == nnkElse:
            elsestmts = childof.findChild(it.kind == nnkStmtList)

        if not haseof:
          eofstmts = elsestmts
        let fake_eof = quote do:
          if is_eof:
            `eofstmts`
            continue
        mainstmtlist.insert(0, fake_eof)
        if hasanythingelse:
          let fake_anything_else = quote do:
            template anything_else =
              `elsestmts`
          mainstmtlist.insert(0, fake_anything_else)
      maincase.add(state)
    result = newNimNode(nnkStmtList)
    result.add(maincase)

  template ignore_eof = discard # does nothing
  template has_anything_else = discard # does nothing

  const null = char(0)

  while true:
    {.computedGoto.}
    #eprint tokenizer.state #debug
    let is_eof = tokenizer.atEof # set eof here, otherwise we would exit at the last character
    let r = if not is_eof:
      tokenizer.consume()
    else:
      # avoid consuming eof...
      Rune(null)
    let c = if r.isAscii(): cast[char](r) else: char(128)
    stateMachine: # => case tokenizer.state
    of DATA:
      case c
      of '&': switch_state_return CHARACTER_REFERENCE
      of '<': switch_state TAG_OPEN
      of null:
        parse_error unexpected_null_character
        emit_current
      of eof: emit_eof
      else: emit_current

    of RCDATA:
      case c
      of '&': switch_state_return CHARACTER_REFERENCE
      of '<': switch_state RCDATA_LESS_THAN_SIGN
      of null: parse_error unexpected_null_character
      of eof: emit_eof
      else: emit_current

    of RAWTEXT:
      case c
      of '<': switch_state RAWTEXT_LESS_THAN_SIGN
      of null:
        parse_error unexpected_null_character
        emit_replacement
      of eof: emit_eof
      else: emit_current

    of SCRIPT_DATA:
      case c
      of '<': switch_state SCRIPT_DATA_LESS_THAN_SIGN
      of null:
        parse_error unexpected_null_character
        emit_replacement
      of eof: emit_eof
      else: emit_current

    of PLAINTEXT:
      case c
      of null:
        parse_error unexpected_null_character
        emit_replacement
      of eof: emit_eof
      else: emit_current

    of TAG_OPEN:
      case c
      of '!': switch_state MARKUP_DECLARATION_OPEN
      of '/': switch_state END_TAG_OPEN
      of AsciiAlpha:
        new_token Token(t: START_TAG)
        reconsume_in TAG_NAME
      of '?':
        parse_error unexpected_question_mark_instead_of_tag_name
        new_token Token(t: COMMENT)
        reconsume_in BOGUS_COMMENT
      of eof:
        parse_error eof_before_tag_name
        emit '<'
        emit_eof
      else:
        parse_error invalid_first_character_of_tag_name
        emit '<'
        reconsume_in DATA

    of END_TAG_OPEN:
      case c
      of AsciiAlpha:
        new_token Token(t: END_TAG)
        reconsume_in TAG_NAME
      of '>':
        parse_error missing_end_tag_name
        switch_state DATA
      of eof:
        parse_error eof_before_tag_name
        emit '<'
        emit '/'
        emit_eof
      else:
        parse_error invalid_first_character_of_tag_name
        new_token Token(t: COMMENT)
        reconsume_in BOGUS_COMMENT

    of TAG_NAME:
      case c
      of AsciiWhitespace: switch_state BEFORE_ATTRIBUTE_NAME
      of '/': switch_state SELF_CLOSING_START_TAG
      of '>':
        switch_state DATA
        emit_tok
      of AsciiUpperAlpha: tokenizer.tok.tagname &= c.tolower()
      of null:
        parse_error unexpected_null_character
        tokenizer.tok.tagname &= Rune(0xFFFD)
      of eof:
        parse_error eof_in_tag
        emit_eof
      else: tokenizer.tok.tagname &= r

    of RCDATA_LESS_THAN_SIGN:
      case c
      of '/':
        tokenizer.tmp = ""
        switch_state RCDATA_END_TAG_OPEN
      else:
        emit '<'
        reconsume_in RCDATA

    of RCDATA_END_TAG_OPEN:
      case c
      of AsciiAlpha:
        new_token Token(t: END_TAG)
        reconsume_in RCDATA_END_TAG_NAME
      else:
        emit '<'
        emit '/'
        reconsume_in RCDATA

    of RCDATA_END_TAG_NAME:
      has_anything_else
      case c
      of AsciiWhitespace:
        if is_appropriate_end_tag_token:
          switch_state BEFORE_ATTRIBUTE_NAME
        else:
          anything_else
      of '/':
        if is_appropriate_end_tag_token:
          switch_state SELF_CLOSING_START_TAG
        else:
          anything_else
      of '>':
        if is_appropriate_end_tag_token:
          switch_state DATA
          emit_tok
        else:
          anything_else
      of AsciiAlpha: # note: merged upper & lower
        tokenizer.tok.tagname &= c.tolower()
        tokenizer.tmp &= c
      else:
        new_token nil #TODO
        emit '<'
        emit '/'
        emit_tmp
        reconsume_in RCDATA

    of RAWTEXT_LESS_THAN_SIGN:
      case c
      of '/':
        tokenizer.tmp = ""
        switch_state RAWTEXT_END_TAG_OPEN
      else:
        emit '<'
        reconsume_in RAWTEXT

    of RAWTEXT_END_TAG_OPEN:
      case c
      of AsciiAlpha:
        new_token Token(t: END_TAG)
        reconsume_in RAWTEXT_END_TAG_NAME
      else:
        emit '<'
        emit '/'
        reconsume_in RAWTEXT

    of RAWTEXT_END_TAG_NAME:
      has_anything_else
      case c
      of AsciiWhitespace:
        if is_appropriate_end_tag_token:
          switch_state BEFORE_ATTRIBUTE_NAME
        else:
          anything_else
      of '/':
        if is_appropriate_end_tag_token:
          switch_state SELF_CLOSING_START_TAG
        else:
          anything_else
      of '>':
        if is_appropriate_end_tag_token:
          switch_state DATA
          emit_tok
        else:
          anything_else
      of AsciiAlpha: # note: merged upper & lower
        tokenizer.tok.tagname &= c.tolower()
        tokenizer.tmp &= c
      else:
        new_token nil #TODO
        emit '<'
        emit '/'
        for r in tokenizer.tmp.runes:
          emit r
        reconsume_in RAWTEXT

    of SCRIPT_DATA_LESS_THAN_SIGN:
      case c
      of '/':
        tokenizer.tmp = ""
        switch_state SCRIPT_DATA_END_TAG_OPEN
      of '!':
        switch_state SCRIPT_DATA_ESCAPE_START
        emit '<'
        emit '!'
      else:
        emit '<'
        reconsume_in SCRIPT_DATA

    of SCRIPT_DATA_END_TAG_OPEN:
      case c
      of AsciiAlpha:
        new_token Token(t: END_TAG)
        reconsume_in SCRIPT_DATA_END_TAG_NAME
      else:
        emit '<'
        emit '/'
        reconsume_in SCRIPT_DATA

    of SCRIPT_DATA_END_TAG_NAME:
      has_anything_else
      case c
      of AsciiWhitespace:
        if is_appropriate_end_tag_token:
          switch_state BEFORE_ATTRIBUTE_NAME
        else:
          anything_else
      of '/':
        if is_appropriate_end_tag_token:
          switch_state SELF_CLOSING_START_TAG
        else:
          anything_else
      of '>':
        if is_appropriate_end_tag_token:
          switch_state DATA
          emit_tok
        else:
          anything_else
      of AsciiAlpha: # note: merged upper & lower
        tokenizer.tok.tagname &= c.tolower()
        tokenizer.tmp &= c
      else:
        emit '<'
        emit '/'
        emit_tmp
        reconsume_in SCRIPT_DATA

    of SCRIPT_DATA_ESCAPE_START:
      case c
      of '-':
        switch_state SCRIPT_DATA_ESCAPE_START_DASH
        emit '-'
      else:
        reconsume_in SCRIPT_DATA

    of SCRIPT_DATA_ESCAPE_START_DASH:
      case c
      of '-':
        switch_state SCRIPT_DATA_ESCAPED_DASH_DASH
        emit '-'
      else:
        reconsume_in SCRIPT_DATA

    of SCRIPT_DATA_ESCAPED:
      case c
      of '-':
        switch_state SCRIPT_DATA_ESCAPED_DASH
        emit '-'
      of '<':
        switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
      of null:
        parse_error unexpected_null_character
        emit_replacement
      of eof:
        parse_error eof_in_script_html_comment_like_text
        emit_eof
      else:
        emit_current

    of SCRIPT_DATA_ESCAPED_DASH:
      case c
      of '-':
        switch_state SCRIPT_DATA_ESCAPED_DASH_DASH
        emit '-'
      of '<':
        switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
      of null:
        parse_error unexpected_null_character
        switch_state SCRIPT_DATA_ESCAPED
      of eof:
        parse_error eof_in_script_html_comment_like_text
        emit_eof
      else:
        switch_state SCRIPT_DATA_ESCAPED
        emit_current

    of SCRIPT_DATA_ESCAPED_DASH_DASH:
      case c
      of '-':
        emit '-'
      of '<':
        switch_state SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
      of '>':
        switch_state SCRIPT_DATA
        emit '>'
      of null:
        parse_error unexpected_null_character
        switch_state SCRIPT_DATA_ESCAPED
      of eof:
        parse_error eof_in_script_html_comment_like_text
        emit_eof
      else:
        switch_state SCRIPT_DATA_ESCAPED
        emit_current

    of SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
      case c
      of '/':
        tokenizer.tmp = ""
        switch_state SCRIPT_DATA_ESCAPED_END_TAG_OPEN
      of AsciiAlpha:
        tokenizer.tmp = ""
        emit '<'
        reconsume_in SCRIPT_DATA_DOUBLE_ESCAPE_START
      else:
        emit '<'
        reconsume_in SCRIPT_DATA_ESCAPED

    of SCRIPT_DATA_ESCAPED_END_TAG_OPEN:
      case c
      of AsciiAlpha:
        new_token Token(t: START_TAG)
        reconsume_in SCRIPT_DATA_ESCAPED_END_TAG_NAME
      else:
        emit '<'
        emit '/'
        reconsume_in SCRIPT_DATA_ESCAPED

    of SCRIPT_DATA_ESCAPED_END_TAG_NAME:
      has_anything_else
      case c
      of AsciiWhitespace:
        if is_appropriate_end_tag_token:
          switch_state BEFORE_ATTRIBUTE_NAME
        else:
          anything_else
      of '/':
        if is_appropriate_end_tag_token:
          switch_state SELF_CLOSING_START_TAG
        else:
          anything_else
      of '>':
        if is_appropriate_end_tag_token:
          switch_state DATA
        else:
          anything_else
      of AsciiAlpha:
        tokenizer.tok.tagname &= c.tolower()
        tokenizer.tmp &= c
      else:
        emit '<'
        emit '/'
        emit_tmp
        reconsume_in SCRIPT_DATA_ESCAPED

    of SCRIPT_DATA_DOUBLE_ESCAPE_START:
      case c
      of AsciiWhitespace, '/', '>':
        if tokenizer.tmp == "script":
          switch_state SCRIPT_DATA_DOUBLE_ESCAPED
        else:
          switch_state SCRIPT_DATA_ESCAPED
          emit_current
      of AsciiAlpha: # note: merged upper & lower
        tokenizer.tmp &= c.tolower()
        emit_current
      else: reconsume_in SCRIPT_DATA_ESCAPED

    of SCRIPT_DATA_DOUBLE_ESCAPED:
      case c
      of '-':
        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_DASH
        emit '-'
      of '<':
        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
        emit '<'
      of null:
        parse_error unexpected_null_character
        emit_replacement
      of eof:
        parse_error eof_in_script_html_comment_like_text
        emit_eof
      else: emit_current

    of SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
      case c
      of '-':
        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH
        emit '-'
      of '<':
        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
        emit '<'
      of null:
        parse_error unexpected_null_character
        switch_state SCRIPT_DATA_DOUBLE_ESCAPED
        emit_replacement
      of eof:
        parse_error eof_in_script_html_comment_like_text
        emit_eof
      else:
        switch_state SCRIPT_DATA_DOUBLE_ESCAPED
        emit_current

    of SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
      case c
      of '-': emit '-'
      of '<':
        switch_state SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
        emit '<'
      of '>':
        switch_state SCRIPT_DATA
        emit '>'
      of null:
        parse_error unexpected_null_character
        switch_state SCRIPT_DATA_DOUBLE_ESCAPED
        emit_replacement
      of eof:
        parse_error eof_in_script_html_comment_like_text
        emit_eof
      else: switch_state SCRIPT_DATA_DOUBLE_ESCAPED

    of SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
      case c
      of '/':
        tokenizer.tmp = ""
        switch_state SCRIPT_DATA_DOUBLE_ESCAPE_END
        emit '/'
      else: reconsume_in SCRIPT_DATA_DOUBLE_ESCAPED

    of SCRIPT_DATA_DOUBLE_ESCAPE_END:
      case c
      of AsciiWhitespace, '/', '>':
        if tokenizer.tmp == "script":
          switch_state SCRIPT_DATA_ESCAPED
        else:
          switch_state SCRIPT_DATA_DOUBLE_ESCAPED
          emit_current
      of AsciiAlpha: # note: merged upper & lower
        tokenizer.tmp &= c.tolower()
        emit_current
      else:
        reconsume_in SCRIPT_DATA_DOUBLE_ESCAPED

    of BEFORE_ATTRIBUTE_NAME:
      case c
      of AsciiWhitespace: discard
      of '/', '>', eof: reconsume_in AFTER_ATTRIBUTE_NAME
      of '=':
        parse_error unexpected_equals_sign_before_attribute_name
        start_new_attribute
        switch_state ATTRIBUTE_NAME
      else:
        start_new_attribute
        reconsume_in ATTRIBUTE_NAME

    of ATTRIBUTE_NAME:
      has_anything_else
      case c
      of AsciiWhitespace, '/', '>', eof:
        leave_attribute_name_state
        reconsume_in AFTER_ATTRIBUTE_NAME
      of '=':
        leave_attribute_name_state
        switch_state BEFORE_ATTRIBUTE_VALUE
      of AsciiUpperAlpha:
        tokenizer.attrn &= c.tolower()
      of null:
        parse_error unexpected_null_character
        tokenizer.attrn &= Rune(0xFFFD)
      of '"', '\'', '<':
        parse_error unexpected_character_in_attribute_name
        anything_else
      else:
        tokenizer.attrn &= r

    of AFTER_ATTRIBUTE_NAME:
      case c
      of AsciiWhitespace: discard
      of '/': switch_state SELF_CLOSING_START_TAG
      of '=': switch_state BEFORE_ATTRIBUTE_VALUE
      of '>':
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_tag
        emit_eof
      else:
        start_new_attribute
        reconsume_in ATTRIBUTE_NAME

    of BEFORE_ATTRIBUTE_VALUE:
      case c
      of AsciiWhitespace: discard
      of '"': switch_state ATTRIBUTE_VALUE_DOUBLE_QUOTED
      of '\'': switch_state ATTRIBUTE_VALUE_SINGLE_QUOTED
      of '>':
        parse_error missing_attribute_value
        switch_state DATA
        emit '>'
      else: reconsume_in ATTRIBUTE_VALUE_UNQUOTED

    of ATTRIBUTE_VALUE_DOUBLE_QUOTED:
      case c
      of '"': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED
      of '&': switch_state_return CHARACTER_REFERENCE
      of null:
        parse_error unexpected_null_character
        append_to_current_attr_value Rune(0xFFFD)
      of eof:
        parse_error eof_in_tag
        emit_eof
      else: append_to_current_attr_value r

    of ATTRIBUTE_VALUE_SINGLE_QUOTED:
      case c
      of '\'': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED
      of '&': switch_state_return CHARACTER_REFERENCE
      of null:
        parse_error unexpected_null_character
        append_to_current_attr_value Rune(0xFFFD)
      of eof:
        parse_error eof_in_tag
        emit_eof
      else: append_to_current_attr_value r

    of ATTRIBUTE_VALUE_UNQUOTED:
      case c
      of AsciiWhitespace: switch_state BEFORE_ATTRIBUTE_NAME
      of '&': switch_state_return CHARACTER_REFERENCE
      of '>':
        switch_state DATA
        emit_tok
      of null:
        parse_error unexpected_null_character
        append_to_current_attr_value Rune(0xFFFD)
      of '"', '\'', '<', '=', '`':
        parse_error unexpected_character_in_unquoted_attribute_value
        append_to_current_attr_value c
      of eof:
        parse_error eof_in_tag
        emit_eof
      else: append_to_current_attr_value r

    of AFTER_ATTRIBUTE_VALUE_QUOTED:
      case c
      of AsciiWhitespace:
        switch_state BEFORE_ATTRIBUTE_NAME
      of '/':
        switch_state SELF_CLOSING_START_TAG
      of '>':
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_tag
        emit_eof
      else:
        parse_error missing_whitespace_between_attributes
        reconsume_in BEFORE_ATTRIBUTE_NAME

    of SELF_CLOSING_START_TAG:
      case c
      of '>':
        tokenizer.tok.selfclosing = true
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_tag
        emit_eof
      else:
        parse_error unexpected_solidus_in_tag
        reconsume_in BEFORE_ATTRIBUTE_NAME

    of BOGUS_COMMENT:
      assert tokenizer.tok.t == COMMENT
      case c
      of '>':
        switch_state DATA
        emit_tok
      of eof:
        emit_tok
        emit_eof
      of null: parse_error unexpected_null_character
      else: tokenizer.tok.data &= r

    of MARKUP_DECLARATION_OPEN: # note: rewritten to fit case model as we consume a char anyway
      has_anything_else
      case c
      of '-':
        if peek_char == '-':
          new_token Token(t: COMMENT)
          tokenizer.state = COMMENT_START
          consume_and_discard 1
        else: anything_else
      of 'D', 'd':
        if peek_str_nocase("OCTYPE"):
          consume_and_discard "OCTYPE".len
          switch_state DOCTYPE
        else: anything_else
      of '[':
        if peek_str("CDATA["):
          consume_and_discard "CDATA[".len
          if tokenizer.hasnonhtml:
            switch_state CDATA_SECTION
          else:
            parse_error cdata_in_html_content
            new_token Token(t: COMMENT, data: "[CDATA[")
            switch_state BOGUS_COMMENT
        else: anything_else
      else:
        parse_error incorrectly_opened_comment
        new_token Token(t: COMMENT)
        reconsume_in BOGUS_COMMENT

    of COMMENT_START:
      case c
      of '-': switch_state COMMENT_START_DASH
      of '>':
        parse_error abrupt_closing_of_empty_comment
        switch_state DATA
        emit_tok
      else: reconsume_in COMMENT

    of COMMENT_START_DASH:
      case c
      of '-': switch_state COMMENT_END
      of '>':
        parse_error abrupt_closing_of_empty_comment
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_comment
        emit_tok
        emit_eof
      else:
        tokenizer.tok.data &= '-'
        reconsume_in COMMENT

    of COMMENT:
      case c
      of '<':
        tokenizer.tok.data &= c
        switch_state COMMENT_LESS_THAN_SIGN
      of '-': switch_state COMMENT_END_DASH
      of null:
        parse_error unexpected_null_character
        tokenizer.tok.data &= Rune(0xFFFD)
      of eof:
        parse_error eof_in_comment
        emit_tok
        emit_eof
      else: tokenizer.tok.data &= r

    of COMMENT_LESS_THAN_SIGN:
      case c
      of '!':
        tokenizer.tok.data &= c
        switch_state COMMENT_LESS_THAN_SIGN_BANG
      of '<': tokenizer.tok.data &= c
      else: reconsume_in COMMENT

    of COMMENT_LESS_THAN_SIGN_BANG:
      case c
      of '-': switch_state COMMENT_LESS_THAN_SIGN_BANG_DASH
      else: reconsume_in COMMENT

    of COMMENT_LESS_THAN_SIGN_BANG_DASH:
      case c
      of '-': switch_state COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH
      else: reconsume_in COMMENT_END_DASH

    of COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH:
      case c
      of '>', eof: reconsume_in COMMENT_END
      else:
        parse_error nested_comment
        reconsume_in COMMENT_END

    of COMMENT_END_DASH:
      case c
      of '-': switch_state COMMENT_END
      of eof:
        parse_error eof_in_comment
        emit_tok
        emit_eof
      else:
        tokenizer.tok.data &= '-'
        reconsume_in COMMENT

    of COMMENT_END:
      case c
      of '>': switch_state DATA
      of '!': switch_state COMMENT_END_BANG
      of '-': tokenizer.tok.data &= '-'
      of eof:
        parse_error eof_in_comment
        emit_tok
        emit_eof
      else:
        tokenizer.tok.data &= "--"
        reconsume_in COMMENT

    of COMMENT_END_BANG:
      case c
      of '-':
        tokenizer.tok.data &= "--!"
        switch_state COMMENT_END_DASH
      of '>':
        parse_error incorrectly_closed_comment
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_comment
        emit_tok
        emit_eof
      else:
        tokenizer.tok.data &= "--!"
        reconsume_in COMMENT

    of DOCTYPE:
      case c
      of AsciiWhitespace: switch_state BEFORE_DOCTYPE_NAME
      of '>': reconsume_in BEFORE_DOCTYPE_NAME
      of eof:
        parse_error eof_in_doctype
        new_token Token(t: DOCTYPE, quirks: true)
        emit_tok
        emit_eof
      else:
        parse_error missing_whitespace_before_doctype_name
        reconsume_in BEFORE_DOCTYPE_NAME

    of BEFORE_DOCTYPE_NAME:
      case c
      of AsciiWhitespace: discard
      of AsciiUpperAlpha:
        new_token Token(t: DOCTYPE, name: some($c.tolower()))
        switch_state DOCTYPE_NAME
      of null:
        parse_error unexpected_null_character
        new_token Token(t: DOCTYPE, name: some($Rune(0xFFFD)))
      of '>':
        parse_error missing_doctype_name
        new_token Token(t: DOCTYPE, quirks: true)
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_doctype
        new_token Token(t: DOCTYPE, quirks: true)
        emit_tok
        emit_eof
      else:
        new_token Token(t: DOCTYPE, name: some($r))
        switch_state DOCTYPE_NAME

    of DOCTYPE_NAME:
      case c
      of AsciiWhitespace: switch_state AFTER_DOCTYPE_NAME
      of '>':
        switch_state DATA
        emit_tok
      of AsciiUpperAlpha:
        tokenizer.tok.name.get &= c.tolower()
      of null:
        parse_error unexpected_null_character
        tokenizer.tok.name.get &= Rune(0xFFFD)
      of eof:
        parse_error eof_in_doctype
        tokenizer.tok.quirks = true
        emit_tok
        emit_eof
      else:
        tokenizer.tok.name.get &= r

    of AFTER_DOCTYPE_NAME: # note: rewritten to fit case model as we consume a char anyway
      has_anything_else
      case c
      of AsciiWhitespace: discard
      of '>':
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_doctype
        tokenizer.tok.quirks = true
        emit_tok
        emit_eof
      of 'p', 'P':
        if peek_str("UBLIC"):
          consume_and_discard "UBLIC".len
          switch_state AFTER_DOCTYPE_PUBLIC_KEYWORD
        else:
          anything_else
      of 's', 'S':
        if peek_str("YSTEM"):
          consume_and_discard "YSTEM".len
          switch_state AFTER_DOCTYPE_SYSTEM_KEYWORD
        else:
          anything_else
      else:
        parse_error invalid_character_sequence_after_doctype_name
        tokenizer.tok.quirks = true
        reconsume_in BOGUS_DOCTYPE

    of AFTER_DOCTYPE_PUBLIC_KEYWORD:
      case c
      of AsciiWhitespace: switch_state BEFORE_DOCTYPE_PUBLIC_IDENTIFIER
      of '"':
        parse_error missing_whitespace_after_doctype_public_keyword
        tokenizer.tok.pubid = some("")
        switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
      of '>':
        parse_error missing_doctype_public_identifier
        tokenizer.tok.quirks = true
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_doctype
        tokenizer.tok.quirks = true
        emit_tok
        emit_eof
      else:
        parse_error missing_quote_before_doctype_public_identifier
        tokenizer.tok.quirks = true
        reconsume_in BOGUS_DOCTYPE

    of BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
      case c
      of AsciiWhitespace: discard
      of '"':
        tokenizer.tok.pubid = some("")
        switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
      of '\'':
        tokenizer.tok.pubid = some("")
        switch_state DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED
      of '>':
        parse_error missing_doctype_public_identifier
        tokenizer.tok.quirks = true
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_doctype
        tokenizer.tok.quirks = true
        emit_tok
        emit_eof
      else:
        parse_error missing_quote_before_doctype_public_identifier
        tokenizer.tok.quirks = true
        reconsume_in BOGUS_DOCTYPE

    of DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
      case c
      of '"': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER
      of null:
        parse_error unexpected_null_character
        tokenizer.tok.pubid.get &= Rune(0xFFFD)
      of '>':
        parse_error abrupt_doctype_public_identifier
        tokenizer.tok.quirks = true
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_doctype
        tokenizer.tok.quirks = true
        emit_tok
        emit_eof
      else:
        tokenizer.tok.pubid.get &= r

    of DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
      case c
      of '\'': switch_state AFTER_DOCTYPE_PUBLIC_IDENTIFIER
      of null:
        parse_error unexpected_null_character
        tokenizer.tok.pubid.get &= Rune(0xFFFD)
      of '>':
        parse_error abrupt_doctype_public_identifier
        tokenizer.tok.quirks = true
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_doctype
        tokenizer.tok.quirks = true
        emit_tok
        emit_eof
      else:
        tokenizer.tok.pubid.get &= r

    of AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
      case c
      of AsciiWhitespace: switch_state BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
      of '>':
        switch_state DATA
        emit_tok
      of '"':
        parse_error missing_whitespace_between_doctype_public_and_system_identifiers
        tokenizer.tok.sysid = some("")
        switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
      of '\'':
        parse_error missing_whitespace_between_doctype_public_and_system_identifiers
        tokenizer.tok.sysid = some("")
        switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
      of eof:
        parse_error eof_in_doctype
        tokenizer.tok.quirks = true
        emit_tok
        emit_eof
      else:
        parse_error missing_quote_before_doctype_system_identifier
        tokenizer.tok.quirks = true
        reconsume_in BOGUS_DOCTYPE

    of BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
      case c
      of AsciiWhitespace: discard
      of '>':
        switch_state DATA
        emit_tok
      of '"':
        tokenizer.tok.sysid = some("")
        switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
      of '\'':
        tokenizer.tok.sysid = some("")
        switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
      of eof:
        parse_error eof_in_doctype
        tokenizer.tok.quirks = true
        emit_tok
        emit_eof
      else:
        parse_error missing_quote_before_doctype_system_identifier
        tokenizer.tok.quirks = true
        reconsume_in BOGUS_DOCTYPE

    of AFTER_DOCTYPE_SYSTEM_KEYWORD:
      case c
      of AsciiWhitespace: switch_state BEFORE_DOCTYPE_SYSTEM_IDENTIFIER
      of '"':
        parse_error missing_whitespace_after_doctype_system_keyword
        tokenizer.tok.sysid = some("")
        switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
      of '\'':
        parse_error missing_whitespace_after_doctype_system_keyword
        tokenizer.tok.sysid = some("")
        switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
      of '>':
        parse_error missing_doctype_system_identifier
        tokenizer.tok.quirks = true
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_doctype
        tokenizer.tok.quirks = true
        emit_tok
        emit_eof
      else:
        parse_error missing_quote_before_doctype_system_identifier
        tokenizer.tok.quirks = true
        reconsume_in BOGUS_DOCTYPE

    of BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
      case c
      of AsciiWhitespace: discard
      of '"':
        tokenizer.tok.pubid = some("")
        switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
      of '\'':
        tokenizer.tok.pubid = some("")
        switch_state DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
      of '>':
        parse_error missing_doctype_system_identifier
        tokenizer.tok.quirks = true
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_doctype
        tokenizer.tok.quirks = true
        emit_tok
        emit_eof
      else:
        parse_error missing_quote_before_doctype_system_identifier
        tokenizer.tok.quirks = true
        reconsume_in BOGUS_DOCTYPE

    of DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
      case c
      of '"': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER
      of null:
        parse_error unexpected_null_character
        tokenizer.tok.sysid.get &= Rune(0xFFFD)
      of '>':
        parse_error abrupt_doctype_system_identifier
        tokenizer.tok.quirks = true
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_doctype
        tokenizer.tok.quirks = true
        emit_tok
        emit_eof
      else:
        tokenizer.tok.sysid.get &= r

    of DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
      case c
      of '\'': switch_state AFTER_DOCTYPE_SYSTEM_IDENTIFIER
      of null:
        parse_error unexpected_null_character
        tokenizer.tok.sysid.get &= Rune(0xFFFD)
      of '>':
        parse_error abrupt_doctype_system_identifier
        tokenizer.tok.quirks = true
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_doctype
        tokenizer.tok.quirks = true
        emit_tok
        emit_eof
      else:
        tokenizer.tok.sysid.get &= r

    of AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
      case c
      of AsciiWhitespace: discard
      of '>':
        switch_state DATA
        emit_tok
      of eof:
        parse_error eof_in_doctype
        tokenizer.tok.quirks = true
        emit_tok
        emit_eof
      else:
        parse_error unexpected_character_after_doctype_system_identifier
        reconsume_in BOGUS_DOCTYPE

    of BOGUS_DOCTYPE:
      case c
      of '>':
        switch_state DATA
        emit_tok
      of null: parse_error unexpected_null_character
      of eof:
        emit_tok
        emit_eof
      else: discard

    of CDATA_SECTION:
      case c
      of ']': switch_state CDATA_SECTION_BRACKET
      of eof:
        parse_error eof_in_cdata
        emit_eof
      else:
        emit_current

    of CDATA_SECTION_BRACKET:
      case c
      of ']': switch_state CDATA_SECTION_END
      of '>': switch_state DATA
      else:
        emit ']'
        reconsume_in CDATA_SECTION

    of CDATA_SECTION_END:
      case c
      of ']': emit ']'
      of '>': switch_state DATA
      else:
        emit ']'
        emit ']'
        reconsume_in CDATA_SECTION

    of CHARACTER_REFERENCE:
      tokenizer.tmp = "&"
      case c
      of AsciiAlpha: reconsume_in NAMED_CHARACTER_REFERENCE
      of '#':
        tokenizer.tmp &= '#'
        switch_state NUMERIC_CHARACTER_REFERENCE
      else:
        flush_code_points_consumed_as_a_character_reference
        reconsume_in tokenizer.rstate

    of NAMED_CHARACTER_REFERENCE:
      ignore_eof # we check for eof ourselves
      tokenizer.reconsume()
      when nimvm:
        eprint "Cannot evaluate character references at compile time"
      else:
        var tokenizerp = addr tokenizer
        var lasti = 0
        let value = entityMap.find(proc(s: var string): bool =
          if tokenizerp[].atEof:
            return false
          let rs = $tokenizerp[].consume()
          lasti = tokenizerp[].tmp.len
          tokenizerp[].tmp &= rs
          s &= rs
          return true
        )
        tokenizer.reconsume()
        tokenizer.tmp.setLen(lasti)
        if value.isOk:
          if consumed_as_an_attribute and tokenizer.tmp[^1] != ';' and peek_char in {'='} + AsciiAlpha:
            flush_code_points_consumed_as_a_character_reference
            switch_state tokenizer.rstate
          else:
            if tokenizer.tmp[^1] != ';':
              parse_error missing_semicolon_after_character_reference_parse_error
            tokenizer.tmp = value.get
            flush_code_points_consumed_as_a_character_reference
            switch_state tokenizer.rstate
        else:
          flush_code_points_consumed_as_a_character_reference
          switch_state AMBIGUOUS_AMPERSAND_STATE

    of AMBIGUOUS_AMPERSAND_STATE:
      case c
      of AsciiAlpha:
        if consumed_as_an_attribute:
          append_to_current_attr_value c
        else:
          emit_current
      of ';':
        parse_error unknown_named_character_reference
        reconsume_in tokenizer.rstate
      else: reconsume_in tokenizer.rstate

    of NUMERIC_CHARACTER_REFERENCE:
      tokenizer.code = 0
      case c
      of 'x', 'X':
        tokenizer.tmp &= c
        switch_state HEXADECIMAL_CHARACTER_REFERENCE_START
      else: reconsume_in DECIMAL_CHARACTER_REFERENCE_START

    of HEXADECIMAL_CHARACTER_REFERENCE_START:
      case c
      of AsciiHexDigit: reconsume_in HEXADECIMAL_CHARACTER_REFERENCE
      else:
        parse_error absence_of_digits_in_numeric_character_reference
        flush_code_points_consumed_as_a_character_reference
        reconsume_in tokenizer.rstate

    of DECIMAL_CHARACTER_REFERENCE_START:
      case c
      of AsciiDigit: reconsume_in DECIMAL_CHARACTER_REFERENCE
      else:
        parse_error absence_of_digits_in_numeric_character_reference
        flush_code_points_consumed_as_a_character_reference
        reconsume_in tokenizer.rstate

    of HEXADECIMAL_CHARACTER_REFERENCE:
      case c
      of AsciiHexDigit: # note: merged digit, upper hex, lower hex
        tokenizer.code *= 0x10
        tokenizer.code += hexValue(c)
      of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END
      else:
        parse_error missing_semicolon_after_character_reference
        reconsume_in NUMERIC_CHARACTER_REFERENCE_END

    of DECIMAL_CHARACTER_REFERENCE:
      case c
      of AsciiDigit:
        tokenizer.code *= 10
        tokenizer.code += decValue(c)
      of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END
      else:
        parse_error missing_semicolon_after_character_reference
        reconsume_in NUMERIC_CHARACTER_REFERENCE_END

    of NUMERIC_CHARACTER_REFERENCE_END:
      ignore_eof # we reconsume anyway
      case tokenizer.code
      of 0x00:
        parse_error null_character_reference
        tokenizer.code = 0xFFFD
      elif tokenizer.code > 0x10FFFF:
        parse_error character_reference_outside_unicode_range
        tokenizer.code = 0xFFFD
      elif Rune(tokenizer.code).isSurrogate():
        parse_error surrogate_character_reference
        tokenizer.code = 0xFFFD
      elif Rune(tokenizer.code).isNonCharacter():
        parse_error noncharacter_character_reference
        # do nothing
      elif tokenizer.code in 0..255 and char(tokenizer.code) in ((Controls - AsciiWhitespace) + {chr(0x0D)}):
        const ControlMapTable = [
          (0x80, 0x20AC), (0x82, 0x201A), (0x83, 0x0192), (0x84, 0x201E),
          (0x85, 0x2026), (0x86, 0x2020), (0x87, 0x2021), (0x88, 0x02C6),
          (0x89, 0x2030), (0x8A, 0x0160), (0x8B, 0x2039), (0x8C, 0x0152),
          (0x8E, 0x017D), (0x91, 0x2018), (0x92, 0x2019), (0x93, 0x201C),
          (0x94, 0x201D), (0x95, 0x2022), (0x96, 0x2013), (0x97, 0x2014),
          (0x98, 0x02DC), (0x99, 0x2122), (0x9A, 0x0161), (0x9B, 0x203A),
          (0x9C, 0x0153), (0x9E, 0x017E), (0x9F, 0x0178),
        ].toTable()
        if ControlMapTable.hasKey(tokenizer.code):
          tokenizer.code = ControlMapTable[tokenizer.code]
      tokenizer.tmp = $Rune(tokenizer.code)
      flush_code_points_consumed_as_a_character_reference #TODO optimize so we flush directly
      reconsume_in tokenizer.rstate # we unnecessarily consumed once so reconsume