path: root/compiler/lexer.nim



#
#
#           The Nimrod Compiler
#        (c) Copyright 2011 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

# This scanner is handwritten for efficiency. I used an elegant buffering
# scheme which I have not seen anywhere else:
# We guarantee that a whole line is in the buffer. Thus only when scanning
# the \n or \r character we have to check wether we need to read in the next 
# chunk. (\n or \r already need special handling for incrementing the line
# counter; choosing both \n and \r allows the scanner to properly read Unix,
# DOS or Macintosh text files, even when it is not the native format.

import 
  hashes, options, msgs, strutils, platform, idents, lexbase, llstream, 
  wordrecg

const 
  MaxLineLength* = 80         # lines longer than this lead to a warning
  numChars*: TCharSet = {'0'..'9', 'a'..'z', 'A'..'Z'}
  SymChars*: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
  SymStartChars*: TCharSet = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'}
  OpChars*: TCharSet = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.', 
    '|', '=', '%', '&', '$', '@', '~', ':', '\x80'..'\xFF'}

type 
  TTokType* = enum 
    tkInvalid, tkEof,         # order is important here!
    tkSymbol, # keywords:
    tkAddr, tkAnd, tkAs, tkAsm, tkAtomic, 
    tkBind, tkBlock, tkBreak, tkCase, tkCast, 
    tkConst, tkContinue, tkConverter, tkDiscard, tkDistinct, tkDiv, tkElif, 
    tkElse, tkEnd, tkEnum, tkExcept, tkFinally, tkFor, tkFrom, tkGeneric, tkIf, 
    tkImport, tkIn, tkInclude, tkIs, tkIsnot, tkIterator,
    tkLambda, tkLet,
    tkMacro, tkMethod, tkMod, tkNil, tkNot, tkNotin, tkObject, tkOf, tkOr, 
    tkOut, tkProc, tkPtr, tkRaise, tkRef, tkReturn, tkShl, tkShr, tkTemplate, 
    tkTry, tkTuple, tkType, tkVar, tkWhen, tkWhile, tkWith, tkWithout, tkXor,
    tkYield, # end of keywords
    tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit, tkFloatLit, 
    tkFloat32Lit, tkFloat64Lit, tkStrLit, tkRStrLit, tkTripleStrLit, 
    tkGStrLit, tkGTripleStrLit, tkCharLit, tkParLe, tkParRi, tkBracketLe, 
    tkBracketRi, tkCurlyLe, tkCurlyRi, 
    tkBracketDotLe, tkBracketDotRi, # [. and  .]
    tkCurlyDotLe, tkCurlyDotRi, # {.  and  .}
    tkParDotLe, tkParDotRi,   # (. and .)
    tkComma, tkSemiColon, tkColon, tkColonColon, tkEquals, tkDot, tkDotDot, 
    tkOpr, tkComment, tkAccent, tkInd, tkSad, 
    tkDed, # pseudo token types used by the source renderers:
    tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr
  TTokTypes* = set[TTokType]

const 
  tokKeywordLow* = succ(tkSymbol)
  tokKeywordHigh* = pred(tkIntLit)
  TokTypeToStr*: array[TTokType, string] = ["tkInvalid", "[EOF]", 
    "tkSymbol",
    "addr", "and", "as", "asm", "atomic", 
    "bind", "block", "break", "case", "cast", 
    "const", "continue", "converter", "discard", "distinct", "div", "elif", 
    "else", "end", "enum", "except", "finally", "for", "from", "generic", "if", 
    "import", "in", "include", "is", "isnot", "iterator",
    "lambda", "let", 
    "macro", "method", "mod", "nil", "not", "notin", "object", "of", "or", 
    "out", "proc", "ptr", "raise", "ref", "return", "shl", "shr", "template", 
    "try", "tuple", "type", "var", "when", "while", "with", "without", "xor",
    "yield",
    "tkIntLit", "tkInt8Lit", "tkInt16Lit", "tkInt32Lit", "tkInt64Lit", 
    "tkFloatLit", "tkFloat32Lit", "tkFloat64Lit", "tkStrLit", "tkRStrLit", 
    "tkTripleStrLit", "tkGStrLit", "tkGTripleStrLit", "tkCharLit", "(", 
    ")", "[", "]", "{", "}", "[.", ".]", "{.", ".}", "(.", ".)", ",", ";", 
    ":", "::",
    "=", ".", "..", "tkOpr", "tkComment", "`", "[new indentation]", 
    "[same indentation]", "[dedentation]", "tkSpaces", "tkInfixOpr", 
    "tkPrefixOpr", "tkPostfixOpr"]

type 
  TNumericalBase* = enum 
    base10,                   # base10 is listed as the first element,
                              # so that it is the correct default value
    base2, base8, base16
  TToken* = object            # a Nimrod token
    tokType*: TTokType        # the type of the token
    indent*: int              # the indentation; only valid if tokType = tkIndent
    ident*: PIdent            # the parsed identifier
    iNumber*: BiggestInt      # the parsed integer literal
    fNumber*: BiggestFloat    # the parsed floating point literal
    base*: TNumericalBase     # the numerical base; only valid for int
                              # or float literals
    literal*: string          # the parsed (string) literal; and
                              # documentation comments are here too
  
  TLexer* = object of TBaseLexer
    filename*: string
    indentStack*: seq[int]    # the indentation stack
    dedent*: int              # counter for DED token generation
    indentAhead*: int         # if > 0 an indendation has already been read
                              # this is needed because scanning comments
                              # needs so much look-ahead
  

var gLinesCompiled*: int  # all lines that have been compiled

proc pushInd*(L: var TLexer, indent: int)

proc popInd*(L: var TLexer)
proc isKeyword*(kind: TTokType): bool
proc openLexer*(lex: var TLexer, filename: string, inputstream: PLLStream)
proc rawGetTok*(L: var TLexer, tok: var TToken)
  # reads in the next token into tok and skips it
proc getColumn*(L: TLexer): int
proc getLineInfo*(L: TLexer): TLineInfo
proc closeLexer*(lex: var TLexer)
proc PrintTok*(tok: TToken)
proc tokToStr*(tok: TToken): string

proc lexMessage*(L: TLexer, msg: TMsgKind, arg = "")

proc isKeyword(kind: TTokType): bool = 
  result = (kind >= tokKeywordLow) and (kind <= tokKeywordHigh)

proc isNimrodIdentifier*(s: string): bool =
  if s[0] in SymStartChars:
    var i = 1
    while i < s.len:
      if s[i] == '_': 
        inc(i)
        if s[i] notin SymChars: return
      if s[i] notin SymChars: return
      inc(i)
    result = true

proc pushInd(L: var TLexer, indent: int) = 
  var length = len(L.indentStack)
  setlen(L.indentStack, length + 1)
  if (indent > L.indentStack[length - 1]): 
    L.indentstack[length] = indent
  else: 
    InternalError("pushInd")
  
proc popInd(L: var TLexer) = 
  var length = len(L.indentStack)
  setlen(L.indentStack, length - 1)

proc findIdent(L: TLexer, indent: int): bool = 
  for i in countdown(len(L.indentStack) - 1, 0): 
    if L.indentStack[i] == indent: 
      return true

proc tokToStr*(tok: TToken): string = 
  case tok.tokType
  of tkIntLit..tkInt64Lit: result = $tok.iNumber
  of tkFloatLit..tkFloat64Lit: result = $tok.fNumber
  of tkInvalid, tkStrLit..tkCharLit, tkComment: result = tok.literal
  of tkParLe..tkColon, tkEof, tkInd, tkSad, tkDed, tkAccent: 
    result = tokTypeToStr[tok.tokType]
  else: 
    if (tok.ident != nil): 
      result = tok.ident.s
    else: 
      InternalError("tokToStr")
      result = ""
  
proc prettyTok*(tok: TToken): string =
  if IsKeyword(tok.tokType): result = "keyword " & tok.ident.s
  else: result = tokToStr(tok)
  
proc PrintTok*(tok: TToken) = 
  write(stdout, TokTypeToStr[tok.tokType])
  write(stdout, " ")
  writeln(stdout, tokToStr(tok))

var dummyIdent: PIdent

proc initToken*(L: var TToken) = 
  L.TokType = tkInvalid
  L.iNumber = 0
  L.Indent = 0
  L.literal = ""
  L.fNumber = 0.0
  L.base = base10
  L.ident = dummyIdent

proc fillToken(L: var TToken) = 
  L.TokType = tkInvalid
  L.iNumber = 0
  L.Indent = 0
  setLen(L.literal, 0)
  L.fNumber = 0.0
  L.base = base10
  L.ident = dummyIdent
  
proc openLexer(lex: var TLexer, filename: string, inputstream: PLLStream) = 
  openBaseLexer(lex, inputstream)
  lex.indentStack = @[0]
  lex.filename = filename
  lex.indentAhead = - 1
  inc(lex.Linenumber, inputstream.lineOffset) 

proc closeLexer(lex: var TLexer) = 
  inc(gLinesCompiled, lex.LineNumber)
  closeBaseLexer(lex)

proc getColumn(L: TLexer): int = 
  result = getColNumber(L, L.bufPos)

proc getLineInfo(L: TLexer): TLineInfo = 
  result = newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos))

proc lexMessage(L: TLexer, msg: TMsgKind, arg = "") = 
  msgs.Message(getLineInfo(L), msg, arg)

proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg = "") = 
  var info = newLineInfo(L.filename, L.linenumber, pos - L.lineStart)
  msgs.Message(info, msg, arg)

proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: TCharSet) = 
  var pos = L.bufpos              # use registers for pos, buf
  var buf = L.buf
  while true: 
    if buf[pos] in chars: 
      add(tok.literal, buf[pos])
      Inc(pos)
    else: 
      break 
    if buf[pos] == '_': 
      if buf[pos+1] notin chars: 
        lexMessage(L, errInvalidToken, "_")
        break
      add(tok.literal, '_')
      Inc(pos)
  L.bufPos = pos

proc matchTwoChars(L: TLexer, first: Char, second: TCharSet): bool = 
  result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in Second)

proc isFloatLiteral(s: string): bool = 
  for i in countup(0, len(s) + 0 - 1): 
    if s[i] in {'.', 'e', 'E'}: 
      return true
  result = false

proc GetNumber(L: var TLexer): TToken = 
  var 
    pos, endpos: int
    xi: biggestInt
  # get the base:
  result.tokType = tkIntLit   # int literal until we know better
  result.literal = ""
  result.base = base10        # BUGFIX
  pos = L.bufpos     # make sure the literal is correct for error messages:
  matchUnderscoreChars(L, result, {'A'..'Z', 'a'..'z', '0'..'9'})
  if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): 
    add(result.literal, '.')
    inc(L.bufpos) 
    #matchUnderscoreChars(L, result, ['A'..'Z', 'a'..'z', '0'..'9'])
    matchUnderscoreChars(L, result, {'0'..'9'})
    if L.buf[L.bufpos] in {'e', 'E'}: 
      add(result.literal, 'e')
      inc(L.bufpos)
      if L.buf[L.bufpos] in {'+', '-'}: 
        add(result.literal, L.buf[L.bufpos])
        inc(L.bufpos)
      matchUnderscoreChars(L, result, {'0'..'9'})
  endpos = L.bufpos
  if L.buf[endpos] == '\'':
    #matchUnderscoreChars(L, result, ['''', 'f', 'F', 'i', 'I', '0'..'9']);
    inc(endpos)
    L.bufpos = pos            # restore position
    case L.buf[endpos]
    of 'f', 'F': 
      inc(endpos)
      if (L.buf[endpos] == '6') and (L.buf[endpos + 1] == '4'): 
        result.tokType = tkFloat64Lit
        inc(endpos, 2)
      elif (L.buf[endpos] == '3') and (L.buf[endpos + 1] == '2'): 
        result.tokType = tkFloat32Lit
        inc(endpos, 2)
      else: 
        lexMessage(L, errInvalidNumber, result.literal & "'f" & L.buf[endpos])
    of 'i', 'I': 
      inc(endpos)
      if (L.buf[endpos] == '6') and (L.buf[endpos + 1] == '4'): 
        result.tokType = tkInt64Lit
        inc(endpos, 2)
      elif (L.buf[endpos] == '3') and (L.buf[endpos + 1] == '2'): 
        result.tokType = tkInt32Lit
        inc(endpos, 2)
      elif (L.buf[endpos] == '1') and (L.buf[endpos + 1] == '6'): 
        result.tokType = tkInt16Lit
        inc(endpos, 2)
      elif (L.buf[endpos] == '8'): 
        result.tokType = tkInt8Lit
        inc(endpos)
      else: 
        lexMessage(L, errInvalidNumber, result.literal & "'i" & L.buf[endpos])
    else: lexMessage(L, errInvalidNumber, result.literal & "'" & L.buf[endpos])
  else:
    L.bufpos = pos            # restore position
  try: 
    if (L.buf[pos] == '0') and
        (L.buf[pos + 1] in {'x', 'X', 'b', 'B', 'o', 'O', 'c', 'C'}): 
      inc(pos, 2)
      xi = 0                  # it may be a base prefix
      case L.buf[pos - 1]     # now look at the optional type suffix:
      of 'b', 'B': 
        result.base = base2
        while true: 
          case L.buf[pos]
          of 'A'..'Z', 'a'..'z', '2'..'9', '.': 
            lexMessage(L, errInvalidNumber, result.literal)
            inc(pos)
          of '_': 
            if L.buf[pos+1] notin {'0'..'1'}: 
              lexMessage(L, errInvalidToken, "_")
              break
            inc(pos)
          of '0', '1': 
            xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0'))
            inc(pos)
          else: break 
      of 'o', 'c', 'C': 
        result.base = base8
        while true: 
          case L.buf[pos]
          of 'A'..'Z', 'a'..'z', '8'..'9', '.': 
            lexMessage(L, errInvalidNumber, result.literal)
            inc(pos)
          of '_': 
            if L.buf[pos+1] notin {'0'..'7'}:
              lexMessage(L, errInvalidToken, "_")
              break
            inc(pos)
          of '0'..'7': 
            xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0'))
            inc(pos)
          else: break 
      of 'O': 
        lexMessage(L, errInvalidNumber, result.literal)
      of 'x', 'X': 
        result.base = base16
        while true: 
          case L.buf[pos]
          of 'G'..'Z', 'g'..'z': 
            lexMessage(L, errInvalidNumber, result.literal)
            inc(pos)
          of '_': 
            if L.buf[pos+1] notin {'0'..'9', 'a'..'f', 'A'..'F'}: 
              lexMessage(L, errInvalidToken, "_")
              break
            inc(pos)
          of '0'..'9': 
            xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0'))
            inc(pos)
          of 'a'..'f': 
            xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10)
            inc(pos)
          of 'A'..'F': 
            xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10)
            inc(pos)
          else: break 
      else: InternalError(getLineInfo(L), "getNumber")
      case result.tokType
      of tkIntLit, tkInt64Lit: result.iNumber = xi
      of tkInt8Lit: result.iNumber = biggestInt(int8(toU8(int(xi))))
      of tkInt16Lit: result.iNumber = biggestInt(toU16(int(xi)))
      of tkInt32Lit: result.iNumber = biggestInt(toU32(xi))
      of tkFloat32Lit: 
        result.fNumber = (cast[PFloat32](addr(xi)))[] 
        # note: this code is endian neutral!
        # XXX: Test this on big endian machine!
      of tkFloat64Lit: result.fNumber = (cast[PFloat64](addr(xi)))[] 
      else: InternalError(getLineInfo(L), "getNumber")
    elif isFloatLiteral(result.literal) or (result.tokType == tkFloat32Lit) or
        (result.tokType == tkFloat64Lit): 
      result.fnumber = parseFloat(result.literal)
      if result.tokType == tkIntLit: result.tokType = tkFloatLit
    else: 
      result.iNumber = ParseBiggestInt(result.literal)
      if (result.iNumber < low(int32)) or (result.iNumber > high(int32)): 
        if result.tokType == tkIntLit: 
          result.tokType = tkInt64Lit
        elif result.tokType != tkInt64Lit: 
          lexMessage(L, errInvalidNumber, result.literal)
  except EInvalidValue: lexMessage(L, errInvalidNumber, result.literal)
  except EOverflow: lexMessage(L, errNumberOutOfRange, result.literal)
  except EOutOfRange: lexMessage(L, errNumberOutOfRange, result.literal)
  L.bufpos = endpos

proc handleHexChar(L: var TLexer, xi: var int) = 
  case L.buf[L.bufpos]
  of '0'..'9': 
    xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0'))
    inc(L.bufpos)
  of 'a'..'f': 
    xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('a') + 10)
    inc(L.bufpos)
  of 'A'..'F': 
    xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10)
    inc(L.bufpos)
  else: nil

proc handleDecChars(L: var TLexer, xi: var int) = 
  while L.buf[L.bufpos] in {'0'..'9'}: 
    xi = (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0'))
    inc(L.bufpos)

proc getEscapedChar(L: var TLexer, tok: var TToken) = 
  inc(L.bufpos)               # skip '\'
  case L.buf[L.bufpos]
  of 'n', 'N': 
    if tok.toktype == tkCharLit: lexMessage(L, errNnotAllowedInCharacter)
    add(tok.literal, tnl)
    Inc(L.bufpos)
  of 'r', 'R', 'c', 'C': 
    add(tok.literal, CR)
    Inc(L.bufpos)
  of 'l', 'L': 
    add(tok.literal, LF)
    Inc(L.bufpos)
  of 'f', 'F': 
    add(tok.literal, FF)
    inc(L.bufpos)
  of 'e', 'E': 
    add(tok.literal, ESC)
    Inc(L.bufpos)
  of 'a', 'A': 
    add(tok.literal, BEL)
    Inc(L.bufpos)
  of 'b', 'B': 
    add(tok.literal, BACKSPACE)
    Inc(L.bufpos)
  of 'v', 'V': 
    add(tok.literal, VT)
    Inc(L.bufpos)
  of 't', 'T': 
    add(tok.literal, Tabulator)
    Inc(L.bufpos)
  of '\'', '\"': 
    add(tok.literal, L.buf[L.bufpos])
    Inc(L.bufpos)
  of '\\': 
    add(tok.literal, '\\')
    Inc(L.bufpos)
  of 'x', 'X': 
    inc(L.bufpos)
    var xi = 0
    handleHexChar(L, xi)
    handleHexChar(L, xi)
    add(tok.literal, Chr(xi))
  of '0'..'9': 
    if matchTwoChars(L, '0', {'0'..'9'}): 
      lexMessage(L, warnOctalEscape)
    var xi = 0
    handleDecChars(L, xi)
    if (xi <= 255): add(tok.literal, Chr(xi))
    else: lexMessage(L, errInvalidCharacterConstant)
  else: lexMessage(L, errInvalidCharacterConstant)
  
proc HandleCRLF(L: var TLexer, pos: int): int = 
  case L.buf[pos]
  of CR: 
    if getColNumber(L, pos) > MaxLineLength: 
      lexMessagePos(L, hintLineTooLong, pos)
    result = lexbase.HandleCR(L, pos)
  of LF: 
    if getColNumber(L, pos) > MaxLineLength: 
      lexMessagePos(L, hintLineTooLong, pos)
    result = lexbase.HandleLF(L, pos)
  else: result = pos
  
proc getString(L: var TLexer, tok: var TToken, rawMode: bool) = 
  var pos = L.bufPos + 1          # skip "
  var buf = L.buf                 # put `buf` in a register
  var line = L.linenumber         # save linenumber for better error message
  if buf[pos] == '\"' and buf[pos+1] == '\"': 
    tok.tokType = tkTripleStrLit # long string literal:
    inc(pos, 2)               # skip ""
    # skip leading newline:
    pos = HandleCRLF(L, pos)
    buf = L.buf
    while true: 
      case buf[pos]
      of '\"': 
        if buf[pos+1] == '\"' and buf[pos+2] == '\"' and
            buf[pos+3] != '\"': 
          L.bufpos = pos + 3 # skip the three """
          break 
        add(tok.literal, '\"')
        Inc(pos)
      of CR, LF: 
        pos = HandleCRLF(L, pos)
        buf = L.buf
        add(tok.literal, tnl)
      of lexbase.EndOfFile: 
        var line2 = L.linenumber
        L.LineNumber = line
        lexMessagePos(L, errClosingTripleQuoteExpected, L.lineStart)
        L.LineNumber = line2
        break 
      else: 
        add(tok.literal, buf[pos])
        Inc(pos)
  else: 
    # ordinary string literal
    if rawMode: tok.tokType = tkRStrLit
    else: tok.tokType = tkStrLit
    while true: 
      var c = buf[pos]
      if c == '\"': 
        if rawMode and buf[pos+1] == '\"':
          inc(pos, 2)
          add(tok.literal, '"')
        else:
          inc(pos) # skip '"'
          break
      elif c in {CR, LF, lexbase.EndOfFile}: 
        lexMessage(L, errClosingQuoteExpected)
        break 
      elif (c == '\\') and not rawMode: 
        L.bufPos = pos
        getEscapedChar(L, tok)
        pos = L.bufPos
      else: 
        add(tok.literal, c)
        Inc(pos)
    L.bufpos = pos

proc getCharacter(L: var TLexer, tok: var TToken) = 
  Inc(L.bufpos)               # skip '
  var c = L.buf[L.bufpos]
  case c
  of '\0'..Pred(' '), '\'': lexMessage(L, errInvalidCharacterConstant)
  of '\\': getEscapedChar(L, tok)
  else: 
    tok.literal = $c
    Inc(L.bufpos)
  if L.buf[L.bufpos] != '\'': lexMessage(L, errMissingFinalQuote)
  inc(L.bufpos)               # skip '
  
proc getSymbol(L: var TLexer, tok: var TToken) = 
  var h: THash = 0
  var pos = L.bufpos
  var buf = L.buf
  while true: 
    var c = buf[pos]
    case c
    of 'a'..'z', '0'..'9', '\x80'..'\xFF': 
      h = h !& ord(c)
    of 'A'..'Z': 
      c = chr(ord(c) + (ord('a') - ord('A'))) # toLower()
      h = h !& ord(c)
    of '_':
      if buf[pos+1] notin SymChars: 
        lexMessage(L, errInvalidToken, "_")
        break
    else: break 
    Inc(pos)
  h = !$h
  tok.ident = getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
  L.bufpos = pos
  if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or
      (tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)): 
    tok.tokType = tkSymbol
  else: 
    tok.tokType = TTokType(tok.ident.id + ord(tkSymbol))
  
proc endOperator(L: var TLexer, tok: var TToken, pos: int,
                 hash: THash) {.inline.} = 
  var h = !$hash
  tok.ident = getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
  if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr
  else: tok.tokType = TTokType(tok.ident.id - oprLow + ord(tkColon))
  L.bufpos = pos
  
proc getOperator(L: var TLexer, tok: var TToken) = 
  var pos = L.bufpos
  var buf = L.buf
  var h: THash = 0
  while true: 
    var c = buf[pos]
    if c notin OpChars: break
    h = h !& Ord(c)
    Inc(pos)
  endOperator(L, tok, pos, h)

proc handleIndentation(L: var TLexer, tok: var TToken, indent: int) = 
  tok.indent = indent
  var i = high(L.indentStack)
  if indent > L.indentStack[i]: 
    tok.tokType = tkInd
  elif indent == L.indentStack[i]: 
    tok.tokType = tkSad
  else: 
    # check we have the indentation somewhere in the stack:
    while (i >= 0) and (indent != L.indentStack[i]): 
      dec(i)
      inc(L.dedent)
    dec(L.dedent)
    tok.tokType = tkDed
    if i < 0: 
      tok.tokType = tkSad     # for the parser it is better as SAD
      lexMessage(L, errInvalidIndentation)

proc scanComment(L: var TLexer, tok: var TToken) = 
  var pos = L.bufpos
  var buf = L.buf 
  # a comment ends if the next line does not start with the # on the same
  # column after only whitespace
  tok.tokType = tkComment
  var col = getColNumber(L, pos)
  while true: 
    while not (buf[pos] in {CR, LF, lexbase.EndOfFile}): 
      add(tok.literal, buf[pos])
      inc(pos)
    pos = handleCRLF(L, pos)
    buf = L.buf
    var indent = 0
    while buf[pos] == ' ': 
      inc(pos)
      inc(indent)
    if (buf[pos] == '#') and (col == indent): 
      tok.literal = tok.literal & "\n"
    else: 
      if buf[pos] > ' ': 
        L.indentAhead = indent
        inc(L.dedent)
      break 
  L.bufpos = pos

proc skip(L: var TLexer, tok: var TToken) = 
  var pos = L.bufpos
  var buf = L.buf
  while true: 
    case buf[pos]
    of ' ': 
      Inc(pos)
    of Tabulator: 
      lexMessagePos(L, errTabulatorsAreNotAllowed, pos)
      inc(pos)                # BUGFIX
    of CR, LF: 
      pos = HandleCRLF(L, pos)
      buf = L.buf
      var indent = 0
      while buf[pos] == ' ': 
        Inc(pos)
        Inc(indent)
      if (buf[pos] > ' '): 
        handleIndentation(L, tok, indent)
        break 
    else: 
      break                   # EndOfFile also leaves the loop
  L.bufpos = pos

proc rawGetTok(L: var TLexer, tok: var TToken) = 
  fillToken(tok)
  if L.dedent > 0:
    dec(L.dedent)
    if L.indentAhead >= 0: 
      handleIndentation(L, tok, L.indentAhead)
      L.indentAhead = - 1
    else:
      tok.tokType = tkDed
    return
  skip(L, tok)
  # got an documentation comment or tkIndent, return that:
  if tok.toktype != tkInvalid: return
  var c = L.buf[L.bufpos]
  if c in SymStartChars - {'r', 'R', 'l'}: 
    getSymbol(L, tok)
  elif c in {'0'..'9'}: 
    tok = getNumber(L)
  else: 
    case c
    of '#': 
      scanComment(L, tok)
    of '*':
      # '*:' is unfortunately a special case, because it is two tokens in 
      # 'var v*: int'.
      if L.buf[L.bufpos+1] == ':' and L.buf[L.bufpos+2] notin OpChars:
        var h = 0 !& ord('*')
        endOperator(L, tok, L.bufpos+1, h)
      else:
        getOperator(L, tok)
    of ',':
      tok.toktype = tkComma
      Inc(L.bufpos)
    of 'l': 
      # if we parsed exactly one character and its a small L (l), this
      # is treated as a warning because it may be confused with the number 1
      if not (L.buf[L.bufpos + 1] in (SymChars + {'_'})): 
        lexMessage(L, warnSmallLshouldNotBeUsed)
      getSymbol(L, tok)
    of 'r', 'R': 
      if L.buf[L.bufPos + 1] == '\"': 
        Inc(L.bufPos)
        getString(L, tok, true)
      else: 
        getSymbol(L, tok)
    of '(': 
      Inc(L.bufpos)
      if (L.buf[L.bufPos] == '.') and (L.buf[L.bufPos + 1] != '.'): 
        tok.toktype = tkParDotLe
        Inc(L.bufpos)
      else: 
        tok.toktype = tkParLe
    of ')': 
      tok.toktype = tkParRi
      Inc(L.bufpos)
    of '[': 
      Inc(L.bufpos)
      if (L.buf[L.bufPos] == '.') and (L.buf[L.bufPos + 1] != '.'): 
        tok.toktype = tkBracketDotLe
        Inc(L.bufpos)
      else: 
        tok.toktype = tkBracketLe
    of ']': 
      tok.toktype = tkBracketRi
      Inc(L.bufpos)
    of '.': 
      if L.buf[L.bufPos + 1] == ']': 
        tok.tokType = tkBracketDotRi
        Inc(L.bufpos, 2)
      elif L.buf[L.bufPos + 1] == '}': 
        tok.tokType = tkCurlyDotRi
        Inc(L.bufpos, 2)
      elif L.buf[L.bufPos + 1] == ')': 
        tok.tokType = tkParDotRi
        Inc(L.bufpos, 2)
      else: 
        getOperator(L, tok)
    of '{': 
      Inc(L.bufpos)
      if (L.buf[L.bufPos] == '.') and (L.buf[L.bufPos+1] != '.'): 
        tok.toktype = tkCurlyDotLe
        Inc(L.bufpos)
      else: 
        tok.toktype = tkCurlyLe
    of '}': 
      tok.toktype = tkCurlyRi
      Inc(L.bufpos)
    of ';': 
      tok.toktype = tkSemiColon
      Inc(L.bufpos)
    of '`': 
      tok.tokType = tkAccent
      Inc(L.bufpos)
    of '\"': 
      # check for extended raw string literal:
      var rawMode = L.bufpos > 0 and L.buf[L.bufpos-1] in SymChars
      getString(L, tok, rawMode)
      if rawMode:
        # tkRStrLit -> tkGStrLit
        # tkTripleStrLit -> tkGTripleStrLit
        inc(tok.tokType, 2)
    of '\'':
      tok.tokType = tkCharLit
      getCharacter(L, tok)
      tok.tokType = tkCharLit
    else:
      if c in OpChars: 
        getOperator(L, tok)
      elif c == lexbase.EndOfFile:
        tok.toktype = tkEof
      else:
        tok.literal = c & ""
        tok.tokType = tkInvalid
        lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')')
        Inc(L.bufpos)
  
dummyIdent = getIdent("")
#
#
#           The Nimrod Compiler
#        (c) Copyright 2011 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

# This scanner is handwritten for efficiency. I used an elegant buffering
# scheme which I have not seen anywhere else:
# We guarantee that a whole line is in the buffer. Thus only when scanning
# the \n or \r character we have to check wether we need to read in the next 
# chunk. (\n or \r already need special handling for incrementing the line
# counter; choosing both \n and \r allows the scanner to properly read Unix,
# DOS or Macintosh text files, even when it is not the native format.

import 
  hashes, options, msgs, strutils, platform, idents, lexbase, llstream, 
  wordrecg

const 
  MaxLineLength* = 80         # lines longer than this lead to a warning
  numChars*: TCharSet = {'0'..'9', 'a'..'z', 'A'..'Z'}
  SymChars*: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
  SymStartChars*: TCharSet = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'}
  OpChars*: TCharSet = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.', 
    '|', '=', '%', '&', '$', '@', '~', ':', '\x80'..'\xFF'}

type 
  TTokType* = enum 
    tkInvalid, tkEof,         # order is important here!
    tkSymbol, # keywords:
    tkAddr, tkAnd, tkAs, tkAsm, tkAtomic, 
    tkBind, tkBlock, tkBreak, tkCase, tkCast, 
    tkConst, tkContinue, tkConverter, tkDiscard, tkDistinct, tkDiv, tkElif, 
    tkElse, tkEnd, tkEnum, tkExcept, tkFinally, tkFor, tkFrom, tkGeneric, tkIf, 
    tkImport, tkIn, tkInclude, tkIs, tkIsnot, tkIterator,
    tkLambda, tkLet,
    tkMacro, tkMethod, tkMod, tkNil, tkNot, tkNotin, tkObject, tkOf, tkOr, 
    tkOut, tkProc, tkPtr, tkRaise, tkRef, tkReturn, tkShl, tkShr, tkTemplate, 
    tkTry, tkTuple, tkType, tkVar, tkWhen, tkWhile, tkWith, tkWithout, tkXor,
    tkYield, # end of keywords
    tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit, tkFloatLit, 
    tkFloat32Lit, tkFloat64Lit, tkStrLit, tkRStrLit, tkTripleStrLit, 
    tkGStrLit, tkGTripleStrLit, tkCharLit, tkParLe, tkParRi, tkBracketLe, 
    tkBracketRi, tkCurlyLe, tkCurlyRi, 
    tkBracketDotLe, tkBracketDotRi, # [. and  .]
    tkCurlyDotLe, tkCurlyDotRi, # {.  and  .}
    tkParDotLe, tkParDotRi,   # (. and .)
    tkComma, tkSemiColon, tkColon, tkColonColon, tkEquals, tkDot, tkDotDot, 
    tkOpr, tkComment, tkAccent, tkInd, tkSad, 
    tkDed, # pseudo token types used by the source renderers:
    tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr
  TTokTypes* = set[TTokType]

const 
  tokKeywordLow* = succ(tkSymbol)
  tokKeywordHigh* = pred(tkIntLit)
  TokTypeToStr*: array[TTokType, string] = ["tkInvalid", "[EOF]", 
    "tkSymbol",
    "addr", "and", "as", "asm", "atomic", 
    "bind", "block", "break", "case", "cast", 
    "const", "continue", "converter", "discard", "distinct", "div", "elif", 
    "else", "end", "enum", "except", "finally", "for", "from", "generic", "if", 
    "import", "in", "include", "is", "isnot", "iterator",
    "lambda", "let", 
    "macro", "method", "mod", "nil", "not", "notin", "object", "of", "or", 
    "out", "proc", "ptr", "raise", "ref", "return", "shl", "shr", "template", 
    "try", "tuple", "type", "var", "when", "while", "with", "without", "xor",
    "yield",
    "tkIntLit", "tkInt8Lit", "tkInt16Lit", "tkInt32Lit", "tkInt64Lit", 
    "tkFloatLit", "tkFloat32Lit", "tkFloat64Lit", "tkStrLit", "tkRStrLit", 
    "tkTripleStrLit", "tkGStrLit", "tkGTripleStrLit", "tkCharLit", "(", 
    ")", "[", "]", "{", "}", "[.", ".]", "{.", ".}", "(.", ".)", ",", ";", 
    ":", "::",
    "=", ".", "..", "tkOpr", "tkComment", "`", "[new indentation]", 
    "[same indentation]", "[dedentation]", "tkSpaces", "tkInfixOpr", 
    "tkPrefixOpr", "tkPostfixOpr"]

type 
  TNumericalBase* = enum 
    base10,                   # base10 is listed as the first element,
                              # so that it is the correct default value
    base2, base8, base16
  TToken* = object            # a Nimrod token
    tokType*: TTokType        # the type of the token
    indent*: int              # the indentation; only valid if tokType = tkIndent
    ident*: PIdent            # the parsed identifier
    iNumber*: BiggestInt      # the parsed integer literal
    fNumber*: BiggestFloat    # the parsed floating point literal
    base*: TNumericalBase     # the numerical base; only valid for int
                              # or float literals
    literal*: string          # the parsed (string) literal; and
                              # documentation comments are here too
  
  TLexer* = object of TBaseLexer
    filename*: string
    indentStack*: seq[int]    # the indentation stack
    dedent*: int              # counter for DED token generation
    indentAhead*: int         # if > 0 an indendation has already been read
                              # this is needed because scanning comments
                              # needs so much look-ahead
  

var gLinesCompiled*: int  # all lines that have been compiled

proc pushInd*(L: var TLexer, indent: int)

proc popInd*(L: var TLexer)
proc isKeyword*(kind: TTokType): bool
proc openLexer*(lex: var TLexer, filename: string, inputstream: PLLStream)
proc rawGetTok*(L: var TLexer, tok: var TToken)
  # reads in the next token into tok and skips it
proc getColumn*(L: TLexer): int
proc getLineInfo*(L: TLexer): TLineInfo
proc closeLexer*(lex: var TLexer)
proc PrintTok*(tok: TToken)
proc tokToStr*(tok: TToken): string

proc lexMessage*(L: TLexer, msg: TMsgKind, arg = "")

proc isKeyword(kind: TTokType): bool = 
  result = (kind >= tokKeywordLow) and (kind <= tokKeywordHigh)

proc isNimrodIdentifier*(s: string): bool =
  if s[0] in SymStartChars:
    var i = 1
    while i < s.len:
      if s[i] == '_': 
        inc(i)
        if s[i] notin SymChars: return
      if s[i] notin SymChars: return
      inc(i)
    result = true

proc pushInd(L: var TLexer, indent: int) = 
  var length = len(L.indentStack)
  setlen(L.indentStack, length + 1)
  if (indent > L.indentStack[length - 1]): 
    L.indentstack[length] = indent
  else: 
    InternalError("pushInd")
  
proc popInd(L: var TLexer) = 
  var length = len(L.indentStack)
  setlen(L.indentStack, length - 1)

proc findIdent(L: TLexer, indent: int): bool = 
  for i in countdown(len(L.indentStack) - 1, 0): 
    if L.indentStack[i] == indent: 
      return true

proc tokToStr*(tok: TToken): string = 
  case tok.tokType
  of tkIntLit..tkInt64Lit: result = $tok.iNumber
  of tkFloatLit..tkFloat64Lit: result = $tok.fNumber
  of tkInvalid, tkStrLit..tkCharLit, tkComment: result = tok.literal
  of tkParLe..tkColon, tkEof, tkInd, tkSad, tkDed, tkAccent: 
    result = tokTypeToStr[tok.tokType]
  else: 
    if (tok.ident != nil): 
      result = tok.ident.s
    else: 
      InternalError("tokToStr")
      result = ""
  
proc prettyTok*(tok: TToken): string =
  if IsKeyword(tok.tokType): result = "keyword " & tok.ident.s
  else: result = tokToStr(tok)
  
proc PrintTok*(tok: TToken) = 
  write(stdout, TokTypeToStr[tok.tokType])
  write(stdout, " ")
  writeln(stdout, tokToStr(tok))

var dummyIdent: PIdent

proc initToken*(L: var TToken) = 
  L.TokType = tkInvalid
  L.iNumber = 0
  L.Indent = 0
  L.literal = ""
  L.fNumber = 0.0
  L.base = base10
  L.ident = dummyIdent

proc fillToken(L: var TToken) = 
  L.TokType = tkInvalid
  L.iNumber = 0
  L.Indent = 0
  setLen(L.literal, 0)
  L.fNumber = 0.0
  L.base = base10
  L.ident = dummyIdent
  
proc openLexer(lex: var TLexer, filename: string, inputstream: PLLStream) = 
  openBaseLexer(lex, inputstream)
  lex.indentStack = @[0]
  lex.filename = filename
  lex.indentAhead = - 1
  inc(lex.Linenumber, inputstream.lineOffset) 

proc closeLexer(lex: var TLexer) = 
  inc(gLinesCompiled, lex.LineNumber)
  closeBaseLexer(lex)

proc getColumn(L: TLexer): int = 
  result = getColNumber(L, L.bufPos)

proc getLineInfo(L: TLexer): TLineInfo = 
  result = newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos))

proc lexMessage(L: TLexer, msg: TMsgKind, arg = "") = 
  msgs.Message(getLineInfo(L), msg, arg)

proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg = "") = 
  var info = newLineInfo(L.filename, L.linenumber, pos - L.lineStart)
  msgs.Message(info, msg, arg)

proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: TCharSet) = 
  var pos = L.bufpos              # use registers for pos, buf
  var buf = L.buf
  while true: 
    if buf[pos] in chars: 
      add(tok.literal, buf[pos])
      Inc(pos)
    else: 
      break 
    if buf[pos] == '_': 
      if buf[pos+1] notin chars: 
        lexMessage(L, errInvalidToken, "_")
        break
      add(tok.literal, '_')
      Inc(pos)
  L.bufPos = pos

proc matchTwoChars(L: TLexer, first: Char, second: TCharSet): bool = 
  result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in Second)

proc isFloatLiteral(s: string): bool = 
  for i in countup(0, len(s) + 0 - 1): 
    if s[i] in {'.', 'e', 'E'}: 
      return true
  result = false

proc GetNumber(L: var TLexer): TToken = 
  var 
    pos, endpos: int
    xi: biggestInt
  # get the base:
  result.tokType = tkIntLit   # int literal until we know better
  result.literal = ""
  result.base = base10        # BUGFIX
  pos = L.bufpos     # make sure the literal is correct for error messages:
  matchUnderscoreChars(L, result, {'A'..'Z', 'a'..'z', '0'..'9'})
  if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): 
    add(result.literal, '.')
    inc(L.bufpos) 
    #matchUnderscoreChars(L, result, ['A'..'Z', 'a'..'z', '0'..'9'])
    matchUnderscoreChars(L, result, {'0'..'9'})
    if L.buf[L.bufpos] in {'e', 'E'}: 
      add(result.literal, 'e')
      inc(L.bufpos)
      if L.buf[L.bufpos] in {'+', '-'}: 
        add(result.literal, L.buf[L.bufpos])
        inc(L.bufpos)
      matchUnderscoreChars(L, result, {'0'..'9'})
  endpos = L.bufpos
  if L.buf[endpos] == '\'':
    #matchUnderscoreChars(L, result, ['''', 'f', 'F', 'i', 'I', '0'..'9']);
    inc(endpos)
    L.bufpos = pos            # restore position
    case L.buf[endpos]
    of 'f', 'F': 
      inc(endpos)
      if (L.buf[endpos] == '6') and (L.buf[endpos + 1] == '4'): 
        result.tokType = tkFloat64Lit
        inc(endpos, 2)
      elif (L.buf[endpos] == '3') and (L.buf[endpos + 1] == '2'): 
        result.tokType = tkFloat32Lit
        inc(endpos, 2)
      else: 
        lexMessage(L, errInvalidNumber, result.literal & "'f" & L.buf[endpos])
    of 'i', 'I': 
      inc(endpos)
      if (L.buf[endpos] == '6') and (L.buf[endpos + 1] == '4'): 
        result.tokType = tkInt64Lit
        inc(endpos, 2)
      elif (L.buf[endpos] == '3') and (L.buf[endpos + 1] == '2'): 
        result.tokType = tkInt32Lit
        inc(endpos, 2)
      elif (L.buf[endpos] == '1') and (L.buf[endpos + 1] == '6'): 
        result.tokType = tkInt16Lit
        inc(endpos, 2)
      elif (L.buf[endpos] == '8'): 
        result.tokType = tkInt8Lit
        inc(endpos)
      else: 
        lexMessage(L, errInvalidNumber, result.literal & "'i" & L.buf[endpos])
    else: lexMessage(L, errInvalidNumber, result.literal & "'" & L.buf[endpos])
  else:
    L.bufpos = pos            # restore position
  try: 
    if (L.buf[pos] == '0') and
        (L.buf[pos + 1] in {'x', 'X', 'b', 'B', 'o', 'O', 'c', 'C'}): 
      inc(pos, 2)
      xi = 0                  # it may be a base prefix
      case L.buf[pos - 1]     # now look at the optional type suffix:
      of 'b', 'B': 
        result.base = base2
        while true: 
          case L.buf[pos]
          of 'A'..'Z', 'a'..'z', '2'..'9', '.': 
            lexMessage(L, errInvalidNumber, result.literal)
            inc(pos)
          of '_': 
            if L.buf[pos+1] notin {'0'..'1'}: 
              lexMessage(L, errInvalidToken, "_")
              break
            inc(pos)
          of '0', '1': 
            xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0'))
            inc(pos)
          else: break 
      of 'o', 'c', 'C': 
        result.base = base8
        while true: 
          case L.buf[pos]
          of 'A'..'Z', 'a'..'z', '8'..'9', '.': 
            lexMessage(L, errInvalidNumber, result.literal)
            inc(pos)
          of '_': 
            if L.buf[pos+1] notin {'0'..'7'}:
              lexMessage(L, errInvalidToken, "_")
              break
            inc(pos)
          of '0'..'7': 
            xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0'))
            inc(pos)
          else: break 
      of 'O': 
        lexMessage(L, errInvalidNumber, result.literal)
      of 'x', 'X': 
        result.base = base16
        while true: 
          case L.buf[pos]
          of 'G'..'Z', 'g'..'z': 
            lexMessage(L, errInvalidNumber, result.literal)
            inc(pos)
          of '_': 
            if L.buf[pos+1] notin {'0'..'9', 'a'..'f', 'A'..'F'}: 
              lexMessage(L, errInvalidToken, "_")
              break
            inc(pos)
          of '0'..'9': 
            xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0'))
            inc(pos)
          of 'a'..'f': 
            xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10)
            inc(pos)
          of 'A'..'F': 
            xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10)
            inc(pos)
          else: break 
      else: InternalError(getLineInfo(L), "getNumber")
      case result.tokType
      of tkIntLit, tkInt64Lit: result.iNumber = xi
      of tkInt8Lit: result.iNumber = biggestInt(int8(toU8(int(xi))))
      of tkInt16Lit: result.iNumber = biggestInt(toU16(int(xi)))
      of tkInt32Lit: result.iNumber = biggestInt(toU32(xi))
      of tkFloat32Lit: 
        result.fNumber = (cast[PFloat32](addr(xi)))[] 
        # note: this code is endian neutral!
        # XXX: Test this on big endian machine!
      of tkFloat64Lit: result.fNumber = (cast[PFloat64](addr(xi)))[] 
      else: InternalError(getLineInfo(L), "getNumber")
    elif isFloatLiteral(result.literal) or (result.tokType == tkFloat32Lit) or
        (result.tokType == tkFloat64Lit): 
      result.fnumber = parseFloat(result.literal)
      if result.tokType == tkIntLit: result.tokType = tkFloatLit
    else: 
      result.iNumber = ParseBiggestInt(result.literal)
      if (result.iNumber < low(int32)) or (result.iNumber > high(int32)): 
        if result.tokType == tkIntLit: 
          result.tokType = tkInt64Lit
        elif result.tokType != tkInt64Lit: 
          lexMessage(L, errInvalidNumber, result.literal)
  except EInvalidValue: lexMessage(L, errInvalidNumber, result.literal)
  except EOverflow: lexMessage(L, errNumberOutOfRange, result.literal)
  except EOutOfRange: lexMessage(L, errNumberOutOfRange, result.literal)
  L.bufpos = endpos

proc handleHexChar(L: var TLexer, xi: var int) = 
  case L.buf[L.bufpos]
  of '0'..'9': 
    xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0'))
    inc(L.bufpos)
  of 'a'..'f': 
    xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('a') + 10)
    inc(L.bufpos)
  of 'A'..'F': 
    xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10)
    inc(L.bufpos)
  else: nil

proc handleDecChars(L: var TLexer, xi: var int) = 
  while L.buf[L.bufpos] in {'0'..'9'}: 
    xi = (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0'))
    inc(L.bufpos)

proc getEscapedChar(L: var TLexer, tok: var TToken) = 
  inc(L.bufpos)               # skip '\'
  case L.buf[L.bufpos]
  of 'n', 'N': 
    if tok.toktype == tkCharLit: lexMessage(L, errNnotAllowedInCharacter)
    add(tok.literal, tnl)
    Inc(L.bufpos)
  of 'r', 'R', 'c', 'C': 
    add(tok.literal, CR)
    Inc(L.bufpos)
  of 'l', 'L': 
    add(tok.literal, LF)
    Inc(L.bufpos)
  of 'f', 'F': 
    add(tok.literal, FF)
    inc(L.bufpos)
  of 'e', 'E': 
    add(tok.literal, ESC)
    Inc(L.bufpos)
  of 'a', 'A': 
    add(tok.literal, BEL)
    Inc(L.bufpos)
  of 'b', 'B': 
    add(tok.literal, BACKSPACE)
    Inc(L.bufpos)
  of 'v', 'V': 
    add(tok.literal, VT)
    Inc(L.bufpos)
  of 't', 'T': 
    add(tok.literal, Tabulator)
    Inc(L.bufpos)
  of '\'', '\"': 
    add(tok.literal, L.buf[L.bufpos])
    Inc(L.bufpos)
  of '\\': 
    add(tok.literal, '\\')
    Inc(L.bufpos)
  of 'x', 'X': 
    inc(L.bufpos)
    var xi = 0
    handleHexChar(L, xi)
    handleHexChar(L, xi)
    add(tok.literal, Chr(xi))
  of '0'..'9': 
    if matchTwoChars(L, '0', {'0'..'9'}): 
      lexMessage(L, warnOctalEscape)
    var xi = 0
    handleDecChars(L, xi)
    if (xi <= 255): add(tok.literal, Chr(xi))
    else: lexMessage(L, errInvalidCharacterConstant)
  else: lexMessage(L, errInvalidCharacterConstant)
  
proc HandleCRLF(L: var TLexer, pos: int): int = 
  case L.buf[pos]
  of CR: 
    if getColNumber(L, pos) > MaxLineLength: 
      lexMessagePos(L, hintLineTooLong, pos)
    result = lexbase.HandleCR(L, pos)
  of LF: 
    if getColNumber(L, pos) > MaxLineLength: 
      lexMessagePos(L, hintLineTooLong, pos)
    result = lexbase.HandleLF(L, pos)
  else: result = pos
  
proc getString(L: var TLexer, tok: var TToken, rawMode: bool) = 
  var pos = L.bufPos + 1          # skip "
  var buf = L.buf                 # put `buf` in a register
  var line = L.linenumber         # save linenumber for better error message
  if buf[pos] == '\"' and buf[pos+1] == '\"': 
    tok.tokType = tkTripleStrLit # long string literal:
    inc(pos, 2)               # skip ""
    # skip leading newline:
    pos = HandleCRLF(L, pos)
    buf = L.buf
    while true: 
      case buf[pos]
      of '\"': 
        if buf[pos+1] == '\"' and buf[pos+2] == '\"' and
            buf[pos+3] != '\"': 
          L.bufpos = pos + 3 # skip the three """
          break 
        add(tok.literal, '\"')
        Inc(pos)
      of CR, LF: 
        pos = HandleCRLF(L, pos)
        buf = L.buf
        add(tok.literal, tnl)
      of lexbase.EndOfFile: 
        var line2 = L.linenumber
        L.LineNumber = line
        lexMessagePos(L, errClosingTripleQuoteExpected, L.lineStart)
        L.LineNumber = line2
        break 
      else: 
        add(tok.literal, buf[pos])
        Inc(pos)
  else: 
    # ordinary string literal
    if rawMode: tok.tokType = tkRStrLit
    else: tok.tokType = tkStrLit
    while true: 
      var c = buf[pos]
      if c == '\"': 
        if rawMode and buf[pos+1] == '\"':
          inc(pos, 2)
          add(tok.literal, '"')
        else:
          inc(pos) # skip '"'
          break
      elif c in {CR, LF, lexbase.EndOfFile}: 
        lexMessage(L, errClosingQuoteExpected)
        break 
      elif (c == '\\') and not rawMode: 
        L.bufPos = pos
        getEscapedChar(L, tok)
        pos = L.bufPos
      else: 
        add(tok.literal, c)
        Inc(pos)
    L.bufpos = pos

proc getCharacter(L: var TLexer, tok: var TToken) = 
  Inc(L.bufpos)               # skip '
  var c = L.buf[L.bufpos]
  case c
  of '\0'..Pred(' '), '\'': lexMessage(L, errInvalidCharacterConstant)
  of '\\': getEscapedChar(L, tok)
  else: 
    tok.literal = $c
    Inc(L.bufpos)
  if L.buf[L.bufpos] != '\'': lexMessage(L, errMissingFinalQuote)
  inc(L.bufpos)               # skip '
  
proc getSymbol(L: var TLexer, tok: var TToken) = 
  var h: THash = 0
  var pos = L.bufpos
  var buf = L.buf
  while true: 
    var c = buf[pos]
    case c
    of 'a'..'z', '0'..'9', '\x80'..'\xFF': 
      h = h !& ord(c)
    of 'A'..'Z': 
      c = chr(ord(c) + (ord('a') - ord('A'))) # toLower()
      h = h !& ord(c)
    of '_':
      if buf[pos+1] notin SymChars: 
        lexMessage(L, errInvalidToken, "_")
        break
    else: break 
    Inc(pos)
  h = !$h
  tok.ident = getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
  L.bufpos = pos
  if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or
      (tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)): 
    tok.tokType = tkSymbol
  else: 
    tok.tokType = TTokType(tok.ident.id + ord(tkSymbol))
  
proc endOperator(L: var TLexer, tok: var TToken, pos: int,
                 hash: THash) {.inline.} = 
  var h = !$hash
  tok.ident = getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
  if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr
  else: tok.tokType = TTokType(tok.ident.id - oprLow + ord(tkColon))
  L.bufpos = pos
  
proc getOperator(L: var TLexer, tok: var TToken) = 
  var pos = L.bufpos
  var buf = L.buf
  var h: THash = 0
  while true: 
    var c = buf[pos]
    if c notin OpChars: break
    h = h !& Ord(c)
    Inc(pos)
  endOperator(L, tok, pos, h)

proc handleIndentation(L: var TLexer, tok: var TToken, indent: int) = 
  tok.indent = indent
  var i = high(L.indentStack)
  if indent > L.indentStack[i]: 
    tok.tokType = tkInd
  elif indent == L.indentStack[i]: 
    tok.tokType = tkSad
  else: 
    # check we have the indentation somewhere in the stack:
    while (i >= 0) and (indent != L.indentStack[i]): 
      dec(i)
      inc(L.dedent)
    dec(L.dedent)
    tok.tokType = tkDed
    if i < 0: 
      tok.tokType = tkSad     # for the parser it is better as SAD
      lexMessage(L, errInvalidIndentation)

proc scanComment(L: var TLexer, tok: var TToken) = 
  var pos = L.bufpos
  var buf = L.buf 
  # a comment ends if the next line does not start with the # on the same
  # column after only whitespace
  tok.tokType = tkComment
  var col = getColNumber(L, pos)
  while true: 
    while not (buf[pos] in {CR, LF, lexbase.EndOfFile}): 
      add(tok.literal, buf[pos])
      inc(pos)
    pos = handleCRLF(L, pos)
    buf = L.buf
    var indent = 0
    while buf[pos] == ' ': 
      inc(pos)
      inc(indent)
    if (buf[pos] == '#') and (col == indent): 
      tok.literal = tok.literal & "\n"
    else: 
      if buf[pos] > ' ': 
        L.indentAhead = indent
        inc(L.dedent)
      break 
  L.bufpos = pos

proc skip(L: var TLexer, tok: var TToken) = 
  var pos = L.bufpos
  var buf = L.buf
  while true: 
    case buf[pos]
    of ' ': 
      Inc(pos)
    of Tabulator: 
      lexMessagePos(L, errTabulatorsAreNotAllowed, pos)
      inc(pos)                # BUGFIX
    of CR, LF: 
      pos = HandleCRLF(L, pos)
      buf = L.buf
      var indent = 0
      while buf[pos] == ' ': 
        Inc(pos)
        Inc(indent)
      if (buf[pos] > ' '): 
        handleIndentation(L, tok, indent)
        break 
    else: 
      break                   # EndOfFile also leaves the loop
  L.bufpos = pos

proc rawGetTok(L: var TLexer, tok: var TToken) = 
  fillToken(tok)
  if L.dedent > 0:
    dec(L.dedent)
    if L.indentAhead >= 0: 
      handleIndentation(L, tok, L.indentAhead)
      L.indentAhead = - 1
    else:
      tok.tokType = tkDed
    return
  skip(L, tok)
  # got an documentation comment or tkIndent, return that:
  if tok.toktype != tkInvalid: return
  var c = L.buf[L.bufpos]
  if c in SymStartChars - {'r', 'R', 'l'}: 
    getSymbol(L, tok)
  elif c in {'0'..'9'}: 
    tok = getNumber(L)
  else: 
    case c
    of '#': 
      scanComment(L, tok)
    of '*':
      # '*:' is unfortunately a special case, because it is two tokens in 
      # 'var v*: int'.
      if L.buf[L.bufpos+1] == ':' and L.buf[L.bufpos+2] notin OpChars:
        var h = 0 !& ord('*')
        endOperator(L, tok, L.bufpos+1, h)
      else:
        getOperator(L, tok)
    of ',':
      tok.toktype = tkComma
      Inc(L.bufpos)
    of 'l': 
      # if we parsed exactly one character and its a small L (l), this
      # is treated as a warning because it may be confused with the number 1
      if not (L.buf[L.bufpos + 1] in (SymChars + {'_'})): 
        lexMessage(L, warnSmallLshouldNotBeUsed)
      getSymbol(L, tok)
    of 'r', 'R': 
      if L.buf[L.bufPos + 1] == '\"': 
        Inc(L.bufPos)
        getString(L, tok, true)
      else: 
        getSymbol(L, tok)
    of '(': 
      Inc(L.bufpos)
      if (L.buf[L.bufPos] == '.') and (L.buf[L.bufPos + 1] != '.'): 
        tok.toktype = tkParDotLe
        Inc(L.bufpos)
      else: 
        tok.toktype = tkParLe
    of ')': 
      tok.toktype = tkParRi
      Inc(L.bufpos)
    of '[': 
      Inc(L.bufpos)
      if (L.buf[L.bufPos] == '.') and (L.buf[L.bufPos + 1] != '.'): 
        tok.toktype = tkBracketDotLe
        Inc(L.bufpos)
      else: 
        tok.toktype = tkBracketLe
    of ']': 
      tok.toktype = tkBracketRi
      Inc(L.bufpos)
    of '.': 
      if L.buf[L.bufPos + 1] == ']': 
        tok.tokType = tkBracketDotRi
        Inc(L.bufpos, 2)
      elif L.buf[L.bufPos + 1] == '}': 
        tok.tokType = tkCurlyDotRi
        Inc(L.bufpos, 2)
      elif L.buf[L.bufPos + 1] == ')': 
        tok.tokType = tkParDotRi
        Inc(L.bufpos, 2)
      else: 
        getOperator(L, tok)
    of '{': 
      Inc(L.bufpos)
      if (L.buf[L.bufPos] == '.') and (L.buf[L.bufPos+1] != '.'): 
        tok.toktype = tkCurlyDotLe
        Inc(L.bufpos)
      else: 
        tok.toktype = tkCurlyLe
    of '}': 
      tok.toktype = tkCurlyRi
      Inc(L.bufpos)
    of ';': 
      tok.toktype = tkSemiColon
      Inc(L.bufpos)
    of '`': 
      tok.tokType = tkAccent
      Inc(L.bufpos)
    of '\"': 
      # check for extended raw string literal:
      var rawMode = L.bufpos > 0 and L.buf[L.bufpos-1] in SymChars
      getString(L, tok, rawMode)
      if rawMode:
        # tkRStrLit -> tkGStrLit
        # tkTripleStrLit -> tkGTripleStrLit
        inc(tok.tokType, 2)
    of '\'':
      tok.tokType = tkCharLit
      getCharacter(L, tok)
      tok.tokType = tkCharLit
    else:
      if c in OpChars: 
        getOperator(L, tok)
      elif c == lexbase.EndOfFile:
        tok.toktype = tkEof
      else:
        tok.literal = c & ""
        tok.tokType = tkInvalid
        lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')')
        Inc(L.bufpos)
  
dummyIdent = getIdent("")