#
#
# The Nimrod Compiler
# (c) Copyright 2011 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
# This scanner is handwritten for efficiency. I used an elegant buffering
# scheme which I have not seen anywhere else:
# We guarantee that a whole line is in the buffer. Thus only when scanning
# the \n or \r character we have to check wether we need to read in the next
# chunk. (\n or \r already need special handling for incrementing the line
# counter; choosing both \n and \r allows the scanner to properly read Unix,
# DOS or Macintosh text files, even when it is not the native format.
import
hashes, options, msgs, strutils, platform, idents, lexbase, llstream,
wordrecg
const
MaxLineLength* = 80 # lines longer than this lead to a warning
numChars*: TCharSet = {'0'..'9', 'a'..'z', 'A'..'Z'}
SymChars*: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
SymStartChars*: TCharSet = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'}
OpChars*: TCharSet = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.',
'|', '=', '%', '&', '$', '@', '~', ':', '\x80'..'\xFF'}
type
TTokType* = enum
tkInvalid, tkEof, # order is important here!
tkSymbol, # keywords:
tkAddr, tkAnd, tkAs, tkAsm, tkAtomic,
tkBind, tkBlock, tkBreak, tkCase, tkCast,
tkConst, tkContinue, tkConverter, tkDiscard, tkDistinct, tkDiv, tkElif,
tkElse, tkEnd, tkEnum, tkExcept, tkFinally, tkFor, tkFrom, tkGeneric, tkIf,
tkImport, tkIn, tkInclude, tkIs, tkIsnot, tkIterator,
tkLambda, tkLet,
tkMacro, tkMethod, tkMod, tkNil, tkNot, tkNotin, tkObject, tkOf, tkOr,
tkOut, tkProc, tkPtr, tkRaise, tkRef, tkReturn, tkShl, tkShr, tkTemplate,
tkTry, tkTuple, tkType, tkVar, tkWhen, tkWhile, tkWith, tkWithout, tkXor,
tkYield, # end of keywords
tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit, tkFloatLit,
tkFloat32Lit, tkFloat64Lit, tkStrLit, tkRStrLit, tkTripleStrLit,
tkGStrLit, tkGTripleStrLit, tkCharLit, tkParLe, tkParRi, tkBracketLe,
tkBracketRi, tkCurlyLe, tkCurlyRi,
tkBracketDotLe, tkBracketDotRi, # [. and .]
tkCurlyDotLe, tkCurlyDotRi, # {. and .}
tkParDotLe, tkParDotRi, # (. and .)
tkComma, tkSemiColon, tkColon, tkColonColon, tkEquals, tkDot, tkDotDot,
tkOpr, tkComment, tkAccent, tkInd, tkSad,
tkDed, # pseudo token types used by the source renderers:
tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr
TTokTypes* = set[TTokType]
const
tokKeywordLow* = succ(tkSymbol)
tokKeywordHigh* = pred(tkIntLit)
TokTypeToStr*: array[TTokType, string] = ["tkInvalid", "[EOF]",
"tkSymbol",
"addr", "and", "as", "asm", "atomic",
"bind", "block", "break", "case", "cast",
"const", "continue", "converter", "discard", "distinct", "div", "elif",
"else", "end", "enum", "except", "finally", "for", "from", "generic", "if",
"import", "in", "include", "is", "isnot", "iterator",
"lambda", "let",
"macro", "method", "mod", "nil", "not", "notin", "object", "of", "or",
"out", "proc", "ptr", "raise", "ref", "return", "shl", "shr", "template",
"try", "tuple", "type", "var", "when", "while", "with", "without", "xor",
"yield",
"tkIntLit", "tkInt8Lit", "tkInt16Lit", "tkInt32Lit", "tkInt64Lit",
"tkFloatLit", "tkFloat32Lit", "tkFloat64Lit", "tkStrLit", "tkRStrLit",
"tkTripleStrLit", "tkGStrLit", "tkGTripleStrLit", "tkCharLit", "(",
")", "[", "]", "{", "}", "[.", ".]", "{.", ".}", "(.", ".)", ",", ";",
":", "::",
"=", ".", "..", "tkOpr", "tkComment", "`", "[new indentation]",
"[same indentation]", "[dedentation]", "tkSpaces", "tkInfixOpr",
"tkPrefixOpr", "tkPostfixOpr"]
type
TNumericalBase* = enum
base10, # base10 is listed as the first element,
# so that it is the correct default value
base2, base8, base16
TToken* = object # a Nimrod token
tokType*: TTokType # the type of the token
indent*: int # the indentation; only valid if tokType = tkIndent
ident*: PIdent # the parsed identifier
iNumber*: BiggestInt # the parsed integer literal
fNumber*: BiggestFloat # the parsed floating point literal
base*: TNumericalBase # the numerical base; only valid for int
# or float literals
literal*: string # the parsed (string) literal; and
# documentation comments are here too
TLexer* = object of TBaseLexer
filename*: string
indentStack*: seq[int] # the indentation stack
dedent*: int # counter for DED token generation
indentAhead*: int # if > 0 an indendation has already been read
# this is needed because scanning comments
# needs so much look-ahead
var gLinesCompiled*: int # all lines that have been compiled
proc pushInd*(L: var TLexer, indent: int)
proc popInd*(L: var TLexer)
proc isKeyword*(kind: TTokType): bool
proc openLexer*(lex: var TLexer, filename: string, inputstream: PLLStream)
proc rawGetTok*(L: var TLexer, tok: var TToken)
# reads in the next token into tok and skips it
proc getColumn*(L: TLexer): int
proc getLineInfo*(L: TLexer): TLineInfo
proc closeLexer*(lex: var TLexer)
proc PrintTok*(tok: TToken)
proc tokToStr*(tok: TToken): string
proc lexMessage*(L: TLexer, msg: TMsgKind, arg = "")
proc isKeyword(kind: TTokType): bool =
result = (kind >= tokKeywordLow) and (kind <= tokKeywordHigh)
proc isNimrodIdentifier*(s: string): bool =
if s[0] in SymStartChars:
var i = 1
while i < s.len:
if s[i] == '_':
inc(i)
if s[i] notin SymChars: return
if s[i] notin SymChars: return
inc(i)
result = true
proc pushInd(L: var TLexer, indent: int) =
var length = len(L.indentStack)
setlen(L.indentStack, length + 1)
if (indent > L.indentStack[length - 1]):
L.indentstack[length] = indent
else:
InternalError("pushInd")
proc popInd(L: var TLexer) =
var length = len(L.indentStack)
setlen(L.indentStack, length - 1)
proc findIdent(L: TLexer, indent: int): bool =
for i in countdown(len(L.indentStack) - 1, 0):
if L.indentStack[i] == indent:
return true
proc tokToStr*(tok: TToken): string =
case tok.tokType
of tkIntLit..tkInt64Lit: result = $tok.iNumber
of tkFloatLit..tkFloat64Lit: result = $tok.fNumber
of tkInvalid, tkStrLit..tkCharLit, tkComment: result = tok.literal
of tkParLe..tkColon, tkEof, tkInd, tkSad, tkDed, tkAccent:
result = tokTypeToStr[tok.tokType]
else:
if (tok.ident != nil):
result = tok.ident.s
else:
InternalError("tokToStr")
result = ""
proc prettyTok*(tok: TToken): string =
if IsKeyword(tok.tokType): result = "keyword " & tok.ident.s
else: result = tokToStr(tok)
proc PrintTok*(tok: TToken) =
write(stdout, TokTypeToStr[tok.tokType])
write(stdout, " ")
writeln(stdout, tokToStr(tok))
var dummyIdent: PIdent
proc initToken*(L: var TToken) =
L.TokType = tkInvalid
L.iNumber = 0
L.Indent = 0
L.literal = ""
L.fNumber = 0.0
L.base = base10
L.ident = dummyIdent
proc fillToken(L: var TToken) =
L.TokType = tkInvalid
L.iNumber = 0
L.Indent = 0
setLen(L.literal, 0)
L.fNumber = 0.0
L.base = base10
L.ident = dummyIdent
proc openLexer(lex: var TLexer, filename: string, inputstream: PLLStream) =
openBaseLexer(lex, inputstream)
lex.indentStack = @[0]
lex.filename = filename
lex.indentAhead = - 1
inc(lex.Linenumber, inputstream.lineOffset)
proc closeLexer(lex: var TLexer) =
inc(gLinesCompiled, lex.LineNumber)
closeBaseLexer(lex)
proc getColumn(L: TLexer): int =
result = getColNumber(L, L.bufPos)
proc getLineInfo(L: TLexer): TLineInfo =
result = newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos))
proc lexMessage(L: TLexer, msg: TMsgKind, arg = "") =
msgs.Message(getLineInfo(L), msg, arg)
proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg = "") =
var info = newLineInfo(L.filename, L.linenumber, pos - L.lineStart)
msgs.Message(info, msg, arg)
proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: TCharSet) =
var pos = L.bufpos # use registers for pos, buf
var buf = L.buf
while true:
if buf[pos] in chars:
add(tok.literal, buf[pos])
Inc(pos)
else:
break
if buf[pos] == '_':
if buf[pos+1] notin chars:
lexMessage(L, errInvalidToken, "_")
break
add(tok.literal, '_')
Inc(pos)
L.bufPos = pos
proc matchTwoChars(L: TLexer, first: Char, second: TCharSet): bool =
result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in Second)
proc isFloatLiteral(s: string): bool =
for i in countup(0, len(s) + 0 - 1):
if s[i] in {'.', 'e', 'E'}:
return true
result = false
proc GetNumber(L: var TLexer): TToken =
var
pos, endpos: int
xi: biggestInt
# get the base:
result.tokType = tkIntLit # int literal until we know better
result.literal = ""
result.base = base10 # BUGFIX
pos = L.bufpos # make sure the literal is correct for error messages:
matchUnderscoreChars(L, result, {'A'..'Z', 'a'..'z', '0'..'9'})
if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}):
add(result.literal, '.')
inc(L.bufpos)
#matchUnderscoreChars(L, result, ['A'..'Z', 'a'..'z', '0'..'9'])
matchUnderscoreChars(L, result, {'0'..'9'})
if L.buf[L.bufpos] in {'e', 'E'}:
add(result.literal, 'e')
inc(L.bufpos)
if L.buf[L.bufpos] in {'+', '-'}:
add(result.literal, L.buf[L.bufpos])
inc(L.bufpos)
matchUnderscoreChars(L, result, {'0'..'9'})
endpos = L.bufpos
if L.buf[endpos] == '\'':
#matchUnderscoreChars(L, result, ['''', 'f', 'F', 'i', 'I', '0'..'9']);
inc(endpos)
L.bufpos = pos # restore position
case L.buf[endpos]
of 'f', 'F':
inc(endpos)
if (L.buf[endpos] == '6') and (L.buf[endpos + 1] == '4'):
result.tokType = tkFloat64Lit
inc(endpos, 2)
elif (L.buf[endpos] == '3') and (L.buf[endpos + 1] == '2'):
result.tokType = tkFloat32Lit
inc(endpos, 2)
else:
lexMessage(L, errInvalidNumber, result.literal & "'f" & L.buf[endpos])
of 'i', 'I':
inc(endpos)
if (L.buf[endpos] == '6') and (L.buf[endpos + 1] == '4'):
result.tokType = tkInt64Lit
inc(endpos, 2)
elif (L.buf[endpos] == '3') and (L.buf[endpos + 1] == '2'):
result.tokType = tkInt32Lit
inc(endpos, 2)
elif (L.buf[endpos] == '1') and (L.buf[endpos + 1] == '6'):
result.tokType = tkInt16Lit
inc(endpos, 2)
elif (L.buf[endpos] == '8'):
result.tokType = tkInt8Lit
inc(endpos)
else:
lexMessage(L, errInvalidNumber, result.literal & "'i" & L.buf[endpos])
else: lexMessage(L, errInvalidNumber, result.literal & "'" & L.buf[endpos])
else:
L.bufpos = pos # restore position
try:
if (L.buf[pos] == '0') and
(L.buf[pos + 1] in {'x', 'X', 'b', 'B', 'o', 'O', 'c', 'C'}):
inc(pos, 2)
xi = 0 # it may be a base prefix
case L.buf[pos - 1] # now look at the optional type suffix:
of 'b', 'B':
result.base = base2
while true:
case L.buf[pos]
of 'A'..'Z', 'a'..'z', '2'..'9', '.':
lexMessage(L, errInvalidNumber, result.literal)
inc(pos)
of '_':
if L.buf[pos+1] notin {'0'..'1'}:
lexMessage(L, errInvalidToken, "_")
break
inc(pos)
of '0', '1':
xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0'))
inc(pos)
else: break
of 'o', 'c', 'C':
result.base = base8
while true:
case L.buf[pos]
of 'A'..'Z', 'a'..'z', '8'..'9', '.':
lexMessage(L, errInvalidNumber, result.literal)
inc(pos)
of '_':
if L.buf[pos+1] notin {'0'..'7'}:
lexMessage(L, errInvalidToken, "_")
break
inc(pos)
of '0'..'7':
xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0'))
inc(pos)
else: break
of 'O':
lexMessage(L, errInvalidNumber, result.literal)
of 'x', 'X':
result.base = base16
while true:
case L.buf[pos]
of 'G'..'Z', 'g'..'z':
lexMessage(L, errInvalidNumber, result.literal)
inc(pos)
of '_':
if L.buf[pos+1] notin {'0'..'9', 'a'..'f', 'A'..'F'}:
lexMessage(L, errInvalidToken, "_")
break
inc(pos)
of '0'..'9':
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0'))
inc(pos)
of 'a'..'f':
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10)
inc(pos)
of 'A'..'F':
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10)
inc(pos)
else: break
else: InternalError(getLineInfo(L), "getNumber")
case result.tokType
of tkIntLit, tkInt64Lit: result.iNumber = xi
of tkInt8Lit: result.iNumber = biggestInt(int8(toU8(int(xi))))
of tkInt16Lit: result.iNumber = biggestInt(toU16(int(xi)))
of tkInt32Lit: result.iNumber = biggestInt(toU32(xi))
of tkFloat32Lit:
result.fNumber = (cast[PFloat32](addr(xi)))[]
# note: this code is endian neutral!
# XXX: Test this on big endian machine!
of tkFloat64Lit: result.fNumber = (cast[PFloat64](addr(xi)))[]
else: InternalError(getLineInfo(L), "getNumber")
elif isFloatLiteral(result.literal) or (result.tokType == tkFloat32Lit) or
(result.tokType == tkFloat64Lit):
result.fnumber = parseFloat(result.literal)
if result.tokType == tkIntLit: result.tokType = tkFloatLit
else:
result.iNumber = ParseBiggestInt(result.literal)
if (result.iNumber < low(int32)) or (result.iNumber > high(int32)):
if result.tokType == tkIntLit:
result.tokType = tkInt64Lit
elif result.tokType != tkInt64Lit:
lexMessage(L, errInvalidNumber, result.literal)
except EInvalidValue: lexMessage(L, errInvalidNumber, result.literal)
except EOverflow: lexMessage(L, errNumberOutOfRange, result.literal)
except EOutOfRange: lexMessage(L, errNumberOutOfRange, result.literal)
L.bufpos = endpos
proc handleHexChar(L: var TLexer, xi: var int) =
case L.buf[L.bufpos]
of '0'..'9':
xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0'))
inc(L.bufpos)
of 'a'..'f':
xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('a') + 10)
inc(L.bufpos)
of 'A'..'F':
xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10)
inc(L.bufpos)
else: nil
proc handleDecChars(L: var TLexer, xi: var int) =
while L.buf[L.bufpos] in {'0'..'9'}:
xi = (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0'))
inc(L.bufpos)
proc getEscapedChar(L: var TLexer, tok: var TToken) =
inc(L.bufpos) # skip '\'
case L.buf[L.bufpos]
of 'n', 'N':
if tok.toktype == tkCharLit: lexMessage(L, errNnotAllowedInCharacter)
add(tok.literal, tnl)
Inc(L.bufpos)
of 'r', 'R', 'c', 'C':
add(tok.literal, CR)
Inc(L.bufpos)
of 'l', 'L':
add(tok.literal, LF)
Inc(L.bufpos)
of 'f', 'F':
add(tok.literal, FF)
inc(L.bufpos)
of 'e', 'E':
add(tok.literal, ESC)
Inc(L.bufpos)
of 'a', 'A':
add(tok.literal, BEL)
Inc(L.bufpos)
of 'b', 'B':
add(tok.literal, BACKSPACE)
Inc(L.bufpos)
of 'v', 'V':
add(tok.literal, VT)
Inc(L.bufpos)
of 't', 'T':
add(tok.literal, Tabulator)
Inc(L.bufpos)
of '\'', '\"':
add(tok.literal, L.buf[L.bufpos])
Inc(L.bufpos)
of '\\':
add(tok.literal, '\\')
Inc(L.bufpos)
of 'x', 'X':
inc(L.bufpos)
var xi = 0
handleHexChar(L, xi)
handleHexChar(L, xi)
add(tok.literal, Chr(xi))
of '0'..'9':
if matchTwoChars(L, '0', {'0'..'9'}):
lexMessage(L, warnOctalEscape)
var xi = 0
handleDecChars(L, xi)
if (xi <= 255): add(tok.literal, Chr(xi))
else: lexMessage(L, errInvalidCharacterConstant)
else: lexMessage(L, errInvalidCharacterConstant)
proc HandleCRLF(L: var TLexer, pos: int): int =
case L.buf[pos]
of CR:
if getColNumber(L, pos) > MaxLineLength:
lexMessagePos(L, hintLineTooLong, pos)
result = lexbase.HandleCR(L, pos)
of LF:
if getColNumber(L, pos) > MaxLineLength:
lexMessagePos(L, hintLineTooLong, pos)
result = lexbase.HandleLF(L, pos)
else: result = pos
proc getString(L: var TLexer, tok: var TToken, rawMode: bool) =
var pos = L.bufPos + 1 # skip "
var buf = L.buf # put `buf` in a register
var line = L.linenumber # save linenumber for better error message
if buf[pos] == '\"' and buf[pos+1] == '\"':
tok.tokType = tkTripleStrLit # long string literal:
inc(pos, 2) # skip ""
# skip leading newline:
pos = HandleCRLF(L, pos)
buf = L.buf
while true:
case buf[pos]
of '\"':
if buf[pos+1] == '\"' and buf[pos+2] == '\"' and
buf[pos+3] != '\"':
L.bufpos = pos + 3 # skip the three """
break
add(tok.literal, '\"')
Inc(pos)
of CR, LF:
pos = HandleCRLF(L, pos)
buf = L.buf
add(tok.literal, tnl)
of lexbase.EndOfFile:
var line2 = L.linenumber
L.LineNumber = line
lexMessagePos(L, errClosingTripleQuoteExpected, L.lineStart)
L.LineNumber = line2
break
else:
add(tok.literal, buf[pos])
Inc(pos)
else:
# ordinary string literal
if rawMode: tok.tokType = tkRStrLit
else: tok.tokType = tkStrLit
while true:
var c = buf[pos]
if c == '\"':
if rawMode and buf[pos+1] == '\"':
inc(pos, 2)
add(tok.literal, '"')
else:
inc(pos) # skip '"'
break
elif c in {CR, LF, lexbase.EndOfFile}:
lexMessage(L, errClosingQuoteExpected)
break
elif (c == '\\') and not rawMode:
L.bufPos = pos
getEscapedChar(L, tok)
pos = L.bufPos
else:
add(tok.literal, c)
Inc(pos)
L.bufpos = pos
proc getCharacter(L: var TLexer, tok: var TToken) =
Inc(L.bufpos) # skip '
var c = L.buf[L.bufpos]
case c
of '\0'..Pred(' '), '\'': lexMessage(L, errInvalidCharacterConstant)
of '\\': getEscapedChar(L, tok)
else:
tok.literal = $c
Inc(L.bufpos)
if L.buf[L.bufpos] != '\'': lexMessage(L, errMissingFinalQuote)
inc(L.bufpos) # skip '
proc getSymbol(L: var TLexer, tok: var TToken) =
var h: THash = 0
var pos = L.bufpos
var buf = L.buf
while true:
var c = buf[pos]
case c
of 'a'..'z', '0'..'9', '\x80'..'\xFF':
h = h !& ord(c)
of 'A'..'Z':
c = chr(ord(c) + (ord('a') - ord('A'))) # toLower()
h = h !& ord(c)
of '_':
if buf[pos+1] notin SymChars:
lexMessage(L, errInvalidToken, "_")
break
else: break
Inc(pos)
h = !$h
tok.ident = getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
L.bufpos = pos
if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or
(tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)):
tok.tokType = tkSymbol
else:
tok.tokType = TTokType(tok.ident.id + ord(tkSymbol))
proc endOperator(L: var TLexer, tok: var TToken, pos: int,
hash: THash) {.inline.} =
var h = !$hash
tok.ident = getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr
else: tok.tokType = TTokType(tok.ident.id - oprLow + ord(tkColon))
L.bufpos = pos
proc getOperator(L: var TLexer, tok: var TToken) =
var pos = L.bufpos
var buf = L.buf
var h: THash = 0
while true:
var c = buf[pos]
if c notin OpChars: break
h = h !& Ord(c)
Inc(pos)
endOperator(L, tok, pos, h)
proc handleIndentation(L: var TLexer, tok: var TToken, indent: int) =
tok.indent = indent
var i = high(L.indentStack)
if indent > L.indentStack[i]:
tok.tokType = tkInd
elif indent == L.indentStack[i]:
tok.tokType = tkSad
else:
# check we have the indentation somewhere in the stack:
while (i >= 0) and (indent != L.indentStack[i]):
dec(i)
inc(L.dedent)
dec(L.dedent)
tok.tokType = tkDed
if i < 0:
tok.tokType = tkSad # for the parser it is better as SAD
lexMessage(L, errInvalidIndentation)
proc scanComment(L: var TLexer, tok: var TToken) =
var pos = L.bufpos
var buf = L.buf
# a comment ends if the next line does not start with the # on the same
# column after only whitespace
tok.tokType = tkComment
var col = getColNumber(L, pos)
while true:
while not (buf[pos] in {CR, LF, lexbase.EndOfFile}):
add(tok.literal, buf[pos])
inc(pos)
pos = handleCRLF(L, pos)
buf = L.buf
var indent = 0
while buf[pos] == ' ':
inc(pos)
inc(indent)
if (buf[pos] == '#') and (col == indent):
tok.literal = tok.literal & "\n"
else:
if buf[pos] > ' ':
L.indentAhead = indent
inc(L.dedent)
break
L.bufpos = pos
proc skip(L: var TLexer, tok: var TToken) =
var pos = L.bufpos
var buf = L.buf
while true:
case buf[pos]
of ' ':
Inc(pos)
of Tabulator:
lexMessagePos(L, errTabulatorsAreNotAllowed, pos)
inc(pos) # BUGFIX
of CR, LF:
pos = HandleCRLF(L, pos)
buf = L.buf
var indent = 0
while buf[pos] == ' ':
Inc(pos)
Inc(indent)
if (buf[pos] > ' '):
handleIndentation(L, tok, indent)
break
else:
break # EndOfFile also leaves the loop
L.bufpos = pos
proc rawGetTok(L: var TLexer, tok: var TToken) =
fillToken(tok)
if L.dedent > 0:
dec(L.dedent)
if L.indentAhead >= 0:
handleIndentation(L, tok, L.indentAhead)
L.indentAhead = - 1
else:
tok.tokType = tkDed
return
skip(L, tok)
# got an documentation comment or tkIndent, return that:
if tok.toktype != tkInvalid: return
var c = L.buf[L.bufpos]
if c in SymStartChars - {'r', 'R', 'l'}:
getSymbol(L, tok)
elif c in {'0'..'9'}:
tok = getNumber(L)
else:
case c
of '#':
scanComment(L, tok)
of '*':
# '*:' is unfortunately a special case, because it is two tokens in
# 'var v*: int'.
if L.buf[L.bufpos+1] == ':' and L.buf[L.bufpos+2] notin OpChars:
var h = 0 !& ord('*')
endOperator(L, tok, L.bufpos+1, h)
else:
getOperator(L, tok)
of ',':
tok.toktype = tkComma
Inc(L.bufpos)
of 'l':
# if we parsed exactly one character and its a small L (l), this
# is treated as a warning because it may be confused with the number 1
if not (L.buf[L.bufpos + 1] in (SymChars + {'_'})):
lexMessage(L, warnSmallLshouldNotBeUsed)
getSymbol(L, tok)
of 'r', 'R':
if L.buf[L.bufPos + 1] == '\"':
Inc(L.bufPos)
getString(L, tok, true)
else:
getSymbol(L, tok)
of '(':
Inc(L.bufpos)
if (L.buf[L.bufPos] == '.') and (L.buf[L.bufPos + 1] != '.'):
tok.toktype = tkParDotLe
Inc(L.bufpos)
else:
tok.toktype = tkParLe
of ')':
tok.toktype = tkParRi
Inc(L.bufpos)
of '[':
Inc(L.bufpos)
if (L.buf[L.bufPos] == '.') and (L.buf[L.bufPos + 1] != '.'):
tok.toktype = tkBracketDotLe
Inc(L.bufpos)
else:
tok.toktype = tkBracketLe
of ']':
tok.toktype = tkBracketRi
Inc(L.bufpos)
of '.':
if L.buf[L.bufPos + 1] == ']':
tok.tokType = tkBracketDotRi
Inc(L.bufpos, 2)
elif L.buf[L.bufPos + 1] == '}':
tok.tokType = tkCurlyDotRi
Inc(L.bufpos, 2)
elif L.buf[L.bufPos + 1] == ')':
tok.tokType = tkParDotRi
Inc(L.bufpos, 2)
else:
getOperator(L, tok)
of '{':
Inc(L.bufpos)
if (L.buf[L.bufPos] == '.') and (L.buf[L.bufPos+1] != '.'):
tok.toktype = tkCurlyDotLe
Inc(L.bufpos)
else:
tok.toktype = tkCurlyLe
of '}':
tok.toktype = tkCurlyRi
Inc(L.bufpos)
of ';':
tok.toktype = tkSemiColon
Inc(L.bufpos)
of '`':
tok.tokType = tkAccent
Inc(L.bufpos)
of '\"':
# check for extended raw string literal:
var rawMode = L.bufpos > 0 and L.buf[L.bufpos-1] in SymChars
getString(L, tok, rawMode)
if rawMode:
# tkRStrLit -> tkGStrLit
# tkTripleStrLit -> tkGTripleStrLit
inc(tok.tokType, 2)
of '\'':
tok.tokType = tkCharLit
getCharacter(L, tok)
tok.tokType = tkCharLit
else:
if c in OpChars:
getOperator(L, tok)
elif c == lexbase.EndOfFile:
tok.toktype = tkEof
else:
tok.literal = c & ""
tok.tokType = tkInvalid
lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')')
Inc(L.bufpos)
dummyIdent = getIdent("")