diff options
Diffstat (limited to 'rod/scanner.nim')
-rwxr-xr-x | rod/scanner.nim | 789 |
1 files changed, 789 insertions, 0 deletions
diff --git a/rod/scanner.nim b/rod/scanner.nim new file mode 100755 index 000000000..c7fcbb062 --- /dev/null +++ b/rod/scanner.nim @@ -0,0 +1,789 @@ +# +# +# The Nimrod Compiler +# (c) Copyright 2009 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +# This scanner is handwritten for efficiency. I used an elegant buffering +# scheme which I have not seen anywhere else: +# We guarantee that a whole line is in the buffer. Thus only when scanning +# the \n or \r character we have to check wether we need to read in the next +# chunk. (\n or \r already need special handling for incrementing the line +# counter; choosing both \n and \r allows the scanner to properly read Unix, +# DOS or Macintosh text files, even when it is not the native format. + +import + nhashes, options, msgs, strutils, platform, idents, lexbase, llstream, + wordrecg + +const + MaxLineLength* = 80 # lines longer than this lead to a warning + numChars*: TCharSet = {'0'..'9', 'a'..'z', 'A'..'Z'} + SymChars*: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'} + SymStartChars*: TCharSet = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'} + OpChars*: TCharSet = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.', + '|', '=', '%', '&', '$', '@', '~', '\x80'..'\xFF'} + +type + TTokType* = enum + tkInvalid, tkEof, # order is important here! + tkSymbol, # keywords: + #[[[cog + #from string import split, capitalize + #keywords = split(open("data/keywords.txt").read()) + #idents = "" + #strings = "" + #i = 1 + #for k in keywords: + # idents = idents + "tk" + capitalize(k) + ", " + # strings = strings + "'" + k + "', " + # if i % 4 == 0: + # idents = idents + "\n" + # strings = strings + "\n" + # i = i + 1 + #cog.out(idents) + #]]] + tkAddr, tkAnd, tkAs, tkAsm, tkBind, tkBlock, tkBreak, tkCase, tkCast, + tkConst, tkContinue, tkConverter, tkDiscard, tkDistinct, tkDiv, tkElif, + tkElse, tkEnd, tkEnum, tkExcept, tkFinally, tkFor, tkFrom, tkGeneric, tkIf, + tkImplies, tkImport, tkIn, tkInclude, tkIs, tkIsnot, tkIterator, tkLambda, + tkMacro, tkMethod, tkMod, tkNil, tkNot, tkNotin, tkObject, tkOf, tkOr, + tkOut, tkProc, tkPtr, tkRaise, tkRef, tkReturn, tkShl, tkShr, tkTemplate, + tkTry, tkTuple, tkType, tkVar, tkWhen, tkWhile, tkWith, tkWithout, tkXor, tkYield, #[[[end]]] + tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit, tkFloatLit, + tkFloat32Lit, tkFloat64Lit, tkStrLit, tkRStrLit, tkTripleStrLit, + tkCallRStrLit, tkCallTripleStrLit, tkCharLit, tkParLe, tkParRi, tkBracketLe, + tkBracketRi, tkCurlyLe, tkCurlyRi, tkBracketDotLe, tkBracketDotRi, # [. and .] + tkCurlyDotLe, tkCurlyDotRi, # {. and .} + tkParDotLe, tkParDotRi, # (. and .) + tkComma, tkSemiColon, tkColon, tkEquals, tkDot, tkDotDot, tkHat, tkOpr, + tkComment, tkAccent, tkInd, tkSad, tkDed, # pseudo token types used by the source renderers: + tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr + TTokTypes* = set[TTokType] + +const + tokKeywordLow* = succ(tkSymbol) + tokKeywordHigh* = pred(tkIntLit) + tokOperators*: TTokTypes = {tkOpr, tkSymbol, tkBracketLe, tkBracketRi, tkIn, + tkIs, tkIsNot, tkEquals, tkDot, tkHat, tkNot, tkAnd, tkOr, tkXor, tkShl, + tkShr, tkDiv, tkMod, tkNotIn} + TokTypeToStr*: array[TTokType, string] = ["tkInvalid", "[EOF]", "tkSymbol", #[[[cog + #cog.out(strings) + #]]] + "addr", "and", "as", "asm", "bind", "block", "break", "case", "cast", + "const", "continue", "converter", "discard", "distinct", "div", "elif", + "else", "end", "enum", "except", "finally", "for", "from", "generic", "if", + "implies", "import", "in", "include", "is", "isnot", "iterator", "lambda", + "macro", "method", "mod", "nil", "not", "notin", "object", "of", "or", + "out", "proc", "ptr", "raise", "ref", "return", "shl", "shr", "template", + "try", "tuple", "type", "var", "when", "while", "with", "without", "xor", "yield", #[[[end]]] + "tkIntLit", "tkInt8Lit", "tkInt16Lit", "tkInt32Lit", "tkInt64Lit", + "tkFloatLit", "tkFloat32Lit", "tkFloat64Lit", "tkStrLit", "tkRStrLit", + "tkTripleStrLit", "tkCallRStrLit", "tkCallTripleStrLit", "tkCharLit", "(", + ")", "[", "]", "{", "}", "[.", ".]", "{.", ".}", "(.", ".)", ",", ";", ":", + "=", ".", "..", "^", "tkOpr", "tkComment", "`", "[new indentation]", + "[same indentation]", "[dedentation]", "tkSpaces", "tkInfixOpr", + "tkPrefixOpr", "tkPostfixOpr"] + +type + TNumericalBase* = enum + base10, # base10 is listed as the first element, + # so that it is the correct default value + base2, base8, base16 + PToken* = ref TToken + TToken* = object # a Nimrod token + tokType*: TTokType # the type of the token + indent*: int # the indentation; only valid if tokType = tkIndent + ident*: PIdent # the parsed identifier + iNumber*: BiggestInt # the parsed integer literal + fNumber*: BiggestFloat # the parsed floating point literal + base*: TNumericalBase # the numerical base; only valid for int + # or float literals + literal*: string # the parsed (string) literal; and + # documentation comments are here too + next*: PToken # next token; can be used for arbitrary look-ahead + + PLexer* = ref TLexer + TLexer* = object of TBaseLexer + filename*: string + indentStack*: seq[int] # the indentation stack + dedent*: int # counter for DED token generation + indentAhead*: int # if > 0 an indendation has already been read + # this is needed because scanning comments + # needs so much look-ahead + + +var gLinesCompiled*: int + +proc pushInd*(L: var TLexer, indent: int) + # all lines that have been compiled +proc popInd*(L: var TLexer) +proc isKeyword*(kind: TTokType): bool +proc openLexer*(lex: var TLexer, filename: string, inputstream: PLLStream) +proc rawGetTok*(L: var TLexer, tok: var TToken) + # reads in the next token into tok and skips it +proc getColumn*(L: TLexer): int +proc getLineInfo*(L: TLexer): TLineInfo +proc closeLexer*(lex: var TLexer) +proc PrintTok*(tok: PToken) +proc tokToStr*(tok: PToken): string + # auxiliary functions: +proc lexMessage*(L: TLexer, msg: TMsgKind, arg: string = "") + # the Pascal scanner uses this too: +proc fillToken*(L: var TToken) +# implementation + +proc isKeyword(kind: TTokType): bool = + result = (kind >= tokKeywordLow) and (kind <= tokKeywordHigh) + +proc pushInd(L: var TLexer, indent: int) = + var length: int + length = len(L.indentStack) + setlen(L.indentStack, length + 1) + if (indent > L.indentStack[length - 1]): + L.indentstack[length] = indent + else: + InternalError("pushInd") #writeln('push indent ', indent); + +proc popInd(L: var TLexer) = + var length: int + length = len(L.indentStack) + setlen(L.indentStack, length - 1) + +proc findIdent(L: TLexer, indent: int): bool = + for i in countdown(len(L.indentStack) - 1, 0): + if L.indentStack[i] == indent: + return true + result = false + +proc tokToStr(tok: PToken): string = + case tok.tokType + of tkIntLit..tkInt64Lit: + result = $(tok.iNumber) + of tkFloatLit..tkFloat64Lit: + result = $(tok.fNumber) + of tkInvalid, tkStrLit..tkCharLit, tkComment: + result = tok.literal + of tkParLe..tkColon, tkEof, tkInd, tkSad, tkDed, tkAccent: + result = tokTypeToStr[tok.tokType] + else: + if (tok.ident != nil): + result = tok.ident.s + else: + InternalError("tokToStr") + result = "" + +proc PrintTok(tok: PToken) = + write(stdout, TokTypeToStr[tok.tokType]) + write(stdout, " ") + writeln(stdout, tokToStr(tok)) + +var dummyIdent: PIdent + +proc fillToken(L: var TToken) = + L.TokType = tkInvalid + L.iNumber = 0 + L.Indent = 0 + L.literal = "" + L.fNumber = 0.0 + L.base = base10 + L.ident = dummyIdent # this prevents many bugs! + +proc openLexer(lex: var TLexer, filename: string, inputstream: PLLStream) = + openBaseLexer(lex, inputstream) + lex.indentStack = @ [0] + lex.filename = filename + lex.indentAhead = - 1 + +proc closeLexer(lex: var TLexer) = + inc(gLinesCompiled, lex.LineNumber) + closeBaseLexer(lex) + +proc getColumn(L: TLexer): int = + result = getColNumber(L, L.bufPos) + +proc getLineInfo(L: TLexer): TLineInfo = + result = newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos)) + +proc lexMessage(L: TLexer, msg: TMsgKind, arg: string = "") = + msgs.liMessage(getLineInfo(L), msg, arg) + +proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg: string = "") = + var info: TLineInfo + info = newLineInfo(L.filename, L.linenumber, pos - L.lineStart) + msgs.liMessage(info, msg, arg) + +proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: TCharSet) = + # matches ([chars]_)* + var + pos: int + buf: cstring + pos = L.bufpos # use registers for pos, buf + buf = L.buf + while true: + if buf[pos] in chars: + add(tok.literal, buf[pos]) + Inc(pos) + else: + break + if buf[pos] == '_': + add(tok.literal, '_') + Inc(pos) + L.bufPos = pos + +proc matchTwoChars(L: TLexer, first: Char, second: TCharSet): bool = + result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in Second) + +proc isFloatLiteral(s: string): bool = + for i in countup(0, len(s) + 0 - 1): + if s[i] in {'.', 'e', 'E'}: + return true + result = false + +proc GetNumber(L: var TLexer): TToken = + var + pos, endpos: int + xi: biggestInt + # get the base: + result.tokType = tkIntLit # int literal until we know better + result.literal = "" + result.base = base10 # BUGFIX + pos = L.bufpos # make sure the literal is correct for error messages: + matchUnderscoreChars(L, result, {'A'..'Z', 'a'..'z', '0'..'9'}) + if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): + add(result.literal, '.') + inc(L.bufpos) #matchUnderscoreChars(L, result, ['A'..'Z', 'a'..'z', '0'..'9']) + matchUnderscoreChars(L, result, {'0'..'9'}) + if L.buf[L.bufpos] in {'e', 'E'}: + add(result.literal, 'e') + inc(L.bufpos) + if L.buf[L.bufpos] in {'+', '-'}: + add(result.literal, L.buf[L.bufpos]) + inc(L.bufpos) + matchUnderscoreChars(L, result, {'0'..'9'}) + endpos = L.bufpos + if L.buf[endpos] == '\'': + #matchUnderscoreChars(L, result, ['''', 'f', 'F', 'i', 'I', '0'..'9']); + inc(endpos) + L.bufpos = pos # restore position + case L.buf[endpos] + of 'f', 'F': + inc(endpos) + if (L.buf[endpos] == '6') and (L.buf[endpos + 1] == '4'): + result.tokType = tkFloat64Lit + inc(endpos, 2) + elif (L.buf[endpos] == '3') and (L.buf[endpos + 1] == '2'): + result.tokType = tkFloat32Lit + inc(endpos, 2) + else: + lexMessage(L, errInvalidNumber, result.literal) + of 'i', 'I': + inc(endpos) + if (L.buf[endpos] == '6') and (L.buf[endpos + 1] == '4'): + result.tokType = tkInt64Lit + inc(endpos, 2) + elif (L.buf[endpos] == '3') and (L.buf[endpos + 1] == '2'): + result.tokType = tkInt32Lit + inc(endpos, 2) + elif (L.buf[endpos] == '1') and (L.buf[endpos + 1] == '6'): + result.tokType = tkInt16Lit + inc(endpos, 2) + elif (L.buf[endpos] == '8'): + result.tokType = tkInt8Lit + inc(endpos) + else: + lexMessage(L, errInvalidNumber, result.literal) + else: lexMessage(L, errInvalidNumber, result.literal) + else: + L.bufpos = pos # restore position + try: + if (L.buf[pos] == '0') and + (L.buf[pos + 1] in {'x', 'X', 'b', 'B', 'o', 'O', 'c', 'C'}): + inc(pos, 2) + xi = 0 # it may be a base prefix + case L.buf[pos - 1] # now look at the optional type suffix: + of 'b', 'B': + result.base = base2 + while true: + case L.buf[pos] + of 'A'..'Z', 'a'..'z', '2'..'9', '.': + lexMessage(L, errInvalidNumber, result.literal) + inc(pos) + of '_': + inc(pos) + of '0', '1': + xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + else: break + of 'o', 'c', 'C': + result.base = base8 + while true: + case L.buf[pos] + of 'A'..'Z', 'a'..'z', '8'..'9', '.': + lexMessage(L, errInvalidNumber, result.literal) + inc(pos) + of '_': + inc(pos) + of '0'..'7': + xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + else: break + of 'O': + lexMessage(L, errInvalidNumber, result.literal) + of 'x', 'X': + result.base = base16 + while true: + case L.buf[pos] + of 'G'..'Z', 'g'..'z', '.': + lexMessage(L, errInvalidNumber, result.literal) + inc(pos) + of '_': + inc(pos) + of '0'..'9': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + of 'a'..'f': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10) + inc(pos) + of 'A'..'F': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10) + inc(pos) + else: break + else: InternalError(getLineInfo(L), "getNumber") + case result.tokType + of tkIntLit, tkInt64Lit: + result.iNumber = xi + of tkInt8Lit: + result.iNumber = biggestInt(int8(toU8(int(xi)))) + of tkInt16Lit: + result.iNumber = biggestInt(toU16(int(xi))) + of tkInt32Lit: + result.iNumber = biggestInt(toU32(xi)) + of tkFloat32Lit: + result.fNumber = (cast[PFloat32](addr(xi)))^ # note: this code is endian neutral! + # XXX: Test this on big endian machine! + of tkFloat64Lit: + result.fNumber = (cast[PFloat64](addr(xi)))^ + else: InternalError(getLineInfo(L), "getNumber") + elif isFloatLiteral(result.literal) or (result.tokType == tkFloat32Lit) or + (result.tokType == tkFloat64Lit): + result.fnumber = parseFloat(result.literal) + if result.tokType == tkIntLit: result.tokType = tkFloatLit + else: + result.iNumber = ParseBiggestInt(result.literal) + if (result.iNumber < low(int32)) or (result.iNumber > high(int32)): + if result.tokType == tkIntLit: + result.tokType = tkInt64Lit + elif result.tokType != tkInt64Lit: + lexMessage(L, errInvalidNumber, result.literal) + except EInvalidValue: + lexMessage(L, errInvalidNumber, result.literal) + except EOverflow: + lexMessage(L, errNumberOutOfRange, result.literal) + except EOutOfRange: + lexMessage(L, errNumberOutOfRange, result.literal) + L.bufpos = endpos + +proc handleHexChar(L: var TLexer, xi: var int) = + case L.buf[L.bufpos] + of '0'..'9': + xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0')) + inc(L.bufpos) + of 'a'..'f': + xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('a') + 10) + inc(L.bufpos) + of 'A'..'F': + xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10) + inc(L.bufpos) + else: + nil + +proc handleDecChars(L: var TLexer, xi: var int) = + while L.buf[L.bufpos] in {'0'..'9'}: + xi = (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0')) + inc(L.bufpos) + +proc getEscapedChar(L: var TLexer, tok: var TToken) = + var xi: int + inc(L.bufpos) # skip '\' + case L.buf[L.bufpos] + of 'n', 'N': + if tok.toktype == tkCharLit: lexMessage(L, errNnotAllowedInCharacter) + tok.literal = tok.literal & tnl + Inc(L.bufpos) + of 'r', 'R', 'c', 'C': + add(tok.literal, CR) + Inc(L.bufpos) + of 'l', 'L': + add(tok.literal, LF) + Inc(L.bufpos) + of 'f', 'F': + add(tok.literal, FF) + inc(L.bufpos) + of 'e', 'E': + add(tok.literal, ESC) + Inc(L.bufpos) + of 'a', 'A': + add(tok.literal, BEL) + Inc(L.bufpos) + of 'b', 'B': + add(tok.literal, BACKSPACE) + Inc(L.bufpos) + of 'v', 'V': + add(tok.literal, VT) + Inc(L.bufpos) + of 't', 'T': + add(tok.literal, Tabulator) + Inc(L.bufpos) + of '\'', '\"': + add(tok.literal, L.buf[L.bufpos]) + Inc(L.bufpos) + of '\\': + add(tok.literal, '\\') + Inc(L.bufpos) + of 'x', 'X': + inc(L.bufpos) + xi = 0 + handleHexChar(L, xi) + handleHexChar(L, xi) + add(tok.literal, Chr(xi)) + of '0'..'9': + if matchTwoChars(L, '0', {'0'..'9'}): + lexMessage(L, warnOctalEscape) + xi = 0 + handleDecChars(L, xi) + if (xi <= 255): add(tok.literal, Chr(xi)) + else: lexMessage(L, errInvalidCharacterConstant) + else: lexMessage(L, errInvalidCharacterConstant) + +proc HandleCRLF(L: var TLexer, pos: int): int = + case L.buf[pos] + of CR: + if getColNumber(L, pos) > MaxLineLength: + lexMessagePos(L, hintLineTooLong, pos) + result = lexbase.HandleCR(L, pos) + of LF: + if getColNumber(L, pos) > MaxLineLength: + lexMessagePos(L, hintLineTooLong, pos) + result = lexbase.HandleLF(L, pos) + else: result = pos + +proc getString(L: var TLexer, tok: var TToken, rawMode: bool) = + var + line, line2, pos: int + c: Char + buf: cstring + pos = L.bufPos + 1 # skip " + buf = L.buf # put `buf` in a register + line = L.linenumber # save linenumber for better error message + if (buf[pos] == '\"') and (buf[pos + 1] == '\"'): + tok.tokType = tkTripleStrLit # long string literal: + inc(pos, 2) # skip "" + # skip leading newline: + pos = HandleCRLF(L, pos) + buf = L.buf + while true: + case buf[pos] + of '\"': + if (buf[pos + 1] == '\"') and (buf[pos + 2] == '\"'): break + add(tok.literal, '\"') + Inc(pos) + of CR, LF: + pos = HandleCRLF(L, pos) + buf = L.buf + tok.literal = tok.literal & tnl + of lexbase.EndOfFile: + line2 = L.linenumber + L.LineNumber = line + lexMessagePos(L, errClosingTripleQuoteExpected, L.lineStart) + L.LineNumber = line2 + break + else: + add(tok.literal, buf[pos]) + Inc(pos) + L.bufpos = pos + + 3 # skip the three """ + else: + # ordinary string literal + if rawMode: tok.tokType = tkRStrLit + else: tok.tokType = tkStrLit + while true: + c = buf[pos] + if c == '\"': + inc(pos) # skip '"' + break + if c in {CR, LF, lexbase.EndOfFile}: + lexMessage(L, errClosingQuoteExpected) + break + if (c == '\\') and not rawMode: + L.bufPos = pos + getEscapedChar(L, tok) + pos = L.bufPos + else: + add(tok.literal, c) + Inc(pos) + L.bufpos = pos + +proc getCharacter(L: var TLexer, tok: var TToken) = + var c: Char + Inc(L.bufpos) # skip ' + c = L.buf[L.bufpos] + case c + of '\0'..Pred(' '), '\'': lexMessage(L, errInvalidCharacterConstant) + of '\\': getEscapedChar(L, tok) + else: + tok.literal = c & "" + Inc(L.bufpos) + if L.buf[L.bufpos] != '\'': lexMessage(L, errMissingFinalQuote) + inc(L.bufpos) # skip ' + +proc getSymbol(L: var TLexer, tok: var TToken) = + var + pos: int + c: Char + buf: cstring + h: THash # hashing algorithm inlined + h = 0 + pos = L.bufpos + buf = L.buf + while true: + c = buf[pos] + case c + of 'a'..'z', '0'..'9', '\x80'..'\xFF': + h = h +% Ord(c) + h = h +% h shl 10 + h = h xor (h shr 6) + of 'A'..'Z': + c = chr(ord(c) + (ord('a') - ord('A'))) # toLower() + h = h +% Ord(c) + h = h +% h shl 10 + h = h xor (h shr 6) + of '_': + nil + else: break + Inc(pos) + h = h +% h shl 3 + h = h xor (h shr 11) + h = h +% h shl 15 + tok.ident = getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h) + L.bufpos = pos + if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or + (tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)): + tok.tokType = tkSymbol + else: + tok.tokType = TTokType(tok.ident.id + ord(tkSymbol)) + if buf[pos] == '\"': + getString(L, tok, true) + if tok.tokType == tkRStrLit: tok.tokType = tkCallRStrLit + else: tok.tokType = tkCallTripleStrLit + +proc getOperator(L: var TLexer, tok: var TToken) = + var + pos: int + c: Char + buf: cstring + h: THash # hashing algorithm inlined + pos = L.bufpos + buf = L.buf + h = 0 + while true: + c = buf[pos] + if c in OpChars: + h = h +% Ord(c) + h = h +% h shl 10 + h = h xor (h shr 6) + else: + break + Inc(pos) + h = h +% h shl 3 + h = h xor (h shr 11) + h = h +% h shl 15 + tok.ident = getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h) + if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr + else: tok.tokType = TTokType(tok.ident.id - oprLow + ord(tkColon)) + L.bufpos = pos + +proc handleIndentation(L: var TLexer, tok: var TToken, indent: int) = + var i: int + tok.indent = indent + i = high(L.indentStack) + if indent > L.indentStack[i]: + tok.tokType = tkInd + elif indent == L.indentStack[i]: + tok.tokType = tkSad + else: + # check we have the indentation somewhere in the stack: + while (i >= 0) and (indent != L.indentStack[i]): + dec(i) + inc(L.dedent) + dec(L.dedent) + tok.tokType = tkDed + if i < 0: + tok.tokType = tkSad # for the parser it is better as SAD + lexMessage(L, errInvalidIndentation) + +proc scanComment(L: var TLexer, tok: var TToken) = + var + buf: cstring + pos, col: int + indent: int + pos = L.bufpos + buf = L.buf # a comment ends if the next line does not start with the # on the same + # column after only whitespace + tok.tokType = tkComment + col = getColNumber(L, pos) + while true: + while not (buf[pos] in {CR, LF, lexbase.EndOfFile}): + add(tok.literal, buf[pos]) + inc(pos) + pos = handleCRLF(L, pos) + buf = L.buf + indent = 0 + while buf[pos] == ' ': + inc(pos) + inc(indent) + if (buf[pos] == '#') and (col == indent): + tok.literal = tok.literal & "\n" + else: + if buf[pos] > ' ': + L.indentAhead = indent + inc(L.dedent) + break + L.bufpos = pos + +proc skip(L: var TLexer, tok: var TToken) = + var + buf: cstring + indent, pos: int + pos = L.bufpos + buf = L.buf + while true: + case buf[pos] + of ' ': + Inc(pos) + of Tabulator: + lexMessagePos(L, errTabulatorsAreNotAllowed, pos) + inc(pos) # BUGFIX + of CR, LF: + pos = HandleCRLF(L, pos) + buf = L.buf + indent = 0 + while buf[pos] == ' ': + Inc(pos) + Inc(indent) + if (buf[pos] > ' '): + handleIndentation(L, tok, indent) + break + else: + break # EndOfFile also leaves the loop + L.bufpos = pos + +proc rawGetTok(L: var TLexer, tok: var TToken) = + var c: Char + fillToken(tok) + if L.dedent > 0: + dec(L.dedent) + if L.indentAhead >= 0: + handleIndentation(L, tok, L.indentAhead) + L.indentAhead = - 1 + else: + tok.tokType = tkDed + return + skip(L, tok) # skip + # got an documentation comment or tkIndent, return that: + if tok.toktype != tkInvalid: return + c = L.buf[L.bufpos] + if c in SymStartChars - {'r', 'R', 'l'}: + getSymbol(L, tok) + elif c in {'0'..'9'}: + tok = getNumber(L) + else: + case c + of '#': + scanComment(L, tok) + of ':': + tok.tokType = tkColon + inc(L.bufpos) + of ',': + tok.toktype = tkComma + Inc(L.bufpos) + of 'l': + # if we parsed exactly one character and its a small L (l), this + # is treated as a warning because it may be confused with the number 1 + if not (L.buf[L.bufpos + 1] in (SymChars + {'_'})): + lexMessage(L, warnSmallLshouldNotBeUsed) + getSymbol(L, tok) + of 'r', 'R': + if L.buf[L.bufPos + 1] == '\"': + Inc(L.bufPos) + getString(L, tok, true) + else: + getSymbol(L, tok) + of '(': + Inc(L.bufpos) + if (L.buf[L.bufPos] == '.') and (L.buf[L.bufPos + 1] != '.'): + tok.toktype = tkParDotLe + Inc(L.bufpos) + else: + tok.toktype = tkParLe + of ')': + tok.toktype = tkParRi + Inc(L.bufpos) + of '[': + Inc(L.bufpos) + if (L.buf[L.bufPos] == '.') and (L.buf[L.bufPos + 1] != '.'): + tok.toktype = tkBracketDotLe + Inc(L.bufpos) + else: + tok.toktype = tkBracketLe + of ']': + tok.toktype = tkBracketRi + Inc(L.bufpos) + of '.': + if L.buf[L.bufPos + 1] == ']': + tok.tokType = tkBracketDotRi + Inc(L.bufpos, 2) + elif L.buf[L.bufPos + 1] == '}': + tok.tokType = tkCurlyDotRi + Inc(L.bufpos, 2) + elif L.buf[L.bufPos + 1] == ')': + tok.tokType = tkParDotRi + Inc(L.bufpos, 2) + else: + getOperator(L, tok) + of '{': + Inc(L.bufpos) + if (L.buf[L.bufPos] == '.') and (L.buf[L.bufPos + 1] != '.'): + tok.toktype = tkCurlyDotLe + Inc(L.bufpos) + else: + tok.toktype = tkCurlyLe + of '}': + tok.toktype = tkCurlyRi + Inc(L.bufpos) + of ';': + tok.toktype = tkSemiColon + Inc(L.bufpos) + of '`': + tok.tokType = tkAccent + Inc(L.bufpos) + of '\"': + getString(L, tok, false) + of '\'': + getCharacter(L, tok) + tok.tokType = tkCharLit + of lexbase.EndOfFile: + tok.toktype = tkEof + else: + if c in OpChars: + getOperator(L, tok) + else: + tok.literal = c & "" + tok.tokType = tkInvalid + lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')') + Inc(L.bufpos) + +dummyIdent = getIdent("") \ No newline at end of file |