diff options
Diffstat (limited to 'compiler/lexer.nim')
-rw-r--r-- | compiler/lexer.nim | 1377 |
1 files changed, 1377 insertions, 0 deletions
diff --git a/compiler/lexer.nim b/compiler/lexer.nim new file mode 100644 index 000000000..ad5dd560c --- /dev/null +++ b/compiler/lexer.nim @@ -0,0 +1,1377 @@ +# +# +# The Nim Compiler +# (c) Copyright 2015 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +# This lexer is handwritten for efficiency. I used an elegant buffering +# scheme which I have not seen anywhere else: +# We guarantee that a whole line is in the buffer. Thus only when scanning +# the \n or \r character we have to check whether we need to read in the next +# chunk. (\n or \r already need special handling for incrementing the line +# counter; choosing both \n and \r allows the lexer to properly read Unix, +# DOS or Macintosh text files, even when it is not the native format. + +import + options, msgs, platform, idents, nimlexbase, llstream, + wordrecg, lineinfos, pathutils + +import std/[hashes, parseutils, strutils] + +when defined(nimPreviewSlimSystem): + import std/[assertions, formatfloat] + +const + numChars*: set[char] = {'0'..'9', 'a'..'z', 'A'..'Z'} + SymChars*: set[char] = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'} + SymStartChars*: set[char] = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'} + OpChars*: set[char] = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.', + '|', '=', '%', '&', '$', '@', '~', ':'} + UnaryMinusWhitelist = {' ', '\t', '\n', '\r', ',', ';', '(', '[', '{'} + +# don't forget to update the 'highlite' module if these charsets should change + +type + TokType* = enum + tkInvalid = "tkInvalid", tkEof = "[EOF]", # order is important here! + tkSymbol = "tkSymbol", # keywords: + tkAddr = "addr", tkAnd = "and", tkAs = "as", tkAsm = "asm", + tkBind = "bind", tkBlock = "block", tkBreak = "break", tkCase = "case", tkCast = "cast", + tkConcept = "concept", tkConst = "const", tkContinue = "continue", tkConverter = "converter", + tkDefer = "defer", tkDiscard = "discard", tkDistinct = "distinct", tkDiv = "div", tkDo = "do", + tkElif = "elif", tkElse = "else", tkEnd = "end", tkEnum = "enum", tkExcept = "except", tkExport = "export", + tkFinally = "finally", tkFor = "for", tkFrom = "from", tkFunc = "func", + tkIf = "if", tkImport = "import", tkIn = "in", tkInclude = "include", tkInterface = "interface", + tkIs = "is", tkIsnot = "isnot", tkIterator = "iterator", + tkLet = "let", + tkMacro = "macro", tkMethod = "method", tkMixin = "mixin", tkMod = "mod", tkNil = "nil", tkNot = "not", tkNotin = "notin", + tkObject = "object", tkOf = "of", tkOr = "or", tkOut = "out", + tkProc = "proc", tkPtr = "ptr", tkRaise = "raise", tkRef = "ref", tkReturn = "return", + tkShl = "shl", tkShr = "shr", tkStatic = "static", + tkTemplate = "template", + tkTry = "try", tkTuple = "tuple", tkType = "type", tkUsing = "using", + tkVar = "var", tkWhen = "when", tkWhile = "while", tkXor = "xor", + tkYield = "yield", # end of keywords + + tkIntLit = "tkIntLit", tkInt8Lit = "tkInt8Lit", tkInt16Lit = "tkInt16Lit", + tkInt32Lit = "tkInt32Lit", tkInt64Lit = "tkInt64Lit", + tkUIntLit = "tkUIntLit", tkUInt8Lit = "tkUInt8Lit", tkUInt16Lit = "tkUInt16Lit", + tkUInt32Lit = "tkUInt32Lit", tkUInt64Lit = "tkUInt64Lit", + tkFloatLit = "tkFloatLit", tkFloat32Lit = "tkFloat32Lit", + tkFloat64Lit = "tkFloat64Lit", tkFloat128Lit = "tkFloat128Lit", + tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit", tkTripleStrLit = "tkTripleStrLit", + tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit", + tkCustomLit = "tkCustomLit", + + tkParLe = "(", tkParRi = ")", tkBracketLe = "[", + tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}", + tkBracketDotLe = "[.", tkBracketDotRi = ".]", + tkCurlyDotLe = "{.", tkCurlyDotRi = ".}", + tkParDotLe = "(.", tkParDotRi = ".)", + tkComma = ",", tkSemiColon = ";", + tkColon = ":", tkColonColon = "::", tkEquals = "=", + tkDot = ".", tkDotDot = "..", tkBracketLeColon = "[:", + tkOpr, tkComment, tkAccent = "`", + # these are fake tokens used by renderer.nim + tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr, tkHideableStart, tkHideableEnd + + TokTypes* = set[TokType] + +const + weakTokens = {tkComma, tkSemiColon, tkColon, + tkParRi, tkParDotRi, tkBracketRi, tkBracketDotRi, + tkCurlyRi} # \ + # tokens that should not be considered for previousToken + tokKeywordLow* = succ(tkSymbol) + tokKeywordHigh* = pred(tkIntLit) + +type + NumericalBase* = enum + base10, # base10 is listed as the first element, + # so that it is the correct default value + base2, base8, base16 + + TokenSpacing* = enum + tsLeading, tsTrailing, tsEof + + Token* = object # a Nim token + tokType*: TokType # the type of the token + base*: NumericalBase # the numerical base; only valid for int + # or float literals + spacing*: set[TokenSpacing] # spaces around token + indent*: int # the indentation; != -1 if the token has been + # preceded with indentation + ident*: PIdent # the parsed identifier + iNumber*: BiggestInt # the parsed integer literal + fNumber*: BiggestFloat # the parsed floating point literal + literal*: string # the parsed (string) literal; and + # documentation comments are here too + line*, col*: int + when defined(nimpretty): + offsetA*, offsetB*: int # used for pretty printing so that literals + # like 0b01 or r"\L" are unaffected + commentOffsetA*, commentOffsetB*: int + + ErrorHandler* = proc (conf: ConfigRef; info: TLineInfo; msg: TMsgKind; arg: string) + Lexer* = object of TBaseLexer + fileIdx*: FileIndex + indentAhead*: int # if > 0 an indentation has already been read + # this is needed because scanning comments + # needs so much look-ahead + currLineIndent*: int + errorHandler*: ErrorHandler + cache*: IdentCache + when defined(nimsuggest): + previousToken: TLineInfo + tokenEnd*: TLineInfo + previousTokenEnd*: TLineInfo + config*: ConfigRef + +proc getLineInfo*(L: Lexer, tok: Token): TLineInfo {.inline.} = + result = newLineInfo(L.fileIdx, tok.line, tok.col) + when defined(nimpretty): + result.offsetA = tok.offsetA + result.offsetB = tok.offsetB + result.commentOffsetA = tok.commentOffsetA + result.commentOffsetB = tok.commentOffsetB + +proc isKeyword*(kind: TokType): bool = + (kind >= tokKeywordLow) and (kind <= tokKeywordHigh) + +template ones(n): untyped = ((1 shl n)-1) # for utf-8 conversion + +proc isNimIdentifier*(s: string): bool = + let sLen = s.len + if sLen > 0 and s[0] in SymStartChars: + var i = 1 + while i < sLen: + if s[i] == '_': inc(i) + if i < sLen and s[i] notin SymChars: return false + inc(i) + result = true + else: + result = false + +proc `$`*(tok: Token): string = + case tok.tokType + of tkIntLit..tkInt64Lit: $tok.iNumber + of tkFloatLit..tkFloat64Lit: $tok.fNumber + of tkInvalid, tkStrLit..tkCharLit, tkComment: tok.literal + of tkParLe..tkColon, tkEof, tkAccent: $tok.tokType + else: + if tok.ident != nil: + tok.ident.s + else: + "" + +proc prettyTok*(tok: Token): string = + if isKeyword(tok.tokType): "keyword " & tok.ident.s + else: $tok + +proc printTok*(conf: ConfigRef; tok: Token) = + # xxx factor with toLocation + msgWriteln(conf, $tok.line & ":" & $tok.col & "\t" & $tok.tokType & " " & $tok) + +proc openLexer*(lex: var Lexer, fileIdx: FileIndex, inputstream: PLLStream; + cache: IdentCache; config: ConfigRef) = + openBaseLexer(lex, inputstream) + lex.fileIdx = fileIdx + lex.indentAhead = -1 + lex.currLineIndent = 0 + inc(lex.lineNumber, inputstream.lineOffset) + lex.cache = cache + when defined(nimsuggest): + lex.previousToken.fileIndex = fileIdx + lex.config = config + +proc openLexer*(lex: var Lexer, filename: AbsoluteFile, inputstream: PLLStream; + cache: IdentCache; config: ConfigRef) = + openLexer(lex, fileInfoIdx(config, filename), inputstream, cache, config) + +proc closeLexer*(lex: var Lexer) = + if lex.config != nil: + inc(lex.config.linesCompiled, lex.lineNumber) + closeBaseLexer(lex) + +proc getLineInfo(L: Lexer): TLineInfo = + result = newLineInfo(L.fileIdx, L.lineNumber, getColNumber(L, L.bufpos)) + +proc dispMessage(L: Lexer; info: TLineInfo; msg: TMsgKind; arg: string) = + if L.errorHandler.isNil: + msgs.message(L.config, info, msg, arg) + else: + L.errorHandler(L.config, info, msg, arg) + +proc lexMessage*(L: Lexer, msg: TMsgKind, arg = "") = + L.dispMessage(getLineInfo(L), msg, arg) + +proc lexMessageTok*(L: Lexer, msg: TMsgKind, tok: Token, arg = "") = + var info = newLineInfo(L.fileIdx, tok.line, tok.col) + L.dispMessage(info, msg, arg) + +proc lexMessagePos(L: var Lexer, msg: TMsgKind, pos: int, arg = "") = + var info = newLineInfo(L.fileIdx, L.lineNumber, pos - L.lineStart) + L.dispMessage(info, msg, arg) + +proc matchTwoChars(L: Lexer, first: char, second: set[char]): bool = + result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in second) + +template tokenBegin(tok, pos) {.dirty.} = + when defined(nimsuggest): + var colA = getColNumber(L, pos) + when defined(nimpretty): + tok.offsetA = L.offsetBase + pos + +template tokenEnd(tok, pos) {.dirty.} = + when defined(nimsuggest): + let colB = getColNumber(L, pos)+1 + if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and + L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}: + L.config.m.trackPos.col = colA.int16 + colA = 0 + when defined(nimpretty): + tok.offsetB = L.offsetBase + pos + +template tokenEndIgnore(tok, pos) = + when defined(nimsuggest): + let colB = getColNumber(L, pos) + if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and + L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}: + L.config.m.trackPos.fileIndex = trackPosInvalidFileIdx + L.config.m.trackPos.line = 0'u16 + colA = 0 + when defined(nimpretty): + tok.offsetB = L.offsetBase + pos + +template tokenEndPrevious(tok, pos) = + when defined(nimsuggest): + # when we detect the cursor in whitespace, we attach the track position + # to the token that came before that, but only if we haven't detected + # the cursor in a string literal or comment: + let colB = getColNumber(L, pos) + if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and + L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}: + L.config.m.trackPos = L.previousToken + L.config.m.trackPosAttached = true + colA = 0 + when defined(nimpretty): + tok.offsetB = L.offsetBase + pos + +template eatChar(L: var Lexer, t: var Token, replacementChar: char) = + t.literal.add(replacementChar) + inc(L.bufpos) + +template eatChar(L: var Lexer, t: var Token) = + t.literal.add(L.buf[L.bufpos]) + inc(L.bufpos) + +proc getNumber(L: var Lexer, result: var Token) = + proc matchUnderscoreChars(L: var Lexer, tok: var Token, chars: set[char]): Natural = + var pos = L.bufpos # use registers for pos, buf + result = 0 + while true: + if L.buf[pos] in chars: + tok.literal.add(L.buf[pos]) + inc(pos) + inc(result) + else: + break + if L.buf[pos] == '_': + if L.buf[pos+1] notin chars: + lexMessage(L, errGenerated, + "only single underscores may occur in a token and token may not " & + "end with an underscore: e.g. '1__1' and '1_' are invalid") + break + tok.literal.add('_') + inc(pos) + L.bufpos = pos + + proc matchChars(L: var Lexer, tok: var Token, chars: set[char]) = + var pos = L.bufpos # use registers for pos, buf + while L.buf[pos] in chars: + tok.literal.add(L.buf[pos]) + inc(pos) + L.bufpos = pos + + proc lexMessageLitNum(L: var Lexer, msg: string, startpos: int, msgKind = errGenerated) = + # Used to get slightly human friendlier err messages. + const literalishChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '.', '\''} + var msgPos = L.bufpos + var t = Token(literal: "") + L.bufpos = startpos # Use L.bufpos as pos because of matchChars + matchChars(L, t, literalishChars) + # We must verify +/- specifically so that we're not past the literal + if L.buf[L.bufpos] in {'+', '-'} and + L.buf[L.bufpos - 1] in {'e', 'E'}: + t.literal.add(L.buf[L.bufpos]) + inc(L.bufpos) + matchChars(L, t, literalishChars) + if L.buf[L.bufpos] in literalishChars: + t.literal.add(L.buf[L.bufpos]) + inc(L.bufpos) + matchChars(L, t, {'0'..'9'}) + L.bufpos = msgPos + lexMessage(L, msgKind, msg % t.literal) + + var + xi: BiggestInt + isBase10 = true + numDigits = 0 + const + # 'c', 'C' is deprecated + baseCodeChars = {'X', 'x', 'o', 'b', 'B', 'c', 'C'} + literalishChars = baseCodeChars + {'A'..'F', 'a'..'f', '0'..'9', '_', '\''} + floatTypes = {tkFloatLit, tkFloat32Lit, tkFloat64Lit, tkFloat128Lit} + result.tokType = tkIntLit # int literal until we know better + result.literal = "" + result.base = base10 + tokenBegin(result, L.bufpos) + + var isPositive = true + if L.buf[L.bufpos] == '-': + eatChar(L, result) + isPositive = false + + let startpos = L.bufpos + + template setNumber(field, value) = + field = (if isPositive: value else: -value) + + # First stage: find out base, make verifications, build token literal string + # {'c', 'C'} is added for deprecation reasons to provide a clear error message + if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'c', 'C', 'O'}: + isBase10 = false + eatChar(L, result, '0') + case L.buf[L.bufpos] + of 'c', 'C': + lexMessageLitNum(L, + "$1 will soon be invalid for oct literals; Use '0o' " & + "for octals. 'c', 'C' prefix", + startpos, + warnDeprecated) + eatChar(L, result, 'c') + numDigits = matchUnderscoreChars(L, result, {'0'..'7'}) + of 'O': + lexMessageLitNum(L, "$1 is an invalid int literal; For octal literals " & + "use the '0o' prefix.", startpos) + of 'x', 'X': + eatChar(L, result, 'x') + numDigits = matchUnderscoreChars(L, result, {'0'..'9', 'a'..'f', 'A'..'F'}) + of 'o': + eatChar(L, result, 'o') + numDigits = matchUnderscoreChars(L, result, {'0'..'7'}) + of 'b', 'B': + eatChar(L, result, 'b') + numDigits = matchUnderscoreChars(L, result, {'0'..'1'}) + else: + internalError(L.config, getLineInfo(L), "getNumber") + if numDigits == 0: + lexMessageLitNum(L, "invalid number: '$1'", startpos) + else: + discard matchUnderscoreChars(L, result, {'0'..'9'}) + if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): + result.tokType = tkFloatLit + eatChar(L, result, '.') + discard matchUnderscoreChars(L, result, {'0'..'9'}) + if L.buf[L.bufpos] in {'e', 'E'}: + result.tokType = tkFloatLit + eatChar(L, result) + if L.buf[L.bufpos] in {'+', '-'}: + eatChar(L, result) + discard matchUnderscoreChars(L, result, {'0'..'9'}) + let endpos = L.bufpos + + # Second stage, find out if there's a datatype suffix and handle it + var postPos = endpos + + if L.buf[postPos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}: + let errPos = postPos + var customLitPossible = false + if L.buf[postPos] == '\'': + inc(postPos) + customLitPossible = true + + if L.buf[postPos] in SymChars: + var suffix = newStringOfCap(10) + while true: + suffix.add L.buf[postPos] + inc postPos + if L.buf[postPos] notin SymChars+{'_'}: break + let suffixAsLower = suffix.toLowerAscii + case suffixAsLower + of "f", "f32": result.tokType = tkFloat32Lit + of "d", "f64": result.tokType = tkFloat64Lit + of "f128": result.tokType = tkFloat128Lit + of "i8": result.tokType = tkInt8Lit + of "i16": result.tokType = tkInt16Lit + of "i32": result.tokType = tkInt32Lit + of "i64": result.tokType = tkInt64Lit + of "u": result.tokType = tkUIntLit + of "u8": result.tokType = tkUInt8Lit + of "u16": result.tokType = tkUInt16Lit + of "u32": result.tokType = tkUInt32Lit + of "u64": result.tokType = tkUInt64Lit + elif customLitPossible: + # remember the position of the `'` so that the parser doesn't + # have to reparse the custom literal: + result.iNumber = len(result.literal) + result.literal.add '\'' + result.literal.add suffix + result.tokType = tkCustomLit + else: + lexMessageLitNum(L, "invalid number suffix: '$1'", errPos) + else: + lexMessageLitNum(L, "invalid number suffix: '$1'", errPos) + + # Is there still a literalish char awaiting? Then it's an error! + if L.buf[postPos] in literalishChars or + (L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}): + lexMessageLitNum(L, "invalid number: '$1'", startpos) + + if result.tokType != tkCustomLit: + # Third stage, extract actual number + L.bufpos = startpos # restore position + var pos = startpos + try: + if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars): + inc(pos, 2) + xi = 0 # it is a base prefix + + case L.buf[pos - 1] + of 'b', 'B': + result.base = base2 + while pos < endpos: + if L.buf[pos] != '_': + xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + # 'c', 'C' is deprecated (a warning is issued elsewhere) + of 'o', 'c', 'C': + result.base = base8 + while pos < endpos: + if L.buf[pos] != '_': + xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + of 'x', 'X': + result.base = base16 + while pos < endpos: + case L.buf[pos] + of '_': + inc(pos) + of '0'..'9': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + of 'a'..'f': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10) + inc(pos) + of 'A'..'F': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10) + inc(pos) + else: + break + else: + internalError(L.config, getLineInfo(L), "getNumber") + + case result.tokType + of tkIntLit, tkInt64Lit: setNumber result.iNumber, xi + of tkInt8Lit: setNumber result.iNumber, ashr(xi shl 56, 56) + of tkInt16Lit: setNumber result.iNumber, ashr(xi shl 48, 48) + of tkInt32Lit: setNumber result.iNumber, ashr(xi shl 32, 32) + of tkUIntLit, tkUInt64Lit: setNumber result.iNumber, xi + of tkUInt8Lit: setNumber result.iNumber, xi and 0xff + of tkUInt16Lit: setNumber result.iNumber, xi and 0xffff + of tkUInt32Lit: setNumber result.iNumber, xi and 0xffffffff + of tkFloat32Lit: + setNumber result.fNumber, (cast[ptr float32](addr(xi)))[] + # note: this code is endian neutral! + # XXX: Test this on big endian machine! + of tkFloat64Lit, tkFloatLit: + setNumber result.fNumber, (cast[ptr float64](addr(xi)))[] + else: internalError(L.config, getLineInfo(L), "getNumber") + + # Bounds checks. Non decimal literals are allowed to overflow the range of + # the datatype as long as their pattern don't overflow _bitwise_, hence + # below checks of signed sizes against uint*.high is deliberate: + # (0x80'u8 = 128, 0x80'i8 = -128, etc == OK) + if result.tokType notin floatTypes: + let outOfRange = + case result.tokType + of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi + of tkInt8Lit: (xi > BiggestInt(uint8.high)) + of tkInt16Lit: (xi > BiggestInt(uint16.high)) + of tkInt32Lit: (xi > BiggestInt(uint32.high)) + else: false + + if outOfRange: + #echo "out of range num: ", result.iNumber, " vs ", xi + lexMessageLitNum(L, "number out of range: '$1'", startpos) + + else: + case result.tokType + of floatTypes: + result.fNumber = parseFloat(result.literal) + of tkUInt64Lit, tkUIntLit: + var iNumber: uint64 = uint64(0) + var len: int = 0 + try: + len = parseBiggestUInt(result.literal, iNumber) + except ValueError: + raise newException(OverflowDefect, "number out of range: " & result.literal) + if len != result.literal.len: + raise newException(ValueError, "invalid integer: " & result.literal) + result.iNumber = cast[int64](iNumber) + else: + var iNumber: int64 = int64(0) + var len: int = 0 + try: + len = parseBiggestInt(result.literal, iNumber) + except ValueError: + raise newException(OverflowDefect, "number out of range: " & result.literal) + if len != result.literal.len: + raise newException(ValueError, "invalid integer: " & result.literal) + result.iNumber = iNumber + + # Explicit bounds checks. + let outOfRange = + case result.tokType + of tkInt8Lit: result.iNumber > int8.high or result.iNumber < int8.low + of tkUInt8Lit: result.iNumber > BiggestInt(uint8.high) or result.iNumber < 0 + of tkInt16Lit: result.iNumber > int16.high or result.iNumber < int16.low + of tkUInt16Lit: result.iNumber > BiggestInt(uint16.high) or result.iNumber < 0 + of tkInt32Lit: result.iNumber > int32.high or result.iNumber < int32.low + of tkUInt32Lit: result.iNumber > BiggestInt(uint32.high) or result.iNumber < 0 + else: false + + if outOfRange: + lexMessageLitNum(L, "number out of range: '$1'", startpos) + + # Promote int literal to int64? Not always necessary, but more consistent + if result.tokType == tkIntLit: + if result.iNumber > high(int32) or result.iNumber < low(int32): + result.tokType = tkInt64Lit + + except ValueError: + lexMessageLitNum(L, "invalid number: '$1'", startpos) + except OverflowDefect, RangeDefect: + lexMessageLitNum(L, "number out of range: '$1'", startpos) + tokenEnd(result, postPos-1) + L.bufpos = postPos + +proc handleHexChar(L: var Lexer, xi: var int; position: range[0..4]) = + template invalid() = + lexMessage(L, errGenerated, + "expected a hex digit, but found: " & L.buf[L.bufpos] & + "; maybe prepend with 0") + + case L.buf[L.bufpos] + of '0'..'9': + xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0')) + inc(L.bufpos) + of 'a'..'f': + xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('a') + 10) + inc(L.bufpos) + of 'A'..'F': + xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10) + inc(L.bufpos) + of '"', '\'': + if position <= 1: invalid() + # do not progress the bufpos here. + if position == 0: inc(L.bufpos) + else: + invalid() + # Need to progress for `nim check` + inc(L.bufpos) + +proc handleDecChars(L: var Lexer, xi: var int) = + while L.buf[L.bufpos] in {'0'..'9'}: + xi = (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0')) + inc(L.bufpos) + +proc addUnicodeCodePoint(s: var string, i: int) = + let i = cast[uint](i) + # inlined toUTF-8 to avoid unicode and strutils dependencies. + let pos = s.len + if i <= 127: + s.setLen(pos+1) + s[pos+0] = chr(i) + elif i <= 0x07FF: + s.setLen(pos+2) + s[pos+0] = chr((i shr 6) or 0b110_00000) + s[pos+1] = chr((i and ones(6)) or 0b10_0000_00) + elif i <= 0xFFFF: + s.setLen(pos+3) + s[pos+0] = chr(i shr 12 or 0b1110_0000) + s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00) + s[pos+2] = chr(i and ones(6) or 0b10_0000_00) + elif i <= 0x001FFFFF: + s.setLen(pos+4) + s[pos+0] = chr(i shr 18 or 0b1111_0000) + s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00) + s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00) + s[pos+3] = chr(i and ones(6) or 0b10_0000_00) + elif i <= 0x03FFFFFF: + s.setLen(pos+5) + s[pos+0] = chr(i shr 24 or 0b111110_00) + s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00) + s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00) + s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00) + s[pos+4] = chr(i and ones(6) or 0b10_0000_00) + elif i <= 0x7FFFFFFF: + s.setLen(pos+6) + s[pos+0] = chr(i shr 30 or 0b1111110_0) + s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00) + s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00) + s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00) + s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00) + s[pos+5] = chr(i and ones(6) or 0b10_0000_00) + +proc getEscapedChar(L: var Lexer, tok: var Token) = + inc(L.bufpos) # skip '\' + case L.buf[L.bufpos] + of 'n', 'N': + tok.literal.add('\L') + inc(L.bufpos) + of 'p', 'P': + if tok.tokType == tkCharLit: + lexMessage(L, errGenerated, "\\p not allowed in character literal") + tok.literal.add(L.config.target.tnl) + inc(L.bufpos) + of 'r', 'R', 'c', 'C': + tok.literal.add(CR) + inc(L.bufpos) + of 'l', 'L': + tok.literal.add(LF) + inc(L.bufpos) + of 'f', 'F': + tok.literal.add(FF) + inc(L.bufpos) + of 'e', 'E': + tok.literal.add(ESC) + inc(L.bufpos) + of 'a', 'A': + tok.literal.add(BEL) + inc(L.bufpos) + of 'b', 'B': + tok.literal.add(BACKSPACE) + inc(L.bufpos) + of 'v', 'V': + tok.literal.add(VT) + inc(L.bufpos) + of 't', 'T': + tok.literal.add('\t') + inc(L.bufpos) + of '\'', '\"': + tok.literal.add(L.buf[L.bufpos]) + inc(L.bufpos) + of '\\': + tok.literal.add('\\') + inc(L.bufpos) + of 'x', 'X': + inc(L.bufpos) + var xi = 0 + handleHexChar(L, xi, 1) + handleHexChar(L, xi, 2) + tok.literal.add(chr(xi)) + of 'u', 'U': + if tok.tokType == tkCharLit: + lexMessage(L, errGenerated, "\\u not allowed in character literal") + inc(L.bufpos) + var xi = 0 + if L.buf[L.bufpos] == '{': + inc(L.bufpos) + var start = L.bufpos + while L.buf[L.bufpos] != '}': + handleHexChar(L, xi, 0) + if start == L.bufpos: + lexMessage(L, errGenerated, + "Unicode codepoint cannot be empty") + inc(L.bufpos) + if xi > 0x10FFFF: + let hex = ($L.buf)[start..L.bufpos-2] + lexMessage(L, errGenerated, + "Unicode codepoint must be lower than 0x10FFFF, but was: " & hex) + else: + handleHexChar(L, xi, 1) + handleHexChar(L, xi, 2) + handleHexChar(L, xi, 3) + handleHexChar(L, xi, 4) + addUnicodeCodePoint(tok.literal, xi) + of '0'..'9': + if matchTwoChars(L, '0', {'0'..'9'}): + lexMessage(L, warnOctalEscape) + var xi = 0 + handleDecChars(L, xi) + if (xi <= 255): tok.literal.add(chr(xi)) + else: lexMessage(L, errGenerated, "invalid character constant") + else: lexMessage(L, errGenerated, "invalid character constant") + +proc handleCRLF(L: var Lexer, pos: int): int = + template registerLine = + let col = L.getColNumber(pos) + + case L.buf[pos] + of CR: + registerLine() + result = nimlexbase.handleCR(L, pos) + of LF: + registerLine() + result = nimlexbase.handleLF(L, pos) + else: result = pos + +type + StringMode = enum + normal, + raw, + generalized + +proc getString(L: var Lexer, tok: var Token, mode: StringMode) = + var pos = L.bufpos + var line = L.lineNumber # save linenumber for better error message + tokenBegin(tok, pos - ord(mode == raw)) + inc pos # skip " + if L.buf[pos] == '\"' and L.buf[pos+1] == '\"': + tok.tokType = tkTripleStrLit # long string literal: + inc(pos, 2) # skip "" + # skip leading newline: + if L.buf[pos] in {' ', '\t'}: + var newpos = pos+1 + while L.buf[newpos] in {' ', '\t'}: inc newpos + if L.buf[newpos] in {CR, LF}: pos = newpos + pos = handleCRLF(L, pos) + while true: + case L.buf[pos] + of '\"': + if L.buf[pos+1] == '\"' and L.buf[pos+2] == '\"' and + L.buf[pos+3] != '\"': + tokenEndIgnore(tok, pos+2) + L.bufpos = pos + 3 # skip the three """ + break + tok.literal.add('\"') + inc(pos) + of CR, LF: + tokenEndIgnore(tok, pos) + pos = handleCRLF(L, pos) + tok.literal.add("\n") + of nimlexbase.EndOfFile: + tokenEndIgnore(tok, pos) + var line2 = L.lineNumber + L.lineNumber = line + lexMessagePos(L, errGenerated, L.lineStart, "closing \"\"\" expected, but end of file reached") + L.lineNumber = line2 + L.bufpos = pos + break + else: + tok.literal.add(L.buf[pos]) + inc(pos) + else: + # ordinary string literal + if mode != normal: tok.tokType = tkRStrLit + else: tok.tokType = tkStrLit + while true: + let c = L.buf[pos] + if c == '\"': + if mode != normal and L.buf[pos+1] == '\"': + inc(pos, 2) + tok.literal.add('"') + else: + tokenEndIgnore(tok, pos) + inc(pos) # skip '"' + break + elif c in {CR, LF, nimlexbase.EndOfFile}: + tokenEndIgnore(tok, pos) + lexMessage(L, errGenerated, "closing \" expected") + break + elif (c == '\\') and mode == normal: + L.bufpos = pos + getEscapedChar(L, tok) + pos = L.bufpos + else: + tok.literal.add(c) + inc(pos) + L.bufpos = pos + +proc getCharacter(L: var Lexer; tok: var Token) = + tokenBegin(tok, L.bufpos) + let startPos = L.bufpos + inc(L.bufpos) # skip ' + let c = L.buf[L.bufpos] + case c + of '\0'..pred(' '), '\'': + lexMessage(L, errGenerated, "invalid character literal") + tok.literal = $c + of '\\': getEscapedChar(L, tok) + else: + tok.literal = $c + inc(L.bufpos) + if L.buf[L.bufpos] == '\'': + tokenEndIgnore(tok, L.bufpos) + inc(L.bufpos) # skip ' + else: + if startPos > 0 and L.buf[startPos-1] == '`': + tok.literal = "'" + L.bufpos = startPos+1 + else: + lexMessage(L, errGenerated, "missing closing ' for character literal") + tokenEndIgnore(tok, L.bufpos) + +const + UnicodeOperatorStartChars = {'\226', '\194', '\195'} + # the allowed unicode characters ("∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓ ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔") + # all start with one of these. + +type + UnicodeOprPred = enum + Mul, Add + +proc unicodeOprLen(buf: cstring; pos: int): (int8, UnicodeOprPred) = + template m(len): untyped = (int8(len), Mul) + template a(len): untyped = (int8(len), Add) + result = 0.m + case buf[pos] + of '\226': + if buf[pos+1] == '\136': + if buf[pos+2] == '\152': result = 3.m # ∘ + elif buf[pos+2] == '\153': result = 3.m # ∙ + elif buf[pos+2] == '\167': result = 3.m # ∧ + elif buf[pos+2] == '\168': result = 3.a # ∨ + elif buf[pos+2] == '\169': result = 3.m # ∩ + elif buf[pos+2] == '\170': result = 3.a # ∪ + elif buf[pos+1] == '\138': + if buf[pos+2] == '\147': result = 3.m # ⊓ + elif buf[pos+2] == '\148': result = 3.a # ⊔ + elif buf[pos+2] == '\149': result = 3.a # ⊕ + elif buf[pos+2] == '\150': result = 3.a # ⊖ + elif buf[pos+2] == '\151': result = 3.m # ⊗ + elif buf[pos+2] == '\152': result = 3.m # ⊘ + elif buf[pos+2] == '\153': result = 3.m # ⊙ + elif buf[pos+2] == '\155': result = 3.m # ⊛ + elif buf[pos+2] == '\158': result = 3.a # ⊞ + elif buf[pos+2] == '\159': result = 3.a # ⊟ + elif buf[pos+2] == '\160': result = 3.m # ⊠ + elif buf[pos+2] == '\161': result = 3.m # ⊡ + elif buf[pos+1] == '\152' and buf[pos+2] == '\133': result = 3.m # ★ + of '\194': + if buf[pos+1] == '\177': result = 2.a # ± + of '\195': + if buf[pos+1] == '\151': result = 2.m # × + else: + discard + +proc getSymbol(L: var Lexer, tok: var Token) = + var h: Hash = 0 + var pos = L.bufpos + tokenBegin(tok, pos) + var suspicious = false + while true: + var c = L.buf[pos] + case c + of 'a'..'z', '0'..'9': + h = h !& ord(c) + inc(pos) + of 'A'..'Z': + c = chr(ord(c) + (ord('a') - ord('A'))) # toLower() + h = h !& ord(c) + inc(pos) + suspicious = true + of '_': + if L.buf[pos+1] notin SymChars: + lexMessage(L, errGenerated, "invalid token: trailing underscore") + break + inc(pos) + suspicious = true + of '\x80'..'\xFF': + if c in UnicodeOperatorStartChars and unicodeOprLen(L.buf, pos)[0] != 0: + break + else: + h = h !& ord(c) + inc(pos) + else: break + tokenEnd(tok, pos-1) + h = !$h + tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h) + if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or + (tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)): + tok.tokType = tkSymbol + else: + tok.tokType = TokType(tok.ident.id + ord(tkSymbol)) + if suspicious and {optStyleHint, optStyleError} * L.config.globalOptions != {}: + lintReport(L.config, getLineInfo(L), tok.ident.s.normalize, tok.ident.s) + L.bufpos = pos + + +proc endOperator(L: var Lexer, tok: var Token, pos: int, + hash: Hash) {.inline.} = + var h = !$hash + tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h) + if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr + else: tok.tokType = TokType(tok.ident.id - oprLow + ord(tkColon)) + L.bufpos = pos + +proc getOperator(L: var Lexer, tok: var Token) = + var pos = L.bufpos + tokenBegin(tok, pos) + var h: Hash = 0 + while true: + let c = L.buf[pos] + if c in OpChars: + h = h !& ord(c) + inc(pos) + elif c in UnicodeOperatorStartChars: + let oprLen = unicodeOprLen(L.buf, pos)[0] + if oprLen == 0: break + for i in 0..<oprLen: + h = h !& ord(L.buf[pos]) + inc pos + else: + break + endOperator(L, tok, pos, h) + tokenEnd(tok, pos-1) + # advance pos but don't store it in L.bufpos so the next token (which might + # be an operator too) gets the preceding spaces: + tok.spacing = tok.spacing - {tsTrailing, tsEof} + var trailing = false + while L.buf[pos] == ' ': + inc pos + trailing = true + if L.buf[pos] in {CR, LF, nimlexbase.EndOfFile}: + tok.spacing.incl(tsEof) + elif trailing: + tok.spacing.incl(tsTrailing) + +proc getPrecedence*(tok: Token): int = + ## Calculates the precedence of the given token. + const + MulPred = 9 + PlusPred = 8 + case tok.tokType + of tkOpr: + let relevantChar = tok.ident.s[0] + + # arrow like? + if tok.ident.s.len > 1 and tok.ident.s[^1] == '>' and + tok.ident.s[^2] in {'-', '~', '='}: return 0 + + template considerAsgn(value: untyped) = + result = if tok.ident.s[^1] == '=': 1 else: value + + case relevantChar + of '$', '^': considerAsgn(10) + of '*', '%', '/', '\\': considerAsgn(MulPred) + of '~': result = 8 + of '+', '-', '|': considerAsgn(PlusPred) + of '&': considerAsgn(7) + of '=', '<', '>', '!': result = 5 + of '.': considerAsgn(6) + of '?': result = 2 + of UnicodeOperatorStartChars: + if tok.ident.s[^1] == '=': + result = 1 + else: + let (len, pred) = unicodeOprLen(cstring(tok.ident.s), 0) + if len != 0: + result = if pred == Mul: MulPred else: PlusPred + else: + result = 2 + else: considerAsgn(2) + of tkDiv, tkMod, tkShl, tkShr: result = 9 + of tkDotDot: result = 6 + of tkIn, tkNotin, tkIs, tkIsnot, tkOf, tkAs, tkFrom: result = 5 + of tkAnd: result = 4 + of tkOr, tkXor, tkPtr, tkRef: result = 3 + else: return -10 + +proc skipMultiLineComment(L: var Lexer; tok: var Token; start: int; + isDoc: bool) = + var pos = start + var toStrip = 0 + tokenBegin(tok, pos) + # detect the amount of indentation: + if isDoc: + toStrip = getColNumber(L, pos) + while L.buf[pos] == ' ': + inc pos + inc toStrip + while L.buf[pos] in {CR, LF}: # skip blank lines + pos = handleCRLF(L, pos) + toStrip = 0 + while L.buf[pos] == ' ': + inc pos + inc toStrip + var nesting = 0 + while true: + case L.buf[pos] + of '#': + if isDoc: + if L.buf[pos+1] == '#' and L.buf[pos+2] == '[': + inc nesting + tok.literal.add '#' + elif L.buf[pos+1] == '[': + inc nesting + inc pos + of ']': + if isDoc: + if L.buf[pos+1] == '#' and L.buf[pos+2] == '#': + if nesting == 0: + tokenEndIgnore(tok, pos+2) + inc(pos, 3) + break + dec nesting + tok.literal.add ']' + elif L.buf[pos+1] == '#': + if nesting == 0: + tokenEndIgnore(tok, pos+1) + inc(pos, 2) + break + dec nesting + inc pos + of CR, LF: + tokenEndIgnore(tok, pos) + pos = handleCRLF(L, pos) + # strip leading whitespace: + when defined(nimpretty): tok.literal.add "\L" + if isDoc: + when not defined(nimpretty): tok.literal.add "\n" + var c = toStrip + while L.buf[pos] == ' ' and c > 0: + inc pos + dec c + of nimlexbase.EndOfFile: + tokenEndIgnore(tok, pos) + lexMessagePos(L, errGenerated, pos, "end of multiline comment expected") + break + else: + if isDoc or defined(nimpretty): tok.literal.add L.buf[pos] + inc(pos) + L.bufpos = pos + when defined(nimpretty): + tok.commentOffsetB = L.offsetBase + pos - 1 + +proc scanComment(L: var Lexer, tok: var Token) = + var pos = L.bufpos + tok.tokType = tkComment + assert L.buf[pos+1] == '#' + when defined(nimpretty): + tok.commentOffsetA = L.offsetBase + pos + + if L.buf[pos+2] == '[': + skipMultiLineComment(L, tok, pos+3, true) + return + tokenBegin(tok, pos) + inc(pos, 2) + + var toStrip = 0 + var stripInit = false + + while true: + if not stripInit: # find baseline indentation inside comment + while L.buf[pos] == ' ': + inc pos + inc toStrip + if L.buf[pos] in {CR, LF}: # don't set toStrip in blank comment lines + toStrip = 0 + else: # found first non-whitespace character + stripInit = true + while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: + tok.literal.add(L.buf[pos]) + inc(pos) + tokenEndIgnore(tok, pos) + pos = handleCRLF(L, pos) + var indent = 0 + while L.buf[pos] == ' ': + inc(pos) + inc(indent) + + if L.buf[pos] == '#' and L.buf[pos+1] == '#': + tok.literal.add "\n" + inc(pos, 2) + if stripInit: + var c = toStrip + while L.buf[pos] == ' ' and c > 0: + inc pos + dec c + else: + if L.buf[pos] > ' ': + L.indentAhead = indent + tokenEndIgnore(tok, pos) + break + L.bufpos = pos + when defined(nimpretty): + tok.commentOffsetB = L.offsetBase + pos - 1 + +proc skip(L: var Lexer, tok: var Token) = + var pos = L.bufpos + tokenBegin(tok, pos) + tok.spacing.excl(tsLeading) + when defined(nimpretty): + var hasComment = false + var commentIndent = L.currLineIndent + tok.commentOffsetA = L.offsetBase + pos + tok.commentOffsetB = tok.commentOffsetA + tok.line = -1 + while true: + case L.buf[pos] + of ' ': + inc(pos) + tok.spacing.incl(tsLeading) + of '\t': + lexMessagePos(L, errGenerated, pos, "tabs are not allowed, use spaces instead") + inc(pos) + of CR, LF: + tokenEndPrevious(tok, pos) + pos = handleCRLF(L, pos) + var indent = 0 + while true: + if L.buf[pos] == ' ': + inc(pos) + inc(indent) + elif L.buf[pos] == '#' and L.buf[pos+1] == '[': + when defined(nimpretty): + hasComment = true + if tok.line < 0: + tok.line = L.lineNumber + commentIndent = indent + skipMultiLineComment(L, tok, pos+2, false) + pos = L.bufpos + else: + break + tok.spacing.excl(tsLeading) + when defined(nimpretty): + if L.buf[pos] == '#' and tok.line < 0: commentIndent = indent + if L.buf[pos] > ' ' and (L.buf[pos] != '#' or L.buf[pos+1] == '#'): + tok.indent = indent + L.currLineIndent = indent + break + of '#': + # do not skip documentation comment: + if L.buf[pos+1] == '#': break + when defined(nimpretty): + hasComment = true + if tok.line < 0: + tok.line = L.lineNumber + + if L.buf[pos+1] == '[': + skipMultiLineComment(L, tok, pos+2, false) + pos = L.bufpos + else: + tokenBegin(tok, pos) + while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: + when defined(nimpretty): tok.literal.add L.buf[pos] + inc(pos) + tokenEndIgnore(tok, pos+1) + when defined(nimpretty): + tok.commentOffsetB = L.offsetBase + pos + 1 + else: + break # EndOfFile also leaves the loop + tokenEndPrevious(tok, pos-1) + L.bufpos = pos + when defined(nimpretty): + if hasComment: + tok.commentOffsetB = L.offsetBase + pos - 1 + tok.tokType = tkComment + tok.indent = commentIndent + +proc rawGetTok*(L: var Lexer, tok: var Token) = + template atTokenEnd() {.dirty.} = + when defined(nimsuggest): + L.previousTokenEnd.line = L.tokenEnd.line + L.previousTokenEnd.col = L.tokenEnd.col + L.tokenEnd.line = tok.line.uint16 + L.tokenEnd.col = getColNumber(L, L.bufpos).int16 + # we attach the cursor to the last *strong* token + if tok.tokType notin weakTokens: + L.previousToken.line = tok.line.uint16 + L.previousToken.col = tok.col.int16 + + reset(tok) + if L.indentAhead >= 0: + tok.indent = L.indentAhead + L.currLineIndent = L.indentAhead + L.indentAhead = -1 + else: + tok.indent = -1 + skip(L, tok) + when defined(nimpretty): + if tok.tokType == tkComment: + L.indentAhead = L.currLineIndent + return + let c = L.buf[L.bufpos] + tok.line = L.lineNumber + tok.col = getColNumber(L, L.bufpos) + if c in SymStartChars - {'r', 'R'} - UnicodeOperatorStartChars: + getSymbol(L, tok) + else: + case c + of UnicodeOperatorStartChars: + if unicodeOprLen(L.buf, L.bufpos)[0] != 0: + getOperator(L, tok) + else: + getSymbol(L, tok) + of '#': + scanComment(L, tok) + of '*': + # '*:' is unfortunately a special case, because it is two tokens in + # 'var v*: int'. + if L.buf[L.bufpos+1] == ':' and L.buf[L.bufpos+2] notin OpChars: + var h = 0 !& ord('*') + endOperator(L, tok, L.bufpos+1, h) + else: + getOperator(L, tok) + of ',': + tok.tokType = tkComma + inc(L.bufpos) + of 'r', 'R': + if L.buf[L.bufpos + 1] == '\"': + inc(L.bufpos) + getString(L, tok, raw) + else: + getSymbol(L, tok) + of '(': + inc(L.bufpos) + if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.': + tok.tokType = tkParDotLe + inc(L.bufpos) + else: + tok.tokType = tkParLe + when defined(nimsuggest): + if L.fileIdx == L.config.m.trackPos.fileIndex and tok.col < L.config.m.trackPos.col and + tok.line == L.config.m.trackPos.line.int and L.config.ideCmd == ideCon: + L.config.m.trackPos.col = tok.col.int16 + of ')': + tok.tokType = tkParRi + inc(L.bufpos) + of '[': + inc(L.bufpos) + if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.': + tok.tokType = tkBracketDotLe + inc(L.bufpos) + elif L.buf[L.bufpos] == ':': + tok.tokType = tkBracketLeColon + inc(L.bufpos) + else: + tok.tokType = tkBracketLe + of ']': + tok.tokType = tkBracketRi + inc(L.bufpos) + of '.': + when defined(nimsuggest): + if L.fileIdx == L.config.m.trackPos.fileIndex and tok.col+1 == L.config.m.trackPos.col and + tok.line == L.config.m.trackPos.line.int and L.config.ideCmd == ideSug: + tok.tokType = tkDot + L.config.m.trackPos.col = tok.col.int16 + inc(L.bufpos) + atTokenEnd() + return + if L.buf[L.bufpos+1] == ']': + tok.tokType = tkBracketDotRi + inc(L.bufpos, 2) + elif L.buf[L.bufpos+1] == '}': + tok.tokType = tkCurlyDotRi + inc(L.bufpos, 2) + elif L.buf[L.bufpos+1] == ')': + tok.tokType = tkParDotRi + inc(L.bufpos, 2) + else: + getOperator(L, tok) + of '{': + inc(L.bufpos) + if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.': + tok.tokType = tkCurlyDotLe + inc(L.bufpos) + else: + tok.tokType = tkCurlyLe + of '}': + tok.tokType = tkCurlyRi + inc(L.bufpos) + of ';': + tok.tokType = tkSemiColon + inc(L.bufpos) + of '`': + tok.tokType = tkAccent + inc(L.bufpos) + of '_': + inc(L.bufpos) + if L.buf[L.bufpos] notin SymChars+{'_'}: + tok.tokType = tkSymbol + tok.ident = L.cache.getIdent("_") + else: + tok.literal = $c + tok.tokType = tkInvalid + lexMessage(L, errGenerated, "invalid token: " & c & " (\\" & $(ord(c)) & ')') + of '\"': + # check for generalized raw string literal: + let mode = if L.bufpos > 0 and L.buf[L.bufpos-1] in SymChars: generalized else: normal + getString(L, tok, mode) + if mode == generalized: + # tkRStrLit -> tkGStrLit + # tkTripleStrLit -> tkGTripleStrLit + inc(tok.tokType, 2) + of '\'': + tok.tokType = tkCharLit + getCharacter(L, tok) + tok.tokType = tkCharLit + of '0'..'9': + getNumber(L, tok) + let c = L.buf[L.bufpos] + if c in SymChars+{'_'}: + if c in UnicodeOperatorStartChars and + unicodeOprLen(L.buf, L.bufpos)[0] != 0: + discard + else: + lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier") + of '-': + if L.buf[L.bufpos+1] in {'0'..'9'} and + (L.bufpos-1 == 0 or L.buf[L.bufpos-1] in UnaryMinusWhitelist): + # x)-23 # binary minus + # ,-23 # unary minus + # \n-78 # unary minus? Yes. + # =-3 # parsed as `=-` anyway + getNumber(L, tok) + let c = L.buf[L.bufpos] + if c in SymChars+{'_'}: + if c in UnicodeOperatorStartChars and + unicodeOprLen(L.buf, L.bufpos)[0] != 0: + discard + else: + lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier") + else: + getOperator(L, tok) + else: + if c in OpChars: + getOperator(L, tok) + elif c == nimlexbase.EndOfFile: + tok.tokType = tkEof + tok.indent = 0 + else: + tok.literal = $c + tok.tokType = tkInvalid + lexMessage(L, errGenerated, "invalid token: " & c & " (\\" & $(ord(c)) & ')') + inc(L.bufpos) + atTokenEnd() + +proc getIndentWidth*(fileIdx: FileIndex, inputstream: PLLStream; + cache: IdentCache; config: ConfigRef): int = + result = 0 + var lex: Lexer = default(Lexer) + var tok: Token = default(Token) + openLexer(lex, fileIdx, inputstream, cache, config) + var prevToken = tkEof + while tok.tokType != tkEof: + rawGetTok(lex, tok) + if tok.indent > 0 and prevToken in {tkColon, tkEquals, tkType, tkConst, tkLet, tkVar, tkUsing}: + result = tok.indent + if result > 0: break + prevToken = tok.tokType + closeLexer(lex) + +proc getPrecedence*(ident: PIdent): int = + ## assumes ident is binary operator already + let + tokType = + if ident.id in ord(tokKeywordLow) - ord(tkSymbol)..ord(tokKeywordHigh) - ord(tkSymbol): + TokType(ident.id + ord(tkSymbol)) + else: tkOpr + tok = Token(ident: ident, tokType: tokType) + + getPrecedence(tok) |