diff options
Diffstat (limited to 'compiler/lexer.nim')
-rw-r--r-- | compiler/lexer.nim | 1283 |
1 files changed, 713 insertions, 570 deletions
diff --git a/compiler/lexer.nim b/compiler/lexer.nim index c5afa6e97..ad5dd560c 100644 --- a/compiler/lexer.nim +++ b/compiler/lexer.nim @@ -7,64 +7,78 @@ # distribution, for details about the copyright. # -# This scanner is handwritten for efficiency. I used an elegant buffering +# This lexer is handwritten for efficiency. I used an elegant buffering # scheme which I have not seen anywhere else: # We guarantee that a whole line is in the buffer. Thus only when scanning -# the \n or \r character we have to check wether we need to read in the next +# the \n or \r character we have to check whether we need to read in the next # chunk. (\n or \r already need special handling for incrementing the line -# counter; choosing both \n and \r allows the scanner to properly read Unix, +# counter; choosing both \n and \r allows the lexer to properly read Unix, # DOS or Macintosh text files, even when it is not the native format. import - hashes, options, msgs, strutils, platform, idents, nimlexbase, llstream, - wordrecg, lineinfos + options, msgs, platform, idents, nimlexbase, llstream, + wordrecg, lineinfos, pathutils + +import std/[hashes, parseutils, strutils] + +when defined(nimPreviewSlimSystem): + import std/[assertions, formatfloat] const - MaxLineLength* = 80 # lines longer than this lead to a warning numChars*: set[char] = {'0'..'9', 'a'..'z', 'A'..'Z'} SymChars*: set[char] = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'} SymStartChars*: set[char] = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'} OpChars*: set[char] = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.', '|', '=', '%', '&', '$', '@', '~', ':'} + UnaryMinusWhitelist = {' ', '\t', '\n', '\r', ',', ';', '(', '[', '{'} # don't forget to update the 'highlite' module if these charsets should change type - TTokType* = enum - tkInvalid, tkEof, # order is important here! - tkSymbol, # keywords: - tkAddr, tkAnd, tkAs, tkAsm, - tkBind, tkBlock, tkBreak, tkCase, tkCast, - tkConcept, tkConst, tkContinue, tkConverter, - tkDefer, tkDiscard, tkDistinct, tkDiv, tkDo, - tkElif, tkElse, tkEnd, tkEnum, tkExcept, tkExport, - tkFinally, tkFor, tkFrom, tkFunc, - tkIf, tkImport, tkIn, tkInclude, tkInterface, - tkIs, tkIsnot, tkIterator, - tkLet, - tkMacro, tkMethod, tkMixin, tkMod, tkNil, tkNot, tkNotin, - tkObject, tkOf, tkOr, tkOut, - tkProc, tkPtr, tkRaise, tkRef, tkReturn, - tkShl, tkShr, tkStatic, - tkTemplate, - tkTry, tkTuple, tkType, tkUsing, - tkVar, tkWhen, tkWhile, tkXor, - tkYield, # end of keywords - tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit, - tkUIntLit, tkUInt8Lit, tkUInt16Lit, tkUInt32Lit, tkUInt64Lit, - tkFloatLit, tkFloat32Lit, tkFloat64Lit, tkFloat128Lit, - tkStrLit, tkRStrLit, tkTripleStrLit, - tkGStrLit, tkGTripleStrLit, tkCharLit, tkParLe, tkParRi, tkBracketLe, - tkBracketRi, tkCurlyLe, tkCurlyRi, - tkBracketDotLe, tkBracketDotRi, # [. and .] - tkCurlyDotLe, tkCurlyDotRi, # {. and .} - tkParDotLe, tkParDotRi, # (. and .) - tkComma, tkSemiColon, - tkColon, tkColonColon, tkEquals, tkDot, tkDotDot, tkBracketLeColon, - tkOpr, tkComment, tkAccent, - tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr - - TTokTypes* = set[TTokType] + TokType* = enum + tkInvalid = "tkInvalid", tkEof = "[EOF]", # order is important here! + tkSymbol = "tkSymbol", # keywords: + tkAddr = "addr", tkAnd = "and", tkAs = "as", tkAsm = "asm", + tkBind = "bind", tkBlock = "block", tkBreak = "break", tkCase = "case", tkCast = "cast", + tkConcept = "concept", tkConst = "const", tkContinue = "continue", tkConverter = "converter", + tkDefer = "defer", tkDiscard = "discard", tkDistinct = "distinct", tkDiv = "div", tkDo = "do", + tkElif = "elif", tkElse = "else", tkEnd = "end", tkEnum = "enum", tkExcept = "except", tkExport = "export", + tkFinally = "finally", tkFor = "for", tkFrom = "from", tkFunc = "func", + tkIf = "if", tkImport = "import", tkIn = "in", tkInclude = "include", tkInterface = "interface", + tkIs = "is", tkIsnot = "isnot", tkIterator = "iterator", + tkLet = "let", + tkMacro = "macro", tkMethod = "method", tkMixin = "mixin", tkMod = "mod", tkNil = "nil", tkNot = "not", tkNotin = "notin", + tkObject = "object", tkOf = "of", tkOr = "or", tkOut = "out", + tkProc = "proc", tkPtr = "ptr", tkRaise = "raise", tkRef = "ref", tkReturn = "return", + tkShl = "shl", tkShr = "shr", tkStatic = "static", + tkTemplate = "template", + tkTry = "try", tkTuple = "tuple", tkType = "type", tkUsing = "using", + tkVar = "var", tkWhen = "when", tkWhile = "while", tkXor = "xor", + tkYield = "yield", # end of keywords + + tkIntLit = "tkIntLit", tkInt8Lit = "tkInt8Lit", tkInt16Lit = "tkInt16Lit", + tkInt32Lit = "tkInt32Lit", tkInt64Lit = "tkInt64Lit", + tkUIntLit = "tkUIntLit", tkUInt8Lit = "tkUInt8Lit", tkUInt16Lit = "tkUInt16Lit", + tkUInt32Lit = "tkUInt32Lit", tkUInt64Lit = "tkUInt64Lit", + tkFloatLit = "tkFloatLit", tkFloat32Lit = "tkFloat32Lit", + tkFloat64Lit = "tkFloat64Lit", tkFloat128Lit = "tkFloat128Lit", + tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit", tkTripleStrLit = "tkTripleStrLit", + tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit", + tkCustomLit = "tkCustomLit", + + tkParLe = "(", tkParRi = ")", tkBracketLe = "[", + tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}", + tkBracketDotLe = "[.", tkBracketDotRi = ".]", + tkCurlyDotLe = "{.", tkCurlyDotRi = ".}", + tkParDotLe = "(.", tkParDotRi = ".)", + tkComma = ",", tkSemiColon = ";", + tkColon = ":", tkColonColon = "::", tkEquals = "=", + tkDot = ".", tkDotDot = "..", tkBracketLeColon = "[:", + tkOpr, tkComment, tkAccent = "`", + # these are fake tokens used by renderer.nim + tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr, tkHideableStart, tkHideableEnd + + TokTypes* = set[TokType] const weakTokens = {tkComma, tkSemiColon, tkColon, @@ -73,84 +87,50 @@ const # tokens that should not be considered for previousToken tokKeywordLow* = succ(tkSymbol) tokKeywordHigh* = pred(tkIntLit) - TokTypeToStr*: array[TTokType, string] = ["tkInvalid", "[EOF]", - "tkSymbol", - "addr", "and", "as", "asm", - "bind", "block", "break", "case", "cast", - "concept", "const", "continue", "converter", - "defer", "discard", "distinct", "div", "do", - "elif", "else", "end", "enum", "except", "export", - "finally", "for", "from", "func", "if", - "import", "in", "include", "interface", "is", "isnot", "iterator", - "let", - "macro", "method", "mixin", "mod", - "nil", "not", "notin", "object", "of", "or", - "out", "proc", "ptr", "raise", "ref", "return", - "shl", "shr", "static", - "template", - "try", "tuple", "type", "using", - "var", "when", "while", "xor", - "yield", - "tkIntLit", "tkInt8Lit", "tkInt16Lit", "tkInt32Lit", "tkInt64Lit", - "tkUIntLit", "tkUInt8Lit", "tkUInt16Lit", "tkUInt32Lit", "tkUInt64Lit", - "tkFloatLit", "tkFloat32Lit", "tkFloat64Lit", "tkFloat128Lit", - "tkStrLit", "tkRStrLit", - "tkTripleStrLit", "tkGStrLit", "tkGTripleStrLit", "tkCharLit", "(", - ")", "[", "]", "{", "}", "[.", ".]", "{.", ".}", "(.", ".)", - ",", ";", - ":", "::", "=", ".", "..", "[:", - "tkOpr", "tkComment", "`", - "tkSpaces", "tkInfixOpr", - "tkPrefixOpr", "tkPostfixOpr"] type - TNumericalBase* = enum + NumericalBase* = enum base10, # base10 is listed as the first element, # so that it is the correct default value base2, base8, base16 - CursorPosition* {.pure.} = enum ## XXX remove this again - None, InToken, BeforeToken, AfterToken - - TToken* = object # a Nim token - tokType*: TTokType # the type of the token - indent*: int # the indentation; != -1 if the token has been - # preceded with indentation - ident*: PIdent # the parsed identifier - iNumber*: BiggestInt # the parsed integer literal - fNumber*: BiggestFloat # the parsed floating point literal - base*: TNumericalBase # the numerical base; only valid for int - # or float literals - strongSpaceA*: int8 # leading spaces of an operator - strongSpaceB*: int8 # trailing spaces of an operator - literal*: string # the parsed (string) literal; and - # documentation comments are here too + TokenSpacing* = enum + tsLeading, tsTrailing, tsEof + + Token* = object # a Nim token + tokType*: TokType # the type of the token + base*: NumericalBase # the numerical base; only valid for int + # or float literals + spacing*: set[TokenSpacing] # spaces around token + indent*: int # the indentation; != -1 if the token has been + # preceded with indentation + ident*: PIdent # the parsed identifier + iNumber*: BiggestInt # the parsed integer literal + fNumber*: BiggestFloat # the parsed floating point literal + literal*: string # the parsed (string) literal; and + # documentation comments are here too line*, col*: int when defined(nimpretty): - offsetA*, offsetB*: int # used for pretty printing so that literals - # like 0b01 or r"\L" are unaffected + offsetA*, offsetB*: int # used for pretty printing so that literals + # like 0b01 or r"\L" are unaffected commentOffsetA*, commentOffsetB*: int - TErrorHandler* = proc (conf: ConfigRef; info: TLineInfo; msg: TMsgKind; arg: string) - TLexer* = object of TBaseLexer + ErrorHandler* = proc (conf: ConfigRef; info: TLineInfo; msg: TMsgKind; arg: string) + Lexer* = object of TBaseLexer fileIdx*: FileIndex - indentAhead*: int # if > 0 an indendation has already been read + indentAhead*: int # if > 0 an indentation has already been read # this is needed because scanning comments # needs so much look-ahead currLineIndent*: int - strongSpaces*, allowTabs*: bool - cursor*: CursorPosition - errorHandler*: TErrorHandler + errorHandler*: ErrorHandler cache*: IdentCache when defined(nimsuggest): previousToken: TLineInfo + tokenEnd*: TLineInfo + previousTokenEnd*: TLineInfo config*: ConfigRef -when defined(nimpretty): - var - gIndentationWidth*: int - -proc getLineInfo*(L: TLexer, tok: TToken): TLineInfo {.inline.} = +proc getLineInfo*(L: Lexer, tok: Token): TLineInfo {.inline.} = result = newLineInfo(L.fileIdx, tok.line, tok.col) when defined(nimpretty): result.offsetA = tok.offsetA @@ -158,8 +138,8 @@ proc getLineInfo*(L: TLexer, tok: TToken): TLineInfo {.inline.} = result.commentOffsetA = tok.commentOffsetA result.commentOffsetB = tok.commentOffsetB -proc isKeyword*(kind: TTokType): bool = - result = (kind >= tokKeywordLow) and (kind <= tokKeywordHigh) +proc isKeyword*(kind: TokType): bool = + (kind >= tokKeywordLow) and (kind <= tokKeywordHigh) template ones(n): untyped = ((1 shl n)-1) # for utf-8 conversion @@ -169,62 +149,37 @@ proc isNimIdentifier*(s: string): bool = var i = 1 while i < sLen: if s[i] == '_': inc(i) - if i < sLen and s[i] notin SymChars: return + if i < sLen and s[i] notin SymChars: return false inc(i) result = true + else: + result = false -proc tokToStr*(tok: TToken): string = +proc `$`*(tok: Token): string = case tok.tokType - of tkIntLit..tkInt64Lit: result = $tok.iNumber - of tkFloatLit..tkFloat64Lit: result = $tok.fNumber - of tkInvalid, tkStrLit..tkCharLit, tkComment: result = tok.literal - of tkParLe..tkColon, tkEof, tkAccent: - result = TokTypeToStr[tok.tokType] + of tkIntLit..tkInt64Lit: $tok.iNumber + of tkFloatLit..tkFloat64Lit: $tok.fNumber + of tkInvalid, tkStrLit..tkCharLit, tkComment: tok.literal + of tkParLe..tkColon, tkEof, tkAccent: $tok.tokType else: if tok.ident != nil: - result = tok.ident.s + tok.ident.s else: - result = "" - -proc prettyTok*(tok: TToken): string = - if isKeyword(tok.tokType): result = "keyword " & tok.ident.s - else: result = tokToStr(tok) - -proc printTok*(conf: ConfigRef; tok: TToken) = - msgWriteln(conf, $tok.line & ":" & $tok.col & "\t" & - TokTypeToStr[tok.tokType] & " " & tokToStr(tok)) - -proc initToken*(L: var TToken) = - L.tokType = tkInvalid - L.iNumber = 0 - L.indent = 0 - L.strongSpaceA = 0 - L.literal = "" - L.fNumber = 0.0 - L.base = base10 - L.ident = nil - when defined(nimpretty): - L.commentOffsetA = 0 - L.commentOffsetB = 0 - -proc fillToken(L: var TToken) = - L.tokType = tkInvalid - L.iNumber = 0 - L.indent = 0 - L.strongSpaceA = 0 - setLen(L.literal, 0) - L.fNumber = 0.0 - L.base = base10 - L.ident = nil - when defined(nimpretty): - L.commentOffsetA = 0 - L.commentOffsetB = 0 + "" + +proc prettyTok*(tok: Token): string = + if isKeyword(tok.tokType): "keyword " & tok.ident.s + else: $tok -proc openLexer*(lex: var TLexer, fileIdx: FileIndex, inputstream: PLLStream; +proc printTok*(conf: ConfigRef; tok: Token) = + # xxx factor with toLocation + msgWriteln(conf, $tok.line & ":" & $tok.col & "\t" & $tok.tokType & " " & $tok) + +proc openLexer*(lex: var Lexer, fileIdx: FileIndex, inputstream: PLLStream; cache: IdentCache; config: ConfigRef) = openBaseLexer(lex, inputstream) - lex.fileIdx = fileidx - lex.indentAhead = - 1 + lex.fileIdx = fileIdx + lex.indentAhead = -1 lex.currLineIndent = 0 inc(lex.lineNumber, inputstream.lineOffset) lex.cache = cache @@ -232,36 +187,36 @@ proc openLexer*(lex: var TLexer, fileIdx: FileIndex, inputstream: PLLStream; lex.previousToken.fileIndex = fileIdx lex.config = config -proc openLexer*(lex: var TLexer, filename: string, inputstream: PLLStream; +proc openLexer*(lex: var Lexer, filename: AbsoluteFile, inputstream: PLLStream; cache: IdentCache; config: ConfigRef) = openLexer(lex, fileInfoIdx(config, filename), inputstream, cache, config) -proc closeLexer*(lex: var TLexer) = +proc closeLexer*(lex: var Lexer) = if lex.config != nil: inc(lex.config.linesCompiled, lex.lineNumber) closeBaseLexer(lex) -proc getLineInfo(L: TLexer): TLineInfo = +proc getLineInfo(L: Lexer): TLineInfo = result = newLineInfo(L.fileIdx, L.lineNumber, getColNumber(L, L.bufpos)) -proc dispMessage(L: TLexer; info: TLineInfo; msg: TMsgKind; arg: string) = +proc dispMessage(L: Lexer; info: TLineInfo; msg: TMsgKind; arg: string) = if L.errorHandler.isNil: msgs.message(L.config, info, msg, arg) else: L.errorHandler(L.config, info, msg, arg) -proc lexMessage*(L: TLexer, msg: TMsgKind, arg = "") = +proc lexMessage*(L: Lexer, msg: TMsgKind, arg = "") = L.dispMessage(getLineInfo(L), msg, arg) -proc lexMessageTok*(L: TLexer, msg: TMsgKind, tok: TToken, arg = "") = +proc lexMessageTok*(L: Lexer, msg: TMsgKind, tok: Token, arg = "") = var info = newLineInfo(L.fileIdx, tok.line, tok.col) L.dispMessage(info, msg, arg) -proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg = "") = +proc lexMessagePos(L: var Lexer, msg: TMsgKind, pos: int, arg = "") = var info = newLineInfo(L.fileIdx, L.lineNumber, pos - L.lineStart) L.dispMessage(info, msg, arg) -proc matchTwoChars(L: TLexer, first: char, second: set[char]): bool = +proc matchTwoChars(L: Lexer, first: char, second: set[char]): bool = result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in second) template tokenBegin(tok, pos) {.dirty.} = @@ -275,7 +230,6 @@ template tokenEnd(tok, pos) {.dirty.} = let colB = getColNumber(L, pos)+1 if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}: - L.cursor = CursorPosition.InToken L.config.m.trackPos.col = colA.int16 colA = 0 when defined(nimpretty): @@ -300,312 +254,317 @@ template tokenEndPrevious(tok, pos) = let colB = getColNumber(L, pos) if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}: - L.cursor = CursorPosition.BeforeToken L.config.m.trackPos = L.previousToken L.config.m.trackPosAttached = true colA = 0 when defined(nimpretty): tok.offsetB = L.offsetBase + pos -{.push overflowChecks: off.} -# We need to parse the largest uint literal without overflow checks -proc unsafeParseUInt(s: string, b: var BiggestInt, start = 0): int = - var i = start - if i < s.len and s[i] in {'0'..'9'}: - b = 0 - while i < s.len and s[i] in {'0'..'9'}: - b = b * 10 + (ord(s[i]) - ord('0')) - inc(i) - while i < s.len and s[i] == '_': inc(i) # underscores are allowed and ignored - result = i - start -{.pop.} # overflowChecks - - -template eatChar(L: var TLexer, t: var TToken, replacementChar: char) = - add(t.literal, replacementChar) +template eatChar(L: var Lexer, t: var Token, replacementChar: char) = + t.literal.add(replacementChar) inc(L.bufpos) -template eatChar(L: var TLexer, t: var TToken) = - add(t.literal, L.buf[L.bufpos]) +template eatChar(L: var Lexer, t: var Token) = + t.literal.add(L.buf[L.bufpos]) inc(L.bufpos) -proc getNumber(L: var TLexer, result: var TToken) = - proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: set[char]) = +proc getNumber(L: var Lexer, result: var Token) = + proc matchUnderscoreChars(L: var Lexer, tok: var Token, chars: set[char]): Natural = var pos = L.bufpos # use registers for pos, buf - var buf = L.buf + result = 0 while true: - if buf[pos] in chars: - add(tok.literal, buf[pos]) + if L.buf[pos] in chars: + tok.literal.add(L.buf[pos]) inc(pos) + inc(result) else: break - if buf[pos] == '_': - if buf[pos+1] notin chars: + if L.buf[pos] == '_': + if L.buf[pos+1] notin chars: lexMessage(L, errGenerated, - "only single underscores may occur in a token: '__' is invalid") + "only single underscores may occur in a token and token may not " & + "end with an underscore: e.g. '1__1' and '1_' are invalid") break - add(tok.literal, '_') + tok.literal.add('_') inc(pos) L.bufpos = pos - proc matchChars(L: var TLexer, tok: var TToken, chars: set[char]) = + proc matchChars(L: var Lexer, tok: var Token, chars: set[char]) = var pos = L.bufpos # use registers for pos, buf - var buf = L.buf - while buf[pos] in chars: - add(tok.literal, buf[pos]) + while L.buf[pos] in chars: + tok.literal.add(L.buf[pos]) inc(pos) L.bufpos = pos - proc lexMessageLitNum(L: var TLexer, msg: string, startpos: int) = + proc lexMessageLitNum(L: var Lexer, msg: string, startpos: int, msgKind = errGenerated) = # Used to get slightly human friendlier err messages. - # Note: the erroneous 'O' char in the character set is intentional - const literalishChars = {'A'..'F', 'a'..'f', '0'..'9', 'X', 'x', 'o', 'O', - 'c', 'C', 'b', 'B', '_', '.', '\'', 'd', 'i', 'u'} + const literalishChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '.', '\''} var msgPos = L.bufpos - var t: TToken - t.literal = "" + var t = Token(literal: "") L.bufpos = startpos # Use L.bufpos as pos because of matchChars matchChars(L, t, literalishChars) # We must verify +/- specifically so that we're not past the literal if L.buf[L.bufpos] in {'+', '-'} and L.buf[L.bufpos - 1] in {'e', 'E'}: - add(t.literal, L.buf[L.bufpos]) + t.literal.add(L.buf[L.bufpos]) inc(L.bufpos) matchChars(L, t, literalishChars) - if L.buf[L.bufpos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}: + if L.buf[L.bufpos] in literalishChars: + t.literal.add(L.buf[L.bufpos]) inc(L.bufpos) - add(t.literal, L.buf[L.bufpos]) matchChars(L, t, {'0'..'9'}) L.bufpos = msgPos - lexMessage(L, errGenerated, msg % t.literal) + lexMessage(L, msgKind, msg % t.literal) var - startpos, endpos: int xi: BiggestInt isBase10 = true + numDigits = 0 const - baseCodeChars = {'X', 'x', 'o', 'c', 'C', 'b', 'B'} + # 'c', 'C' is deprecated + baseCodeChars = {'X', 'x', 'o', 'b', 'B', 'c', 'C'} literalishChars = baseCodeChars + {'A'..'F', 'a'..'f', '0'..'9', '_', '\''} floatTypes = {tkFloatLit, tkFloat32Lit, tkFloat64Lit, tkFloat128Lit} result.tokType = tkIntLit # int literal until we know better result.literal = "" result.base = base10 - startpos = L.bufpos - tokenBegin(result, startPos) + tokenBegin(result, L.bufpos) + + var isPositive = true + if L.buf[L.bufpos] == '-': + eatChar(L, result) + isPositive = false + + let startpos = L.bufpos + + template setNumber(field, value) = + field = (if isPositive: value else: -value) # First stage: find out base, make verifications, build token literal string - if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'O'}: + # {'c', 'C'} is added for deprecation reasons to provide a clear error message + if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'c', 'C', 'O'}: isBase10 = false eatChar(L, result, '0') case L.buf[L.bufpos] + of 'c', 'C': + lexMessageLitNum(L, + "$1 will soon be invalid for oct literals; Use '0o' " & + "for octals. 'c', 'C' prefix", + startpos, + warnDeprecated) + eatChar(L, result, 'c') + numDigits = matchUnderscoreChars(L, result, {'0'..'7'}) of 'O': - lexMessageLitNum(L, "$1 is not a valid number; did you mean octal? Then use one of '0o', '0c' or '0C'.", startpos) + lexMessageLitNum(L, "$1 is an invalid int literal; For octal literals " & + "use the '0o' prefix.", startpos) of 'x', 'X': eatChar(L, result, 'x') - matchUnderscoreChars(L, result, {'0'..'9', 'a'..'f', 'A'..'F'}) - of 'o', 'c', 'C': - eatChar(L, result, 'c') - matchUnderscoreChars(L, result, {'0'..'7'}) + numDigits = matchUnderscoreChars(L, result, {'0'..'9', 'a'..'f', 'A'..'F'}) + of 'o': + eatChar(L, result, 'o') + numDigits = matchUnderscoreChars(L, result, {'0'..'7'}) of 'b', 'B': eatChar(L, result, 'b') - matchUnderscoreChars(L, result, {'0'..'1'}) + numDigits = matchUnderscoreChars(L, result, {'0'..'1'}) else: internalError(L.config, getLineInfo(L), "getNumber") + if numDigits == 0: + lexMessageLitNum(L, "invalid number: '$1'", startpos) else: - matchUnderscoreChars(L, result, {'0'..'9'}) + discard matchUnderscoreChars(L, result, {'0'..'9'}) if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): result.tokType = tkFloatLit eatChar(L, result, '.') - matchUnderscoreChars(L, result, {'0'..'9'}) + discard matchUnderscoreChars(L, result, {'0'..'9'}) if L.buf[L.bufpos] in {'e', 'E'}: result.tokType = tkFloatLit - eatChar(L, result, 'e') + eatChar(L, result) if L.buf[L.bufpos] in {'+', '-'}: eatChar(L, result) - matchUnderscoreChars(L, result, {'0'..'9'}) - endpos = L.bufpos + discard matchUnderscoreChars(L, result, {'0'..'9'}) + let endpos = L.bufpos # Second stage, find out if there's a datatype suffix and handle it var postPos = endpos + if L.buf[postPos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}: + let errPos = postPos + var customLitPossible = false if L.buf[postPos] == '\'': inc(postPos) + customLitPossible = true - case L.buf[postPos] - of 'f', 'F': - inc(postPos) - if (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'): - result.tokType = tkFloat32Lit - inc(postPos, 2) - elif (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'): - result.tokType = tkFloat64Lit - inc(postPos, 2) - elif (L.buf[postPos] == '1') and - (L.buf[postPos + 1] == '2') and - (L.buf[postPos + 2] == '8'): - result.tokType = tkFloat128Lit - inc(postPos, 3) - else: # "f" alone defaults to float32 - result.tokType = tkFloat32Lit - of 'd', 'D': # ad hoc convenience shortcut for f64 - inc(postPos) - result.tokType = tkFloat64Lit - of 'i', 'I': - inc(postPos) - if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'): - result.tokType = tkInt64Lit - inc(postPos, 2) - elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'): - result.tokType = tkInt32Lit - inc(postPos, 2) - elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'): - result.tokType = tkInt16Lit - inc(postPos, 2) - elif (L.buf[postPos] == '8'): - result.tokType = tkInt8Lit - inc(postPos) - else: - lexMessageLitNum(L, "invalid number: '$1'", startpos) - of 'u', 'U': - inc(postPos) - if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'): - result.tokType = tkUInt64Lit - inc(postPos, 2) - elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'): - result.tokType = tkUInt32Lit - inc(postPos, 2) - elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'): - result.tokType = tkUInt16Lit - inc(postPos, 2) - elif (L.buf[postPos] == '8'): - result.tokType = tkUInt8Lit - inc(postPos) + if L.buf[postPos] in SymChars: + var suffix = newStringOfCap(10) + while true: + suffix.add L.buf[postPos] + inc postPos + if L.buf[postPos] notin SymChars+{'_'}: break + let suffixAsLower = suffix.toLowerAscii + case suffixAsLower + of "f", "f32": result.tokType = tkFloat32Lit + of "d", "f64": result.tokType = tkFloat64Lit + of "f128": result.tokType = tkFloat128Lit + of "i8": result.tokType = tkInt8Lit + of "i16": result.tokType = tkInt16Lit + of "i32": result.tokType = tkInt32Lit + of "i64": result.tokType = tkInt64Lit + of "u": result.tokType = tkUIntLit + of "u8": result.tokType = tkUInt8Lit + of "u16": result.tokType = tkUInt16Lit + of "u32": result.tokType = tkUInt32Lit + of "u64": result.tokType = tkUInt64Lit + elif customLitPossible: + # remember the position of the `'` so that the parser doesn't + # have to reparse the custom literal: + result.iNumber = len(result.literal) + result.literal.add '\'' + result.literal.add suffix + result.tokType = tkCustomLit else: - result.tokType = tkUIntLit + lexMessageLitNum(L, "invalid number suffix: '$1'", errPos) else: - lexMessageLitNum(L, "invalid number: '$1'", startpos) + lexMessageLitNum(L, "invalid number suffix: '$1'", errPos) # Is there still a literalish char awaiting? Then it's an error! if L.buf[postPos] in literalishChars or (L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}): lexMessageLitNum(L, "invalid number: '$1'", startpos) - # Third stage, extract actual number - L.bufpos = startpos # restore position - var pos: int = startpos - try: - if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars): - inc(pos, 2) - xi = 0 # it is a base prefix - - case L.buf[pos - 1] - of 'b', 'B': - result.base = base2 - while pos < endpos: - if L.buf[pos] != '_': - xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0')) - inc(pos) - of 'o', 'c', 'C': - result.base = base8 - while pos < endpos: - if L.buf[pos] != '_': - xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0')) - inc(pos) - of 'x', 'X': - result.base = base16 - while pos < endpos: - case L.buf[pos] - of '_': - inc(pos) - of '0'..'9': - xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0')) + if result.tokType != tkCustomLit: + # Third stage, extract actual number + L.bufpos = startpos # restore position + var pos = startpos + try: + if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars): + inc(pos, 2) + xi = 0 # it is a base prefix + + case L.buf[pos - 1] + of 'b', 'B': + result.base = base2 + while pos < endpos: + if L.buf[pos] != '_': + xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0')) inc(pos) - of 'a'..'f': - xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10) + # 'c', 'C' is deprecated (a warning is issued elsewhere) + of 'o', 'c', 'C': + result.base = base8 + while pos < endpos: + if L.buf[pos] != '_': + xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0')) inc(pos) - of 'A'..'F': - xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10) - inc(pos) - else: - break + of 'x', 'X': + result.base = base16 + while pos < endpos: + case L.buf[pos] + of '_': + inc(pos) + of '0'..'9': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + of 'a'..'f': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10) + inc(pos) + of 'A'..'F': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10) + inc(pos) + else: + break + else: + internalError(L.config, getLineInfo(L), "getNumber") + + case result.tokType + of tkIntLit, tkInt64Lit: setNumber result.iNumber, xi + of tkInt8Lit: setNumber result.iNumber, ashr(xi shl 56, 56) + of tkInt16Lit: setNumber result.iNumber, ashr(xi shl 48, 48) + of tkInt32Lit: setNumber result.iNumber, ashr(xi shl 32, 32) + of tkUIntLit, tkUInt64Lit: setNumber result.iNumber, xi + of tkUInt8Lit: setNumber result.iNumber, xi and 0xff + of tkUInt16Lit: setNumber result.iNumber, xi and 0xffff + of tkUInt32Lit: setNumber result.iNumber, xi and 0xffffffff + of tkFloat32Lit: + setNumber result.fNumber, (cast[ptr float32](addr(xi)))[] + # note: this code is endian neutral! + # XXX: Test this on big endian machine! + of tkFloat64Lit, tkFloatLit: + setNumber result.fNumber, (cast[ptr float64](addr(xi)))[] + else: internalError(L.config, getLineInfo(L), "getNumber") + + # Bounds checks. Non decimal literals are allowed to overflow the range of + # the datatype as long as their pattern don't overflow _bitwise_, hence + # below checks of signed sizes against uint*.high is deliberate: + # (0x80'u8 = 128, 0x80'i8 = -128, etc == OK) + if result.tokType notin floatTypes: + let outOfRange = + case result.tokType + of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi + of tkInt8Lit: (xi > BiggestInt(uint8.high)) + of tkInt16Lit: (xi > BiggestInt(uint16.high)) + of tkInt32Lit: (xi > BiggestInt(uint32.high)) + else: false + + if outOfRange: + #echo "out of range num: ", result.iNumber, " vs ", xi + lexMessageLitNum(L, "number out of range: '$1'", startpos) + else: - internalError(L.config, getLineInfo(L), "getNumber") - - case result.tokType - of tkIntLit, tkInt64Lit: result.iNumber = xi - of tkInt8Lit: result.iNumber = BiggestInt(int8(toU8(int(xi)))) - of tkInt16Lit: result.iNumber = BiggestInt(int16(toU16(int(xi)))) - of tkInt32Lit: result.iNumber = BiggestInt(int32(toU32(int64(xi)))) - of tkUIntLit, tkUInt64Lit: result.iNumber = xi - of tkUInt8Lit: result.iNumber = BiggestInt(uint8(toU8(int(xi)))) - of tkUInt16Lit: result.iNumber = BiggestInt(uint16(toU16(int(xi)))) - of tkUInt32Lit: result.iNumber = BiggestInt(uint32(toU32(int64(xi)))) - of tkFloat32Lit: - result.fNumber = (cast[PFloat32](addr(xi)))[] - # note: this code is endian neutral! - # XXX: Test this on big endian machine! - of tkFloat64Lit, tkFloatLit: - result.fNumber = (cast[PFloat64](addr(xi)))[] - else: internalError(L.config, getLineInfo(L), "getNumber") - - # Bounds checks. Non decimal literals are allowed to overflow the range of - # the datatype as long as their pattern don't overflow _bitwise_, hence - # below checks of signed sizes against uint*.high is deliberate: - # (0x80'u8 = 128, 0x80'i8 = -128, etc == OK) - if result.tokType notin floatTypes: - let outOfRange = case result.tokType: - of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi - of tkInt8Lit: (xi > BiggestInt(uint8.high)) - of tkInt16Lit: (xi > BiggestInt(uint16.high)) - of tkInt32Lit: (xi > BiggestInt(uint32.high)) - else: false + case result.tokType + of floatTypes: + result.fNumber = parseFloat(result.literal) + of tkUInt64Lit, tkUIntLit: + var iNumber: uint64 = uint64(0) + var len: int = 0 + try: + len = parseBiggestUInt(result.literal, iNumber) + except ValueError: + raise newException(OverflowDefect, "number out of range: " & result.literal) + if len != result.literal.len: + raise newException(ValueError, "invalid integer: " & result.literal) + result.iNumber = cast[int64](iNumber) + else: + var iNumber: int64 = int64(0) + var len: int = 0 + try: + len = parseBiggestInt(result.literal, iNumber) + except ValueError: + raise newException(OverflowDefect, "number out of range: " & result.literal) + if len != result.literal.len: + raise newException(ValueError, "invalid integer: " & result.literal) + result.iNumber = iNumber + + # Explicit bounds checks. + let outOfRange = + case result.tokType + of tkInt8Lit: result.iNumber > int8.high or result.iNumber < int8.low + of tkUInt8Lit: result.iNumber > BiggestInt(uint8.high) or result.iNumber < 0 + of tkInt16Lit: result.iNumber > int16.high or result.iNumber < int16.low + of tkUInt16Lit: result.iNumber > BiggestInt(uint16.high) or result.iNumber < 0 + of tkInt32Lit: result.iNumber > int32.high or result.iNumber < int32.low + of tkUInt32Lit: result.iNumber > BiggestInt(uint32.high) or result.iNumber < 0 + else: false if outOfRange: - #echo "out of range num: ", result.iNumber, " vs ", xi lexMessageLitNum(L, "number out of range: '$1'", startpos) - else: - case result.tokType - of floatTypes: - result.fNumber = parseFloat(result.literal) - of tkUint64Lit: - xi = 0 - let len = unsafeParseUInt(result.literal, xi) - if len != result.literal.len or len == 0: - raise newException(ValueError, "invalid integer: " & $xi) - result.iNumber = xi - else: - result.iNumber = parseBiggestInt(result.literal) + # Promote int literal to int64? Not always necessary, but more consistent + if result.tokType == tkIntLit: + if result.iNumber > high(int32) or result.iNumber < low(int32): + result.tokType = tkInt64Lit - # Explicit bounds checks - let outOfRange = - case result.tokType - of tkInt8Lit: (result.iNumber < int8.low or result.iNumber > int8.high) - of tkUInt8Lit: (result.iNumber < BiggestInt(uint8.low) or - result.iNumber > BiggestInt(uint8.high)) - of tkInt16Lit: (result.iNumber < int16.low or result.iNumber > int16.high) - of tkUInt16Lit: (result.iNumber < BiggestInt(uint16.low) or - result.iNumber > BiggestInt(uint16.high)) - of tkInt32Lit: (result.iNumber < int32.low or result.iNumber > int32.high) - of tkUInt32Lit: (result.iNumber < BiggestInt(uint32.low) or - result.iNumber > BiggestInt(uint32.high)) - else: false - - if outOfRange: lexMessageLitNum(L, "number out of range: '$1'", startpos) - - # Promote int literal to int64? Not always necessary, but more consistent - if result.tokType == tkIntLit: - if (result.iNumber < low(int32)) or (result.iNumber > high(int32)): - result.tokType = tkInt64Lit - - except ValueError: - lexMessageLitNum(L, "invalid number: '$1'", startpos) - except OverflowError, RangeError: - lexMessageLitNum(L, "number out of range: '$1'", startpos) + except ValueError: + lexMessageLitNum(L, "invalid number: '$1'", startpos) + except OverflowDefect, RangeDefect: + lexMessageLitNum(L, "number out of range: '$1'", startpos) tokenEnd(result, postPos-1) L.bufpos = postPos -proc handleHexChar(L: var TLexer, xi: var int) = +proc handleHexChar(L: var Lexer, xi: var int; position: range[0..4]) = + template invalid() = + lexMessage(L, errGenerated, + "expected a hex digit, but found: " & L.buf[L.bufpos] & + "; maybe prepend with 0") + case L.buf[L.bufpos] of '0'..'9': xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0')) @@ -616,104 +575,142 @@ proc handleHexChar(L: var TLexer, xi: var int) = of 'A'..'F': xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10) inc(L.bufpos) - else: discard + of '"', '\'': + if position <= 1: invalid() + # do not progress the bufpos here. + if position == 0: inc(L.bufpos) + else: + invalid() + # Need to progress for `nim check` + inc(L.bufpos) -proc handleDecChars(L: var TLexer, xi: var int) = +proc handleDecChars(L: var Lexer, xi: var int) = while L.buf[L.bufpos] in {'0'..'9'}: xi = (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0')) inc(L.bufpos) -proc getEscapedChar(L: var TLexer, tok: var TToken) = +proc addUnicodeCodePoint(s: var string, i: int) = + let i = cast[uint](i) + # inlined toUTF-8 to avoid unicode and strutils dependencies. + let pos = s.len + if i <= 127: + s.setLen(pos+1) + s[pos+0] = chr(i) + elif i <= 0x07FF: + s.setLen(pos+2) + s[pos+0] = chr((i shr 6) or 0b110_00000) + s[pos+1] = chr((i and ones(6)) or 0b10_0000_00) + elif i <= 0xFFFF: + s.setLen(pos+3) + s[pos+0] = chr(i shr 12 or 0b1110_0000) + s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00) + s[pos+2] = chr(i and ones(6) or 0b10_0000_00) + elif i <= 0x001FFFFF: + s.setLen(pos+4) + s[pos+0] = chr(i shr 18 or 0b1111_0000) + s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00) + s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00) + s[pos+3] = chr(i and ones(6) or 0b10_0000_00) + elif i <= 0x03FFFFFF: + s.setLen(pos+5) + s[pos+0] = chr(i shr 24 or 0b111110_00) + s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00) + s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00) + s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00) + s[pos+4] = chr(i and ones(6) or 0b10_0000_00) + elif i <= 0x7FFFFFFF: + s.setLen(pos+6) + s[pos+0] = chr(i shr 30 or 0b1111110_0) + s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00) + s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00) + s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00) + s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00) + s[pos+5] = chr(i and ones(6) or 0b10_0000_00) + +proc getEscapedChar(L: var Lexer, tok: var Token) = inc(L.bufpos) # skip '\' case L.buf[L.bufpos] of 'n', 'N': - if L.config.oldNewlines: - if tok.tokType == tkCharLit: - lexMessage(L, errGenerated, "\\n not allowed in character literal") - add(tok.literal, L.config.target.tnl) - else: - add(tok.literal, '\L') + tok.literal.add('\L') inc(L.bufpos) of 'p', 'P': if tok.tokType == tkCharLit: lexMessage(L, errGenerated, "\\p not allowed in character literal") - add(tok.literal, L.config.target.tnl) + tok.literal.add(L.config.target.tnl) inc(L.bufpos) of 'r', 'R', 'c', 'C': - add(tok.literal, CR) + tok.literal.add(CR) inc(L.bufpos) of 'l', 'L': - add(tok.literal, LF) + tok.literal.add(LF) inc(L.bufpos) of 'f', 'F': - add(tok.literal, FF) + tok.literal.add(FF) inc(L.bufpos) of 'e', 'E': - add(tok.literal, ESC) + tok.literal.add(ESC) inc(L.bufpos) of 'a', 'A': - add(tok.literal, BEL) + tok.literal.add(BEL) inc(L.bufpos) of 'b', 'B': - add(tok.literal, BACKSPACE) + tok.literal.add(BACKSPACE) inc(L.bufpos) of 'v', 'V': - add(tok.literal, VT) + tok.literal.add(VT) inc(L.bufpos) of 't', 'T': - add(tok.literal, '\t') + tok.literal.add('\t') inc(L.bufpos) of '\'', '\"': - add(tok.literal, L.buf[L.bufpos]) + tok.literal.add(L.buf[L.bufpos]) inc(L.bufpos) of '\\': - add(tok.literal, '\\') + tok.literal.add('\\') inc(L.bufpos) - of 'x', 'X', 'u', 'U': - var tp = L.buf[L.bufpos] + of 'x', 'X': inc(L.bufpos) var xi = 0 - handleHexChar(L, xi) - handleHexChar(L, xi) - if tp in {'u', 'U'}: - handleHexChar(L, xi) - handleHexChar(L, xi) - # inlined toUTF-8 to avoid unicode and strutils dependencies. - if xi <=% 127: - add(tok.literal, xi.char ) - elif xi <=% 0x07FF: - add(tok.literal, ((xi shr 6) or 0b110_00000).char ) - add(tok.literal, ((xi and ones(6)) or 0b10_0000_00).char ) - elif xi <=% 0xFFFF: - add(tok.literal, (xi shr 12 or 0b1110_0000).char ) - add(tok.literal, (xi shr 6 and ones(6) or 0b10_0000_00).char ) - add(tok.literal, (xi and ones(6) or 0b10_0000_00).char ) - else: # value is 0xFFFF - add(tok.literal, "\xef\xbf\xbf" ) + handleHexChar(L, xi, 1) + handleHexChar(L, xi, 2) + tok.literal.add(chr(xi)) + of 'u', 'U': + if tok.tokType == tkCharLit: + lexMessage(L, errGenerated, "\\u not allowed in character literal") + inc(L.bufpos) + var xi = 0 + if L.buf[L.bufpos] == '{': + inc(L.bufpos) + var start = L.bufpos + while L.buf[L.bufpos] != '}': + handleHexChar(L, xi, 0) + if start == L.bufpos: + lexMessage(L, errGenerated, + "Unicode codepoint cannot be empty") + inc(L.bufpos) + if xi > 0x10FFFF: + let hex = ($L.buf)[start..L.bufpos-2] + lexMessage(L, errGenerated, + "Unicode codepoint must be lower than 0x10FFFF, but was: " & hex) else: - add(tok.literal, chr(xi)) + handleHexChar(L, xi, 1) + handleHexChar(L, xi, 2) + handleHexChar(L, xi, 3) + handleHexChar(L, xi, 4) + addUnicodeCodePoint(tok.literal, xi) of '0'..'9': if matchTwoChars(L, '0', {'0'..'9'}): lexMessage(L, warnOctalEscape) var xi = 0 handleDecChars(L, xi) - if (xi <= 255): add(tok.literal, chr(xi)) + if (xi <= 255): tok.literal.add(chr(xi)) else: lexMessage(L, errGenerated, "invalid character constant") else: lexMessage(L, errGenerated, "invalid character constant") -proc newString(s: cstring, len: int): string = - ## XXX, how come there is no support for this? - result = newString(len) - for i in 0 ..< len: - result[i] = s[i] - -proc handleCRLF(L: var TLexer, pos: int): int = +proc handleCRLF(L: var Lexer, pos: int): int = template registerLine = let col = L.getColNumber(pos) - if col > MaxLineLength: - lexMessagePos(L, hintLineTooLong, pos) - case L.buf[pos] of CR: registerLine() @@ -723,37 +720,40 @@ proc handleCRLF(L: var TLexer, pos: int): int = result = nimlexbase.handleLF(L, pos) else: result = pos -proc getString(L: var TLexer, tok: var TToken, rawMode: bool) = +type + StringMode = enum + normal, + raw, + generalized + +proc getString(L: var Lexer, tok: var Token, mode: StringMode) = var pos = L.bufpos - var buf = L.buf # put `buf` in a register var line = L.lineNumber # save linenumber for better error message - tokenBegin(tok, pos) + tokenBegin(tok, pos - ord(mode == raw)) inc pos # skip " - if buf[pos] == '\"' and buf[pos+1] == '\"': + if L.buf[pos] == '\"' and L.buf[pos+1] == '\"': tok.tokType = tkTripleStrLit # long string literal: inc(pos, 2) # skip "" # skip leading newline: - if buf[pos] in {' ', '\t'}: + if L.buf[pos] in {' ', '\t'}: var newpos = pos+1 - while buf[newpos] in {' ', '\t'}: inc newpos - if buf[newpos] in {CR, LF}: pos = newpos + while L.buf[newpos] in {' ', '\t'}: inc newpos + if L.buf[newpos] in {CR, LF}: pos = newpos pos = handleCRLF(L, pos) - buf = L.buf while true: - case buf[pos] + case L.buf[pos] of '\"': - if buf[pos+1] == '\"' and buf[pos+2] == '\"' and - buf[pos+3] != '\"': + if L.buf[pos+1] == '\"' and L.buf[pos+2] == '\"' and + L.buf[pos+3] != '\"': tokenEndIgnore(tok, pos+2) L.bufpos = pos + 3 # skip the three """ break - add(tok.literal, '\"') + tok.literal.add('\"') inc(pos) of CR, LF: tokenEndIgnore(tok, pos) pos = handleCRLF(L, pos) - buf = L.buf - add(tok.literal, "\n") + tok.literal.add("\n") of nimlexbase.EndOfFile: tokenEndIgnore(tok, pos) var line2 = L.lineNumber @@ -763,18 +763,18 @@ proc getString(L: var TLexer, tok: var TToken, rawMode: bool) = L.bufpos = pos break else: - add(tok.literal, buf[pos]) + tok.literal.add(L.buf[pos]) inc(pos) else: # ordinary string literal - if rawMode: tok.tokType = tkRStrLit + if mode != normal: tok.tokType = tkRStrLit else: tok.tokType = tkStrLit while true: - var c = buf[pos] + let c = L.buf[pos] if c == '\"': - if rawMode and buf[pos+1] == '\"': + if mode != normal and L.buf[pos+1] == '\"': inc(pos, 2) - add(tok.literal, '"') + tok.literal.add('"') else: tokenEndIgnore(tok, pos) inc(pos) # skip '"' @@ -783,145 +783,243 @@ proc getString(L: var TLexer, tok: var TToken, rawMode: bool) = tokenEndIgnore(tok, pos) lexMessage(L, errGenerated, "closing \" expected") break - elif (c == '\\') and not rawMode: + elif (c == '\\') and mode == normal: L.bufpos = pos getEscapedChar(L, tok) pos = L.bufpos else: - add(tok.literal, c) + tok.literal.add(c) inc(pos) L.bufpos = pos -proc getCharacter(L: var TLexer, tok: var TToken) = +proc getCharacter(L: var Lexer; tok: var Token) = tokenBegin(tok, L.bufpos) + let startPos = L.bufpos inc(L.bufpos) # skip ' - var c = L.buf[L.bufpos] + let c = L.buf[L.bufpos] case c - of '\0'..pred(' '), '\'': lexMessage(L, errGenerated, "invalid character literal") + of '\0'..pred(' '), '\'': + lexMessage(L, errGenerated, "invalid character literal") + tok.literal = $c of '\\': getEscapedChar(L, tok) else: tok.literal = $c inc(L.bufpos) - if L.buf[L.bufpos] != '\'': - lexMessage(L, errGenerated, "missing closing ' for character literal") - tokenEndIgnore(tok, L.bufpos) - inc(L.bufpos) # skip ' + if L.buf[L.bufpos] == '\'': + tokenEndIgnore(tok, L.bufpos) + inc(L.bufpos) # skip ' + else: + if startPos > 0 and L.buf[startPos-1] == '`': + tok.literal = "'" + L.bufpos = startPos+1 + else: + lexMessage(L, errGenerated, "missing closing ' for character literal") + tokenEndIgnore(tok, L.bufpos) + +const + UnicodeOperatorStartChars = {'\226', '\194', '\195'} + # the allowed unicode characters ("∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓ ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔") + # all start with one of these. + +type + UnicodeOprPred = enum + Mul, Add + +proc unicodeOprLen(buf: cstring; pos: int): (int8, UnicodeOprPred) = + template m(len): untyped = (int8(len), Mul) + template a(len): untyped = (int8(len), Add) + result = 0.m + case buf[pos] + of '\226': + if buf[pos+1] == '\136': + if buf[pos+2] == '\152': result = 3.m # ∘ + elif buf[pos+2] == '\153': result = 3.m # ∙ + elif buf[pos+2] == '\167': result = 3.m # ∧ + elif buf[pos+2] == '\168': result = 3.a # ∨ + elif buf[pos+2] == '\169': result = 3.m # ∩ + elif buf[pos+2] == '\170': result = 3.a # ∪ + elif buf[pos+1] == '\138': + if buf[pos+2] == '\147': result = 3.m # ⊓ + elif buf[pos+2] == '\148': result = 3.a # ⊔ + elif buf[pos+2] == '\149': result = 3.a # ⊕ + elif buf[pos+2] == '\150': result = 3.a # ⊖ + elif buf[pos+2] == '\151': result = 3.m # ⊗ + elif buf[pos+2] == '\152': result = 3.m # ⊘ + elif buf[pos+2] == '\153': result = 3.m # ⊙ + elif buf[pos+2] == '\155': result = 3.m # ⊛ + elif buf[pos+2] == '\158': result = 3.a # ⊞ + elif buf[pos+2] == '\159': result = 3.a # ⊟ + elif buf[pos+2] == '\160': result = 3.m # ⊠ + elif buf[pos+2] == '\161': result = 3.m # ⊡ + elif buf[pos+1] == '\152' and buf[pos+2] == '\133': result = 3.m # ★ + of '\194': + if buf[pos+1] == '\177': result = 2.a # ± + of '\195': + if buf[pos+1] == '\151': result = 2.m # × + else: + discard -proc getSymbol(L: var TLexer, tok: var TToken) = +proc getSymbol(L: var Lexer, tok: var Token) = var h: Hash = 0 var pos = L.bufpos - var buf = L.buf tokenBegin(tok, pos) + var suspicious = false while true: - var c = buf[pos] + var c = L.buf[pos] case c - of 'a'..'z', '0'..'9', '\x80'..'\xFF': + of 'a'..'z', '0'..'9': h = h !& ord(c) inc(pos) of 'A'..'Z': c = chr(ord(c) + (ord('a') - ord('A'))) # toLower() h = h !& ord(c) inc(pos) + suspicious = true of '_': - if buf[pos+1] notin SymChars: + if L.buf[pos+1] notin SymChars: lexMessage(L, errGenerated, "invalid token: trailing underscore") break inc(pos) + suspicious = true + of '\x80'..'\xFF': + if c in UnicodeOperatorStartChars and unicodeOprLen(L.buf, pos)[0] != 0: + break + else: + h = h !& ord(c) + inc(pos) else: break tokenEnd(tok, pos-1) h = !$h - tok.ident = L.cache.getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h) - L.bufpos = pos + tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h) if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or (tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)): tok.tokType = tkSymbol else: - tok.tokType = TTokType(tok.ident.id + ord(tkSymbol)) + tok.tokType = TokType(tok.ident.id + ord(tkSymbol)) + if suspicious and {optStyleHint, optStyleError} * L.config.globalOptions != {}: + lintReport(L.config, getLineInfo(L), tok.ident.s.normalize, tok.ident.s) + L.bufpos = pos + -proc endOperator(L: var TLexer, tok: var TToken, pos: int, +proc endOperator(L: var Lexer, tok: var Token, pos: int, hash: Hash) {.inline.} = var h = !$hash - tok.ident = L.cache.getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h) + tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h) if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr - else: tok.tokType = TTokType(tok.ident.id - oprLow + ord(tkColon)) + else: tok.tokType = TokType(tok.ident.id - oprLow + ord(tkColon)) L.bufpos = pos -proc getOperator(L: var TLexer, tok: var TToken) = +proc getOperator(L: var Lexer, tok: var Token) = var pos = L.bufpos - var buf = L.buf tokenBegin(tok, pos) var h: Hash = 0 while true: - var c = buf[pos] - if c notin OpChars: break - h = h !& ord(c) - inc(pos) + let c = L.buf[pos] + if c in OpChars: + h = h !& ord(c) + inc(pos) + elif c in UnicodeOperatorStartChars: + let oprLen = unicodeOprLen(L.buf, pos)[0] + if oprLen == 0: break + for i in 0..<oprLen: + h = h !& ord(L.buf[pos]) + inc pos + else: + break endOperator(L, tok, pos, h) tokenEnd(tok, pos-1) # advance pos but don't store it in L.bufpos so the next token (which might # be an operator too) gets the preceding spaces: - tok.strongSpaceB = 0 - while buf[pos] == ' ': + tok.spacing = tok.spacing - {tsTrailing, tsEof} + var trailing = false + while L.buf[pos] == ' ': inc pos - inc tok.strongSpaceB - if buf[pos] in {CR, LF, nimlexbase.EndOfFile}: - tok.strongSpaceB = -1 - -proc newlineFollows*(L: TLexer): bool = - var pos = L.bufpos - var buf = L.buf - while true: - case buf[pos] - of ' ', '\t': - inc(pos) - of CR, LF: - result = true - break - of '#': - inc(pos) - if buf[pos] == '#': inc(pos) - if buf[pos] != '[': return true - else: - break - -proc skipMultiLineComment(L: var TLexer; tok: var TToken; start: int; + trailing = true + if L.buf[pos] in {CR, LF, nimlexbase.EndOfFile}: + tok.spacing.incl(tsEof) + elif trailing: + tok.spacing.incl(tsTrailing) + +proc getPrecedence*(tok: Token): int = + ## Calculates the precedence of the given token. + const + MulPred = 9 + PlusPred = 8 + case tok.tokType + of tkOpr: + let relevantChar = tok.ident.s[0] + + # arrow like? + if tok.ident.s.len > 1 and tok.ident.s[^1] == '>' and + tok.ident.s[^2] in {'-', '~', '='}: return 0 + + template considerAsgn(value: untyped) = + result = if tok.ident.s[^1] == '=': 1 else: value + + case relevantChar + of '$', '^': considerAsgn(10) + of '*', '%', '/', '\\': considerAsgn(MulPred) + of '~': result = 8 + of '+', '-', '|': considerAsgn(PlusPred) + of '&': considerAsgn(7) + of '=', '<', '>', '!': result = 5 + of '.': considerAsgn(6) + of '?': result = 2 + of UnicodeOperatorStartChars: + if tok.ident.s[^1] == '=': + result = 1 + else: + let (len, pred) = unicodeOprLen(cstring(tok.ident.s), 0) + if len != 0: + result = if pred == Mul: MulPred else: PlusPred + else: + result = 2 + else: considerAsgn(2) + of tkDiv, tkMod, tkShl, tkShr: result = 9 + of tkDotDot: result = 6 + of tkIn, tkNotin, tkIs, tkIsnot, tkOf, tkAs, tkFrom: result = 5 + of tkAnd: result = 4 + of tkOr, tkXor, tkPtr, tkRef: result = 3 + else: return -10 + +proc skipMultiLineComment(L: var Lexer; tok: var Token; start: int; isDoc: bool) = var pos = start - var buf = L.buf var toStrip = 0 tokenBegin(tok, pos) # detect the amount of indentation: if isDoc: toStrip = getColNumber(L, pos) - while buf[pos] == ' ': inc pos - if buf[pos] in {CR, LF}: + while L.buf[pos] == ' ': + inc pos + inc toStrip + while L.buf[pos] in {CR, LF}: # skip blank lines pos = handleCRLF(L, pos) - buf = L.buf toStrip = 0 - while buf[pos] == ' ': + while L.buf[pos] == ' ': inc pos inc toStrip var nesting = 0 while true: - case buf[pos] + case L.buf[pos] of '#': if isDoc: - if buf[pos+1] == '#' and buf[pos+2] == '[': + if L.buf[pos+1] == '#' and L.buf[pos+2] == '[': inc nesting tok.literal.add '#' - elif buf[pos+1] == '[': + elif L.buf[pos+1] == '[': inc nesting inc pos of ']': if isDoc: - if buf[pos+1] == '#' and buf[pos+2] == '#': + if L.buf[pos+1] == '#' and L.buf[pos+2] == '#': if nesting == 0: tokenEndIgnore(tok, pos+2) inc(pos, 3) break dec nesting tok.literal.add ']' - elif buf[pos+1] == '#': + elif L.buf[pos+1] == '#': if nesting == 0: tokenEndIgnore(tok, pos+1) inc(pos, 2) @@ -931,14 +1029,12 @@ proc skipMultiLineComment(L: var TLexer; tok: var TToken; start: int; of CR, LF: tokenEndIgnore(tok, pos) pos = handleCRLF(L, pos) - buf = L.buf # strip leading whitespace: when defined(nimpretty): tok.literal.add "\L" if isDoc: when not defined(nimpretty): tok.literal.add "\n" - inc tok.iNumber var c = toStrip - while buf[pos] == ' ' and c > 0: + while L.buf[pos] == ' ' and c > 0: inc pos dec c of nimlexbase.EndOfFile: @@ -946,57 +1042,57 @@ proc skipMultiLineComment(L: var TLexer; tok: var TToken; start: int; lexMessagePos(L, errGenerated, pos, "end of multiline comment expected") break else: - if isDoc or defined(nimpretty): tok.literal.add buf[pos] + if isDoc or defined(nimpretty): tok.literal.add L.buf[pos] inc(pos) L.bufpos = pos when defined(nimpretty): tok.commentOffsetB = L.offsetBase + pos - 1 -proc scanComment(L: var TLexer, tok: var TToken) = +proc scanComment(L: var Lexer, tok: var Token) = var pos = L.bufpos - var buf = L.buf tok.tokType = tkComment - # iNumber contains the number of '\n' in the token - tok.iNumber = 0 - assert buf[pos+1] == '#' + assert L.buf[pos+1] == '#' when defined(nimpretty): - tok.commentOffsetA = L.offsetBase + pos - 1 + tok.commentOffsetA = L.offsetBase + pos - if buf[pos+2] == '[': + if L.buf[pos+2] == '[': skipMultiLineComment(L, tok, pos+3, true) return tokenBegin(tok, pos) inc(pos, 2) var toStrip = 0 - while buf[pos] == ' ': - inc pos - inc toStrip + var stripInit = false while true: - var lastBackslash = -1 - while buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: - if buf[pos] == '\\': lastBackslash = pos+1 - add(tok.literal, buf[pos]) + if not stripInit: # find baseline indentation inside comment + while L.buf[pos] == ' ': + inc pos + inc toStrip + if L.buf[pos] in {CR, LF}: # don't set toStrip in blank comment lines + toStrip = 0 + else: # found first non-whitespace character + stripInit = true + while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: + tok.literal.add(L.buf[pos]) inc(pos) tokenEndIgnore(tok, pos) pos = handleCRLF(L, pos) - buf = L.buf var indent = 0 - while buf[pos] == ' ': + while L.buf[pos] == ' ': inc(pos) inc(indent) - if buf[pos] == '#' and buf[pos+1] == '#': + if L.buf[pos] == '#' and L.buf[pos+1] == '#': tok.literal.add "\n" inc(pos, 2) - var c = toStrip - while buf[pos] == ' ' and c > 0: - inc pos - dec c - inc tok.iNumber + if stripInit: + var c = toStrip + while L.buf[pos] == ' ' and c > 0: + inc pos + dec c else: - if buf[pos] > ' ': + if L.buf[pos] > ' ': L.indentAhead = indent tokenEndIgnore(tok, pos) break @@ -1004,58 +1100,65 @@ proc scanComment(L: var TLexer, tok: var TToken) = when defined(nimpretty): tok.commentOffsetB = L.offsetBase + pos - 1 -proc skip(L: var TLexer, tok: var TToken) = +proc skip(L: var Lexer, tok: var Token) = var pos = L.bufpos - var buf = L.buf tokenBegin(tok, pos) - tok.strongSpaceA = 0 + tok.spacing.excl(tsLeading) when defined(nimpretty): var hasComment = false + var commentIndent = L.currLineIndent tok.commentOffsetA = L.offsetBase + pos tok.commentOffsetB = tok.commentOffsetA + tok.line = -1 while true: - case buf[pos] + case L.buf[pos] of ' ': inc(pos) - inc(tok.strongSpaceA) + tok.spacing.incl(tsLeading) of '\t': - if not L.allowTabs: lexMessagePos(L, errGenerated, pos, "tabulators are not allowed") + lexMessagePos(L, errGenerated, pos, "tabs are not allowed, use spaces instead") inc(pos) of CR, LF: tokenEndPrevious(tok, pos) - when defined(nimpretty): - # we are not yet in a comment, so update the comment token's line information: - if not hasComment: inc tok.line pos = handleCRLF(L, pos) - buf = L.buf var indent = 0 while true: - if buf[pos] == ' ': + if L.buf[pos] == ' ': inc(pos) inc(indent) - elif buf[pos] == '#' and buf[pos+1] == '[': - when defined(nimpretty): hasComment = true + elif L.buf[pos] == '#' and L.buf[pos+1] == '[': + when defined(nimpretty): + hasComment = true + if tok.line < 0: + tok.line = L.lineNumber + commentIndent = indent skipMultiLineComment(L, tok, pos+2, false) pos = L.bufpos - buf = L.buf else: break - tok.strongSpaceA = 0 - if buf[pos] > ' ' and (buf[pos] != '#' or buf[pos+1] == '#'): + tok.spacing.excl(tsLeading) + when defined(nimpretty): + if L.buf[pos] == '#' and tok.line < 0: commentIndent = indent + if L.buf[pos] > ' ' and (L.buf[pos] != '#' or L.buf[pos+1] == '#'): tok.indent = indent L.currLineIndent = indent break of '#': # do not skip documentation comment: - if buf[pos+1] == '#': break - when defined(nimpretty): hasComment = true - if buf[pos+1] == '[': + if L.buf[pos+1] == '#': break + when defined(nimpretty): + hasComment = true + if tok.line < 0: + tok.line = L.lineNumber + + if L.buf[pos+1] == '[': skipMultiLineComment(L, tok, pos+2, false) pos = L.bufpos - buf = L.buf else: tokenBegin(tok, pos) - while buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: inc(pos) + while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: + when defined(nimpretty): tok.literal.add L.buf[pos] + inc(pos) tokenEndIgnore(tok, pos+1) when defined(nimpretty): tok.commentOffsetB = L.offsetBase + pos + 1 @@ -1067,20 +1170,21 @@ proc skip(L: var TLexer, tok: var TToken) = if hasComment: tok.commentOffsetB = L.offsetBase + pos - 1 tok.tokType = tkComment - if gIndentationWidth <= 0: - gIndentationWidth = tok.indent + tok.indent = commentIndent -proc rawGetTok*(L: var TLexer, tok: var TToken) = +proc rawGetTok*(L: var Lexer, tok: var Token) = template atTokenEnd() {.dirty.} = when defined(nimsuggest): + L.previousTokenEnd.line = L.tokenEnd.line + L.previousTokenEnd.col = L.tokenEnd.col + L.tokenEnd.line = tok.line.uint16 + L.tokenEnd.col = getColNumber(L, L.bufpos).int16 # we attach the cursor to the last *strong* token if tok.tokType notin weakTokens: L.previousToken.line = tok.line.uint16 L.previousToken.col = tok.col.int16 - when defined(nimsuggest): - L.cursor = CursorPosition.None - fillToken(tok) + reset(tok) if L.indentAhead >= 0: tok.indent = L.indentAhead L.currLineIndent = L.indentAhead @@ -1092,13 +1196,18 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) = if tok.tokType == tkComment: L.indentAhead = L.currLineIndent return - var c = L.buf[L.bufpos] + let c = L.buf[L.bufpos] tok.line = L.lineNumber tok.col = getColNumber(L, L.bufpos) - if c in SymStartChars - {'r', 'R'}: + if c in SymStartChars - {'r', 'R'} - UnicodeOperatorStartChars: getSymbol(L, tok) else: case c + of UnicodeOperatorStartChars: + if unicodeOprLen(L.buf, L.bufpos)[0] != 0: + getOperator(L, tok) + else: + getSymbol(L, tok) of '#': scanComment(L, tok) of '*': @@ -1115,7 +1224,7 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) = of 'r', 'R': if L.buf[L.bufpos + 1] == '\"': inc(L.bufpos) - getString(L, tok, true) + getString(L, tok, raw) else: getSymbol(L, tok) of '(': @@ -1150,7 +1259,6 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) = if L.fileIdx == L.config.m.trackPos.fileIndex and tok.col+1 == L.config.m.trackPos.col and tok.line == L.config.m.trackPos.line.int and L.config.ideCmd == ideSug: tok.tokType = tkDot - L.cursor = CursorPosition.InToken L.config.m.trackPos.col = tok.col.int16 inc(L.bufpos) atTokenEnd() @@ -1192,10 +1300,10 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) = tok.tokType = tkInvalid lexMessage(L, errGenerated, "invalid token: " & c & " (\\" & $(ord(c)) & ')') of '\"': - # check for extended raw string literal: - var rawMode = L.bufpos > 0 and L.buf[L.bufpos-1] in SymChars - getString(L, tok, rawMode) - if rawMode: + # check for generalized raw string literal: + let mode = if L.bufpos > 0 and L.buf[L.bufpos-1] in SymChars: generalized else: normal + getString(L, tok, mode) + if mode == generalized: # tkRStrLit -> tkGStrLit # tkTripleStrLit -> tkGTripleStrLit inc(tok.tokType, 2) @@ -1207,7 +1315,28 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) = getNumber(L, tok) let c = L.buf[L.bufpos] if c in SymChars+{'_'}: - lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier") + if c in UnicodeOperatorStartChars and + unicodeOprLen(L.buf, L.bufpos)[0] != 0: + discard + else: + lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier") + of '-': + if L.buf[L.bufpos+1] in {'0'..'9'} and + (L.bufpos-1 == 0 or L.buf[L.bufpos-1] in UnaryMinusWhitelist): + # x)-23 # binary minus + # ,-23 # unary minus + # \n-78 # unary minus? Yes. + # =-3 # parsed as `=-` anyway + getNumber(L, tok) + let c = L.buf[L.bufpos] + if c in SymChars+{'_'}: + if c in UnicodeOperatorStartChars and + unicodeOprLen(L.buf, L.bufpos)[0] != 0: + discard + else: + lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier") + else: + getOperator(L, tok) else: if c in OpChars: getOperator(L, tok) @@ -1223,12 +1352,26 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) = proc getIndentWidth*(fileIdx: FileIndex, inputstream: PLLStream; cache: IdentCache; config: ConfigRef): int = - var lex: TLexer - var tok: TToken - initToken(tok) + result = 0 + var lex: Lexer = default(Lexer) + var tok: Token = default(Token) openLexer(lex, fileIdx, inputstream, cache, config) - while true: + var prevToken = tkEof + while tok.tokType != tkEof: rawGetTok(lex, tok) - result = tok.indent - if result > 0 or tok.tokType == tkEof: break + if tok.indent > 0 and prevToken in {tkColon, tkEquals, tkType, tkConst, tkLet, tkVar, tkUsing}: + result = tok.indent + if result > 0: break + prevToken = tok.tokType closeLexer(lex) + +proc getPrecedence*(ident: PIdent): int = + ## assumes ident is binary operator already + let + tokType = + if ident.id in ord(tokKeywordLow) - ord(tkSymbol)..ord(tokKeywordHigh) - ord(tkSymbol): + TokType(ident.id + ord(tkSymbol)) + else: tkOpr + tok = Token(ident: ident, tokType: tokType) + + getPrecedence(tok) |