diff options
Diffstat (limited to 'compiler/lexer.nim')
-rw-r--r-- | compiler/lexer.nim | 174 |
1 files changed, 99 insertions, 75 deletions
diff --git a/compiler/lexer.nim b/compiler/lexer.nim index bca07e500..591561987 100644 --- a/compiler/lexer.nim +++ b/compiler/lexer.nim @@ -17,7 +17,7 @@ import hashes, options, msgs, strutils, platform, idents, nimlexbase, llstream, - wordrecg + wordrecg, configuration const MaxLineLength* = 80 # lines longer than this lead to a warning @@ -60,7 +60,7 @@ type tkCurlyDotLe, tkCurlyDotRi, # {. and .} tkParDotLe, tkParDotRi, # (. and .) tkComma, tkSemiColon, - tkColon, tkColonColon, tkEquals, tkDot, tkDotDot, + tkColon, tkColonColon, tkEquals, tkDot, tkDotDot, tkBracketLeColon, tkOpr, tkComment, tkAccent, tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr @@ -98,7 +98,7 @@ const "tkTripleStrLit", "tkGStrLit", "tkGTripleStrLit", "tkCharLit", "(", ")", "[", "]", "{", "}", "[.", ".]", "{.", ".}", "(.", ".)", ",", ";", - ":", "::", "=", ".", "..", + ":", "::", "=", ".", "..", "[:", "tkOpr", "tkComment", "`", "tkSpaces", "tkInfixOpr", "tkPrefixOpr", "tkPostfixOpr"] @@ -131,9 +131,9 @@ type # like 0b01 or r"\L" are unaffected commentOffsetA*, commentOffsetB*: int - TErrorHandler* = proc (info: TLineInfo; msg: TMsgKind; arg: string) + TErrorHandler* = proc (conf: ConfigRef; info: TLineInfo; msg: TMsgKind; arg: string) TLexer* = object of TBaseLexer - fileIdx*: int32 + fileIdx*: FileIndex indentAhead*: int # if > 0 an indendation has already been read # this is needed because scanning comments # needs so much look-ahead @@ -144,13 +144,12 @@ type cache*: IdentCache when defined(nimsuggest): previousToken: TLineInfo + config*: ConfigRef when defined(nimpretty): var gIndentationWidth*: int -var gLinesCompiled*: int # all lines that have been compiled - proc getLineInfo*(L: TLexer, tok: TToken): TLineInfo {.inline.} = result = newLineInfo(L.fileIdx, tok.line, tok.col) when defined(nimpretty): @@ -165,13 +164,12 @@ proc isKeyword*(kind: TTokType): bool = template ones(n): untyped = ((1 shl n)-1) # for utf-8 conversion proc isNimIdentifier*(s: string): bool = - if s[0] in SymStartChars: + let sLen = s.len + if sLen > 0 and s[0] in SymStartChars: var i = 1 - var sLen = s.len while i < sLen: - if s[i] == '_': - inc(i) - if s[i] notin SymChars: return + if s[i] == '_': inc(i) + if i < sLen and s[i] notin SymChars: return inc(i) result = true @@ -192,8 +190,8 @@ proc prettyTok*(tok: TToken): string = if isKeyword(tok.tokType): result = "keyword " & tok.ident.s else: result = tokToStr(tok) -proc printTok*(tok: TToken) = - msgWriteln($tok.line & ":" & $tok.col & "\t" & +proc printTok*(conf: ConfigRef; tok: TToken) = + msgWriteln(conf, $tok.line & ":" & $tok.col & "\t" & TokTypeToStr[tok.tokType] & " " & tokToStr(tok)) proc initToken*(L: var TToken) = @@ -222,8 +220,8 @@ proc fillToken(L: var TToken) = L.commentOffsetA = 0 L.commentOffsetB = 0 -proc openLexer*(lex: var TLexer, fileIdx: int32, inputstream: PLLStream; - cache: IdentCache) = +proc openLexer*(lex: var TLexer, fileIdx: FileIndex, inputstream: PLLStream; + cache: IdentCache; config: ConfigRef) = openBaseLexer(lex, inputstream) lex.fileIdx = fileidx lex.indentAhead = - 1 @@ -232,13 +230,15 @@ proc openLexer*(lex: var TLexer, fileIdx: int32, inputstream: PLLStream; lex.cache = cache when defined(nimsuggest): lex.previousToken.fileIndex = fileIdx + lex.config = config proc openLexer*(lex: var TLexer, filename: string, inputstream: PLLStream; - cache: IdentCache) = - openLexer(lex, filename.fileInfoIdx, inputstream, cache) + cache: IdentCache; config: ConfigRef) = + openLexer(lex, fileInfoIdx(config, filename), inputstream, cache, config) proc closeLexer*(lex: var TLexer) = - inc(gLinesCompiled, lex.lineNumber) + if lex.config != nil: + inc(lex.config.linesCompiled, lex.lineNumber) closeBaseLexer(lex) proc getLineInfo(L: TLexer): TLineInfo = @@ -246,9 +246,9 @@ proc getLineInfo(L: TLexer): TLineInfo = proc dispMessage(L: TLexer; info: TLineInfo; msg: TMsgKind; arg: string) = if L.errorHandler.isNil: - msgs.message(info, msg, arg) + msgs.message(L.config, info, msg, arg) else: - L.errorHandler(info, msg, arg) + L.errorHandler(L.config, info, msg, arg) proc lexMessage*(L: TLexer, msg: TMsgKind, arg = "") = L.dispMessage(getLineInfo(L), msg, arg) @@ -274,7 +274,7 @@ template tokenEnd(tok, pos) {.dirty.} = when defined(nimsuggest): let colB = getColNumber(L, pos)+1 if L.fileIdx == gTrackPos.fileIndex and gTrackPos.col in colA..colB and - L.lineNumber == gTrackPos.line and gIdeCmd in {ideSug, ideCon}: + L.lineNumber == gTrackPos.line.int and L.config.ideCmd in {ideSug, ideCon}: L.cursor = CursorPosition.InToken gTrackPos.col = colA.int16 colA = 0 @@ -285,9 +285,9 @@ template tokenEndIgnore(tok, pos) = when defined(nimsuggest): let colB = getColNumber(L, pos) if L.fileIdx == gTrackPos.fileIndex and gTrackPos.col in colA..colB and - L.lineNumber == gTrackPos.line and gIdeCmd in {ideSug, ideCon}: + L.lineNumber == gTrackPos.line.int and L.config.ideCmd in {ideSug, ideCon}: gTrackPos.fileIndex = trackPosInvalidFileIdx - gTrackPos.line = -1 + gTrackPos.line = 0'u16 colA = 0 when defined(nimpretty): tok.offsetB = L.offsetBase + pos @@ -299,7 +299,7 @@ template tokenEndPrevious(tok, pos) = # the cursor in a string literal or comment: let colB = getColNumber(L, pos) if L.fileIdx == gTrackPos.fileIndex and gTrackPos.col in colA..colB and - L.lineNumber == gTrackPos.line and gIdeCmd in {ideSug, ideCon}: + L.lineNumber == gTrackPos.line.int and L.config.ideCmd in {ideSug, ideCon}: L.cursor = CursorPosition.BeforeToken gTrackPos = L.previousToken gTrackPosAttached = true @@ -311,12 +311,12 @@ template tokenEndPrevious(tok, pos) = # We need to parse the largest uint literal without overflow checks proc unsafeParseUInt(s: string, b: var BiggestInt, start = 0): int = var i = start - if s[i] in {'0'..'9'}: + if i < s.len and s[i] in {'0'..'9'}: b = 0 - while s[i] in {'0'..'9'}: + while i < s.len and s[i] in {'0'..'9'}: b = b * 10 + (ord(s[i]) - ord('0')) inc(i) - while s[i] == '_': inc(i) # underscores are allowed and ignored + while i < s.len and s[i] == '_': inc(i) # underscores are allowed and ignored result = i - start {.pop.} # overflowChecks @@ -341,7 +341,8 @@ proc getNumber(L: var TLexer, result: var TToken) = break if buf[pos] == '_': if buf[pos+1] notin chars: - lexMessage(L, errInvalidToken, "_") + lexMessage(L, errGenerated, + "only single underscores may occur in a token: '__' is invalid") break add(tok.literal, '_') inc(pos) @@ -355,7 +356,7 @@ proc getNumber(L: var TLexer, result: var TToken) = inc(pos) L.bufpos = pos - proc lexMessageLitNum(L: var TLexer, msg: TMsgKind, startpos: int) = + proc lexMessageLitNum(L: var TLexer, msg: string, startpos: int) = # Used to get slightly human friendlier err messages. # Note: the erroneous 'O' char in the character set is intentional const literalishChars = {'A'..'F', 'a'..'f', '0'..'9', 'X', 'x', 'o', 'O', @@ -376,7 +377,7 @@ proc getNumber(L: var TLexer, result: var TToken) = add(t.literal, L.buf[L.bufpos]) matchChars(L, t, {'0'..'9'}) L.bufpos = msgPos - lexMessage(L, msg, t.literal) + lexMessage(L, errGenerated, msg % t.literal) var startpos, endpos: int @@ -398,7 +399,7 @@ proc getNumber(L: var TLexer, result: var TToken) = eatChar(L, result, '0') case L.buf[L.bufpos] of 'O': - lexMessageLitNum(L, errInvalidNumberOctalCode, startpos) + lexMessageLitNum(L, "$1 is not a valid number; did you mean octal? Then use one of '0o', '0c' or '0C'.", startpos) of 'x', 'X': eatChar(L, result, 'x') matchUnderscoreChars(L, result, {'0'..'9', 'a'..'f', 'A'..'F'}) @@ -409,7 +410,7 @@ proc getNumber(L: var TLexer, result: var TToken) = eatChar(L, result, 'b') matchUnderscoreChars(L, result, {'0'..'1'}) else: - internalError(getLineInfo(L), "getNumber") + internalError(L.config, getLineInfo(L), "getNumber") else: matchUnderscoreChars(L, result, {'0'..'9'}) if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): @@ -464,7 +465,7 @@ proc getNumber(L: var TLexer, result: var TToken) = result.tokType = tkInt8Lit inc(postPos) else: - lexMessageLitNum(L, errInvalidNumber, startpos) + lexMessageLitNum(L, "invalid number: '$1'", startpos) of 'u', 'U': inc(postPos) if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'): @@ -482,12 +483,12 @@ proc getNumber(L: var TLexer, result: var TToken) = else: result.tokType = tkUIntLit else: - lexMessageLitNum(L, errInvalidNumber, startpos) + lexMessageLitNum(L, "invalid number: '$1'", startpos) # Is there still a literalish char awaiting? Then it's an error! if L.buf[postPos] in literalishChars or (L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}): - lexMessageLitNum(L, errInvalidNumber, startpos) + lexMessageLitNum(L, "invalid number: '$1'", startpos) # Third stage, extract actual number L.bufpos = startpos # restore position @@ -528,7 +529,7 @@ proc getNumber(L: var TLexer, result: var TToken) = else: break else: - internalError(getLineInfo(L), "getNumber") + internalError(L.config, getLineInfo(L), "getNumber") case result.tokType of tkIntLit, tkInt64Lit: result.iNumber = xi @@ -545,7 +546,7 @@ proc getNumber(L: var TLexer, result: var TToken) = # XXX: Test this on big endian machine! of tkFloat64Lit, tkFloatLit: result.fNumber = (cast[PFloat64](addr(xi)))[] - else: internalError(getLineInfo(L), "getNumber") + else: internalError(L.config, getLineInfo(L), "getNumber") # Bounds checks. Non decimal literals are allowed to overflow the range of # the datatype as long as their pattern don't overflow _bitwise_, hence @@ -561,7 +562,7 @@ proc getNumber(L: var TLexer, result: var TToken) = if outOfRange: #echo "out of range num: ", result.iNumber, " vs ", xi - lexMessageLitNum(L, errNumberOutOfRange, startpos) + lexMessageLitNum(L, "number out of range: '$1'", startpos) else: case result.tokType @@ -577,19 +578,20 @@ proc getNumber(L: var TLexer, result: var TToken) = result.iNumber = parseBiggestInt(result.literal) # Explicit bounds checks - let outOfRange = case result.tokType: - of tkInt8Lit: (result.iNumber < int8.low or result.iNumber > int8.high) - of tkUInt8Lit: (result.iNumber < BiggestInt(uint8.low) or - result.iNumber > BiggestInt(uint8.high)) - of tkInt16Lit: (result.iNumber < int16.low or result.iNumber > int16.high) - of tkUInt16Lit: (result.iNumber < BiggestInt(uint16.low) or - result.iNumber > BiggestInt(uint16.high)) - of tkInt32Lit: (result.iNumber < int32.low or result.iNumber > int32.high) - of tkUInt32Lit: (result.iNumber < BiggestInt(uint32.low) or - result.iNumber > BiggestInt(uint32.high)) - else: false - - if outOfRange: lexMessageLitNum(L, errNumberOutOfRange, startpos) + let outOfRange = + case result.tokType + of tkInt8Lit: (result.iNumber < int8.low or result.iNumber > int8.high) + of tkUInt8Lit: (result.iNumber < BiggestInt(uint8.low) or + result.iNumber > BiggestInt(uint8.high)) + of tkInt16Lit: (result.iNumber < int16.low or result.iNumber > int16.high) + of tkUInt16Lit: (result.iNumber < BiggestInt(uint16.low) or + result.iNumber > BiggestInt(uint16.high)) + of tkInt32Lit: (result.iNumber < int32.low or result.iNumber > int32.high) + of tkUInt32Lit: (result.iNumber < BiggestInt(uint32.low) or + result.iNumber > BiggestInt(uint32.high)) + else: false + + if outOfRange: lexMessageLitNum(L, "number out of range: '$1'", startpos) # Promote int literal to int64? Not always necessary, but more consistent if result.tokType == tkIntLit: @@ -597,9 +599,9 @@ proc getNumber(L: var TLexer, result: var TToken) = result.tokType = tkInt64Lit except ValueError: - lexMessageLitNum(L, errInvalidNumber, startpos) + lexMessageLitNum(L, "invalid number: '$1'", startpos) except OverflowError, RangeError: - lexMessageLitNum(L, errNumberOutOfRange, startpos) + lexMessageLitNum(L, "number out of range: '$1'", startpos) tokenEnd(result, postPos-1) L.bufpos = postPos @@ -625,7 +627,16 @@ proc getEscapedChar(L: var TLexer, tok: var TToken) = inc(L.bufpos) # skip '\' case L.buf[L.bufpos] of 'n', 'N': - if tok.tokType == tkCharLit: lexMessage(L, errNnotAllowedInCharacter) + if L.config.oldNewlines: + if tok.tokType == tkCharLit: + lexMessage(L, errGenerated, "\\n not allowed in character literal") + add(tok.literal, tnl) + else: + add(tok.literal, '\L') + inc(L.bufpos) + of 'p', 'P': + if tok.tokType == tkCharLit: + lexMessage(L, errGenerated, "\\p not allowed in character literal") add(tok.literal, tnl) inc(L.bufpos) of 'r', 'R', 'c', 'C': @@ -687,8 +698,8 @@ proc getEscapedChar(L: var TLexer, tok: var TToken) = var xi = 0 handleDecChars(L, xi) if (xi <= 255): add(tok.literal, chr(xi)) - else: lexMessage(L, errInvalidCharacterConstant) - else: lexMessage(L, errInvalidCharacterConstant) + else: lexMessage(L, errGenerated, "invalid character constant") + else: lexMessage(L, errGenerated, "invalid character constant") proc newString(s: cstring, len: int): string = ## XXX, how come there is no support for this? @@ -703,7 +714,7 @@ proc handleCRLF(L: var TLexer, pos: int): int = if col > MaxLineLength: lexMessagePos(L, hintLineTooLong, pos) - if optEmbedOrigSrc in gGlobalOptions: + if optEmbedOrigSrc in L.config.globalOptions: let lineStart = cast[ByteAddress](L.buf) + L.lineStart let line = newString(cast[cstring](lineStart), col) addSourceLine(L.fileIdx, line) @@ -747,12 +758,12 @@ proc getString(L: var TLexer, tok: var TToken, rawMode: bool) = tokenEndIgnore(tok, pos) pos = handleCRLF(L, pos) buf = L.buf - add(tok.literal, tnl) + add(tok.literal, "\n") of nimlexbase.EndOfFile: tokenEndIgnore(tok, pos) var line2 = L.lineNumber L.lineNumber = line - lexMessagePos(L, errClosingTripleQuoteExpected, L.lineStart) + lexMessagePos(L, errGenerated, L.lineStart, "closing \"\"\" expected, but end of file reached") L.lineNumber = line2 L.bufpos = pos break @@ -775,7 +786,7 @@ proc getString(L: var TLexer, tok: var TToken, rawMode: bool) = break elif c in {CR, LF, nimlexbase.EndOfFile}: tokenEndIgnore(tok, pos) - lexMessage(L, errClosingQuoteExpected) + lexMessage(L, errGenerated, "closing \" expected") break elif (c == '\\') and not rawMode: L.bufpos = pos @@ -791,12 +802,13 @@ proc getCharacter(L: var TLexer, tok: var TToken) = inc(L.bufpos) # skip ' var c = L.buf[L.bufpos] case c - of '\0'..pred(' '), '\'': lexMessage(L, errInvalidCharacterConstant) + of '\0'..pred(' '), '\'': lexMessage(L, errGenerated, "invalid character literal") of '\\': getEscapedChar(L, tok) else: tok.literal = $c inc(L.bufpos) - if L.buf[L.bufpos] != '\'': lexMessage(L, errMissingFinalQuote) + if L.buf[L.bufpos] != '\'': + lexMessage(L, errGenerated, "missing closing ' for character literal") tokenEndIgnore(tok, L.bufpos) inc(L.bufpos) # skip ' @@ -817,7 +829,7 @@ proc getSymbol(L: var TLexer, tok: var TToken) = inc(pos) of '_': if buf[pos+1] notin SymChars: - lexMessage(L, errInvalidToken, "_") + lexMessage(L, errGenerated, "invalid token: trailing underscore") break inc(pos) else: break @@ -995,13 +1007,17 @@ proc skip(L: var TLexer, tok: var TToken) = var buf = L.buf tokenBegin(tok, pos) tok.strongSpaceA = 0 + when defined(nimpretty): + var hasComment = false + tok.commentOffsetA = L.offsetBase + pos + tok.commentOffsetB = tok.commentOffsetA while true: case buf[pos] of ' ': inc(pos) inc(tok.strongSpaceA) of '\t': - if not L.allowTabs: lexMessagePos(L, errTabulatorsAreNotAllowed, pos) + if not L.allowTabs: lexMessagePos(L, errGenerated, pos, "tabulators are not allowed") inc(pos) of CR, LF: tokenEndPrevious(tok, pos) @@ -1013,6 +1029,7 @@ proc skip(L: var TLexer, tok: var TToken) = inc(pos) inc(indent) elif buf[pos] == '#' and buf[pos+1] == '[': + when defined(nimpretty): hasComment = true skipMultiLineComment(L, tok, pos+2, false) pos = L.bufpos buf = L.buf @@ -1026,14 +1043,11 @@ proc skip(L: var TLexer, tok: var TToken) = of '#': # do not skip documentation comment: if buf[pos+1] == '#': break - when defined(nimpretty): - tok.commentOffsetA = L.offsetBase + pos + when defined(nimpretty): hasComment = true if buf[pos+1] == '[': skipMultiLineComment(L, tok, pos+2, false) pos = L.bufpos buf = L.buf - when defined(nimpretty): - tok.commentOffsetB = L.offsetBase + pos else: tokenBegin(tok, pos) while buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: inc(pos) @@ -1045,6 +1059,9 @@ proc skip(L: var TLexer, tok: var TToken) = tokenEndPrevious(tok, pos-1) L.bufpos = pos when defined(nimpretty): + if hasComment: + tok.commentOffsetB = L.offsetBase + pos + tok.tokType = tkComment if gIndentationWidth <= 0: gIndentationWidth = tok.indent @@ -1053,7 +1070,7 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) = when defined(nimsuggest): # we attach the cursor to the last *strong* token if tok.tokType notin weakTokens: - L.previousToken.line = tok.line.int16 + L.previousToken.line = tok.line.uint16 L.previousToken.col = tok.col.int16 when defined(nimsuggest): @@ -1066,6 +1083,10 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) = else: tok.indent = -1 skip(L, tok) + when defined(nimpretty): + if tok.tokType == tkComment: + L.indentAhead = L.currLineIndent + return var c = L.buf[L.bufpos] tok.line = L.lineNumber tok.col = getColNumber(L, L.bufpos) @@ -1101,7 +1122,7 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) = tok.tokType = tkParLe when defined(nimsuggest): if L.fileIdx == gTrackPos.fileIndex and tok.col < gTrackPos.col and - tok.line == gTrackPos.line and gIdeCmd == ideCon: + tok.line == gTrackPos.line.int and L.config.ideCmd == ideCon: gTrackPos.col = tok.col.int16 of ')': tok.tokType = tkParRi @@ -1111,6 +1132,9 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) = if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.': tok.tokType = tkBracketDotLe inc(L.bufpos) + elif L.buf[L.bufpos] == ':': + tok.tokType = tkBracketLeColon + inc(L.bufpos) else: tok.tokType = tkBracketLe of ']': @@ -1119,7 +1143,7 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) = of '.': when defined(nimsuggest): if L.fileIdx == gTrackPos.fileIndex and tok.col+1 == gTrackPos.col and - tok.line == gTrackPos.line and gIdeCmd == ideSug: + tok.line == gTrackPos.line.int and L.config.ideCmd == ideSug: tok.tokType = tkDot L.cursor = CursorPosition.InToken gTrackPos.col = tok.col.int16 @@ -1161,7 +1185,7 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) = else: tok.literal = $c tok.tokType = tkInvalid - lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')') + lexMessage(L, errGenerated, "invalid token: " & c & " (\\" & $(ord(c)) & ')') of '\"': # check for extended raw string literal: var rawMode = L.bufpos > 0 and L.buf[L.bufpos-1] in SymChars @@ -1178,7 +1202,7 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) = getNumber(L, tok) let c = L.buf[L.bufpos] if c in SymChars+{'_'}: - lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')') + lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier") else: if c in OpChars: getOperator(L, tok) @@ -1188,6 +1212,6 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) = else: tok.literal = $c tok.tokType = tkInvalid - lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')') + lexMessage(L, errGenerated, "invalid token: " & c & " (\\" & $(ord(c)) & ')') inc(L.bufpos) atTokenEnd() |