diff options
Diffstat (limited to 'compiler/lexer.nim')
-rw-r--r-- | compiler/lexer.nim | 174 |
1 files changed, 67 insertions, 107 deletions
diff --git a/compiler/lexer.nim b/compiler/lexer.nim index 06a6cea67..ad5dd560c 100644 --- a/compiler/lexer.nim +++ b/compiler/lexer.nim @@ -16,14 +16,15 @@ # DOS or Macintosh text files, even when it is not the native format. import - hashes, options, msgs, strutils, platform, idents, nimlexbase, llstream, - wordrecg, lineinfos, pathutils, parseutils + options, msgs, platform, idents, nimlexbase, llstream, + wordrecg, lineinfos, pathutils + +import std/[hashes, parseutils, strutils] when defined(nimPreviewSlimSystem): import std/[assertions, formatfloat] const - MaxLineLength* = 80 # lines longer than this lead to a warning numChars*: set[char] = {'0'..'9', 'a'..'z', 'A'..'Z'} SymChars*: set[char] = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'} SymStartChars*: set[char] = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'} @@ -93,19 +94,21 @@ type # so that it is the correct default value base2, base8, base16 - Token* = object # a Nim token - tokType*: TokType # the type of the token - indent*: int # the indentation; != -1 if the token has been - # preceded with indentation - ident*: PIdent # the parsed identifier - iNumber*: BiggestInt # the parsed integer literal - fNumber*: BiggestFloat # the parsed floating point literal - base*: NumericalBase # the numerical base; only valid for int - # or float literals - strongSpaceA*: int8 # leading spaces of an operator - strongSpaceB*: int8 # trailing spaces of an operator - literal*: string # the parsed (string) literal; and - # documentation comments are here too + TokenSpacing* = enum + tsLeading, tsTrailing, tsEof + + Token* = object # a Nim token + tokType*: TokType # the type of the token + base*: NumericalBase # the numerical base; only valid for int + # or float literals + spacing*: set[TokenSpacing] # spaces around token + indent*: int # the indentation; != -1 if the token has been + # preceded with indentation + ident*: PIdent # the parsed identifier + iNumber*: BiggestInt # the parsed integer literal + fNumber*: BiggestFloat # the parsed floating point literal + literal*: string # the parsed (string) literal; and + # documentation comments are here too line*, col*: int when defined(nimpretty): offsetA*, offsetB*: int # used for pretty printing so that literals @@ -119,11 +122,12 @@ type # this is needed because scanning comments # needs so much look-ahead currLineIndent*: int - strongSpaces*, allowTabs*: bool errorHandler*: ErrorHandler cache*: IdentCache when defined(nimsuggest): previousToken: TLineInfo + tokenEnd*: TLineInfo + previousTokenEnd*: TLineInfo config*: ConfigRef proc getLineInfo*(L: Lexer, tok: Token): TLineInfo {.inline.} = @@ -145,9 +149,11 @@ proc isNimIdentifier*(s: string): bool = var i = 1 while i < sLen: if s[i] == '_': inc(i) - if i < sLen and s[i] notin SymChars: return + if i < sLen and s[i] notin SymChars: return false inc(i) result = true + else: + result = false proc `$`*(tok: Token): string = case tok.tokType @@ -169,32 +175,6 @@ proc printTok*(conf: ConfigRef; tok: Token) = # xxx factor with toLocation msgWriteln(conf, $tok.line & ":" & $tok.col & "\t" & $tok.tokType & " " & $tok) -proc initToken*(L: var Token) = - L.tokType = tkInvalid - L.iNumber = 0 - L.indent = 0 - L.strongSpaceA = 0 - L.literal = "" - L.fNumber = 0.0 - L.base = base10 - L.ident = nil - when defined(nimpretty): - L.commentOffsetA = 0 - L.commentOffsetB = 0 - -proc fillToken(L: var Token) = - L.tokType = tkInvalid - L.iNumber = 0 - L.indent = 0 - L.strongSpaceA = 0 - setLen(L.literal, 0) - L.fNumber = 0.0 - L.base = base10 - L.ident = nil - when defined(nimpretty): - L.commentOffsetA = 0 - L.commentOffsetB = 0 - proc openLexer*(lex: var Lexer, fileIdx: FileIndex, inputstream: PLLStream; cache: IdentCache; config: ConfigRef) = openBaseLexer(lex, inputstream) @@ -320,8 +300,7 @@ proc getNumber(L: var Lexer, result: var Token) = # Used to get slightly human friendlier err messages. const literalishChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '.', '\''} var msgPos = L.bufpos - var t: Token - t.literal = "" + var t = Token(literal: "") L.bufpos = startpos # Use L.bufpos as pos because of matchChars matchChars(L, t, literalishChars) # We must verify +/- specifically so that we're not past the literal @@ -505,11 +484,11 @@ proc getNumber(L: var Lexer, result: var Token) = of tkUInt16Lit: setNumber result.iNumber, xi and 0xffff of tkUInt32Lit: setNumber result.iNumber, xi and 0xffffffff of tkFloat32Lit: - setNumber result.fNumber, (cast[PFloat32](addr(xi)))[] + setNumber result.fNumber, (cast[ptr float32](addr(xi)))[] # note: this code is endian neutral! # XXX: Test this on big endian machine! of tkFloat64Lit, tkFloatLit: - setNumber result.fNumber, (cast[PFloat64](addr(xi)))[] + setNumber result.fNumber, (cast[ptr float64](addr(xi)))[] else: internalError(L.config, getLineInfo(L), "getNumber") # Bounds checks. Non decimal literals are allowed to overflow the range of @@ -534,8 +513,8 @@ proc getNumber(L: var Lexer, result: var Token) = of floatTypes: result.fNumber = parseFloat(result.literal) of tkUInt64Lit, tkUIntLit: - var iNumber: uint64 - var len: int + var iNumber: uint64 = uint64(0) + var len: int = 0 try: len = parseBiggestUInt(result.literal, iNumber) except ValueError: @@ -544,8 +523,8 @@ proc getNumber(L: var Lexer, result: var Token) = raise newException(ValueError, "invalid integer: " & result.literal) result.iNumber = cast[int64](iNumber) else: - var iNumber: int64 - var len: int + var iNumber: int64 = int64(0) + var len: int = 0 try: len = parseBiggestInt(result.literal, iNumber) except ValueError: @@ -732,10 +711,6 @@ proc handleCRLF(L: var Lexer, pos: int): int = template registerLine = let col = L.getColNumber(pos) - when not defined(nimpretty): - if col > MaxLineLength: - lexMessagePos(L, hintLineTooLong, pos) - case L.buf[pos] of CR: registerLine() @@ -795,7 +770,7 @@ proc getString(L: var Lexer, tok: var Token, mode: StringMode) = if mode != normal: tok.tokType = tkRStrLit else: tok.tokType = tkStrLit while true: - var c = L.buf[pos] + let c = L.buf[pos] if c == '\"': if mode != normal and L.buf[pos+1] == '\"': inc(pos, 2) @@ -821,7 +796,7 @@ proc getCharacter(L: var Lexer; tok: var Token) = tokenBegin(tok, L.bufpos) let startPos = L.bufpos inc(L.bufpos) # skip ' - var c = L.buf[L.bufpos] + let c = L.buf[L.bufpos] case c of '\0'..pred(' '), '\'': lexMessage(L, errGenerated, "invalid character literal") @@ -907,7 +882,7 @@ proc getSymbol(L: var Lexer, tok: var Token) = inc(pos) suspicious = true of '\x80'..'\xFF': - if c in UnicodeOperatorStartChars and unicodeOperators in L.config.features and unicodeOprLen(L.buf, pos)[0] != 0: + if c in UnicodeOperatorStartChars and unicodeOprLen(L.buf, pos)[0] != 0: break else: h = h !& ord(c) @@ -915,7 +890,7 @@ proc getSymbol(L: var Lexer, tok: var Token) = else: break tokenEnd(tok, pos-1) h = !$h - tok.ident = L.cache.getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h) + tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h) if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or (tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)): tok.tokType = tkSymbol @@ -929,7 +904,7 @@ proc getSymbol(L: var Lexer, tok: var Token) = proc endOperator(L: var Lexer, tok: var Token, pos: int, hash: Hash) {.inline.} = var h = !$hash - tok.ident = L.cache.getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h) + tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h) if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr else: tok.tokType = TokType(tok.ident.id - oprLow + ord(tkColon)) L.bufpos = pos @@ -939,11 +914,11 @@ proc getOperator(L: var Lexer, tok: var Token) = tokenBegin(tok, pos) var h: Hash = 0 while true: - var c = L.buf[pos] + let c = L.buf[pos] if c in OpChars: h = h !& ord(c) inc(pos) - elif c in UnicodeOperatorStartChars and unicodeOperators in L.config.features: + elif c in UnicodeOperatorStartChars: let oprLen = unicodeOprLen(L.buf, pos)[0] if oprLen == 0: break for i in 0..<oprLen: @@ -955,12 +930,15 @@ proc getOperator(L: var Lexer, tok: var Token) = tokenEnd(tok, pos-1) # advance pos but don't store it in L.bufpos so the next token (which might # be an operator too) gets the preceding spaces: - tok.strongSpaceB = 0 + tok.spacing = tok.spacing - {tsTrailing, tsEof} + var trailing = false while L.buf[pos] == ' ': inc pos - inc tok.strongSpaceB + trailing = true if L.buf[pos] in {CR, LF, nimlexbase.EndOfFile}: - tok.strongSpaceB = -1 + tok.spacing.incl(tsEof) + elif trailing: + tok.spacing.incl(tsTrailing) proc getPrecedence*(tok: Token): int = ## Calculates the precedence of the given token. @@ -1004,22 +982,6 @@ proc getPrecedence*(tok: Token): int = of tkOr, tkXor, tkPtr, tkRef: result = 3 else: return -10 -proc newlineFollows*(L: Lexer): bool = - var pos = L.bufpos - while true: - case L.buf[pos] - of ' ', '\t': - inc(pos) - of CR, LF: - result = true - break - of '#': - inc(pos) - if L.buf[pos] == '#': inc(pos) - if L.buf[pos] != '[': return true - else: - break - proc skipMultiLineComment(L: var Lexer; tok: var Token; start: int; isDoc: bool) = var pos = start @@ -1071,7 +1033,6 @@ proc skipMultiLineComment(L: var Lexer; tok: var Token; start: int; when defined(nimpretty): tok.literal.add "\L" if isDoc: when not defined(nimpretty): tok.literal.add "\n" - inc tok.iNumber var c = toStrip while L.buf[pos] == ' ' and c > 0: inc pos @@ -1090,8 +1051,6 @@ proc skipMultiLineComment(L: var Lexer; tok: var Token; start: int; proc scanComment(L: var Lexer, tok: var Token) = var pos = L.bufpos tok.tokType = tkComment - # iNumber contains the number of '\n' in the token - tok.iNumber = 0 assert L.buf[pos+1] == '#' when defined(nimpretty): tok.commentOffsetA = L.offsetBase + pos @@ -1114,9 +1073,7 @@ proc scanComment(L: var Lexer, tok: var Token) = toStrip = 0 else: # found first non-whitespace character stripInit = true - var lastBackslash = -1 while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: - if L.buf[pos] == '\\': lastBackslash = pos+1 tok.literal.add(L.buf[pos]) inc(pos) tokenEndIgnore(tok, pos) @@ -1134,7 +1091,6 @@ proc scanComment(L: var Lexer, tok: var Token) = while L.buf[pos] == ' ' and c > 0: inc pos dec c - inc tok.iNumber else: if L.buf[pos] > ' ': L.indentAhead = indent @@ -1147,7 +1103,7 @@ proc scanComment(L: var Lexer, tok: var Token) = proc skip(L: var Lexer, tok: var Token) = var pos = L.bufpos tokenBegin(tok, pos) - tok.strongSpaceA = 0 + tok.spacing.excl(tsLeading) when defined(nimpretty): var hasComment = false var commentIndent = L.currLineIndent @@ -1158,9 +1114,9 @@ proc skip(L: var Lexer, tok: var Token) = case L.buf[pos] of ' ': inc(pos) - inc(tok.strongSpaceA) + tok.spacing.incl(tsLeading) of '\t': - if not L.allowTabs: lexMessagePos(L, errGenerated, pos, "tabs are not allowed, use spaces instead") + lexMessagePos(L, errGenerated, pos, "tabs are not allowed, use spaces instead") inc(pos) of CR, LF: tokenEndPrevious(tok, pos) @@ -1180,7 +1136,7 @@ proc skip(L: var Lexer, tok: var Token) = pos = L.bufpos else: break - tok.strongSpaceA = 0 + tok.spacing.excl(tsLeading) when defined(nimpretty): if L.buf[pos] == '#' and tok.line < 0: commentIndent = indent if L.buf[pos] > ' ' and (L.buf[pos] != '#' or L.buf[pos+1] == '#'): @@ -1219,12 +1175,16 @@ proc skip(L: var Lexer, tok: var Token) = proc rawGetTok*(L: var Lexer, tok: var Token) = template atTokenEnd() {.dirty.} = when defined(nimsuggest): + L.previousTokenEnd.line = L.tokenEnd.line + L.previousTokenEnd.col = L.tokenEnd.col + L.tokenEnd.line = tok.line.uint16 + L.tokenEnd.col = getColNumber(L, L.bufpos).int16 # we attach the cursor to the last *strong* token if tok.tokType notin weakTokens: L.previousToken.line = tok.line.uint16 L.previousToken.col = tok.col.int16 - fillToken(tok) + reset(tok) if L.indentAhead >= 0: tok.indent = L.indentAhead L.currLineIndent = L.indentAhead @@ -1236,7 +1196,7 @@ proc rawGetTok*(L: var Lexer, tok: var Token) = if tok.tokType == tkComment: L.indentAhead = L.currLineIndent return - var c = L.buf[L.bufpos] + let c = L.buf[L.bufpos] tok.line = L.lineNumber tok.col = getColNumber(L, L.bufpos) if c in SymStartChars - {'r', 'R'} - UnicodeOperatorStartChars: @@ -1244,7 +1204,7 @@ proc rawGetTok*(L: var Lexer, tok: var Token) = else: case c of UnicodeOperatorStartChars: - if unicodeOperators in L.config.features and unicodeOprLen(L.buf, L.bufpos)[0] != 0: + if unicodeOprLen(L.buf, L.bufpos)[0] != 0: getOperator(L, tok) else: getSymbol(L, tok) @@ -1355,7 +1315,7 @@ proc rawGetTok*(L: var Lexer, tok: var Token) = getNumber(L, tok) let c = L.buf[L.bufpos] if c in SymChars+{'_'}: - if c in UnicodeOperatorStartChars and unicodeOperators in L.config.features and + if c in UnicodeOperatorStartChars and unicodeOprLen(L.buf, L.bufpos)[0] != 0: discard else: @@ -1370,7 +1330,7 @@ proc rawGetTok*(L: var Lexer, tok: var Token) = getNumber(L, tok) let c = L.buf[L.bufpos] if c in SymChars+{'_'}: - if c in UnicodeOperatorStartChars and unicodeOperators in L.config.features and + if c in UnicodeOperatorStartChars and unicodeOprLen(L.buf, L.bufpos)[0] != 0: discard else: @@ -1392,9 +1352,9 @@ proc rawGetTok*(L: var Lexer, tok: var Token) = proc getIndentWidth*(fileIdx: FileIndex, inputstream: PLLStream; cache: IdentCache; config: ConfigRef): int = - var lex: Lexer - var tok: Token - initToken(tok) + result = 0 + var lex: Lexer = default(Lexer) + var tok: Token = default(Token) openLexer(lex, fileIdx, inputstream, cache, config) var prevToken = tkEof while tok.tokType != tkEof: @@ -1407,11 +1367,11 @@ proc getIndentWidth*(fileIdx: FileIndex, inputstream: PLLStream; proc getPrecedence*(ident: PIdent): int = ## assumes ident is binary operator already - var tok: Token - initToken(tok) - tok.ident = ident - tok.tokType = - if tok.ident.id in ord(tokKeywordLow) - ord(tkSymbol)..ord(tokKeywordHigh) - ord(tkSymbol): - TokType(tok.ident.id + ord(tkSymbol)) - else: tkOpr + let + tokType = + if ident.id in ord(tokKeywordLow) - ord(tkSymbol)..ord(tokKeywordHigh) - ord(tkSymbol): + TokType(ident.id + ord(tkSymbol)) + else: tkOpr + tok = Token(ident: ident, tokType: tokType) + getPrecedence(tok) |