summary refs log tree commit diff stats
path: root/compiler/lexer.nim
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/lexer.nim')
-rw-r--r--compiler/lexer.nim174
1 files changed, 67 insertions, 107 deletions
diff --git a/compiler/lexer.nim b/compiler/lexer.nim
index 06a6cea67..ad5dd560c 100644
--- a/compiler/lexer.nim
+++ b/compiler/lexer.nim
@@ -16,14 +16,15 @@
 # DOS or Macintosh text files, even when it is not the native format.
 
 import
-  hashes, options, msgs, strutils, platform, idents, nimlexbase, llstream,
-  wordrecg, lineinfos, pathutils, parseutils
+  options, msgs, platform, idents, nimlexbase, llstream,
+  wordrecg, lineinfos, pathutils
+
+import std/[hashes, parseutils, strutils]
 
 when defined(nimPreviewSlimSystem):
   import std/[assertions, formatfloat]
 
 const
-  MaxLineLength* = 80         # lines longer than this lead to a warning
   numChars*: set[char] = {'0'..'9', 'a'..'z', 'A'..'Z'}
   SymChars*: set[char] = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
   SymStartChars*: set[char] = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'}
@@ -93,19 +94,21 @@ type
                               # so that it is the correct default value
     base2, base8, base16
 
-  Token* = object             # a Nim token
-    tokType*: TokType         # the type of the token
-    indent*: int              # the indentation; != -1 if the token has been
-                              # preceded with indentation
-    ident*: PIdent            # the parsed identifier
-    iNumber*: BiggestInt      # the parsed integer literal
-    fNumber*: BiggestFloat    # the parsed floating point literal
-    base*: NumericalBase      # the numerical base; only valid for int
-                              # or float literals
-    strongSpaceA*: int8       # leading spaces of an operator
-    strongSpaceB*: int8       # trailing spaces of an operator
-    literal*: string          # the parsed (string) literal; and
-                              # documentation comments are here too
+  TokenSpacing* = enum
+    tsLeading, tsTrailing, tsEof
+
+  Token* = object                # a Nim token
+    tokType*: TokType            # the type of the token
+    base*: NumericalBase         # the numerical base; only valid for int
+                                 # or float literals
+    spacing*: set[TokenSpacing]  # spaces around token
+    indent*: int                 # the indentation; != -1 if the token has been
+                                 # preceded with indentation
+    ident*: PIdent               # the parsed identifier
+    iNumber*: BiggestInt         # the parsed integer literal
+    fNumber*: BiggestFloat       # the parsed floating point literal
+    literal*: string             # the parsed (string) literal; and
+                                 # documentation comments are here too
     line*, col*: int
     when defined(nimpretty):
       offsetA*, offsetB*: int # used for pretty printing so that literals
@@ -119,11 +122,12 @@ type
                               # this is needed because scanning comments
                               # needs so much look-ahead
     currLineIndent*: int
-    strongSpaces*, allowTabs*: bool
     errorHandler*: ErrorHandler
     cache*: IdentCache
     when defined(nimsuggest):
       previousToken: TLineInfo
+      tokenEnd*: TLineInfo
+      previousTokenEnd*: TLineInfo
     config*: ConfigRef
 
 proc getLineInfo*(L: Lexer, tok: Token): TLineInfo {.inline.} =
@@ -145,9 +149,11 @@ proc isNimIdentifier*(s: string): bool =
     var i = 1
     while i < sLen:
       if s[i] == '_': inc(i)
-      if i < sLen and s[i] notin SymChars: return
+      if i < sLen and s[i] notin SymChars: return false
       inc(i)
     result = true
+  else:
+    result = false
 
 proc `$`*(tok: Token): string =
   case tok.tokType
@@ -169,32 +175,6 @@ proc printTok*(conf: ConfigRef; tok: Token) =
   # xxx factor with toLocation
   msgWriteln(conf, $tok.line & ":" & $tok.col & "\t" & $tok.tokType & " " & $tok)
 
-proc initToken*(L: var Token) =
-  L.tokType = tkInvalid
-  L.iNumber = 0
-  L.indent = 0
-  L.strongSpaceA = 0
-  L.literal = ""
-  L.fNumber = 0.0
-  L.base = base10
-  L.ident = nil
-  when defined(nimpretty):
-    L.commentOffsetA = 0
-    L.commentOffsetB = 0
-
-proc fillToken(L: var Token) =
-  L.tokType = tkInvalid
-  L.iNumber = 0
-  L.indent = 0
-  L.strongSpaceA = 0
-  setLen(L.literal, 0)
-  L.fNumber = 0.0
-  L.base = base10
-  L.ident = nil
-  when defined(nimpretty):
-    L.commentOffsetA = 0
-    L.commentOffsetB = 0
-
 proc openLexer*(lex: var Lexer, fileIdx: FileIndex, inputstream: PLLStream;
                  cache: IdentCache; config: ConfigRef) =
   openBaseLexer(lex, inputstream)
@@ -320,8 +300,7 @@ proc getNumber(L: var Lexer, result: var Token) =
     # Used to get slightly human friendlier err messages.
     const literalishChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '.', '\''}
     var msgPos = L.bufpos
-    var t: Token
-    t.literal = ""
+    var t = Token(literal: "")
     L.bufpos = startpos # Use L.bufpos as pos because of matchChars
     matchChars(L, t, literalishChars)
     # We must verify +/- specifically so that we're not past the literal
@@ -505,11 +484,11 @@ proc getNumber(L: var Lexer, result: var Token) =
         of tkUInt16Lit: setNumber result.iNumber, xi and 0xffff
         of tkUInt32Lit: setNumber result.iNumber, xi and 0xffffffff
         of tkFloat32Lit:
-          setNumber result.fNumber, (cast[PFloat32](addr(xi)))[]
+          setNumber result.fNumber, (cast[ptr float32](addr(xi)))[]
           # note: this code is endian neutral!
           # XXX: Test this on big endian machine!
         of tkFloat64Lit, tkFloatLit:
-          setNumber result.fNumber, (cast[PFloat64](addr(xi)))[]
+          setNumber result.fNumber, (cast[ptr float64](addr(xi)))[]
         else: internalError(L.config, getLineInfo(L), "getNumber")
 
         # Bounds checks. Non decimal literals are allowed to overflow the range of
@@ -534,8 +513,8 @@ proc getNumber(L: var Lexer, result: var Token) =
         of floatTypes:
           result.fNumber = parseFloat(result.literal)
         of tkUInt64Lit, tkUIntLit:
-          var iNumber: uint64
-          var len: int
+          var iNumber: uint64 = uint64(0)
+          var len: int = 0
           try:
             len = parseBiggestUInt(result.literal, iNumber)
           except ValueError:
@@ -544,8 +523,8 @@ proc getNumber(L: var Lexer, result: var Token) =
             raise newException(ValueError, "invalid integer: " & result.literal)
           result.iNumber = cast[int64](iNumber)
         else:
-          var iNumber: int64
-          var len: int
+          var iNumber: int64 = int64(0)
+          var len: int = 0
           try:
             len = parseBiggestInt(result.literal, iNumber)
           except ValueError:
@@ -732,10 +711,6 @@ proc handleCRLF(L: var Lexer, pos: int): int =
   template registerLine =
     let col = L.getColNumber(pos)
 
-    when not defined(nimpretty):
-      if col > MaxLineLength:
-        lexMessagePos(L, hintLineTooLong, pos)
-
   case L.buf[pos]
   of CR:
     registerLine()
@@ -795,7 +770,7 @@ proc getString(L: var Lexer, tok: var Token, mode: StringMode) =
     if mode != normal: tok.tokType = tkRStrLit
     else: tok.tokType = tkStrLit
     while true:
-      var c = L.buf[pos]
+      let c = L.buf[pos]
       if c == '\"':
         if mode != normal and L.buf[pos+1] == '\"':
           inc(pos, 2)
@@ -821,7 +796,7 @@ proc getCharacter(L: var Lexer; tok: var Token) =
   tokenBegin(tok, L.bufpos)
   let startPos = L.bufpos
   inc(L.bufpos)               # skip '
-  var c = L.buf[L.bufpos]
+  let c = L.buf[L.bufpos]
   case c
   of '\0'..pred(' '), '\'':
     lexMessage(L, errGenerated, "invalid character literal")
@@ -907,7 +882,7 @@ proc getSymbol(L: var Lexer, tok: var Token) =
       inc(pos)
       suspicious = true
     of '\x80'..'\xFF':
-      if c in UnicodeOperatorStartChars and unicodeOperators in L.config.features and unicodeOprLen(L.buf, pos)[0] != 0:
+      if c in UnicodeOperatorStartChars and unicodeOprLen(L.buf, pos)[0] != 0:
         break
       else:
         h = h !& ord(c)
@@ -915,7 +890,7 @@ proc getSymbol(L: var Lexer, tok: var Token) =
     else: break
   tokenEnd(tok, pos-1)
   h = !$h
-  tok.ident = L.cache.getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
+  tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h)
   if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or
       (tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)):
     tok.tokType = tkSymbol
@@ -929,7 +904,7 @@ proc getSymbol(L: var Lexer, tok: var Token) =
 proc endOperator(L: var Lexer, tok: var Token, pos: int,
                  hash: Hash) {.inline.} =
   var h = !$hash
-  tok.ident = L.cache.getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
+  tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h)
   if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr
   else: tok.tokType = TokType(tok.ident.id - oprLow + ord(tkColon))
   L.bufpos = pos
@@ -939,11 +914,11 @@ proc getOperator(L: var Lexer, tok: var Token) =
   tokenBegin(tok, pos)
   var h: Hash = 0
   while true:
-    var c = L.buf[pos]
+    let c = L.buf[pos]
     if c in OpChars:
       h = h !& ord(c)
       inc(pos)
-    elif c in UnicodeOperatorStartChars and unicodeOperators in L.config.features:
+    elif c in UnicodeOperatorStartChars:
       let oprLen = unicodeOprLen(L.buf, pos)[0]
       if oprLen == 0: break
       for i in 0..<oprLen:
@@ -955,12 +930,15 @@ proc getOperator(L: var Lexer, tok: var Token) =
   tokenEnd(tok, pos-1)
   # advance pos but don't store it in L.bufpos so the next token (which might
   # be an operator too) gets the preceding spaces:
-  tok.strongSpaceB = 0
+  tok.spacing = tok.spacing - {tsTrailing, tsEof}
+  var trailing = false
   while L.buf[pos] == ' ':
     inc pos
-    inc tok.strongSpaceB
+    trailing = true
   if L.buf[pos] in {CR, LF, nimlexbase.EndOfFile}:
-    tok.strongSpaceB = -1
+    tok.spacing.incl(tsEof)
+  elif trailing:
+    tok.spacing.incl(tsTrailing)
 
 proc getPrecedence*(tok: Token): int =
   ## Calculates the precedence of the given token.
@@ -1004,22 +982,6 @@ proc getPrecedence*(tok: Token): int =
   of tkOr, tkXor, tkPtr, tkRef: result = 3
   else: return -10
 
-proc newlineFollows*(L: Lexer): bool =
-  var pos = L.bufpos
-  while true:
-    case L.buf[pos]
-    of ' ', '\t':
-      inc(pos)
-    of CR, LF:
-      result = true
-      break
-    of '#':
-      inc(pos)
-      if L.buf[pos] == '#': inc(pos)
-      if L.buf[pos] != '[': return true
-    else:
-      break
-
 proc skipMultiLineComment(L: var Lexer; tok: var Token; start: int;
                           isDoc: bool) =
   var pos = start
@@ -1071,7 +1033,6 @@ proc skipMultiLineComment(L: var Lexer; tok: var Token; start: int;
       when defined(nimpretty): tok.literal.add "\L"
       if isDoc:
         when not defined(nimpretty): tok.literal.add "\n"
-        inc tok.iNumber
         var c = toStrip
         while L.buf[pos] == ' ' and c > 0:
           inc pos
@@ -1090,8 +1051,6 @@ proc skipMultiLineComment(L: var Lexer; tok: var Token; start: int;
 proc scanComment(L: var Lexer, tok: var Token) =
   var pos = L.bufpos
   tok.tokType = tkComment
-  # iNumber contains the number of '\n' in the token
-  tok.iNumber = 0
   assert L.buf[pos+1] == '#'
   when defined(nimpretty):
     tok.commentOffsetA = L.offsetBase + pos
@@ -1114,9 +1073,7 @@ proc scanComment(L: var Lexer, tok: var Token) =
         toStrip = 0
       else:  # found first non-whitespace character
         stripInit = true
-    var lastBackslash = -1
     while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}:
-      if L.buf[pos] == '\\': lastBackslash = pos+1
       tok.literal.add(L.buf[pos])
       inc(pos)
     tokenEndIgnore(tok, pos)
@@ -1134,7 +1091,6 @@ proc scanComment(L: var Lexer, tok: var Token) =
         while L.buf[pos] == ' ' and c > 0:
           inc pos
           dec c
-        inc tok.iNumber
     else:
       if L.buf[pos] > ' ':
         L.indentAhead = indent
@@ -1147,7 +1103,7 @@ proc scanComment(L: var Lexer, tok: var Token) =
 proc skip(L: var Lexer, tok: var Token) =
   var pos = L.bufpos
   tokenBegin(tok, pos)
-  tok.strongSpaceA = 0
+  tok.spacing.excl(tsLeading)
   when defined(nimpretty):
     var hasComment = false
     var commentIndent = L.currLineIndent
@@ -1158,9 +1114,9 @@ proc skip(L: var Lexer, tok: var Token) =
     case L.buf[pos]
     of ' ':
       inc(pos)
-      inc(tok.strongSpaceA)
+      tok.spacing.incl(tsLeading)
     of '\t':
-      if not L.allowTabs: lexMessagePos(L, errGenerated, pos, "tabs are not allowed, use spaces instead")
+      lexMessagePos(L, errGenerated, pos, "tabs are not allowed, use spaces instead")
       inc(pos)
     of CR, LF:
       tokenEndPrevious(tok, pos)
@@ -1180,7 +1136,7 @@ proc skip(L: var Lexer, tok: var Token) =
           pos = L.bufpos
         else:
           break
-      tok.strongSpaceA = 0
+      tok.spacing.excl(tsLeading)
       when defined(nimpretty):
         if L.buf[pos] == '#' and tok.line < 0: commentIndent = indent
       if L.buf[pos] > ' ' and (L.buf[pos] != '#' or L.buf[pos+1] == '#'):
@@ -1219,12 +1175,16 @@ proc skip(L: var Lexer, tok: var Token) =
 proc rawGetTok*(L: var Lexer, tok: var Token) =
   template atTokenEnd() {.dirty.} =
     when defined(nimsuggest):
+      L.previousTokenEnd.line = L.tokenEnd.line
+      L.previousTokenEnd.col = L.tokenEnd.col
+      L.tokenEnd.line = tok.line.uint16
+      L.tokenEnd.col = getColNumber(L, L.bufpos).int16
       # we attach the cursor to the last *strong* token
       if tok.tokType notin weakTokens:
         L.previousToken.line = tok.line.uint16
         L.previousToken.col = tok.col.int16
 
-  fillToken(tok)
+  reset(tok)
   if L.indentAhead >= 0:
     tok.indent = L.indentAhead
     L.currLineIndent = L.indentAhead
@@ -1236,7 +1196,7 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
     if tok.tokType == tkComment:
       L.indentAhead = L.currLineIndent
       return
-  var c = L.buf[L.bufpos]
+  let c = L.buf[L.bufpos]
   tok.line = L.lineNumber
   tok.col = getColNumber(L, L.bufpos)
   if c in SymStartChars - {'r', 'R'} - UnicodeOperatorStartChars:
@@ -1244,7 +1204,7 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
   else:
     case c
     of UnicodeOperatorStartChars:
-      if unicodeOperators in L.config.features and unicodeOprLen(L.buf, L.bufpos)[0] != 0:
+      if unicodeOprLen(L.buf, L.bufpos)[0] != 0:
         getOperator(L, tok)
       else:
         getSymbol(L, tok)
@@ -1355,7 +1315,7 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
       getNumber(L, tok)
       let c = L.buf[L.bufpos]
       if c in SymChars+{'_'}:
-        if c in UnicodeOperatorStartChars and unicodeOperators in L.config.features and
+        if c in UnicodeOperatorStartChars and
             unicodeOprLen(L.buf, L.bufpos)[0] != 0:
           discard
         else:
@@ -1370,7 +1330,7 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
         getNumber(L, tok)
         let c = L.buf[L.bufpos]
         if c in SymChars+{'_'}:
-          if c in UnicodeOperatorStartChars and unicodeOperators in L.config.features and
+          if c in UnicodeOperatorStartChars and
               unicodeOprLen(L.buf, L.bufpos)[0] != 0:
             discard
           else:
@@ -1392,9 +1352,9 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
 
 proc getIndentWidth*(fileIdx: FileIndex, inputstream: PLLStream;
                      cache: IdentCache; config: ConfigRef): int =
-  var lex: Lexer
-  var tok: Token
-  initToken(tok)
+  result = 0
+  var lex: Lexer = default(Lexer)
+  var tok: Token = default(Token)
   openLexer(lex, fileIdx, inputstream, cache, config)
   var prevToken = tkEof
   while tok.tokType != tkEof:
@@ -1407,11 +1367,11 @@ proc getIndentWidth*(fileIdx: FileIndex, inputstream: PLLStream;
 
 proc getPrecedence*(ident: PIdent): int =
   ## assumes ident is binary operator already
-  var tok: Token
-  initToken(tok)
-  tok.ident = ident
-  tok.tokType =
-    if tok.ident.id in ord(tokKeywordLow) - ord(tkSymbol)..ord(tokKeywordHigh) - ord(tkSymbol):
-      TokType(tok.ident.id + ord(tkSymbol))
-    else: tkOpr
+  let
+    tokType =
+      if ident.id in ord(tokKeywordLow) - ord(tkSymbol)..ord(tokKeywordHigh) - ord(tkSymbol):
+        TokType(ident.id + ord(tkSymbol))
+      else: tkOpr
+    tok = Token(ident: ident, tokType: tokType)
+
   getPrecedence(tok)