summary refs log tree commit diff stats
path: root/compiler/lexer.nim
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/lexer.nim')
-rw-r--r--compiler/lexer.nim1283
1 files changed, 713 insertions, 570 deletions
diff --git a/compiler/lexer.nim b/compiler/lexer.nim
index c5afa6e97..ad5dd560c 100644
--- a/compiler/lexer.nim
+++ b/compiler/lexer.nim
@@ -7,64 +7,78 @@
 #    distribution, for details about the copyright.
 #
 
-# This scanner is handwritten for efficiency. I used an elegant buffering
+# This lexer is handwritten for efficiency. I used an elegant buffering
 # scheme which I have not seen anywhere else:
 # We guarantee that a whole line is in the buffer. Thus only when scanning
-# the \n or \r character we have to check wether we need to read in the next
+# the \n or \r character we have to check whether we need to read in the next
 # chunk. (\n or \r already need special handling for incrementing the line
-# counter; choosing both \n and \r allows the scanner to properly read Unix,
+# counter; choosing both \n and \r allows the lexer to properly read Unix,
 # DOS or Macintosh text files, even when it is not the native format.
 
 import
-  hashes, options, msgs, strutils, platform, idents, nimlexbase, llstream,
-  wordrecg, lineinfos
+  options, msgs, platform, idents, nimlexbase, llstream,
+  wordrecg, lineinfos, pathutils
+
+import std/[hashes, parseutils, strutils]
+
+when defined(nimPreviewSlimSystem):
+  import std/[assertions, formatfloat]
 
 const
-  MaxLineLength* = 80         # lines longer than this lead to a warning
   numChars*: set[char] = {'0'..'9', 'a'..'z', 'A'..'Z'}
   SymChars*: set[char] = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
   SymStartChars*: set[char] = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'}
   OpChars*: set[char] = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.',
     '|', '=', '%', '&', '$', '@', '~', ':'}
+  UnaryMinusWhitelist = {' ', '\t', '\n', '\r', ',', ';', '(', '[', '{'}
 
 # don't forget to update the 'highlite' module if these charsets should change
 
 type
-  TTokType* = enum
-    tkInvalid, tkEof,         # order is important here!
-    tkSymbol, # keywords:
-    tkAddr, tkAnd, tkAs, tkAsm,
-    tkBind, tkBlock, tkBreak, tkCase, tkCast,
-    tkConcept, tkConst, tkContinue, tkConverter,
-    tkDefer, tkDiscard, tkDistinct, tkDiv, tkDo,
-    tkElif, tkElse, tkEnd, tkEnum, tkExcept, tkExport,
-    tkFinally, tkFor, tkFrom, tkFunc,
-    tkIf, tkImport, tkIn, tkInclude, tkInterface,
-    tkIs, tkIsnot, tkIterator,
-    tkLet,
-    tkMacro, tkMethod, tkMixin, tkMod, tkNil, tkNot, tkNotin,
-    tkObject, tkOf, tkOr, tkOut,
-    tkProc, tkPtr, tkRaise, tkRef, tkReturn,
-    tkShl, tkShr, tkStatic,
-    tkTemplate,
-    tkTry, tkTuple, tkType, tkUsing,
-    tkVar, tkWhen, tkWhile, tkXor,
-    tkYield, # end of keywords
-    tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit,
-    tkUIntLit, tkUInt8Lit, tkUInt16Lit, tkUInt32Lit, tkUInt64Lit,
-    tkFloatLit, tkFloat32Lit, tkFloat64Lit, tkFloat128Lit,
-    tkStrLit, tkRStrLit, tkTripleStrLit,
-    tkGStrLit, tkGTripleStrLit, tkCharLit, tkParLe, tkParRi, tkBracketLe,
-    tkBracketRi, tkCurlyLe, tkCurlyRi,
-    tkBracketDotLe, tkBracketDotRi, # [. and  .]
-    tkCurlyDotLe, tkCurlyDotRi, # {.  and  .}
-    tkParDotLe, tkParDotRi,   # (. and .)
-    tkComma, tkSemiColon,
-    tkColon, tkColonColon, tkEquals, tkDot, tkDotDot, tkBracketLeColon,
-    tkOpr, tkComment, tkAccent,
-    tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr
-
-  TTokTypes* = set[TTokType]
+  TokType* = enum
+    tkInvalid = "tkInvalid", tkEof = "[EOF]", # order is important here!
+    tkSymbol = "tkSymbol", # keywords:
+    tkAddr = "addr", tkAnd = "and", tkAs = "as", tkAsm = "asm",
+    tkBind = "bind", tkBlock = "block", tkBreak = "break", tkCase = "case", tkCast = "cast",
+    tkConcept = "concept", tkConst = "const", tkContinue = "continue", tkConverter = "converter",
+    tkDefer = "defer", tkDiscard = "discard", tkDistinct = "distinct", tkDiv = "div", tkDo = "do",
+    tkElif = "elif", tkElse = "else", tkEnd = "end", tkEnum = "enum", tkExcept = "except", tkExport = "export",
+    tkFinally = "finally", tkFor = "for", tkFrom = "from", tkFunc = "func",
+    tkIf = "if", tkImport = "import", tkIn = "in", tkInclude = "include", tkInterface = "interface",
+    tkIs = "is", tkIsnot = "isnot", tkIterator = "iterator",
+    tkLet = "let",
+    tkMacro = "macro", tkMethod = "method", tkMixin = "mixin", tkMod = "mod", tkNil = "nil", tkNot = "not", tkNotin = "notin",
+    tkObject = "object", tkOf = "of", tkOr = "or", tkOut = "out",
+    tkProc = "proc", tkPtr = "ptr", tkRaise = "raise", tkRef = "ref", tkReturn = "return",
+    tkShl = "shl", tkShr = "shr", tkStatic = "static",
+    tkTemplate = "template",
+    tkTry = "try", tkTuple = "tuple", tkType = "type", tkUsing = "using",
+    tkVar = "var", tkWhen = "when", tkWhile = "while", tkXor = "xor",
+    tkYield = "yield", # end of keywords
+
+    tkIntLit = "tkIntLit", tkInt8Lit = "tkInt8Lit", tkInt16Lit = "tkInt16Lit",
+    tkInt32Lit = "tkInt32Lit", tkInt64Lit = "tkInt64Lit",
+    tkUIntLit = "tkUIntLit", tkUInt8Lit = "tkUInt8Lit", tkUInt16Lit = "tkUInt16Lit",
+    tkUInt32Lit = "tkUInt32Lit", tkUInt64Lit = "tkUInt64Lit",
+    tkFloatLit = "tkFloatLit", tkFloat32Lit = "tkFloat32Lit",
+    tkFloat64Lit = "tkFloat64Lit", tkFloat128Lit = "tkFloat128Lit",
+    tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit", tkTripleStrLit = "tkTripleStrLit",
+    tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit",
+    tkCustomLit = "tkCustomLit",
+
+    tkParLe = "(", tkParRi = ")", tkBracketLe = "[",
+    tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}",
+    tkBracketDotLe = "[.", tkBracketDotRi = ".]",
+    tkCurlyDotLe = "{.", tkCurlyDotRi = ".}",
+    tkParDotLe = "(.", tkParDotRi = ".)",
+    tkComma = ",", tkSemiColon = ";",
+    tkColon = ":", tkColonColon = "::", tkEquals = "=",
+    tkDot = ".", tkDotDot = "..", tkBracketLeColon = "[:",
+    tkOpr, tkComment, tkAccent = "`",
+    # these are fake tokens used by renderer.nim
+    tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr, tkHideableStart, tkHideableEnd
+
+  TokTypes* = set[TokType]
 
 const
   weakTokens = {tkComma, tkSemiColon, tkColon,
@@ -73,84 +87,50 @@ const
     # tokens that should not be considered for previousToken
   tokKeywordLow* = succ(tkSymbol)
   tokKeywordHigh* = pred(tkIntLit)
-  TokTypeToStr*: array[TTokType, string] = ["tkInvalid", "[EOF]",
-    "tkSymbol",
-    "addr", "and", "as", "asm",
-    "bind", "block", "break", "case", "cast",
-    "concept", "const", "continue", "converter",
-    "defer", "discard", "distinct", "div", "do",
-    "elif", "else", "end", "enum", "except", "export",
-    "finally", "for", "from", "func", "if",
-    "import", "in", "include", "interface", "is", "isnot", "iterator",
-    "let",
-    "macro", "method", "mixin", "mod",
-    "nil", "not", "notin", "object", "of", "or",
-    "out", "proc", "ptr", "raise", "ref", "return",
-    "shl", "shr", "static",
-    "template",
-    "try", "tuple", "type", "using",
-    "var", "when", "while", "xor",
-    "yield",
-    "tkIntLit", "tkInt8Lit", "tkInt16Lit", "tkInt32Lit", "tkInt64Lit",
-    "tkUIntLit", "tkUInt8Lit", "tkUInt16Lit", "tkUInt32Lit", "tkUInt64Lit",
-    "tkFloatLit", "tkFloat32Lit", "tkFloat64Lit", "tkFloat128Lit",
-    "tkStrLit", "tkRStrLit",
-    "tkTripleStrLit", "tkGStrLit", "tkGTripleStrLit", "tkCharLit", "(",
-    ")", "[", "]", "{", "}", "[.", ".]", "{.", ".}", "(.", ".)",
-    ",", ";",
-    ":", "::", "=", ".", "..", "[:",
-    "tkOpr", "tkComment", "`",
-    "tkSpaces", "tkInfixOpr",
-    "tkPrefixOpr", "tkPostfixOpr"]
 
 type
-  TNumericalBase* = enum
+  NumericalBase* = enum
     base10,                   # base10 is listed as the first element,
                               # so that it is the correct default value
     base2, base8, base16
 
-  CursorPosition* {.pure.} = enum ## XXX remove this again
-    None, InToken, BeforeToken, AfterToken
-
-  TToken* = object            # a Nim token
-    tokType*: TTokType        # the type of the token
-    indent*: int              # the indentation; != -1 if the token has been
-                              # preceded with indentation
-    ident*: PIdent            # the parsed identifier
-    iNumber*: BiggestInt      # the parsed integer literal
-    fNumber*: BiggestFloat    # the parsed floating point literal
-    base*: TNumericalBase     # the numerical base; only valid for int
-                              # or float literals
-    strongSpaceA*: int8       # leading spaces of an operator
-    strongSpaceB*: int8       # trailing spaces of an operator
-    literal*: string          # the parsed (string) literal; and
-                              # documentation comments are here too
+  TokenSpacing* = enum
+    tsLeading, tsTrailing, tsEof
+
+  Token* = object                # a Nim token
+    tokType*: TokType            # the type of the token
+    base*: NumericalBase         # the numerical base; only valid for int
+                                 # or float literals
+    spacing*: set[TokenSpacing]  # spaces around token
+    indent*: int                 # the indentation; != -1 if the token has been
+                                 # preceded with indentation
+    ident*: PIdent               # the parsed identifier
+    iNumber*: BiggestInt         # the parsed integer literal
+    fNumber*: BiggestFloat       # the parsed floating point literal
+    literal*: string             # the parsed (string) literal; and
+                                 # documentation comments are here too
     line*, col*: int
     when defined(nimpretty):
-      offsetA*, offsetB*: int   # used for pretty printing so that literals
-                                # like 0b01 or  r"\L" are unaffected
+      offsetA*, offsetB*: int # used for pretty printing so that literals
+                              # like 0b01 or  r"\L" are unaffected
       commentOffsetA*, commentOffsetB*: int
 
-  TErrorHandler* = proc (conf: ConfigRef; info: TLineInfo; msg: TMsgKind; arg: string)
-  TLexer* = object of TBaseLexer
+  ErrorHandler* = proc (conf: ConfigRef; info: TLineInfo; msg: TMsgKind; arg: string)
+  Lexer* = object of TBaseLexer
     fileIdx*: FileIndex
-    indentAhead*: int         # if > 0 an indendation has already been read
+    indentAhead*: int         # if > 0 an indentation has already been read
                               # this is needed because scanning comments
                               # needs so much look-ahead
     currLineIndent*: int
-    strongSpaces*, allowTabs*: bool
-    cursor*: CursorPosition
-    errorHandler*: TErrorHandler
+    errorHandler*: ErrorHandler
     cache*: IdentCache
     when defined(nimsuggest):
       previousToken: TLineInfo
+      tokenEnd*: TLineInfo
+      previousTokenEnd*: TLineInfo
     config*: ConfigRef
 
-when defined(nimpretty):
-  var
-    gIndentationWidth*: int
-
-proc getLineInfo*(L: TLexer, tok: TToken): TLineInfo {.inline.} =
+proc getLineInfo*(L: Lexer, tok: Token): TLineInfo {.inline.} =
   result = newLineInfo(L.fileIdx, tok.line, tok.col)
   when defined(nimpretty):
     result.offsetA = tok.offsetA
@@ -158,8 +138,8 @@ proc getLineInfo*(L: TLexer, tok: TToken): TLineInfo {.inline.} =
     result.commentOffsetA = tok.commentOffsetA
     result.commentOffsetB = tok.commentOffsetB
 
-proc isKeyword*(kind: TTokType): bool =
-  result = (kind >= tokKeywordLow) and (kind <= tokKeywordHigh)
+proc isKeyword*(kind: TokType): bool =
+  (kind >= tokKeywordLow) and (kind <= tokKeywordHigh)
 
 template ones(n): untyped = ((1 shl n)-1) # for utf-8 conversion
 
@@ -169,62 +149,37 @@ proc isNimIdentifier*(s: string): bool =
     var i = 1
     while i < sLen:
       if s[i] == '_': inc(i)
-      if i < sLen and s[i] notin SymChars: return
+      if i < sLen and s[i] notin SymChars: return false
       inc(i)
     result = true
+  else:
+    result = false
 
-proc tokToStr*(tok: TToken): string =
+proc `$`*(tok: Token): string =
   case tok.tokType
-  of tkIntLit..tkInt64Lit: result = $tok.iNumber
-  of tkFloatLit..tkFloat64Lit: result = $tok.fNumber
-  of tkInvalid, tkStrLit..tkCharLit, tkComment: result = tok.literal
-  of tkParLe..tkColon, tkEof, tkAccent:
-    result = TokTypeToStr[tok.tokType]
+  of tkIntLit..tkInt64Lit: $tok.iNumber
+  of tkFloatLit..tkFloat64Lit: $tok.fNumber
+  of tkInvalid, tkStrLit..tkCharLit, tkComment: tok.literal
+  of tkParLe..tkColon, tkEof, tkAccent: $tok.tokType
   else:
     if tok.ident != nil:
-      result = tok.ident.s
+      tok.ident.s
     else:
-      result = ""
-
-proc prettyTok*(tok: TToken): string =
-  if isKeyword(tok.tokType): result = "keyword " & tok.ident.s
-  else: result = tokToStr(tok)
-
-proc printTok*(conf: ConfigRef; tok: TToken) =
-  msgWriteln(conf, $tok.line & ":" & $tok.col & "\t" &
-      TokTypeToStr[tok.tokType] & " " & tokToStr(tok))
-
-proc initToken*(L: var TToken) =
-  L.tokType = tkInvalid
-  L.iNumber = 0
-  L.indent = 0
-  L.strongSpaceA = 0
-  L.literal = ""
-  L.fNumber = 0.0
-  L.base = base10
-  L.ident = nil
-  when defined(nimpretty):
-    L.commentOffsetA = 0
-    L.commentOffsetB = 0
-
-proc fillToken(L: var TToken) =
-  L.tokType = tkInvalid
-  L.iNumber = 0
-  L.indent = 0
-  L.strongSpaceA = 0
-  setLen(L.literal, 0)
-  L.fNumber = 0.0
-  L.base = base10
-  L.ident = nil
-  when defined(nimpretty):
-    L.commentOffsetA = 0
-    L.commentOffsetB = 0
+      ""
+
+proc prettyTok*(tok: Token): string =
+  if isKeyword(tok.tokType): "keyword " & tok.ident.s
+  else: $tok
 
-proc openLexer*(lex: var TLexer, fileIdx: FileIndex, inputstream: PLLStream;
+proc printTok*(conf: ConfigRef; tok: Token) =
+  # xxx factor with toLocation
+  msgWriteln(conf, $tok.line & ":" & $tok.col & "\t" & $tok.tokType & " " & $tok)
+
+proc openLexer*(lex: var Lexer, fileIdx: FileIndex, inputstream: PLLStream;
                  cache: IdentCache; config: ConfigRef) =
   openBaseLexer(lex, inputstream)
-  lex.fileIdx = fileidx
-  lex.indentAhead = - 1
+  lex.fileIdx = fileIdx
+  lex.indentAhead = -1
   lex.currLineIndent = 0
   inc(lex.lineNumber, inputstream.lineOffset)
   lex.cache = cache
@@ -232,36 +187,36 @@ proc openLexer*(lex: var TLexer, fileIdx: FileIndex, inputstream: PLLStream;
     lex.previousToken.fileIndex = fileIdx
   lex.config = config
 
-proc openLexer*(lex: var TLexer, filename: string, inputstream: PLLStream;
+proc openLexer*(lex: var Lexer, filename: AbsoluteFile, inputstream: PLLStream;
                 cache: IdentCache; config: ConfigRef) =
   openLexer(lex, fileInfoIdx(config, filename), inputstream, cache, config)
 
-proc closeLexer*(lex: var TLexer) =
+proc closeLexer*(lex: var Lexer) =
   if lex.config != nil:
     inc(lex.config.linesCompiled, lex.lineNumber)
   closeBaseLexer(lex)
 
-proc getLineInfo(L: TLexer): TLineInfo =
+proc getLineInfo(L: Lexer): TLineInfo =
   result = newLineInfo(L.fileIdx, L.lineNumber, getColNumber(L, L.bufpos))
 
-proc dispMessage(L: TLexer; info: TLineInfo; msg: TMsgKind; arg: string) =
+proc dispMessage(L: Lexer; info: TLineInfo; msg: TMsgKind; arg: string) =
   if L.errorHandler.isNil:
     msgs.message(L.config, info, msg, arg)
   else:
     L.errorHandler(L.config, info, msg, arg)
 
-proc lexMessage*(L: TLexer, msg: TMsgKind, arg = "") =
+proc lexMessage*(L: Lexer, msg: TMsgKind, arg = "") =
   L.dispMessage(getLineInfo(L), msg, arg)
 
-proc lexMessageTok*(L: TLexer, msg: TMsgKind, tok: TToken, arg = "") =
+proc lexMessageTok*(L: Lexer, msg: TMsgKind, tok: Token, arg = "") =
   var info = newLineInfo(L.fileIdx, tok.line, tok.col)
   L.dispMessage(info, msg, arg)
 
-proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg = "") =
+proc lexMessagePos(L: var Lexer, msg: TMsgKind, pos: int, arg = "") =
   var info = newLineInfo(L.fileIdx, L.lineNumber, pos - L.lineStart)
   L.dispMessage(info, msg, arg)
 
-proc matchTwoChars(L: TLexer, first: char, second: set[char]): bool =
+proc matchTwoChars(L: Lexer, first: char, second: set[char]): bool =
   result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in second)
 
 template tokenBegin(tok, pos) {.dirty.} =
@@ -275,7 +230,6 @@ template tokenEnd(tok, pos) {.dirty.} =
     let colB = getColNumber(L, pos)+1
     if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and
         L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}:
-      L.cursor = CursorPosition.InToken
       L.config.m.trackPos.col = colA.int16
     colA = 0
   when defined(nimpretty):
@@ -300,312 +254,317 @@ template tokenEndPrevious(tok, pos) =
     let colB = getColNumber(L, pos)
     if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and
         L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}:
-      L.cursor = CursorPosition.BeforeToken
       L.config.m.trackPos = L.previousToken
       L.config.m.trackPosAttached = true
     colA = 0
   when defined(nimpretty):
     tok.offsetB = L.offsetBase + pos
 
-{.push overflowChecks: off.}
-# We need to parse the largest uint literal without overflow checks
-proc unsafeParseUInt(s: string, b: var BiggestInt, start = 0): int =
-  var i = start
-  if i < s.len and s[i] in {'0'..'9'}:
-    b = 0
-    while i < s.len and s[i] in {'0'..'9'}:
-      b = b * 10 + (ord(s[i]) - ord('0'))
-      inc(i)
-      while i < s.len and s[i] == '_': inc(i) # underscores are allowed and ignored
-    result = i - start
-{.pop.} # overflowChecks
-
-
-template eatChar(L: var TLexer, t: var TToken, replacementChar: char) =
-  add(t.literal, replacementChar)
+template eatChar(L: var Lexer, t: var Token, replacementChar: char) =
+  t.literal.add(replacementChar)
   inc(L.bufpos)
 
-template eatChar(L: var TLexer, t: var TToken) =
-  add(t.literal, L.buf[L.bufpos])
+template eatChar(L: var Lexer, t: var Token) =
+  t.literal.add(L.buf[L.bufpos])
   inc(L.bufpos)
 
-proc getNumber(L: var TLexer, result: var TToken) =
-  proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: set[char]) =
+proc getNumber(L: var Lexer, result: var Token) =
+  proc matchUnderscoreChars(L: var Lexer, tok: var Token, chars: set[char]): Natural =
     var pos = L.bufpos              # use registers for pos, buf
-    var buf = L.buf
+    result = 0
     while true:
-      if buf[pos] in chars:
-        add(tok.literal, buf[pos])
+      if L.buf[pos] in chars:
+        tok.literal.add(L.buf[pos])
         inc(pos)
+        inc(result)
       else:
         break
-      if buf[pos] == '_':
-        if buf[pos+1] notin chars:
+      if L.buf[pos] == '_':
+        if L.buf[pos+1] notin chars:
           lexMessage(L, errGenerated,
-            "only single underscores may occur in a token: '__' is invalid")
+            "only single underscores may occur in a token and token may not " &
+            "end with an underscore: e.g. '1__1' and '1_' are invalid")
           break
-        add(tok.literal, '_')
+        tok.literal.add('_')
         inc(pos)
     L.bufpos = pos
 
-  proc matchChars(L: var TLexer, tok: var TToken, chars: set[char]) =
+  proc matchChars(L: var Lexer, tok: var Token, chars: set[char]) =
     var pos = L.bufpos              # use registers for pos, buf
-    var buf = L.buf
-    while buf[pos] in chars:
-      add(tok.literal, buf[pos])
+    while L.buf[pos] in chars:
+      tok.literal.add(L.buf[pos])
       inc(pos)
     L.bufpos = pos
 
-  proc lexMessageLitNum(L: var TLexer, msg: string, startpos: int) =
+  proc lexMessageLitNum(L: var Lexer, msg: string, startpos: int, msgKind = errGenerated) =
     # Used to get slightly human friendlier err messages.
-    # Note: the erroneous 'O' char in the character set is intentional
-    const literalishChars = {'A'..'F', 'a'..'f', '0'..'9', 'X', 'x', 'o', 'O',
-      'c', 'C', 'b', 'B', '_', '.', '\'', 'd', 'i', 'u'}
+    const literalishChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '.', '\''}
     var msgPos = L.bufpos
-    var t: TToken
-    t.literal = ""
+    var t = Token(literal: "")
     L.bufpos = startpos # Use L.bufpos as pos because of matchChars
     matchChars(L, t, literalishChars)
     # We must verify +/- specifically so that we're not past the literal
     if  L.buf[L.bufpos] in {'+', '-'} and
         L.buf[L.bufpos - 1] in {'e', 'E'}:
-      add(t.literal, L.buf[L.bufpos])
+      t.literal.add(L.buf[L.bufpos])
       inc(L.bufpos)
       matchChars(L, t, literalishChars)
-    if L.buf[L.bufpos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}:
+    if L.buf[L.bufpos] in literalishChars:
+      t.literal.add(L.buf[L.bufpos])
       inc(L.bufpos)
-      add(t.literal, L.buf[L.bufpos])
       matchChars(L, t, {'0'..'9'})
     L.bufpos = msgPos
-    lexMessage(L, errGenerated, msg % t.literal)
+    lexMessage(L, msgKind, msg % t.literal)
 
   var
-    startpos, endpos: int
     xi: BiggestInt
     isBase10 = true
+    numDigits = 0
   const
-    baseCodeChars = {'X', 'x', 'o', 'c', 'C', 'b', 'B'}
+    # 'c', 'C' is deprecated
+    baseCodeChars = {'X', 'x', 'o', 'b', 'B', 'c', 'C'}
     literalishChars = baseCodeChars + {'A'..'F', 'a'..'f', '0'..'9', '_', '\''}
     floatTypes = {tkFloatLit, tkFloat32Lit, tkFloat64Lit, tkFloat128Lit}
   result.tokType = tkIntLit   # int literal until we know better
   result.literal = ""
   result.base = base10
-  startpos = L.bufpos
-  tokenBegin(result, startPos)
+  tokenBegin(result, L.bufpos)
+
+  var isPositive = true
+  if L.buf[L.bufpos] == '-':
+    eatChar(L, result)
+    isPositive = false
+
+  let startpos = L.bufpos
+
+  template setNumber(field, value) =
+    field = (if isPositive: value else: -value)
 
   # First stage: find out base, make verifications, build token literal string
-  if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'O'}:
+  # {'c', 'C'} is added for deprecation reasons to provide a clear error message
+  if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'c', 'C', 'O'}:
     isBase10 = false
     eatChar(L, result, '0')
     case L.buf[L.bufpos]
+    of 'c', 'C':
+      lexMessageLitNum(L,
+                       "$1 will soon be invalid for oct literals; Use '0o' " &
+                       "for octals. 'c', 'C' prefix",
+                       startpos,
+                       warnDeprecated)
+      eatChar(L, result, 'c')
+      numDigits = matchUnderscoreChars(L, result, {'0'..'7'})
     of 'O':
-      lexMessageLitNum(L, "$1 is not a valid number; did you mean octal? Then use one of '0o', '0c' or '0C'.", startpos)
+      lexMessageLitNum(L, "$1 is an invalid int literal; For octal literals " &
+                          "use the '0o' prefix.", startpos)
     of 'x', 'X':
       eatChar(L, result, 'x')
-      matchUnderscoreChars(L, result, {'0'..'9', 'a'..'f', 'A'..'F'})
-    of 'o', 'c', 'C':
-      eatChar(L, result, 'c')
-      matchUnderscoreChars(L, result, {'0'..'7'})
+      numDigits = matchUnderscoreChars(L, result, {'0'..'9', 'a'..'f', 'A'..'F'})
+    of 'o':
+      eatChar(L, result, 'o')
+      numDigits = matchUnderscoreChars(L, result, {'0'..'7'})
     of 'b', 'B':
       eatChar(L, result, 'b')
-      matchUnderscoreChars(L, result, {'0'..'1'})
+      numDigits = matchUnderscoreChars(L, result, {'0'..'1'})
     else:
       internalError(L.config, getLineInfo(L), "getNumber")
+    if numDigits == 0:
+      lexMessageLitNum(L, "invalid number: '$1'", startpos)
   else:
-    matchUnderscoreChars(L, result, {'0'..'9'})
+    discard matchUnderscoreChars(L, result, {'0'..'9'})
     if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}):
       result.tokType = tkFloatLit
       eatChar(L, result, '.')
-      matchUnderscoreChars(L, result, {'0'..'9'})
+      discard matchUnderscoreChars(L, result, {'0'..'9'})
     if L.buf[L.bufpos] in {'e', 'E'}:
       result.tokType = tkFloatLit
-      eatChar(L, result, 'e')
+      eatChar(L, result)
       if L.buf[L.bufpos] in {'+', '-'}:
         eatChar(L, result)
-      matchUnderscoreChars(L, result, {'0'..'9'})
-  endpos = L.bufpos
+      discard matchUnderscoreChars(L, result, {'0'..'9'})
+  let endpos = L.bufpos
 
   # Second stage, find out if there's a datatype suffix and handle it
   var postPos = endpos
+
   if L.buf[postPos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}:
+    let errPos = postPos
+    var customLitPossible = false
     if L.buf[postPos] == '\'':
       inc(postPos)
+      customLitPossible = true
 
-    case L.buf[postPos]
-    of 'f', 'F':
-      inc(postPos)
-      if (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
-        result.tokType = tkFloat32Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
-        result.tokType = tkFloat64Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '1') and
-           (L.buf[postPos + 1] == '2') and
-           (L.buf[postPos + 2] == '8'):
-        result.tokType = tkFloat128Lit
-        inc(postPos, 3)
-      else:   # "f" alone defaults to float32
-        result.tokType = tkFloat32Lit
-    of 'd', 'D':  # ad hoc convenience shortcut for f64
-      inc(postPos)
-      result.tokType = tkFloat64Lit
-    of 'i', 'I':
-      inc(postPos)
-      if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
-        result.tokType = tkInt64Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
-        result.tokType = tkInt32Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'):
-        result.tokType = tkInt16Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '8'):
-        result.tokType = tkInt8Lit
-        inc(postPos)
-      else:
-        lexMessageLitNum(L, "invalid number: '$1'", startpos)
-    of 'u', 'U':
-      inc(postPos)
-      if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
-        result.tokType = tkUInt64Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
-        result.tokType = tkUInt32Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'):
-        result.tokType = tkUInt16Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '8'):
-        result.tokType = tkUInt8Lit
-        inc(postPos)
+    if L.buf[postPos] in SymChars:
+      var suffix = newStringOfCap(10)
+      while true:
+        suffix.add L.buf[postPos]
+        inc postPos
+        if L.buf[postPos] notin SymChars+{'_'}: break
+      let suffixAsLower = suffix.toLowerAscii
+      case suffixAsLower
+      of "f", "f32": result.tokType = tkFloat32Lit
+      of "d", "f64": result.tokType = tkFloat64Lit
+      of "f128": result.tokType = tkFloat128Lit
+      of "i8": result.tokType = tkInt8Lit
+      of "i16": result.tokType = tkInt16Lit
+      of "i32": result.tokType = tkInt32Lit
+      of "i64": result.tokType = tkInt64Lit
+      of "u": result.tokType = tkUIntLit
+      of "u8": result.tokType = tkUInt8Lit
+      of "u16": result.tokType = tkUInt16Lit
+      of "u32": result.tokType = tkUInt32Lit
+      of "u64": result.tokType = tkUInt64Lit
+      elif customLitPossible:
+        # remember the position of the `'` so that the parser doesn't
+        # have to reparse the custom literal:
+        result.iNumber = len(result.literal)
+        result.literal.add '\''
+        result.literal.add suffix
+        result.tokType = tkCustomLit
       else:
-        result.tokType = tkUIntLit
+        lexMessageLitNum(L, "invalid number suffix: '$1'", errPos)
     else:
-      lexMessageLitNum(L, "invalid number: '$1'", startpos)
+      lexMessageLitNum(L, "invalid number suffix: '$1'", errPos)
 
   # Is there still a literalish char awaiting? Then it's an error!
   if  L.buf[postPos] in literalishChars or
      (L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}):
     lexMessageLitNum(L, "invalid number: '$1'", startpos)
 
-  # Third stage, extract actual number
-  L.bufpos = startpos            # restore position
-  var pos: int = startpos
-  try:
-    if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars):
-      inc(pos, 2)
-      xi = 0                  # it is a base prefix
-
-      case L.buf[pos - 1]
-      of 'b', 'B':
-        result.base = base2
-        while pos < endpos:
-          if L.buf[pos] != '_':
-            xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0'))
-          inc(pos)
-      of 'o', 'c', 'C':
-        result.base = base8
-        while pos < endpos:
-          if L.buf[pos] != '_':
-            xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0'))
-          inc(pos)
-      of 'x', 'X':
-        result.base = base16
-        while pos < endpos:
-          case L.buf[pos]
-          of '_':
-            inc(pos)
-          of '0'..'9':
-            xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0'))
+  if result.tokType != tkCustomLit:
+    # Third stage, extract actual number
+    L.bufpos = startpos            # restore position
+    var pos = startpos
+    try:
+      if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars):
+        inc(pos, 2)
+        xi = 0                  # it is a base prefix
+
+        case L.buf[pos - 1]
+        of 'b', 'B':
+          result.base = base2
+          while pos < endpos:
+            if L.buf[pos] != '_':
+              xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0'))
             inc(pos)
-          of 'a'..'f':
-            xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10)
+        # 'c', 'C' is deprecated (a warning is issued elsewhere)
+        of 'o', 'c', 'C':
+          result.base = base8
+          while pos < endpos:
+            if L.buf[pos] != '_':
+              xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0'))
             inc(pos)
-          of 'A'..'F':
-            xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10)
-            inc(pos)
-          else:
-            break
+        of 'x', 'X':
+          result.base = base16
+          while pos < endpos:
+            case L.buf[pos]
+            of '_':
+              inc(pos)
+            of '0'..'9':
+              xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0'))
+              inc(pos)
+            of 'a'..'f':
+              xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10)
+              inc(pos)
+            of 'A'..'F':
+              xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10)
+              inc(pos)
+            else:
+              break
+        else:
+          internalError(L.config, getLineInfo(L), "getNumber")
+
+        case result.tokType
+        of tkIntLit, tkInt64Lit: setNumber result.iNumber, xi
+        of tkInt8Lit: setNumber result.iNumber, ashr(xi shl 56, 56)
+        of tkInt16Lit: setNumber result.iNumber, ashr(xi shl 48, 48)
+        of tkInt32Lit: setNumber result.iNumber, ashr(xi shl 32, 32)
+        of tkUIntLit, tkUInt64Lit: setNumber result.iNumber, xi
+        of tkUInt8Lit: setNumber result.iNumber, xi and 0xff
+        of tkUInt16Lit: setNumber result.iNumber, xi and 0xffff
+        of tkUInt32Lit: setNumber result.iNumber, xi and 0xffffffff
+        of tkFloat32Lit:
+          setNumber result.fNumber, (cast[ptr float32](addr(xi)))[]
+          # note: this code is endian neutral!
+          # XXX: Test this on big endian machine!
+        of tkFloat64Lit, tkFloatLit:
+          setNumber result.fNumber, (cast[ptr float64](addr(xi)))[]
+        else: internalError(L.config, getLineInfo(L), "getNumber")
+
+        # Bounds checks. Non decimal literals are allowed to overflow the range of
+        # the datatype as long as their pattern don't overflow _bitwise_, hence
+        # below checks of signed sizes against uint*.high is deliberate:
+        # (0x80'u8 = 128, 0x80'i8 = -128, etc == OK)
+        if result.tokType notin floatTypes:
+          let outOfRange =
+            case result.tokType
+            of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi
+            of tkInt8Lit:  (xi > BiggestInt(uint8.high))
+            of tkInt16Lit: (xi > BiggestInt(uint16.high))
+            of tkInt32Lit: (xi > BiggestInt(uint32.high))
+            else: false
+
+          if outOfRange:
+            #echo "out of range num: ", result.iNumber, " vs ", xi
+            lexMessageLitNum(L, "number out of range: '$1'", startpos)
+
       else:
-        internalError(L.config, getLineInfo(L), "getNumber")
-
-      case result.tokType
-      of tkIntLit, tkInt64Lit: result.iNumber = xi
-      of tkInt8Lit: result.iNumber = BiggestInt(int8(toU8(int(xi))))
-      of tkInt16Lit: result.iNumber = BiggestInt(int16(toU16(int(xi))))
-      of tkInt32Lit: result.iNumber = BiggestInt(int32(toU32(int64(xi))))
-      of tkUIntLit, tkUInt64Lit: result.iNumber = xi
-      of tkUInt8Lit: result.iNumber = BiggestInt(uint8(toU8(int(xi))))
-      of tkUInt16Lit: result.iNumber = BiggestInt(uint16(toU16(int(xi))))
-      of tkUInt32Lit: result.iNumber = BiggestInt(uint32(toU32(int64(xi))))
-      of tkFloat32Lit:
-        result.fNumber = (cast[PFloat32](addr(xi)))[]
-        # note: this code is endian neutral!
-        # XXX: Test this on big endian machine!
-      of tkFloat64Lit, tkFloatLit:
-        result.fNumber = (cast[PFloat64](addr(xi)))[]
-      else: internalError(L.config, getLineInfo(L), "getNumber")
-
-      # Bounds checks. Non decimal literals are allowed to overflow the range of
-      # the datatype as long as their pattern don't overflow _bitwise_, hence
-      # below checks of signed sizes against uint*.high is deliberate:
-      # (0x80'u8 = 128, 0x80'i8 = -128, etc == OK)
-      if result.tokType notin floatTypes:
-        let outOfRange = case result.tokType:
-        of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi
-        of tkInt8Lit: (xi > BiggestInt(uint8.high))
-        of tkInt16Lit: (xi > BiggestInt(uint16.high))
-        of tkInt32Lit: (xi > BiggestInt(uint32.high))
-        else: false
+        case result.tokType
+        of floatTypes:
+          result.fNumber = parseFloat(result.literal)
+        of tkUInt64Lit, tkUIntLit:
+          var iNumber: uint64 = uint64(0)
+          var len: int = 0
+          try:
+            len = parseBiggestUInt(result.literal, iNumber)
+          except ValueError:
+            raise newException(OverflowDefect, "number out of range: " & result.literal)
+          if len != result.literal.len:
+            raise newException(ValueError, "invalid integer: " & result.literal)
+          result.iNumber = cast[int64](iNumber)
+        else:
+          var iNumber: int64 = int64(0)
+          var len: int = 0
+          try:
+            len = parseBiggestInt(result.literal, iNumber)
+          except ValueError:
+            raise newException(OverflowDefect, "number out of range: " & result.literal)
+          if len != result.literal.len:
+            raise newException(ValueError, "invalid integer: " & result.literal)
+          result.iNumber = iNumber
+
+        # Explicit bounds checks.
+        let outOfRange =
+          case result.tokType
+          of tkInt8Lit: result.iNumber > int8.high or result.iNumber < int8.low
+          of tkUInt8Lit: result.iNumber > BiggestInt(uint8.high) or result.iNumber < 0
+          of tkInt16Lit: result.iNumber > int16.high or result.iNumber < int16.low
+          of tkUInt16Lit: result.iNumber > BiggestInt(uint16.high) or result.iNumber < 0
+          of tkInt32Lit: result.iNumber > int32.high or result.iNumber < int32.low
+          of tkUInt32Lit: result.iNumber > BiggestInt(uint32.high) or result.iNumber < 0
+          else: false
 
         if outOfRange:
-          #echo "out of range num: ", result.iNumber, " vs ", xi
           lexMessageLitNum(L, "number out of range: '$1'", startpos)
 
-    else:
-      case result.tokType
-      of floatTypes:
-        result.fNumber = parseFloat(result.literal)
-      of tkUint64Lit:
-        xi = 0
-        let len = unsafeParseUInt(result.literal, xi)
-        if len != result.literal.len or len == 0:
-          raise newException(ValueError, "invalid integer: " & $xi)
-        result.iNumber = xi
-      else:
-        result.iNumber = parseBiggestInt(result.literal)
+      # Promote int literal to int64? Not always necessary, but more consistent
+      if result.tokType == tkIntLit:
+        if result.iNumber > high(int32) or result.iNumber < low(int32):
+          result.tokType = tkInt64Lit
 
-      # Explicit bounds checks
-      let outOfRange =
-        case result.tokType
-        of tkInt8Lit: (result.iNumber < int8.low or result.iNumber > int8.high)
-        of tkUInt8Lit: (result.iNumber < BiggestInt(uint8.low) or
-                        result.iNumber > BiggestInt(uint8.high))
-        of tkInt16Lit: (result.iNumber < int16.low or result.iNumber > int16.high)
-        of tkUInt16Lit: (result.iNumber < BiggestInt(uint16.low) or
-                        result.iNumber > BiggestInt(uint16.high))
-        of tkInt32Lit: (result.iNumber < int32.low or result.iNumber > int32.high)
-        of tkUInt32Lit: (result.iNumber < BiggestInt(uint32.low) or
-                        result.iNumber > BiggestInt(uint32.high))
-        else: false
-
-      if outOfRange: lexMessageLitNum(L, "number out of range: '$1'", startpos)
-
-    # Promote int literal to int64? Not always necessary, but more consistent
-    if result.tokType == tkIntLit:
-      if (result.iNumber < low(int32)) or (result.iNumber > high(int32)):
-        result.tokType = tkInt64Lit
-
-  except ValueError:
-    lexMessageLitNum(L, "invalid number: '$1'", startpos)
-  except OverflowError, RangeError:
-    lexMessageLitNum(L, "number out of range: '$1'", startpos)
+    except ValueError:
+      lexMessageLitNum(L, "invalid number: '$1'", startpos)
+    except OverflowDefect, RangeDefect:
+      lexMessageLitNum(L, "number out of range: '$1'", startpos)
   tokenEnd(result, postPos-1)
   L.bufpos = postPos
 
-proc handleHexChar(L: var TLexer, xi: var int) =
+proc handleHexChar(L: var Lexer, xi: var int; position: range[0..4]) =
+  template invalid() =
+    lexMessage(L, errGenerated,
+      "expected a hex digit, but found: " & L.buf[L.bufpos] &
+        "; maybe prepend with 0")
+
   case L.buf[L.bufpos]
   of '0'..'9':
     xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0'))
@@ -616,104 +575,142 @@ proc handleHexChar(L: var TLexer, xi: var int) =
   of 'A'..'F':
     xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10)
     inc(L.bufpos)
-  else: discard
+  of '"', '\'':
+    if position <= 1: invalid()
+    # do not progress the bufpos here.
+    if position == 0: inc(L.bufpos)
+  else:
+    invalid()
+    # Need to progress for `nim check`
+    inc(L.bufpos)
 
-proc handleDecChars(L: var TLexer, xi: var int) =
+proc handleDecChars(L: var Lexer, xi: var int) =
   while L.buf[L.bufpos] in {'0'..'9'}:
     xi = (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0'))
     inc(L.bufpos)
 
-proc getEscapedChar(L: var TLexer, tok: var TToken) =
+proc addUnicodeCodePoint(s: var string, i: int) =
+  let i = cast[uint](i)
+  # inlined toUTF-8 to avoid unicode and strutils dependencies.
+  let pos = s.len
+  if i <= 127:
+    s.setLen(pos+1)
+    s[pos+0] = chr(i)
+  elif i <= 0x07FF:
+    s.setLen(pos+2)
+    s[pos+0] = chr((i shr 6) or 0b110_00000)
+    s[pos+1] = chr((i and ones(6)) or 0b10_0000_00)
+  elif i <= 0xFFFF:
+    s.setLen(pos+3)
+    s[pos+0] = chr(i shr 12 or 0b1110_0000)
+    s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00)
+    s[pos+2] = chr(i and ones(6) or 0b10_0000_00)
+  elif i <= 0x001FFFFF:
+    s.setLen(pos+4)
+    s[pos+0] = chr(i shr 18 or 0b1111_0000)
+    s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00)
+    s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00)
+    s[pos+3] = chr(i and ones(6) or 0b10_0000_00)
+  elif i <= 0x03FFFFFF:
+    s.setLen(pos+5)
+    s[pos+0] = chr(i shr 24 or 0b111110_00)
+    s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00)
+    s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00)
+    s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00)
+    s[pos+4] = chr(i and ones(6) or 0b10_0000_00)
+  elif i <= 0x7FFFFFFF:
+    s.setLen(pos+6)
+    s[pos+0] = chr(i shr 30 or 0b1111110_0)
+    s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00)
+    s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00)
+    s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00)
+    s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00)
+    s[pos+5] = chr(i and ones(6) or 0b10_0000_00)
+
+proc getEscapedChar(L: var Lexer, tok: var Token) =
   inc(L.bufpos)               # skip '\'
   case L.buf[L.bufpos]
   of 'n', 'N':
-    if L.config.oldNewlines:
-      if tok.tokType == tkCharLit:
-        lexMessage(L, errGenerated, "\\n not allowed in character literal")
-      add(tok.literal, L.config.target.tnl)
-    else:
-      add(tok.literal, '\L')
+    tok.literal.add('\L')
     inc(L.bufpos)
   of 'p', 'P':
     if tok.tokType == tkCharLit:
       lexMessage(L, errGenerated, "\\p not allowed in character literal")
-    add(tok.literal, L.config.target.tnl)
+    tok.literal.add(L.config.target.tnl)
     inc(L.bufpos)
   of 'r', 'R', 'c', 'C':
-    add(tok.literal, CR)
+    tok.literal.add(CR)
     inc(L.bufpos)
   of 'l', 'L':
-    add(tok.literal, LF)
+    tok.literal.add(LF)
     inc(L.bufpos)
   of 'f', 'F':
-    add(tok.literal, FF)
+    tok.literal.add(FF)
     inc(L.bufpos)
   of 'e', 'E':
-    add(tok.literal, ESC)
+    tok.literal.add(ESC)
     inc(L.bufpos)
   of 'a', 'A':
-    add(tok.literal, BEL)
+    tok.literal.add(BEL)
     inc(L.bufpos)
   of 'b', 'B':
-    add(tok.literal, BACKSPACE)
+    tok.literal.add(BACKSPACE)
     inc(L.bufpos)
   of 'v', 'V':
-    add(tok.literal, VT)
+    tok.literal.add(VT)
     inc(L.bufpos)
   of 't', 'T':
-    add(tok.literal, '\t')
+    tok.literal.add('\t')
     inc(L.bufpos)
   of '\'', '\"':
-    add(tok.literal, L.buf[L.bufpos])
+    tok.literal.add(L.buf[L.bufpos])
     inc(L.bufpos)
   of '\\':
-    add(tok.literal, '\\')
+    tok.literal.add('\\')
     inc(L.bufpos)
-  of 'x', 'X', 'u', 'U':
-    var tp = L.buf[L.bufpos]
+  of 'x', 'X':
     inc(L.bufpos)
     var xi = 0
-    handleHexChar(L, xi)
-    handleHexChar(L, xi)
-    if tp in {'u', 'U'}:
-      handleHexChar(L, xi)
-      handleHexChar(L, xi)
-      # inlined toUTF-8 to avoid unicode and strutils dependencies.
-      if xi <=% 127:
-        add(tok.literal, xi.char )
-      elif xi <=% 0x07FF:
-        add(tok.literal, ((xi shr 6) or 0b110_00000).char )
-        add(tok.literal, ((xi and ones(6)) or 0b10_0000_00).char )
-      elif xi <=% 0xFFFF:
-        add(tok.literal, (xi shr 12 or 0b1110_0000).char )
-        add(tok.literal, (xi shr 6 and ones(6) or 0b10_0000_00).char )
-        add(tok.literal, (xi and ones(6) or 0b10_0000_00).char )
-      else: # value is 0xFFFF
-        add(tok.literal, "\xef\xbf\xbf" )
+    handleHexChar(L, xi, 1)
+    handleHexChar(L, xi, 2)
+    tok.literal.add(chr(xi))
+  of 'u', 'U':
+    if tok.tokType == tkCharLit:
+      lexMessage(L, errGenerated, "\\u not allowed in character literal")
+    inc(L.bufpos)
+    var xi = 0
+    if L.buf[L.bufpos] == '{':
+      inc(L.bufpos)
+      var start = L.bufpos
+      while L.buf[L.bufpos] != '}':
+        handleHexChar(L, xi, 0)
+      if start == L.bufpos:
+        lexMessage(L, errGenerated,
+          "Unicode codepoint cannot be empty")
+      inc(L.bufpos)
+      if xi > 0x10FFFF:
+        let hex = ($L.buf)[start..L.bufpos-2]
+        lexMessage(L, errGenerated,
+          "Unicode codepoint must be lower than 0x10FFFF, but was: " & hex)
     else:
-      add(tok.literal, chr(xi))
+      handleHexChar(L, xi, 1)
+      handleHexChar(L, xi, 2)
+      handleHexChar(L, xi, 3)
+      handleHexChar(L, xi, 4)
+    addUnicodeCodePoint(tok.literal, xi)
   of '0'..'9':
     if matchTwoChars(L, '0', {'0'..'9'}):
       lexMessage(L, warnOctalEscape)
     var xi = 0
     handleDecChars(L, xi)
-    if (xi <= 255): add(tok.literal, chr(xi))
+    if (xi <= 255): tok.literal.add(chr(xi))
     else: lexMessage(L, errGenerated, "invalid character constant")
   else: lexMessage(L, errGenerated, "invalid character constant")
 
-proc newString(s: cstring, len: int): string =
-  ## XXX, how come there is no support for this?
-  result = newString(len)
-  for i in 0 ..< len:
-    result[i] = s[i]
-
-proc handleCRLF(L: var TLexer, pos: int): int =
+proc handleCRLF(L: var Lexer, pos: int): int =
   template registerLine =
     let col = L.getColNumber(pos)
 
-    if col > MaxLineLength:
-      lexMessagePos(L, hintLineTooLong, pos)
-
   case L.buf[pos]
   of CR:
     registerLine()
@@ -723,37 +720,40 @@ proc handleCRLF(L: var TLexer, pos: int): int =
     result = nimlexbase.handleLF(L, pos)
   else: result = pos
 
-proc getString(L: var TLexer, tok: var TToken, rawMode: bool) =
+type
+  StringMode = enum
+    normal,
+    raw,
+    generalized
+
+proc getString(L: var Lexer, tok: var Token, mode: StringMode) =
   var pos = L.bufpos
-  var buf = L.buf                 # put `buf` in a register
   var line = L.lineNumber         # save linenumber for better error message
-  tokenBegin(tok, pos)
+  tokenBegin(tok, pos - ord(mode == raw))
   inc pos # skip "
-  if buf[pos] == '\"' and buf[pos+1] == '\"':
+  if L.buf[pos] == '\"' and L.buf[pos+1] == '\"':
     tok.tokType = tkTripleStrLit # long string literal:
     inc(pos, 2)               # skip ""
     # skip leading newline:
-    if buf[pos] in {' ', '\t'}:
+    if L.buf[pos] in {' ', '\t'}:
       var newpos = pos+1
-      while buf[newpos] in {' ', '\t'}: inc newpos
-      if buf[newpos] in {CR, LF}: pos = newpos
+      while L.buf[newpos] in {' ', '\t'}: inc newpos
+      if L.buf[newpos] in {CR, LF}: pos = newpos
     pos = handleCRLF(L, pos)
-    buf = L.buf
     while true:
-      case buf[pos]
+      case L.buf[pos]
       of '\"':
-        if buf[pos+1] == '\"' and buf[pos+2] == '\"' and
-            buf[pos+3] != '\"':
+        if L.buf[pos+1] == '\"' and L.buf[pos+2] == '\"' and
+            L.buf[pos+3] != '\"':
           tokenEndIgnore(tok, pos+2)
           L.bufpos = pos + 3 # skip the three """
           break
-        add(tok.literal, '\"')
+        tok.literal.add('\"')
         inc(pos)
       of CR, LF:
         tokenEndIgnore(tok, pos)
         pos = handleCRLF(L, pos)
-        buf = L.buf
-        add(tok.literal, "\n")
+        tok.literal.add("\n")
       of nimlexbase.EndOfFile:
         tokenEndIgnore(tok, pos)
         var line2 = L.lineNumber
@@ -763,18 +763,18 @@ proc getString(L: var TLexer, tok: var TToken, rawMode: bool) =
         L.bufpos = pos
         break
       else:
-        add(tok.literal, buf[pos])
+        tok.literal.add(L.buf[pos])
         inc(pos)
   else:
     # ordinary string literal
-    if rawMode: tok.tokType = tkRStrLit
+    if mode != normal: tok.tokType = tkRStrLit
     else: tok.tokType = tkStrLit
     while true:
-      var c = buf[pos]
+      let c = L.buf[pos]
       if c == '\"':
-        if rawMode and buf[pos+1] == '\"':
+        if mode != normal and L.buf[pos+1] == '\"':
           inc(pos, 2)
-          add(tok.literal, '"')
+          tok.literal.add('"')
         else:
           tokenEndIgnore(tok, pos)
           inc(pos) # skip '"'
@@ -783,145 +783,243 @@ proc getString(L: var TLexer, tok: var TToken, rawMode: bool) =
         tokenEndIgnore(tok, pos)
         lexMessage(L, errGenerated, "closing \" expected")
         break
-      elif (c == '\\') and not rawMode:
+      elif (c == '\\') and mode == normal:
         L.bufpos = pos
         getEscapedChar(L, tok)
         pos = L.bufpos
       else:
-        add(tok.literal, c)
+        tok.literal.add(c)
         inc(pos)
     L.bufpos = pos
 
-proc getCharacter(L: var TLexer, tok: var TToken) =
+proc getCharacter(L: var Lexer; tok: var Token) =
   tokenBegin(tok, L.bufpos)
+  let startPos = L.bufpos
   inc(L.bufpos)               # skip '
-  var c = L.buf[L.bufpos]
+  let c = L.buf[L.bufpos]
   case c
-  of '\0'..pred(' '), '\'': lexMessage(L, errGenerated, "invalid character literal")
+  of '\0'..pred(' '), '\'':
+    lexMessage(L, errGenerated, "invalid character literal")
+    tok.literal = $c
   of '\\': getEscapedChar(L, tok)
   else:
     tok.literal = $c
     inc(L.bufpos)
-  if L.buf[L.bufpos] != '\'':
-    lexMessage(L, errGenerated, "missing closing ' for character literal")
-  tokenEndIgnore(tok, L.bufpos)
-  inc(L.bufpos)               # skip '
+  if L.buf[L.bufpos] == '\'':
+    tokenEndIgnore(tok, L.bufpos)
+    inc(L.bufpos)               # skip '
+  else:
+    if startPos > 0 and L.buf[startPos-1] == '`':
+      tok.literal = "'"
+      L.bufpos = startPos+1
+    else:
+      lexMessage(L, errGenerated, "missing closing ' for character literal")
+    tokenEndIgnore(tok, L.bufpos)
+
+const
+  UnicodeOperatorStartChars = {'\226', '\194', '\195'}
+    # the allowed unicode characters ("∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓ ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔")
+    # all start with one of these.
+
+type
+  UnicodeOprPred = enum
+    Mul, Add
+
+proc unicodeOprLen(buf: cstring; pos: int): (int8, UnicodeOprPred) =
+  template m(len): untyped = (int8(len), Mul)
+  template a(len): untyped = (int8(len), Add)
+  result = 0.m
+  case buf[pos]
+  of '\226':
+    if buf[pos+1] == '\136':
+      if buf[pos+2] == '\152': result = 3.m # ∘
+      elif buf[pos+2] == '\153': result = 3.m # ∙
+      elif buf[pos+2] == '\167': result = 3.m # ∧
+      elif buf[pos+2] == '\168': result = 3.a # ∨
+      elif buf[pos+2] == '\169': result = 3.m # ∩
+      elif buf[pos+2] == '\170': result = 3.a # ∪
+    elif buf[pos+1] == '\138':
+      if buf[pos+2] == '\147': result = 3.m # ⊓
+      elif buf[pos+2] == '\148': result = 3.a # ⊔
+      elif buf[pos+2] == '\149': result = 3.a # ⊕
+      elif buf[pos+2] == '\150': result = 3.a # ⊖
+      elif buf[pos+2] == '\151': result = 3.m # ⊗
+      elif buf[pos+2] == '\152': result = 3.m # ⊘
+      elif buf[pos+2] == '\153': result = 3.m # ⊙
+      elif buf[pos+2] == '\155': result = 3.m # ⊛
+      elif buf[pos+2] == '\158': result = 3.a # ⊞
+      elif buf[pos+2] == '\159': result = 3.a # ⊟
+      elif buf[pos+2] == '\160': result = 3.m # ⊠
+      elif buf[pos+2] == '\161': result = 3.m # ⊡
+    elif buf[pos+1] == '\152' and buf[pos+2] == '\133': result = 3.m # ★
+  of '\194':
+    if buf[pos+1] == '\177': result = 2.a # ±
+  of '\195':
+    if buf[pos+1] == '\151': result = 2.m # ×
+  else:
+    discard
 
-proc getSymbol(L: var TLexer, tok: var TToken) =
+proc getSymbol(L: var Lexer, tok: var Token) =
   var h: Hash = 0
   var pos = L.bufpos
-  var buf = L.buf
   tokenBegin(tok, pos)
+  var suspicious = false
   while true:
-    var c = buf[pos]
+    var c = L.buf[pos]
     case c
-    of 'a'..'z', '0'..'9', '\x80'..'\xFF':
+    of 'a'..'z', '0'..'9':
       h = h !& ord(c)
       inc(pos)
     of 'A'..'Z':
       c = chr(ord(c) + (ord('a') - ord('A'))) # toLower()
       h = h !& ord(c)
       inc(pos)
+      suspicious = true
     of '_':
-      if buf[pos+1] notin SymChars:
+      if L.buf[pos+1] notin SymChars:
         lexMessage(L, errGenerated, "invalid token: trailing underscore")
         break
       inc(pos)
+      suspicious = true
+    of '\x80'..'\xFF':
+      if c in UnicodeOperatorStartChars and unicodeOprLen(L.buf, pos)[0] != 0:
+        break
+      else:
+        h = h !& ord(c)
+        inc(pos)
     else: break
   tokenEnd(tok, pos-1)
   h = !$h
-  tok.ident = L.cache.getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
-  L.bufpos = pos
+  tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h)
   if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or
       (tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)):
     tok.tokType = tkSymbol
   else:
-    tok.tokType = TTokType(tok.ident.id + ord(tkSymbol))
+    tok.tokType = TokType(tok.ident.id + ord(tkSymbol))
+    if suspicious and {optStyleHint, optStyleError} * L.config.globalOptions != {}:
+      lintReport(L.config, getLineInfo(L), tok.ident.s.normalize, tok.ident.s)
+  L.bufpos = pos
+
 
-proc endOperator(L: var TLexer, tok: var TToken, pos: int,
+proc endOperator(L: var Lexer, tok: var Token, pos: int,
                  hash: Hash) {.inline.} =
   var h = !$hash
-  tok.ident = L.cache.getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
+  tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h)
   if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr
-  else: tok.tokType = TTokType(tok.ident.id - oprLow + ord(tkColon))
+  else: tok.tokType = TokType(tok.ident.id - oprLow + ord(tkColon))
   L.bufpos = pos
 
-proc getOperator(L: var TLexer, tok: var TToken) =
+proc getOperator(L: var Lexer, tok: var Token) =
   var pos = L.bufpos
-  var buf = L.buf
   tokenBegin(tok, pos)
   var h: Hash = 0
   while true:
-    var c = buf[pos]
-    if c notin OpChars: break
-    h = h !& ord(c)
-    inc(pos)
+    let c = L.buf[pos]
+    if c in OpChars:
+      h = h !& ord(c)
+      inc(pos)
+    elif c in UnicodeOperatorStartChars:
+      let oprLen = unicodeOprLen(L.buf, pos)[0]
+      if oprLen == 0: break
+      for i in 0..<oprLen:
+        h = h !& ord(L.buf[pos])
+        inc pos
+    else:
+      break
   endOperator(L, tok, pos, h)
   tokenEnd(tok, pos-1)
   # advance pos but don't store it in L.bufpos so the next token (which might
   # be an operator too) gets the preceding spaces:
-  tok.strongSpaceB = 0
-  while buf[pos] == ' ':
+  tok.spacing = tok.spacing - {tsTrailing, tsEof}
+  var trailing = false
+  while L.buf[pos] == ' ':
     inc pos
-    inc tok.strongSpaceB
-  if buf[pos] in {CR, LF, nimlexbase.EndOfFile}:
-    tok.strongSpaceB = -1
-
-proc newlineFollows*(L: TLexer): bool =
-  var pos = L.bufpos
-  var buf = L.buf
-  while true:
-    case buf[pos]
-    of ' ', '\t':
-      inc(pos)
-    of CR, LF:
-      result = true
-      break
-    of '#':
-      inc(pos)
-      if buf[pos] == '#': inc(pos)
-      if buf[pos] != '[': return true
-    else:
-      break
-
-proc skipMultiLineComment(L: var TLexer; tok: var TToken; start: int;
+    trailing = true
+  if L.buf[pos] in {CR, LF, nimlexbase.EndOfFile}:
+    tok.spacing.incl(tsEof)
+  elif trailing:
+    tok.spacing.incl(tsTrailing)
+
+proc getPrecedence*(tok: Token): int =
+  ## Calculates the precedence of the given token.
+  const
+    MulPred = 9
+    PlusPred = 8
+  case tok.tokType
+  of tkOpr:
+    let relevantChar = tok.ident.s[0]
+
+    # arrow like?
+    if tok.ident.s.len > 1 and tok.ident.s[^1] == '>' and
+      tok.ident.s[^2] in {'-', '~', '='}: return 0
+
+    template considerAsgn(value: untyped) =
+      result = if tok.ident.s[^1] == '=': 1 else: value
+
+    case relevantChar
+    of '$', '^': considerAsgn(10)
+    of '*', '%', '/', '\\': considerAsgn(MulPred)
+    of '~': result = 8
+    of '+', '-', '|': considerAsgn(PlusPred)
+    of '&': considerAsgn(7)
+    of '=', '<', '>', '!': result = 5
+    of '.': considerAsgn(6)
+    of '?': result = 2
+    of UnicodeOperatorStartChars:
+      if tok.ident.s[^1] == '=':
+        result = 1
+      else:
+        let (len, pred) = unicodeOprLen(cstring(tok.ident.s), 0)
+        if len != 0:
+          result = if pred == Mul: MulPred else: PlusPred
+        else:
+          result = 2
+    else: considerAsgn(2)
+  of tkDiv, tkMod, tkShl, tkShr: result = 9
+  of tkDotDot: result = 6
+  of tkIn, tkNotin, tkIs, tkIsnot, tkOf, tkAs, tkFrom: result = 5
+  of tkAnd: result = 4
+  of tkOr, tkXor, tkPtr, tkRef: result = 3
+  else: return -10
+
+proc skipMultiLineComment(L: var Lexer; tok: var Token; start: int;
                           isDoc: bool) =
   var pos = start
-  var buf = L.buf
   var toStrip = 0
   tokenBegin(tok, pos)
   # detect the amount of indentation:
   if isDoc:
     toStrip = getColNumber(L, pos)
-    while buf[pos] == ' ': inc pos
-    if buf[pos] in {CR, LF}:
+    while L.buf[pos] == ' ':
+      inc pos
+      inc toStrip
+    while L.buf[pos] in {CR, LF}:  # skip blank lines
       pos = handleCRLF(L, pos)
-      buf = L.buf
       toStrip = 0
-      while buf[pos] == ' ':
+      while L.buf[pos] == ' ':
         inc pos
         inc toStrip
   var nesting = 0
   while true:
-    case buf[pos]
+    case L.buf[pos]
     of '#':
       if isDoc:
-        if buf[pos+1] == '#' and buf[pos+2] == '[':
+        if L.buf[pos+1] == '#' and L.buf[pos+2] == '[':
           inc nesting
         tok.literal.add '#'
-      elif buf[pos+1] == '[':
+      elif L.buf[pos+1] == '[':
         inc nesting
       inc pos
     of ']':
       if isDoc:
-        if buf[pos+1] == '#' and buf[pos+2] == '#':
+        if L.buf[pos+1] == '#' and L.buf[pos+2] == '#':
           if nesting == 0:
             tokenEndIgnore(tok, pos+2)
             inc(pos, 3)
             break
           dec nesting
         tok.literal.add ']'
-      elif buf[pos+1] == '#':
+      elif L.buf[pos+1] == '#':
         if nesting == 0:
           tokenEndIgnore(tok, pos+1)
           inc(pos, 2)
@@ -931,14 +1029,12 @@ proc skipMultiLineComment(L: var TLexer; tok: var TToken; start: int;
     of CR, LF:
       tokenEndIgnore(tok, pos)
       pos = handleCRLF(L, pos)
-      buf = L.buf
       # strip leading whitespace:
       when defined(nimpretty): tok.literal.add "\L"
       if isDoc:
         when not defined(nimpretty): tok.literal.add "\n"
-        inc tok.iNumber
         var c = toStrip
-        while buf[pos] == ' ' and c > 0:
+        while L.buf[pos] == ' ' and c > 0:
           inc pos
           dec c
     of nimlexbase.EndOfFile:
@@ -946,57 +1042,57 @@ proc skipMultiLineComment(L: var TLexer; tok: var TToken; start: int;
       lexMessagePos(L, errGenerated, pos, "end of multiline comment expected")
       break
     else:
-      if isDoc or defined(nimpretty): tok.literal.add buf[pos]
+      if isDoc or defined(nimpretty): tok.literal.add L.buf[pos]
       inc(pos)
   L.bufpos = pos
   when defined(nimpretty):
     tok.commentOffsetB = L.offsetBase + pos - 1
 
-proc scanComment(L: var TLexer, tok: var TToken) =
+proc scanComment(L: var Lexer, tok: var Token) =
   var pos = L.bufpos
-  var buf = L.buf
   tok.tokType = tkComment
-  # iNumber contains the number of '\n' in the token
-  tok.iNumber = 0
-  assert buf[pos+1] == '#'
+  assert L.buf[pos+1] == '#'
   when defined(nimpretty):
-    tok.commentOffsetA = L.offsetBase + pos - 1
+    tok.commentOffsetA = L.offsetBase + pos
 
-  if buf[pos+2] == '[':
+  if L.buf[pos+2] == '[':
     skipMultiLineComment(L, tok, pos+3, true)
     return
   tokenBegin(tok, pos)
   inc(pos, 2)
 
   var toStrip = 0
-  while buf[pos] == ' ':
-    inc pos
-    inc toStrip
+  var stripInit = false
 
   while true:
-    var lastBackslash = -1
-    while buf[pos] notin {CR, LF, nimlexbase.EndOfFile}:
-      if buf[pos] == '\\': lastBackslash = pos+1
-      add(tok.literal, buf[pos])
+    if not stripInit:  # find baseline indentation inside comment
+      while L.buf[pos] == ' ':
+        inc pos
+        inc toStrip
+      if L.buf[pos] in {CR, LF}:  # don't set toStrip in blank comment lines
+        toStrip = 0
+      else:  # found first non-whitespace character
+        stripInit = true
+    while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}:
+      tok.literal.add(L.buf[pos])
       inc(pos)
     tokenEndIgnore(tok, pos)
     pos = handleCRLF(L, pos)
-    buf = L.buf
     var indent = 0
-    while buf[pos] == ' ':
+    while L.buf[pos] == ' ':
       inc(pos)
       inc(indent)
 
-    if buf[pos] == '#' and buf[pos+1] == '#':
+    if L.buf[pos] == '#' and L.buf[pos+1] == '#':
       tok.literal.add "\n"
       inc(pos, 2)
-      var c = toStrip
-      while buf[pos] == ' ' and c > 0:
-        inc pos
-        dec c
-      inc tok.iNumber
+      if stripInit:
+        var c = toStrip
+        while L.buf[pos] == ' ' and c > 0:
+          inc pos
+          dec c
     else:
-      if buf[pos] > ' ':
+      if L.buf[pos] > ' ':
         L.indentAhead = indent
       tokenEndIgnore(tok, pos)
       break
@@ -1004,58 +1100,65 @@ proc scanComment(L: var TLexer, tok: var TToken) =
   when defined(nimpretty):
     tok.commentOffsetB = L.offsetBase + pos - 1
 
-proc skip(L: var TLexer, tok: var TToken) =
+proc skip(L: var Lexer, tok: var Token) =
   var pos = L.bufpos
-  var buf = L.buf
   tokenBegin(tok, pos)
-  tok.strongSpaceA = 0
+  tok.spacing.excl(tsLeading)
   when defined(nimpretty):
     var hasComment = false
+    var commentIndent = L.currLineIndent
     tok.commentOffsetA = L.offsetBase + pos
     tok.commentOffsetB = tok.commentOffsetA
+    tok.line = -1
   while true:
-    case buf[pos]
+    case L.buf[pos]
     of ' ':
       inc(pos)
-      inc(tok.strongSpaceA)
+      tok.spacing.incl(tsLeading)
     of '\t':
-      if not L.allowTabs: lexMessagePos(L, errGenerated, pos, "tabulators are not allowed")
+      lexMessagePos(L, errGenerated, pos, "tabs are not allowed, use spaces instead")
       inc(pos)
     of CR, LF:
       tokenEndPrevious(tok, pos)
-      when defined(nimpretty):
-        # we are not yet in a comment, so update the comment token's line information:
-        if not hasComment: inc tok.line
       pos = handleCRLF(L, pos)
-      buf = L.buf
       var indent = 0
       while true:
-        if buf[pos] == ' ':
+        if L.buf[pos] == ' ':
           inc(pos)
           inc(indent)
-        elif buf[pos] == '#' and buf[pos+1] == '[':
-          when defined(nimpretty): hasComment = true
+        elif L.buf[pos] == '#' and L.buf[pos+1] == '[':
+          when defined(nimpretty):
+            hasComment = true
+            if tok.line < 0:
+              tok.line = L.lineNumber
+              commentIndent = indent
           skipMultiLineComment(L, tok, pos+2, false)
           pos = L.bufpos
-          buf = L.buf
         else:
           break
-      tok.strongSpaceA = 0
-      if buf[pos] > ' ' and (buf[pos] != '#' or buf[pos+1] == '#'):
+      tok.spacing.excl(tsLeading)
+      when defined(nimpretty):
+        if L.buf[pos] == '#' and tok.line < 0: commentIndent = indent
+      if L.buf[pos] > ' ' and (L.buf[pos] != '#' or L.buf[pos+1] == '#'):
         tok.indent = indent
         L.currLineIndent = indent
         break
     of '#':
       # do not skip documentation comment:
-      if buf[pos+1] == '#': break
-      when defined(nimpretty): hasComment = true
-      if buf[pos+1] == '[':
+      if L.buf[pos+1] == '#': break
+      when defined(nimpretty):
+        hasComment = true
+        if tok.line < 0:
+          tok.line = L.lineNumber
+
+      if L.buf[pos+1] == '[':
         skipMultiLineComment(L, tok, pos+2, false)
         pos = L.bufpos
-        buf = L.buf
       else:
         tokenBegin(tok, pos)
-        while buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: inc(pos)
+        while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}:
+          when defined(nimpretty): tok.literal.add L.buf[pos]
+          inc(pos)
         tokenEndIgnore(tok, pos+1)
         when defined(nimpretty):
           tok.commentOffsetB = L.offsetBase + pos + 1
@@ -1067,20 +1170,21 @@ proc skip(L: var TLexer, tok: var TToken) =
     if hasComment:
       tok.commentOffsetB = L.offsetBase + pos - 1
       tok.tokType = tkComment
-    if gIndentationWidth <= 0:
-      gIndentationWidth = tok.indent
+      tok.indent = commentIndent
 
-proc rawGetTok*(L: var TLexer, tok: var TToken) =
+proc rawGetTok*(L: var Lexer, tok: var Token) =
   template atTokenEnd() {.dirty.} =
     when defined(nimsuggest):
+      L.previousTokenEnd.line = L.tokenEnd.line
+      L.previousTokenEnd.col = L.tokenEnd.col
+      L.tokenEnd.line = tok.line.uint16
+      L.tokenEnd.col = getColNumber(L, L.bufpos).int16
       # we attach the cursor to the last *strong* token
       if tok.tokType notin weakTokens:
         L.previousToken.line = tok.line.uint16
         L.previousToken.col = tok.col.int16
 
-  when defined(nimsuggest):
-    L.cursor = CursorPosition.None
-  fillToken(tok)
+  reset(tok)
   if L.indentAhead >= 0:
     tok.indent = L.indentAhead
     L.currLineIndent = L.indentAhead
@@ -1092,13 +1196,18 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) =
     if tok.tokType == tkComment:
       L.indentAhead = L.currLineIndent
       return
-  var c = L.buf[L.bufpos]
+  let c = L.buf[L.bufpos]
   tok.line = L.lineNumber
   tok.col = getColNumber(L, L.bufpos)
-  if c in SymStartChars - {'r', 'R'}:
+  if c in SymStartChars - {'r', 'R'} - UnicodeOperatorStartChars:
     getSymbol(L, tok)
   else:
     case c
+    of UnicodeOperatorStartChars:
+      if unicodeOprLen(L.buf, L.bufpos)[0] != 0:
+        getOperator(L, tok)
+      else:
+        getSymbol(L, tok)
     of '#':
       scanComment(L, tok)
     of '*':
@@ -1115,7 +1224,7 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) =
     of 'r', 'R':
       if L.buf[L.bufpos + 1] == '\"':
         inc(L.bufpos)
-        getString(L, tok, true)
+        getString(L, tok, raw)
       else:
         getSymbol(L, tok)
     of '(':
@@ -1150,7 +1259,6 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) =
         if L.fileIdx == L.config.m.trackPos.fileIndex and tok.col+1 == L.config.m.trackPos.col and
             tok.line == L.config.m.trackPos.line.int and L.config.ideCmd == ideSug:
           tok.tokType = tkDot
-          L.cursor = CursorPosition.InToken
           L.config.m.trackPos.col = tok.col.int16
           inc(L.bufpos)
           atTokenEnd()
@@ -1192,10 +1300,10 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) =
         tok.tokType = tkInvalid
         lexMessage(L, errGenerated, "invalid token: " & c & " (\\" & $(ord(c)) & ')')
     of '\"':
-      # check for extended raw string literal:
-      var rawMode = L.bufpos > 0 and L.buf[L.bufpos-1] in SymChars
-      getString(L, tok, rawMode)
-      if rawMode:
+      # check for generalized raw string literal:
+      let mode = if L.bufpos > 0 and L.buf[L.bufpos-1] in SymChars: generalized else: normal
+      getString(L, tok, mode)
+      if mode == generalized:
         # tkRStrLit -> tkGStrLit
         # tkTripleStrLit -> tkGTripleStrLit
         inc(tok.tokType, 2)
@@ -1207,7 +1315,28 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) =
       getNumber(L, tok)
       let c = L.buf[L.bufpos]
       if c in SymChars+{'_'}:
-        lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
+        if c in UnicodeOperatorStartChars and
+            unicodeOprLen(L.buf, L.bufpos)[0] != 0:
+          discard
+        else:
+          lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
+    of '-':
+      if L.buf[L.bufpos+1] in {'0'..'9'} and
+          (L.bufpos-1 == 0 or L.buf[L.bufpos-1] in UnaryMinusWhitelist):
+        # x)-23 # binary minus
+        # ,-23  # unary minus
+        # \n-78 # unary minus? Yes.
+        # =-3   # parsed as `=-` anyway
+        getNumber(L, tok)
+        let c = L.buf[L.bufpos]
+        if c in SymChars+{'_'}:
+          if c in UnicodeOperatorStartChars and
+              unicodeOprLen(L.buf, L.bufpos)[0] != 0:
+            discard
+          else:
+            lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
+      else:
+        getOperator(L, tok)
     else:
       if c in OpChars:
         getOperator(L, tok)
@@ -1223,12 +1352,26 @@ proc rawGetTok*(L: var TLexer, tok: var TToken) =
 
 proc getIndentWidth*(fileIdx: FileIndex, inputstream: PLLStream;
                      cache: IdentCache; config: ConfigRef): int =
-  var lex: TLexer
-  var tok: TToken
-  initToken(tok)
+  result = 0
+  var lex: Lexer = default(Lexer)
+  var tok: Token = default(Token)
   openLexer(lex, fileIdx, inputstream, cache, config)
-  while true:
+  var prevToken = tkEof
+  while tok.tokType != tkEof:
     rawGetTok(lex, tok)
-    result = tok.indent
-    if result > 0 or tok.tokType == tkEof: break
+    if tok.indent > 0 and prevToken in {tkColon, tkEquals, tkType, tkConst, tkLet, tkVar, tkUsing}:
+      result = tok.indent
+      if result > 0: break
+    prevToken = tok.tokType
   closeLexer(lex)
+
+proc getPrecedence*(ident: PIdent): int =
+  ## assumes ident is binary operator already
+  let
+    tokType =
+      if ident.id in ord(tokKeywordLow) - ord(tkSymbol)..ord(tokKeywordHigh) - ord(tkSymbol):
+        TokType(ident.id + ord(tkSymbol))
+      else: tkOpr
+    tok = Token(ident: ident, tokType: tokType)
+
+  getPrecedence(tok)