# # # The Nim Compiler # (c) Copyright 2015 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # # This lexer is handwritten for efficiency. I used an elegant buffering # scheme which I have not seen anywhere else: # We guarantee that a whole line is in the buffer. Thus only when scanning # the \n or \r character we have to check whether we need to read in the next # chunk. (\n or \r already need special handling for incrementing the line # counter; choosing both \n and \r allows the lexer to properly read Unix, # DOS or Macintosh text files, even when it is not the native format. import options, msgs, platform, idents, nimlexbase, llstream, wordrecg, lineinfos, pathutils import std/[hashes, parseutils, strutils] when defined(nimPreviewSlimSystem): import std/[assertions, formatfloat] const numChars*: set[char] = {'0'..'9', 'a'..'z', 'A'..'Z'} SymChars*: set[char] = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'} SymStartChars*: set[char] = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'} OpChars*: set[char] = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.', '|', '=', '%', '&', '$', '@', '~', ':'} UnaryMinusWhitelist = {' ', '\t', '\n', '\r', ',', ';', '(', '[', '{'} # don't forget to update the 'highlite' module if these charsets should change type TokType* = enum tkInvalid = "tkInvalid", tkEof = "[EOF]", # order is important here! tkSymbol = "tkSymbol", # keywords: tkAddr = "addr", tkAnd = "and", tkAs = "as", tkAsm = "asm", tkBind = "bind", tkBlock = "block", tkBreak = "break", tkCase = "case", tkCast = "cast", tkConcept = "concept", tkConst = "const", tkContinue = "continue", tkConverter = "converter", tkDefer = "defer", tkDiscard = "discard", tkDistinct = "distinct", tkDiv = "div", tkDo = "do", tkElif = "elif", tkElse = "else", tkEnd = "end", tkEnum = "enum", tkExcept = "except", tkExport = "export", tkFinally = "finally", tkFor = "for", tkFrom = "from", tkFunc = "func", tkIf = "if", tkImport = "import", tkIn = "in", tkInclude = "include", tkInterface = "interface", tkIs = "is", tkIsnot = "isnot", tkIterator = "iterator", tkLet = "let", tkMacro = "macro", tkMethod = "method", tkMixin = "mixin", tkMod = "mod", tkNil = "nil", tkNot = "not", tkNotin = "notin", tkObject = "object", tkOf = "of", tkOr = "or", tkOut = "out", tkProc = "proc", tkPtr = "ptr", tkRaise = "raise", tkRef = "ref", tkReturn = "return", tkShl = "shl", tkShr = "shr", tkStatic = "static", tkTemplate = "template", tkTry = "try", tkTuple = "tuple", tkType = "type", tkUsing = "using", tkVar = "var", tkWhen = "when", tkWhile = "while", tkXor = "xor", tkYield = "yield", # end of keywords tkIntLit = "tkIntLit", tkInt8Lit = "tkInt8Lit", tkInt16Lit = "tkInt16Lit", tkInt32Lit = "tkInt32Lit", tkInt64Lit = "tkInt64Lit", tkUIntLit = "tkUIntLit", tkUInt8Lit = "tkUInt8Lit", tkUInt16Lit = "tkUInt16Lit", tkUInt32Lit = "tkUInt32Lit", tkUInt64Lit = "tkUInt64Lit", tkFloatLit = "tkFloatLit", tkFloat32Lit = "tkFloat32Lit", tkFloat64Lit = "tkFloat64Lit", tkFloat128Lit = "tkFloat128Lit", tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit", tkTripleStrLit = "tkTripleStrLit", tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit", tkCustomLit = "tkCustomLit", tkParLe = "(", tkParRi = ")", tkBracketLe = "[", tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}", tkBracketDotLe = "[.", tkBracketDotRi = ".]", tkCurlyDotLe = "{.", tkCurlyDotRi = ".}", tkParDotLe = "(.", tkParDotRi = ".)", tkComma = ",", tkSemiColon = ";", tkColon = ":", tkColonColon = "::", tkEquals = "=", tkDot = ".", tkDotDot = "..", tkBracketLeColon = "[:", tkOpr, tkComment, tkAccent = "`", # these are fake tokens used by renderer.nim tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr, tkHideableStart, tkHideableEnd TokTypes* = set[TokType] const weakTokens = {tkComma, tkSemiColon, tkColon, tkParRi, tkParDotRi, tkBracketRi, tkBracketDotRi, tkCurlyRi} # \ # tokens that should not be considered for previousToken tokKeywordLow* = succ(tkSymbol) tokKeywordHigh* = pred(tkIntLit) type NumericalBase* = enum base10, # base10 is listed as the first element, # so that it is the correct default value base2, base8, base16 TokenSpacing* = enum tsLeading, tsTrailing, tsEof Token* = object # a Nim token tokType*: TokType # the type of the token base*: NumericalBase # the numerical base; only valid for int # or float literals spacing*: set[TokenSpacing] # spaces around token indent*: int # the indentation; != -1 if the token has been # preceded with indentation ident*: PIdent # the parsed identifier iNumber*: BiggestInt # the parsed integer literal fNumber*: BiggestFloat # the parsed floating point literal literal*: string # the parsed (string) literal; and # documentation comments are here too line*, col*: int when defined(nimpretty): offsetA*, offsetB*: int # used for pretty printing so that literals # like 0b01 or r"\L" are unaffected commentOffsetA*, commentOffsetB*: int ErrorHandler* = proc (conf: ConfigRef; info: TLineInfo; msg: TMsgKind; arg: string) Lexer* = object of TBaseLexer fileIdx*: FileIndex indentAhead*: int # if > 0 an indentation has already been read # this is needed because scanning comments # needs so much look-ahead currLineIndent*: int errorHandler*: ErrorHandler cache*: IdentCache when defined(nimsuggest): previousToken: TLineInfo tokenEnd*: TLineInfo previousTokenEnd*: TLineInfo config*: ConfigRef proc getLineInfo*(L: Lexer, tok: Token): TLineInfo {.inline.} = result = newLineInfo(L.fileIdx, tok.line, tok.col) when defined(nimpretty): result.offsetA = tok.offsetA result.offsetB = tok.offsetB result.commentOffsetA = tok.commentOffsetA result.commentOffsetB = tok.commentOffsetB proc isKeyword*(kind: TokType): bool = (kind >= tokKeywordLow) and (kind <= tokKeywordHigh) template ones(n): untyped = ((1 shl n)-1) # for utf-8 conversion proc isNimIdentifier*(s: string): bool = let sLen = s.len if sLen > 0 and s[0] in SymStartChars: var i = 1 while i < sLen: if s[i] == '_': inc(i) if i < sLen and s[i] notin SymChars: return false inc(i) result = true else: result = false proc `$`*(tok: Token): string = case tok.tokType of tkIntLit..tkInt64Lit: $tok.iNumber of tkFloatLit..tkFloat64Lit: $tok.fNumber of tkInvalid, tkStrLit..tkCharLit, tkComment: tok.literal of tkParLe..tkColon, tkEof, tkAccent: $tok.tokType else: if tok.ident != nil: tok.ident.s else: "" proc prettyTok*(tok: Token): string = if isKeyword(tok.tokType): "keyword " & tok.ident.s else: $tok proc printTok*(conf: ConfigRef; tok: Token) = # xxx factor with toLocation msgWriteln(conf, $tok.line & ":" & $tok.col & "\t" & $tok.tokType & " " & $tok) proc openLexer*(lex: var Lexer, fileIdx: FileIndex, inputstream: PLLStream; cache: IdentCache; config: ConfigRef) = openBaseLexer(lex, inputstream) lex.fileIdx = fileIdx lex.indentAhead = -1 lex.currLineIndent = 0 inc(lex.lineNumber, inputstream.lineOffset) lex.cache = cache when defined(nimsuggest): lex.previousToken.fileIndex = fileIdx lex.config = config proc openLexer*(lex: var Lexer, filename: AbsoluteFile, inputstream: PLLStream; cache: IdentCache; config: ConfigRef) = openLexer(lex, fileInfoIdx(config, filename), inputstream, cache, config) proc closeLexer*(lex: var Lexer) = if lex.config != nil: inc(lex.config.linesCompiled, lex.lineNumber) closeBaseLexer(lex) proc getLineInfo(L: Lexer): TLineInfo = result = newLineInfo(L.fileIdx, L.lineNumber, getColNumber(L, L.bufpos)) proc dispMessage(L: Lexer; info: TLineInfo; msg: TMsgKind; arg: string) = if L.errorHandler.isNil: msgs.message(L.config, info, msg, arg) else: L.errorHandler(L.config, info, msg, arg) proc lexMessage*(L: Lexer, msg: TMsgKind, arg = "") = L.dispMessage(getLineInfo(L), msg, arg) proc lexMessageTok*(L: Lexer, msg: TMsgKind, tok: Token, arg = "") = var info = newLineInfo(L.fileIdx, tok.line, tok.col) L.dispMessage(info, msg, arg) proc lexMessagePos(L: var Lexer, msg: TMsgKind, pos: int, arg = "") = var info = newLineInfo(L.fileIdx, L.lineNumber, pos - L.lineStart) L.dispMessage(info, msg, arg) proc matchTwoChars(L: Lexer, first: char, second: set[char]): bool = result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in second) template tokenBegin(tok, pos) {.dirty.} = when defined(nimsuggest): var colA = getColNumber(L, pos) when defined(nimpretty): tok.offsetA = L.offsetBase + pos template tokenEnd(tok, pos) {.dirty.} = when defined(nimsuggest): let colB = getColNumber(L, pos)+1 if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}: L.config.m.trackPos.col = colA.int16 colA = 0 when defined(nimpretty): tok.offsetB = L.offsetBase + pos template tokenEndIgnore(tok, pos) = when defined(nimsuggest): let colB = getColNumber(L, pos) if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}: L.config.m.trackPos.fileIndex = trackPosInvalidFileIdx L.config.m.trackPos.line = 0'u16 colA = 0 when defined(nimpretty): tok.offsetB = L.offsetBase + pos template tokenEndPrevious(tok, pos) = when defined(nimsuggest): # when we detect the cursor in whitespace, we attach the track position # to the token that came before that, but only if we haven't detected # the cursor in a string literal or comment: let colB = getColNumber(L, pos) if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}: L.config.m.trackPos = L.previousToken L.config.m.trackPosAttached = true colA = 0 when defined(nimpretty): tok.offsetB = L.offsetBase + pos template eatChar(L: var Lexer, t: var Token, replacementChar: char) = t.literal.add(replacementChar) inc(L.bufpos) template eatChar(L: var Lexer, t: var Token) = t.literal.add(L.buf[L.bufpos]) inc(L.bufpos) proc getNumber(L: var Lexer, result: var Token) = proc matchUnderscoreChars(L: var Lexer, tok: var Token, chars: set[char]): Natural = var pos = L.bufpos # use registers for pos, buf result = 0 while true: if L.buf[pos] in chars: tok.literal.add(L.buf[pos]) inc(pos) inc(result) else: break if L.buf[pos] == '_': if L.buf[pos+1] notin chars: lexMessage(L, errGenerated, "only single underscores may occur in a token and token may not " & "end with an underscore: e.g. '1__1' and '1_' are invalid") break tok.literal.add('_') inc(pos) L.bufpos = pos proc matchChars(L: var Lexer, tok: var Token, chars: set[char]) = var pos = L.bufpos # use registers for pos, buf while L.buf[pos] in chars: tok.literal.add(L.buf[pos]) inc(pos) L.bufpos = pos proc lexMessageLitNum(L: var Lexer, msg: string, startpos: int, msgKind = errGenerated) = # Used to get slightly human friendlier err messages. const literalishChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '.', '\''} var msgPos = L.bufpos var t = Token(literal: "") L.bufpos = startpos # Use L.bufpos as pos because of matchChars matchChars(L, t, literalishChars) # We must verify +/- specifically so that we're not past the literal if L.buf[L.bufpos] in {'+', '-'} and L.buf[L.bufpos - 1] in {'e', 'E'}: t.literal.add(L.buf[L.bufpos]) inc(L.bufpos) matchChars(L, t, literalishChars) if L.buf[L.bufpos] in literalishChars: t.literal.add(L.buf[L.bufpos]) inc(L.bufpos) matchChars(L, t, {'0'..'9'}) L.bufpos = msgPos lexMessage(L, msgKind, msg % t.literal) var xi: BiggestInt isBase10 = true numDigits = 0 const # 'c', 'C' is deprecated baseCodeChars = {'X', 'x', 'o', 'b', 'B', 'c', 'C'} literalishChars = baseCodeChars + {'A'..'F', 'a'..'f', '0'..'9', '_', '\''} floatTypes = {tkFloatLit, tkFloat32Lit, tkFloat64Lit, tkFloat128Lit} result.tokType = tkIntLit # int literal until we know better result.literal = "" result.base = base10 tokenBegin(result, L.bufpos) var isPositive = true if L.buf[L.bufpos] == '-': eatChar(L, result) isPositive = false let startpos = L.bufpos template setNumber(field, value) = field = (if isPositive: value else: -value) # First stage: find out base, make verifications, build token literal string # {'c', 'C'} is added for deprecation reasons to provide a clear error message if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'c', 'C', 'O'}: isBase10 = false eatChar(L, result, '0') case L.buf[L.bufpos] of 'c', 'C': lexMessageLitNum(L, "$1 will soon be invalid for oct literals; Use '0o' " & "for octals. 'c', 'C' prefix", startpos, warnDeprecated) eatChar(L, result, 'c') numDigits = matchUnderscoreChars(L, result, {'0'..'7'}) of 'O': lexMessageLitNum(L, "$1 is an invalid int literal; For octal literals " & "use the '0o' prefix.", startpos) of 'x', 'X': eatChar(L, result, 'x') numDigits = matchUnderscoreChars(L, result, {'0'..'9', 'a'..'f', 'A'..'F'}) of 'o': eatChar(L, result, 'o') numDigits = matchUnderscoreChars(L, result, {'0'..'7'}) of 'b', 'B': eatChar(L, result, 'b') numDigits = matchUnderscoreChars(L, result, {'0'..'1'}) else: internalError(L.config, getLineInfo(L), "getNumber") if numDigits == 0: lexMessageLitNum(L, "invalid number: '$1'", startpos) else: discard matchUnderscoreChars(L, result, {'0'..'9'}) if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): result.tokType = tkFloatLit eatChar(L, result, '.') discard matchUnderscoreChars(L, result, {'0'..'9'}) if L.buf[L.bufpos] in {'e', 'E'}: result.tokType = tkFloatLit eatChar(L, result) if L.buf[L.bufpos] in {'+', '-'}: eatChar(L, result) discard matchUnderscoreChars(L, result, {'0'..'9'}) let endpos = L.bufpos # Second stage, find out if there's a datatype suffix and handle it var postPos = endpos if L.buf[postPos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}: let errPos = postPos var customLitPossible = false if L.buf[postPos] == '\'': inc(postPos) customLitPossible = true if L.buf[postPos] in SymChars: var suffix = newStringOfCap(10) while true: suffix.add L.buf[postPos] inc postPos if L.buf[postPos] notin SymChars+{'_'}: break let suffixAsLower = suffix.toLowerAscii case suffixAsLower of "f", "f32": result.tokType = tkFloat32Lit of "d", "f64": result.tokType = tkFloat64Lit of "f128": result.tokType = tkFloat128Lit of "i8": result.tokType = tkInt8Lit of "i16": result.tokType = tkInt16Lit of "i32": result.tokType = tkInt32Lit of "i64": result.tokType = tkInt64Lit of "u": result.tokType = tkUIntLit of "u8": result.tokType = tkUInt8Lit of "u16": result.tokType = tkUInt16Lit of "u32": result.tokType = tkUInt32Lit of "u64": result.tokType = tkUInt64Lit elif customLitPossible: # remember the position of the `'` so that the parser doesn't # have to reparse the custom literal: result.iNumber = len(result.literal) result.literal.add '\'' result.literal.add suffix result.tokType = tkCustomLit else: lexMessageLitNum(L, "invalid number suffix: '$1'", errPos) else: lexMessageLitNum(L, "invalid number suffix: '$1'", errPos) # Is there still a literalish char awaiting? Then it's an error! if L.buf[postPos] in literalishChars or (L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}): lexMessageLitNum(L, "invalid number: '$1'", startpos) if result.tokType != tkCustomLit: # Third stage, extract actual number L.bufpos = startpos # restore position var pos = startpos try: if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars): inc(pos, 2) xi = 0 # it is a base prefix case L.buf[pos - 1] of 'b', 'B': result.base = base2 while pos < endpos: if L.buf[pos] != '_': xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0')) inc(pos) # 'c', 'C' is deprecated (a warning is issued elsewhere) of 'o', 'c', 'C': result.base = base8 while pos < endpos: if L.buf[pos] != '_': xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0')) inc(pos) of 'x', 'X': result.base = base16 while pos < endpos: case L.buf[pos] of '_': inc(pos) of '0'..'9': xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0')) inc(pos) of 'a'..'f': xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10) inc(pos) of 'A'..'F': xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10) inc(pos) else: break else: internalError(L.config, getLineInfo(L), "getNumber") case result.tokType of tkIntLit, tkInt64Lit: setNumber result.iNumber, xi of tkInt8Lit: setNumber result.iNumber, ashr(xi shl 56, 56) of tkInt16Lit: setNumber result.iNumber, ashr(xi shl 48, 48) of tkInt32Lit: setNumber result.iNumber, ashr(xi shl 32, 32) of tkUIntLit, tkUInt64Lit: setNumber result.iNumber, xi of tkUInt8Lit: setNumber result.iNumber, xi and 0xff of tkUInt16Lit: setNumber result.iNumber, xi and 0xffff of tkUInt32Lit: setNumber result.iNumber, xi and 0xffffffff of tkFloat32Lit: setNumber result.fNumber, (cast[ptr float32](addr(xi)))[] # note: this code is endian neutral! # XXX: Test this on big endian machine! of tkFloat64Lit, tkFloatLit: setNumber result.fNumber, (cast[ptr float64](addr(xi)))[] else: internalError(L.config, getLineInfo(L), "getNumber") # Bounds checks. Non decimal literals are allowed to overflow the range of # the datatype as long as their pattern don't overflow _bitwise_, hence # below checks of signed sizes against uint*.high is deliberate: # (0x80'u8 = 128, 0x80'i8 = -128, etc == OK) if result.tokType notin floatTypes: let outOfRange = case result.tokType of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi of tkInt8Lit: (xi > BiggestInt(uint8.high)) of tkInt16Lit: (xi > BiggestInt(uint16.high)) of tkInt32Lit: (xi > BiggestInt(uint32.high)) else: false if outOfRange: #echo "out of range num: ", result.iNumber, " vs ", xi lexMessageLitNum(L, "number out of range: '$1'", startpos) else: case result.tokType of floatTypes: result.fNumber = parseFloat(result.literal) of tkUInt64Lit, tkUIntLit: var iNumber: uint64 = uint64(0) var len: int = 0 try: len = parseBiggestUInt(result.literal, iNumber) except ValueError: raise newException(OverflowDefect, "number out of range: " & result.literal) if len != result.literal.len: raise newException(ValueError, "invalid integer: " & result.literal) result.iNumber = cast[int64](iNumber) else: var iNumber: int64 = int64(0) var len: int = 0 try: len = parseBiggestInt(result.literal, iNumber) except ValueError: raise newException(OverflowDefect, "number out of range: " & result.literal) if len != result.literal.len: raise newException(ValueError, "invalid integer: " & result.literal) result.iNumber = iNumber # Explicit bounds checks. let outOfRange = case result.tokType of tkInt8Lit: result.iNumber > int8.high or result.iNumber < int8.low of tkUInt8Lit: result.iNumber > BiggestInt(uint8.high) or result.iNumber < 0 of tkInt16Lit: result.iNumber > int16.high or result.iNumber < int16.low of tkUInt16Lit: result.iNumber > BiggestInt(uint16.high) or result.iNumber < 0 of tkInt32Lit: result.iNumber > int32.high or result.iNumber < int32.low of tkUInt32Lit: result.iNumber > BiggestInt(uint32.high) or result.iNumber < 0 else: false if outOfRange: lexMessageLitNum(L, "number out of range: '$1'", startpos) # Promote int literal to int64? Not always necessary, but more consistent if result.tokType == tkIntLit: if result.iNumber > high(int32) or result.iNumber < low(int32): result.tokType = tkInt64Lit except ValueError: lexMessageLitNum(L, "invalid number: '$1'", startpos) except OverflowDefect, RangeDefect: lexMessageLitNum(L, "number out of range: '$1'", startpos) tokenEnd(result, postPos-1) L.bufpos = postPos proc handleHexChar(L: var Lexer, xi: var int; position: range[0..4]) = template invalid() = lexMessage(L, errGenerated, "expected a hex digit, but found: " & L.buf[L.bufpos] & "; maybe prepend with 0") case L.buf[L.bufpos] of '0'..'9': xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0')) inc(L.bufpos) of 'a'..'f': xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('a') + 10) inc(L.bufpos) of 'A'..'F': xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10) inc(L.bufpos) of '"', '\'': if position <= 1: invalid() # do not progress the bufpos here. if position == 0: inc(L.bufpos) else: invalid() # Need to progress for `nim check` inc(L.bufpos) proc handleDecChars(L: var Lexer, xi: var int) = while L.buf[L.bufpos] in {'0'..'9'}: xi = (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0')) inc(L.bufpos) proc addUnicodeCodePoint(s: var string, i: int) = let i = cast[uint](i) # inlined toUTF-8 to avoid unicode and strutils dependencies. let pos = s.len if i <= 127: s.setLen(pos+1) s[pos+0] = chr(i) elif i <= 0x07FF: s.setLen(pos+2) s[pos+0] = chr((i shr 6) or 0b110_00000) s[pos+1] = chr((i and ones(6)) or 0b10_0000_00) elif i <= 0xFFFF: s.setLen(pos+3) s[pos+0] = chr(i shr 12 or 0b1110_0000) s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00) s[pos+2] = chr(i and ones(6) or 0b10_0000_00) elif i <= 0x001FFFFF: s.setLen(pos+4) s[pos+0] = chr(i shr 18 or 0b1111_0000) s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00) s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00) s[pos+3] = chr(i and ones(6) or 0b10_0000_00) elif i <= 0x03FFFFFF: s.setLen(pos+5) s[pos+0] = chr(i shr 24 or 0b111110_00) s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00) s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00) s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00) s[pos+4] = chr(i and ones(6) or 0b10_0000_00) elif i <= 0x7FFFFFFF: s.setLen(pos+6) s[pos+0] = chr(i shr 30 or 0b1111110_0) s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00) s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00) s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00) s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00) s[pos+5] = chr(i and ones(6) or 0b10_0000_00) proc getEscapedChar(L: var Lexer, tok: var Token) = inc(L.bufpos) # skip '\' case L.buf[L.bufpos] of 'n', 'N': tok.literal.add('\L') inc(L.bufpos) of 'p', 'P': if tok.tokType == tkCharLit: lexMessage(L, errGenerated, "\\p not allowed in character literal") tok.literal.add(L.config.target.tnl) inc(L.bufpos) of 'r', 'R', 'c', 'C': tok.literal.add(CR) inc(L.bufpos) of 'l', 'L': tok.literal.add(LF) inc(L.bufpos) of 'f', 'F': tok.literal.add(FF) inc(L.bufpos) of 'e', 'E': tok.literal.add(ESC) inc(L.bufpos) of 'a', 'A': tok.literal.add(BEL) inc(L.bufpos) of 'b', 'B': tok.literal.add(BACKSPACE) inc(L.bufpos) of 'v', 'V': tok.literal.add(VT) inc(L.bufpos) of 't', 'T': tok.literal.add('\t') inc(L.bufpos) of '\'', '\"': tok.literal.add(L.buf[L.bufpos]) inc(L.bufpos) of '\\': tok.literal.add('\\') inc(L.bufpos) of 'x', 'X': inc(L.bufpos) var xi = 0 handleHexChar(L, xi, 1) handleHexChar(L, xi, 2) tok.literal.add(chr(xi)) of 'u', 'U': if tok.tokType == tkCharLit: lexMessage(L, errGenerated, "\\u not allowed in character literal") inc(L.bufpos) var xi = 0 if L.buf[L.bufpos] == '{': inc(L.bufpos) var start = L.bufpos while L.buf[L.bufpos] != '}': handleHexChar(L, xi, 0) if start == L.bufpos: lexMessage(L, errGenerated, "Unicode codepoint cannot be empty") inc(L.bufpos) if xi > 0x10FFFF: let hex = ($L.buf)[start..L.bufpos-2] lexMessage(L, errGenerated, "Unicode codepoint must be lower than 0x10FFFF, but was: " & hex) else: handleHexChar(L, xi, 1) handleHexChar(L, xi, 2) handleHexChar(L, xi, 3) handleHexChar(L, xi, 4) addUnicodeCodePoint(tok.literal, xi) of '0'..'9': if matchTwoChars(L, '0', {'0'..'9'}): lexMessage(L, warnOctalEscape) var xi = 0 handleDecChars(L, xi) if (xi <= 255): tok.literal.add(chr(xi)) else: lexMessage(L, errGenerated, "invalid character constant") else: lexMessage(L, errGenerated, "invalid character constant") proc handleCRLF(L: var Lexer, pos: int): int = template registerLine = let col = L.getColNumber(pos) case L.buf[pos] of CR: registerLine() result = nimlexbase.handleCR(L, pos) of LF: registerLine() result = nimlexbase.handleLF(L, pos) else: result = pos type StringMode = enum normal, raw, generalized proc getString(L: var Lexer, tok: var Token, mode: StringMode) = var pos = L.bufpos var line = L.lineNumber # save linenumber for better error message tokenBegin(tok, pos - ord(mode == raw)) inc pos # skip " if L.buf[pos] == '\"' and L.buf[pos+1] == '\"': tok.tokType = tkTripleStrLit # long string literal: inc(pos, 2) # skip "" # skip leading newline: if L.buf[pos] in {' ', '\t'}: var newpos = pos+1 while L.buf[newpos] in {' ', '\t'}: inc newpos if L.buf[newpos] in {CR, LF}: pos = newpos pos = handleCRLF(L, pos) while true: case L.buf[pos] of '\"': if L.buf[pos+1] == '\"' and L.buf[pos+2] == '\"' and L.buf[pos+3] != '\"': tokenEndIgnore(tok, pos+2) L.bufpos = pos + 3 # skip the three """ break tok.literal.add('\"') inc(pos) of CR, LF: tokenEndIgnore(tok, pos) pos = handleCRLF(L, pos) tok.literal.add("\n") of nimlexbase.EndOfFile: tokenEndIgnore(tok, pos) var line2 = L.lineNumber L.lineNumber = line lexMessagePos(L, errGenerated, L.lineStart, "closing \"\"\" expected, but end of file reached") L.lineNumber = line2 L.bufpos = pos break else: tok.literal.add(L.buf[pos]) inc(pos) else: # ordinary string literal if mode != normal: tok.tokType = tkRStrLit else: tok.tokType = tkStrLit while true: let c = L.buf[pos] if c == '\"': if mode != normal and L.buf[pos+1] == '\"': inc(pos, 2) tok.literal.add('"') else: tokenEndIgnore(tok, pos) inc(pos) # skip '"' break elif c in {CR, LF, nimlexbase.EndOfFile}: tokenEndIgnore(tok, pos) lexMessage(L, errGenerated, "closing \" expected") break elif (c == '\\') and mode == normal: L.bufpos = pos getEscapedChar(L, tok) pos = L.bufpos else: tok.literal.add(c) inc(pos) L.bufpos = pos proc getCharacter(L: var Lexer; tok: var Token) = tokenBegin(tok, L.bufpos) let startPos = L.bufpos inc(L.bufpos) # skip ' let c = L.buf[L.bufpos] case c of '\0'..pred(' '), '\'': lexMessage(L, errGenerated, "invalid character literal") tok.literal = $c of '\\': getEscapedChar(L, tok) else: tok.literal = $c inc(L.bufpos) if L.buf[L.bufpos] == '\'': tokenEndIgnore(tok, L.bufpos) inc(L.bufpos) # skip ' else: if startPos > 0 and L.buf[startPos-1] == '`': tok.literal = "'" L.bufpos = startPos+1 else: lexMessage(L, errGenerated, "missing closing ' for character literal") tokenEndIgnore(tok, L.bufpos) const UnicodeOperatorStartChars = {'\226', '\194', '\195'} # the allowed unicode characters ("∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓ ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔") # all start with one of these. type UnicodeOprPred = enum Mul, Add proc unicodeOprLen(buf: cstring; pos: int): (int8, UnicodeOprPred) = template m(len): untyped = (int8(len), Mul) template a(len): untyped = (int8(len), Add) result = 0.m case buf[pos] of '\226': if buf[pos+1] == '\136': if buf[pos+2] == '\152': result = 3.m # ∘ elif buf[pos+2] == '\153': result = 3.m # ∙ elif buf[pos+2] == '\167': result = 3.m # ∧ elif buf[pos+2] == '\168': result = 3.a # ∨ elif buf[pos+2] == '\169': result = 3.m # ∩ elif buf[pos+2] == '\170': result = 3.a # ∪ elif buf[pos+1] == '\138': if buf[pos+2] == '\147': result = 3.m # ⊓ elif buf[pos+2] == '\148': result = 3.a # ⊔ elif buf[pos+2] == '\149': result = 3.a # ⊕ elif buf[pos+2] == '\150': result = 3.a # ⊖ elif buf[pos+2] == '\151': result = 3.m # ⊗ elif buf[pos+2] == '\152': result = 3.m # ⊘ elif buf[pos+2] == '\153': result = 3.m # ⊙ elif buf[pos+2] == '\155': result = 3.m # ⊛ elif buf[pos+2] == '\158': result = 3.a # ⊞ elif buf[pos+2] == '\159': result = 3.a # ⊟ elif buf[pos+2] == '\160': result = 3.m # ⊠ elif buf[pos+2] == '\161': result = 3.m # ⊡ elif buf[pos+1] == '\152' and buf[pos+2] == '\133': result = 3.m # ★ of '\194': if buf[pos+1] == '\177': result = 2.a # ± of '\195': if buf[pos+1] == '\151': result = 2.m # × else: discard proc getSymbol(L: var Lexer, tok: var Token) = var h: Hash = 0 var pos = L.bufpos tokenBegin(tok, pos) var suspicious = false while true: var c = L.buf[pos] case c of 'a'..'z', '0'..'9': h = h !& ord(c) inc(pos) of 'A'..'Z': c = chr(ord(c) + (ord('a') - ord('A'))) # toLower() h = h !& ord(c) inc(pos) suspicious = true of '_': if L.buf[pos+1] notin SymChars: lexMessage(L, errGenerated, "invalid token: trailing underscore") break inc(pos) suspicious = true of '\x80'..'\xFF': if c in UnicodeOperatorStartChars and unicodeOprLen(L.buf, pos)[0] != 0: break else: h = h !& ord(c) inc(pos) else: break tokenEnd(tok, pos-1) h = !$h tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h) if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or (tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)): tok.tokType = tkSymbol else: tok.tokType = TokType(tok.ident.id + ord(tkSymbol)) if suspicious and {optStyleHint, optStyleError} * L.config.globalOptions != {}: lintReport(L.config, getLineInfo(L), tok.ident.s.normalize, tok.ident.s) L.bufpos = pos proc endOperator(L: var Lexer, tok: var Token, pos: int, hash: Hash) {.inline.} = var h = !$hash tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h) if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr else: tok.tokType = TokType(tok.ident.id - oprLow + ord(tkColon)) L.bufpos = pos proc getOperator(L: var Lexer, tok: var Token) = var pos = L.bufpos tokenBegin(tok, pos) var h: Hash = 0 while true: let c = L.buf[pos] if c in OpChars: h = h !& ord(c) inc(pos) elif c in UnicodeOperatorStartChars: let oprLen = unicodeOprLen(L.buf, pos)[0] if oprLen == 0: break for i in 0.. 1 and tok.ident.s[^1] == '>' and tok.ident.s[^2] in {'-', '~', '='}: return 0 template considerAsgn(value: untyped) = result = if tok.ident.s[^1] == '=': 1 else: value case relevantChar of '$', '^': considerAsgn(10) of '*', '%', '/', '\\': considerAsgn(MulPred) of '~': result = 8 of '+', '-', '|': considerAsgn(PlusPred) of '&': considerAsgn(7) of '=', '<', '>', '!': result = 5 of '.': considerAsgn(6) of '?': result = 2 of UnicodeOperatorStartChars: if tok.ident.s[^1] == '=': result = 1 else: let (len, pred) = unicodeOprLen(cstring(tok.ident.s), 0) if len != 0: result = if pred == Mul: MulPred else: PlusPred else: result = 2 else: considerAsgn(2) of tkDiv, tkMod, tkShl, tkShr: result = 9 of tkDotDot: result = 6 of tkIn, tkNotin, tkIs, tkIsnot, tkOf, tkAs, tkFrom: result = 5 of tkAnd: result = 4 of tkOr, tkXor, tkPtr, tkRef: result = 3 else: return -10 proc skipMultiLineComment(L: var Lexer; tok: var Token; start: int; isDoc: bool) = var pos = start var toStrip = 0 tokenBegin(tok, pos) # detect the amount of indentation: if isDoc: toStrip = getColNumber(L, pos) while L.buf[pos] == ' ': inc pos inc toStrip while L.buf[pos] in {CR, LF}: # skip blank lines pos = handleCRLF(L, pos) toStrip = 0 while L.buf[pos] == ' ': inc pos inc toStrip var nesting = 0 while true: case L.buf[pos] of '#': if isDoc: if L.buf[pos+1] == '#' and L.buf[pos+2] == '[': inc nesting tok.literal.add '#' elif L.buf[pos+1] == '[': inc nesting inc pos of ']': if isDoc: if L.buf[pos+1] == '#' and L.buf[pos+2] == '#': if nesting == 0: tokenEndIgnore(tok, pos+2) inc(pos, 3) break dec nesting tok.literal.add ']' elif L.buf[pos+1] == '#': if nesting == 0: tokenEndIgnore(tok, pos+1) inc(pos, 2) break dec nesting inc pos of CR, LF: tokenEndIgnore(tok, pos) pos = handleCRLF(L, pos) # strip leading whitespace: when defined(nimpretty): tok.literal.add "\L" if isDoc: when not defined(nimpretty): tok.literal.add "\n" var c = toStrip while L.buf[pos] == ' ' and c > 0: inc pos dec c of nimlexbase.EndOfFile: tokenEndIgnore(tok, pos) lexMessagePos(L, errGenerated, pos, "end of multiline comment expected") break else: if isDoc or defined(nimpretty): tok.literal.add L.buf[pos] inc(pos) L.bufpos = pos when defined(nimpretty): tok.commentOffsetB = L.offsetBase + pos - 1 proc scanComment(L: var Lexer, tok: var Token) = var pos = L.bufpos tok.tokType = tkComment assert L.buf[pos+1] == '#' when defined(nimpretty): tok.commentOffsetA = L.offsetBase + pos if L.buf[pos+2] == '[': skipMultiLineComment(L, tok, pos+3, true) return tokenBegin(tok, pos) inc(pos, 2) var toStrip = 0 var stripInit = false while true: if not stripInit: # find baseline indentation inside comment while L.buf[pos] == ' ': inc pos inc toStrip if L.buf[pos] in {CR, LF}: # don't set toStrip in blank comment lines toStrip = 0 else: # found first non-whitespace character stripInit = true while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: tok.literal.add(L.buf[pos]) inc(pos) tokenEndIgnore(tok, pos) pos = handleCRLF(L, pos) var indent = 0 while L.buf[pos] == ' ': inc(pos) inc(indent) if L.buf[pos] == '#' and L.buf[pos+1] == '#': tok.literal.add "\n" inc(pos, 2) if stripInit: var c = toStrip while L.buf[pos] == ' ' and c > 0: inc pos dec c else: if L.buf[pos] > ' ': L.indentAhead = indent tokenEndIgnore(tok, pos) break L.bufpos = pos when defined(nimpretty): tok.commentOffsetB = L.offsetBase + pos - 1 proc skip(L: var Lexer, tok: var Token) = var pos = L.bufpos tokenBegin(tok, pos) tok.spacing.excl(tsLeading) when defined(nimpretty): var hasComment = false var commentIndent = L.currLineIndent tok.commentOffsetA = L.offsetBase + pos tok.commentOffsetB = tok.commentOffsetA tok.line = -1 while true: case L.buf[pos] of ' ': inc(pos) tok.spacing.incl(tsLeading) of '\t': lexMessagePos(L, errGenerated, pos, "tabs are not allowed, use spaces instead") inc(pos) of CR, LF: tokenEndPrevious(tok, pos) pos = handleCRLF(L, pos) var indent = 0 while true: if L.buf[pos] == ' ': inc(pos) inc(indent) elif L.buf[pos] == '#' and L.buf[pos+1] == '[': when defined(nimpretty): hasComment = true if tok.line < 0: tok.line = L.lineNumber commentIndent = indent skipMultiLineComment(L, tok, pos+2, false) pos = L.bufpos else: break tok.spacing.excl(tsLeading) when defined(nimpretty): if L.buf[pos] == '#' and tok.line < 0: commentIndent = indent if L.buf[pos] > ' ' and (L.buf[pos] != '#' or L.buf[pos+1] == '#'): tok.indent = indent L.currLineIndent = indent break of '#': # do not skip documentation comment: if L.buf[pos+1] == '#': break when defined(nimpretty): hasComment = true if tok.line < 0: tok.line = L.lineNumber if L.buf[pos+1] == '[': skipMultiLineComment(L, tok, pos+2, false) pos = L.bufpos else: tokenBegin(tok, pos) while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}: when defined(nimpretty): tok.literal.add L.buf[pos] inc(pos) tokenEndIgnore(tok, pos+1) when defined(nimpretty): tok.commentOffsetB = L.offsetBase + pos + 1 else: break # EndOfFile also leaves the loop tokenEndPrevious(tok, pos-1) L.bufpos = pos when defined(nimpretty): if hasComment: tok.commentOffsetB = L.offsetBase + pos - 1 tok.tokType = tkComment tok.indent = commentIndent proc rawGetTok*(L: var Lexer, tok: var Token) = template atTokenEnd() {.dirty.} = when defined(nimsuggest): L.previousTokenEnd.line = L.tokenEnd.line L.previousTokenEnd.col = L.tokenEnd.col L.tokenEnd.line = tok.line.uint16 L.tokenEnd.col = getColNumber(L, L.bufpos).int16 # we attach the cursor to the last *strong* token if tok.tokType notin weakTokens: L.previousToken.line = tok.line.uint16 L.previousToken.col = tok.col.int16 reset(tok) if L.indentAhead >= 0: tok.indent = L.indentAhead L.currLineIndent = L.indentAhead L.indentAhead = -1 else: tok.indent = -1 skip(L, tok) when defined(nimpretty): if tok.tokType == tkComment: L.indentAhead = L.currLineIndent return let c = L.buf[L.bufpos] tok.line = L.lineNumber tok.col = getColNumber(L, L.bufpos) if c in SymStartChars - {'r', 'R'} - UnicodeOperatorStartChars: getSymbol(L, tok) else: case c of UnicodeOperatorStartChars: if unicodeOprLen(L.buf, L.bufpos)[0] != 0: getOperator(L, tok) else: getSymbol(L, tok) of '#': scanComment(L, tok) of '*': # '*:' is unfortunately a special case, because it is two tokens in # 'var v*: int'. if L.buf[L.bufpos+1] == ':' and L.buf[L.bufpos+2] notin OpChars: var h = 0 !& ord('*') endOperator(L, tok, L.bufpos+1, h) else: getOperator(L, tok) of ',': tok.tokType = tkComma inc(L.bufpos) of 'r', 'R': if L.buf[L.bufpos + 1] == '\"': inc(L.bufpos) getString(L, tok, raw) else: getSymbol(L, tok) of '(': inc(L.bufpos) if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.': tok.tokType = tkParDotLe inc(L.bufpos) else: tok.tokType = tkParLe when defined(nimsuggest): if L.fileIdx == L.config.m.trackPos.fileIndex and tok.col < L.config.m.trackPos.col and tok.line == L.config.m.trackPos.line.int and L.config.ideCmd == ideCon: L.config.m.trackPos.col = tok.col.int16 of ')': tok.tokType = tkParRi inc(L.bufpos) of '[': inc(L.bufpos) if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.': tok.tokType = tkBracketDotLe inc(L.bufpos) elif L.buf[L.bufpos] == ':': tok.tokType = tkBracketLeColon inc(L.bufpos) else: tok.tokType = tkBracketLe of ']': tok.tokType = tkBracketRi inc(L.bufpos) of '.': when defined(nimsuggest): if L.fileIdx == L.config.m.trackPos.fileIndex and tok.col+1 == L.config.m.trackPos.col and tok.line == L.config.m.trackPos.line.int and L.config.ideCmd == ideSug: tok.tokType = tkDot L.config.m.trackPos.col = tok.col.int16 inc(L.bufpos) atTokenEnd() return if L.buf[L.bufpos+1] == ']': tok.tokType = tkBracketDotRi inc(L.bufpos, 2) elif L.buf[L.bufpos+1] == '}': tok.tokType = tkCurlyDotRi inc(L.bufpos, 2) elif L.buf[L.bufpos+1] == ')': tok.tokType = tkParDotRi inc(L.bufpos, 2) else: getOperator(L, tok) of '{': inc(L.bufpos) if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.': tok.tokType = tkCurlyDotLe inc(L.bufpos) else: tok.tokType = tkCurlyLe of '}': tok.tokType = tkCurlyRi inc(L.bufpos) of ';': tok.tokType = tkSemiColon inc(L.bufpos) of '`': tok.tokType = tkAccent inc(L.bufpos) of '_': inc(L.bufpos) if L.buf[L.bufpos] notin SymChars+{'_'}: tok.tokType = tkSymbol tok.ident = L.cache.getIdent("_") else: tok.literal = $c tok.tokType = tkInvalid lexMessage(L, errGenerated, "invalid token: " & c & " (\\" & $(ord(c)) & ')') of '\"': # check for generalized raw string literal: let mode = if L.bufpos > 0 and L.buf[L.bufpos-1] in SymChars: generalized else: normal getString(L, tok, mode) if mode == generalized: # tkRStrLit -> tkGStrLit # tkTripleStrLit -> tkGTripleStrLit inc(tok.tokType, 2) of '\'': tok.tokType = tkCharLit getCharacter(L, tok) tok.tokType = tkCharLit of '0'..'9': getNumber(L, tok) let c = L.buf[L.bufpos] if c in SymChars+{'_'}: if c in UnicodeOperatorStartChars and unicodeOprLen(L.buf, L.bufpos)[0] != 0: discard else: lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier") of '-': if L.buf[L.bufpos+1] in {'0'..'9'} and (L.bufpos-1 == 0 or L.buf[L.bufpos-1] in UnaryMinusWhitelist): # x)-23 # binary minus # ,-23 # unary minus # \n-78 # unary minus? Yes. # =-3 # parsed as `=-` anyway getNumber(L, tok) let c = L.buf[L.bufpos] if c in SymChars+{'_'}: if c in UnicodeOperatorStartChars and unicodeOprLen(L.buf, L.bufpos)[0] != 0: discard else: lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier") else: getOperator(L, tok) else: if c in OpChars: getOperator(L, tok) elif c == nimlexbase.EndOfFile: tok.tokType = tkEof tok.indent = 0 else: tok.literal = $c tok.tokType = tkInvalid lexMessage(L, errGenerated, "invalid token: " & c & " (\\" & $(ord(c)) & ')') inc(L.bufpos) atTokenEnd() proc getIndentWidth*(fileIdx: FileIndex, inputstream: PLLStream; cache: IdentCache; config: ConfigRef): int = result = 0 var lex: Lexer = default(Lexer) var tok: Token = default(Token) openLexer(lex, fileIdx, inputstream, cache, config) var prevToken = tkEof while tok.tokType != tkEof: rawGetTok(lex, tok) if tok.indent > 0 and prevToken in {tkColon, tkEquals, tkType, tkConst, tkLet, tkVar, tkUsing}: result = tok.indent if result > 0: break prevToken = tok.tokType closeLexer(lex) proc getPrecedence*(ident: PIdent): int = ## assumes ident is binary operator already let tokType = if ident.id in ord(tokKeywordLow) - ord(tkSymbol)..ord(tokKeywordHigh) - ord(tkSymbol): TokType(ident.id + ord(tkSymbol)) else: tkOpr tok = Token(ident: ident, tokType: tokType) getPrecedence(tok)