# # # The Nim Compiler # (c) Copyright 2015 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # # This lexer is handwritten for efficiency. I used an elegant buffering # scheme which I have not seen anywhere else: # We guarantee that a whole line is in the buffer. Thus only when scanning # the \n or \r character we have to check whether we need to read in the next # chunk. (\n or \r already need special handling for incrementing the line # counter; choosing both \n and \r allows the lexer to properly read Unix, # DOS or Macintosh text files, even when it is not the native format. import hashes, options, msgs, strutils, platform, idents, nimlexbase, llstream, wordrecg, lineinfos, pathutils, parseutils when defined(nimPreviewSlimSystem): import std/[assertions, formatfloat] const MaxLineLength* = 80 # lines longer than this lead to a warning numChars*: set[char] = {'0'..'9', 'a'..'z', 'A'..'Z'} SymChars*: set[char] = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'} SymStartChars*: set[char] = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'} OpChars*: set[char] = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.', '|', '=', '%', '&', '$', '@', '~', ':'} UnaryMinusWhitelist = {' ', '\t', '\n', '\r', ',', ';', '(', '[', '{'} # don't forget to update the 'highlite' module if these charsets should change type TokType* = enum tkInvalid = "tkInvalid", tkEof = "[EOF]", # order is important here! tkSymbol = "tkSymbol", # keywords: tkAddr = "addr", tkAnd = "and", tkAs = "as", tkAsm = "asm", tkBind = "bind", tkBlock = "block", tkBreak = "break", tkCase = "case", tkCast = "cast", tkConcept = "concept", tkConst = "const", tkContinue = "continue", tkConverter = "converter", tkDefer = "defer", tkDiscard = "discard", tkDistinct = "distinct", tkDiv = "div", tkDo = "do", tkElif = "elif", tkElse = "else", tkEnd = "end", tkEnum = "enum", tkExcept = "except", tkExport = "export", tkFinally = "finally", tkFor = "for", tkFrom = "from", tkFunc = "func", tkIf = "if", tkImport = "import", tkIn = "in", tkInclude = "include", tkInterface = "interface", tkIs = "is", tkIsnot = "isnot", tkIterator = "iterator", tkLet = "let", tkMacro = "macro", tkMethod = "method", tkMixin = "mixin", tkMod = "mod", tkNil = "nil", tkNot = "not", tkNotin = "notin", tkObject = "object", tkOf = "of", tkOr = "or", tkOut = "out", tkProc = "proc", tkPtr = "ptr", tkRaise = "raise", tkRef = "ref", tkReturn = "return", tkShl = "shl", tkShr = "shr", tkStatic = "static", tkTemplate = "template", tkTry = "try", tkTuple = "tuple", tkType = "type", tkUsing = "using", tkVar = "var", tkWhen = "when", tkWhile = "while", tkXor = "xor", tkYield = "yield", # end of keywords tkIntLit = "tkIntLit", tkInt8Lit = "tkInt8Lit", tkInt16Lit = "tkInt16Lit", tkInt32Lit = "tkInt32Lit", tkInt64Lit = "tkInt64Lit", tkUIntLit = "tkUIntLit", tkUInt8Lit = "tkUInt8Lit", tkUInt16Lit = "tkUInt16Lit", tkUInt32Lit = "tkUInt32Lit", tkUInt64Lit = "tkUInt64Lit", tkFloatLit = "tkFloatLit", tkFloat32Lit = "tkFloat32Lit", tkFloat64Lit = "tkFloat64Lit", tkFloat128Lit = "tkFloat128Lit", tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit", tkTripleStrLit = "tkTripleStrLit", tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit", tkCustomLit = "tkCustomLit", tkParLe = "(", tkParRi = ")", tkBracketLe = "[", tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}", tkBracketDotLe = "[.", tkBracketDotRi = ".]", tkCurlyDotLe = "{.", tkCurlyDotRi = ".}", tkParDotLe = "(.", tkParDotRi = ".)", tkComma = ",", tkSemiColon = ";", tkColon = ":", tkColonColon = "::", tkEquals = "=", tkDot = ".", tkDotDot = "..", tkBracketLeColon = "[:", tkOpr, tkComment, tkAccent = "`", # these are fake tokens used by renderer.nim tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr, tkHideableStart, tkHideableEnd TokTypes* = set[TokType] const weakTokens = {tkComma, tkSemiColon, tkColon, tkParRi, tkParDotRi, tkBracketRi, tkBracketDotRi, tkCurlyRi} # \ # tokens that should not be considered for previousToken tokKeywordLow* = succ(tkSymbol) tokKeywordHigh* = pred(tkIntLit) type NumericalBase* = enum base10, # base10 is listed as the first element, # so that it is the correct default value base2, base8, base16 Token* = object # a Nim token tokType*: TokType # the type of the token indent*: int # the indentation; != -1 if the token has been # preceded with indentation ident*: PIdent # the parsed identifier iNumber*: BiggestInt # the parsed integer literal fNumber*: BiggestFloat # the parsed floating point literal base*: NumericalBase # the numerical base; only valid for int # or float literals strongSpaceA*: int8 # leading spaces of an operator strongSpaceB*: int8 # trailing spaces of an operator literal*: string # the parsed (string) literal; and # documentation comments are here too line*, col*: int when defined(nimpretty): offsetA*, offsetB*: int # used for pretty printing so that literals # like 0b01 or r"\L" are unaffected commentOffsetA*, commentOffsetB*: int ErrorHandler* = proc (conf: ConfigRef; info: TLineInfo; msg: TMsgKind; arg: string) Lexer* = object of TBaseLexer fileIdx*: FileIndex indentAhead*: int # if > 0 an indentation has already been read # this is needed because scanning comments # needs so much look-ahead currLineIndent*: int strongSpaces*, allowTabs*: bool errorHandler*: ErrorHandler cache*: IdentCache when defined(nimsuggest): previousToken: TLineInfo config*: ConfigRef proc getLineInfo*(L: Lexer, tok: Token): TLineInfo {.inline.} = result = newLineInfo(L.fileIdx, tok.line, tok.col) when defined(nimpretty): result.offsetA = tok.offsetA result.offsetB = tok.offsetB result.commentOffsetA = tok.commentOffsetA result.commentOffsetB = tok.commentOffsetB proc isKeyword*(kind: TokType): bool = (kind >= tokKeywordLow) and (kind <= tokKeywordHigh) template ones(n): untyped = ((1 shl n)-1) # for utf-8 conversion proc isNimIdentifier*(s: string): bool = let sLen = s.len if sLen > 0 and s[0] in SymStartChars: var i = 1 while i < sLen: if s[i] == '_': inc(i) if i < sLen and s[i] notin SymChars: return inc(i) result = true proc `$`*(tok: Token): string = case tok.tokType of tkIntLit..tkInt64Lit: $tok.iNumber of tkFloatLit..tkFloat64Lit: $tok.fNumber of tkInvalid, tkStrLit..tkCharLit, tkComment: tok.literal of tkParLe..tkColon, tkEof, tkAccent: $tok.tokType else: if tok.ident != nil: tok.ident.s else: "" proc prettyTok*(tok: Token): string = if isKeyword(tok.tokType): "keyword " & tok.ident.s else: $tok proc printTok*(conf: ConfigRef; tok: Token) = # xxx factor with toLocation msgWriteln(conf, $tok.line & ":" & $tok.col & "\t" & $tok.tokType & " " & $tok) proc initToken*(L: var Token) = L.tokType = tkInvalid L.iNumber = 0 L.indent = 0 L.strongSpaceA = 0 L.literal = "" L.fNumber = 0.0 L.base = base10 L.ident = nil when defined(nimpretty): L.commentOffsetA = 0 L.commentOffsetB = 0 proc fillToken(L: var Token) = L.tokType = tkInvalid L.iNumber = 0 L.indent = 0 L.strongSpaceA = 0 setLen(L.literal, 0) L.fNumber = 0.0 L.base = base10 L.ident = nil when defined(nimpretty): L.commentOffsetA = 0 L.commentOffsetB = 0 proc openLexer*(lex: var Lexer, fileIdx: FileIndex, inputstream: PLLStream; cache: IdentCache; config: ConfigRef) = openBaseLexer(lex, inputstream) lex.fileIdx = fileIdx lex.indentAhead = -1 lex.currLineIndent = 0 inc(lex.lineNumber, inputstream.lineOffset) lex.cache = cache when defined(nimsuggest): lex.previousToken.fileIndex = fileIdx lex.config = config proc openLexer*(lex: var Lexer, filename: AbsoluteFile, inputstream: PLLStream; cache: IdentCache; config: ConfigRef) = openLexer(lex, fileInfoIdx(config, filename), inputstream, cache, config) proc closeLexer*(lex: var Lexer) = if lex.config != nil: inc(lex.config.linesCompiled, lex.lineNumber) closeBaseLexer(lex) proc getLineInfo(L: Lexer): TLineInfo = result = newLineInfo(L.fileIdx, L.lineNumber, getColNumber(L, L.bufpos)) proc dispMessage(L: Lexer; info: TLineInfo; msg: TMsgKind; arg: string) = if L.errorHandler.isNil: msgs.message(L.config, info, msg, arg) else: L.errorHandler(L.config, info, msg, arg) proc lexMessage*(L: Lexer, msg: TMsgKind, arg = "") = L.dispMessage(getLineInfo(L), msg, arg) proc lexMessageTok*(L: Lexer, msg: TMsgKind, tok: Token, arg = "") = var info = newLineInfo(L.fileIdx, tok.line, tok.col) L.dispMessage(info, msg, arg) proc lexMessagePos(L: var Lexer, msg: TMsgKind, pos: int, arg = "") = var info = newLineInfo(L.fileIdx, L.lineNumber, pos - L.lineStart) L.dispMessage(info, msg, arg) proc matchTwoChars(L: Lexer, first: char, second: set[char]): bool = result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in second) template tokenBegin(tok, pos) {.dirty.} = when defined(nimsuggest): var colA = getColNumber(L, pos) when defined(nimpretty): tok.offsetA = L.offsetBase + pos template tokenEnd(tok, pos) {.dirty.} = when defined(nimsuggest): let colB = getColNumber(L, pos)+1 if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}: L.config.m.trackPos.col = colA.int16 colA = 0 when defined(nimpretty): tok.offsetB = L.offsetBase + pos template tokenEndIgnore(tok, pos) = when defined(nimsuggest): let colB = getColNumber(L, pos) if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}: L.config.m.trackPos.fileIndex = trackPosInvalidFileIdx L.config.m.trackPos.line = 0'u16 colA = 0 when defined(nimpretty): tok.offsetB = L.offsetBase + pos template tokenEndPrevious(tok, pos) = when defined(nimsuggest): # when we detect the cursor in whitespace, we attach the track position # to the token that came before that, but only if we haven't detected # the cursor in a string literal or comment: let colB = getColNumber(L, pos) if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}: L.config.m.trackPos = L.previousToken L.config.m.trackPosAttached = true colA = 0 when defined(nimpretty): tok.offsetB = L.offsetBase + pos template eatChar(L: var Lexer, t: var Token, replacementChar: char) = t.literal.add(replacementChar) inc(L.bufpos) template eatChar(L: var Lexer, t: var Token) = t.literal.add(L.buf[L.bufpos]) inc(L.bufpos) proc getNumber(L: var Lexer, result: var Token) = proc matchUnderscoreChars(L: var Lexer, tok: var Token, chars: set[char]): Natural = var pos = L.bufpos # use registers for pos, buf result = 0 while true: if L.buf[pos] in chars: tok.literal.add(L.buf[pos]) inc(pos) inc(result) else: break if L.buf[pos] == '_': if L.buf[pos+1] notin chars: lexMessage(L, errGenerated, "only single underscores may occur in a token and token may not " & "end with an underscore: e.g. '1__1' and '1_' are invalid") break tok.literal.add('_') inc(pos) L.bufpos = pos proc matchChars(L: var Lexer, tok: var Token, chars: set[char]) = var pos = L.bufpos # use registers for pos, buf while L.buf[pos] in chars: tok.literal.add(L.buf[pos]) inc(pos) L.bufpos = pos proc lexMessageLitNum(L: var Lexer, msg: string, startpos: int, msgKind = errGenerated) = # Used to get slightly human friendlier err messages. const literalishChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '.', '\''} var msgPos = L.bufpos var t: Token t.literal = "" L.bufpos = startpos # Use L.bufpos as pos because of matchChars matchChars(L, t, literalishChars) # We must verify +/- specifically so that we're not past the literal if L.buf[L.bufpos] in {'+', '-'} and L.buf[L.bufpos - 1] in {'e', 'E'}: t.literal.add(L.buf[L.bufpos]) inc(L.bufpos) matchChars(L, t, literalishChars) if L.buf[L.bufpos] in literalishChars: t.literal.add(L.buf[L.bufpos]) inc(L.bufpos) matchChars(L, t, {'0'..'9'}) L.bufpos = msgPos lexMessage(L, msgKind, msg % t.literal) var xi: BiggestInt isBase10 = true numDigits = 0 const # 'c', 'C' is deprecated baseCodeChars = {'X', 'x', 'o', 'b', 'B', 'c', 'C'} literalishChars = baseCodeChars + {'A'..'F', 'a'..'f', '0'..'9', '_', '\''} floatTypes = {tkFloatLit, tkFloat32Lit, tkFloat64Lit, tkFloat128Lit} result.tokType = tkIntLit # int literal until we know better result.literal = "" result.base = base10 tokenBegin(result, L.bufpos) var isPositive = true if L.buf[L.bufpos] == '-': eatChar(L, result) isPositive = false let startpos = L.bufpos template setNumber(field, value) = field = (if isPositive: value else: -value) # First stage: find out base, make verifications, build token literal string # {'c', 'C'} is added for deprecation reasons to provide a clear error message if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'c', 'C', 'O'}: isBase10 = false eatChar(L, result, '0') case L.buf[L.bufpos] of 'c', 'C': lexMessageLitNum(L, "$1 will soon be invalid for oct literals; Use '0o' " & "for octals. 'c', 'C' prefix", startpos, warnDeprecated) eatChar(L, result, 'c') numDigits = matchUnderscoreChars(L, result, {'0'..'7'}) of 'O': lexMessageLitNum(L, "$1 is an invalid int literal; For octal literals " & "use the '0o' prefix.", startpos) of 'x', 'X': eatChar(L, result, 'x') numDigits = matchUnderscoreChars(L, result, {'0'..'9', 'a'..'f', 'A'..'F'}) of 'o': eatChar(L, result, 'o') numDigits = matchUnderscoreChars(L, result, {'0'..'7'}) of 'b', 'B': eatChar(L, result, 'b') numDigits = matchUnderscoreChars(L, result, {'0'..'1'}) else: internalError(L.config, getLineInfo(L), "getNumber") if numDigits == 0: lexMessageLitNum(L, "invalid number: '$1'", startpos) else: discard matchUnderscoreChars(L, result, {'0'..'9'}) if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): result.tokType = tkFloatLit eatChar(L, result, '.') discard matchUnderscoreChars(L, result, {'0'..'9'}) if L.buf[L.bufpos] in {'e', 'E'}: result.tokType = tkFloatLit eatChar(L, result) if L.buf[L.bufpos] in {'+', '-'}: eatChar(L, result) discard matchUnderscoreChars(L, result, {
#
#
# The Nimrod Compiler
# (c) Copyright 2012 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## Semantic analysis that deals with threads: Possible race conditions should
## be reported some day.
##
##
## ========================
## No heap sharing analysis
## ========================
##
## The only crucial operation that can violate the heap invariants is the
## write access. The analysis needs to distinguish between 'unknown', 'mine',
## and 'theirs' memory and pointers. Assignments 'whatever <- unknown' are
## invalid, and so are 'theirs <- whatever' but not 'mine <- theirs'. Since
## strings and sequences are heap allocated they are affected too:
##
## .. code-block:: nimrod
## proc p() =
## global = "alloc this string" # ugh!
##
## Thus the analysis is concerned with any type that contains a GC'ed
## reference...
## If the type system would distinguish between 'ref' and '!ref' and threads
## could not have '!ref' as input parameters the analysis could simply need to
## reject any write access to a global variable which contains GC'ed data.
## Thanks to the write barrier of the GC, this is exactly what needs to be
## done! Every write access to a global that contains GC'ed data needs to
## be prevented! Unfortunately '!ref' is not implemented yet...
##
## The assignment target is essential for the algorithm: only
## write access to heap locations and global variables are critical and need
## to be checked. Access via 'var' parameters is no problem to analyse since
## we need the arguments' locations in the analysis.
##
## However, this is tricky:
##
## var x = globalVar # 'x' points to 'theirs'
## while true:
## globalVar = x # NOT OK: 'theirs <- theirs' invalid due to
## # write barrier!
## x = "new string" # ugh: 'x is toUnknown'!
##
## --> Solution: toUnknown is never allowed anywhere!
##
##
## Beware that the same proc might need to be
## analysed multiple times! Oh and watch out for recursion! Recursion is handled
## by a stack of symbols that we are processing, if we come back to the same
## symbol, we have to skip this check (assume no error in the recursive case).
## However this is wrong. We need to check for the particular combination
## of (procsym, threadOwner(arg1), threadOwner(arg2), ...)!
import
ast, astalgo, strutils, hashes, options, msgs, idents, types, os,
renderer, tables, rodread
type
TThreadOwner = enum
toUndefined, # not computed yet
toVoid, # no return type
toNil, # cycle in computation or nil: can be overwritten
toTheirs, # some other heap
toMine # mine heap
TCall = object {.pure.}
callee: PSym # what if callee is an indirect call?
args: seq[TThreadOwner]
PProcCtx = ref TProcCtx
TProcCtx = object {.pure.}
nxt: PProcCtx # can be stacked
mapping: tables.TTable[int, TThreadOwner] # int = symbol ID
owner: PSym # current owner
var
computed = tables.initTable[TCall, TThreadOwner]()
proc hash(c: TCall): THash =
result = hash(c.callee.id)
for a in items(c.args): result = result !& hash(ord(a))
result = !$result
proc `==`(a, b: TCall): bool =
if a.callee != b.callee: return
if a.args.len != b.args.len: return
for i in 0..a.args.len-1:
if a.args[i] != b.args[i]: return
result = true
proc newProcCtx(owner: PSym): PProcCtx =
assert owner != nil
new(result)
result.mapping = tables.InitTable[int, TThreadOwner]()
result.owner = owner
proc analyse(c: PProcCtx, n: PNode): TThreadOwner
proc analyseSym(c: PProcCtx, n: PNode): TThreadOwner =
var v = n.sym
result = c.mapping[v.id]
if result != toUndefined: return
case v.kind
of skVar, skLet, skResult:
result = toNil
if sfGlobal in v.flags:
if sfThread in v.flags:
result = toMine
elif containsGarbageCollectedRef(v.typ):
result = toTheirs
of skTemp, skForVar: result = toNil
of skConst: result = toMine
of skParam:
result = c.mapping[v.id]
if result == toUndefined:
InternalError(n.info, "param not set: " & v.name.s)
else:
result = toNil
c.mapping[v.id] = result
proc lvalueSym(n: PNode): PNode =
result = n
while result.kind in {nkDotExpr, nkCheckedFieldExpr,
nkBracketExpr, nkDerefExpr, nkHiddenDeref}:
result = result.sons[0]
proc writeAccess(c: PProcCtx, n: PNode, owner: TThreadOwner) =
if owner notin {toNil, toMine, toTheirs}:
InternalError(n.info, "writeAccess: " & $owner)
var a = lvalueSym(n)
if a.kind == nkSym:
var v = a.sym
var lastOwner = analyseSym(c, a)
case lastOwner
of toNil:
# fine, toNil can be overwritten
var newOwner: TThreadOwner
if sfGlobal in v.flags:
newOwner = owner
elif containsTyRef(v.typ):
# ``var local = gNode`` --> ok, but ``local`` is theirs!
newOwner = owner
else:
# ``var local = gString`` --> string copy: ``local`` is mine!
newOwner = toMine
# XXX BUG what if the tuple contains both ``tyRef`` and ``tyString``?
c.mapping[v.id] = newOwner
of toVoid, toUndefined: InternalError(n.info, "writeAccess")
of toTheirs: Message(n.info, warnWriteToForeignHeap)
of toMine:
if lastOwner != owner and owner !=<