diff options
Diffstat (limited to 'compiler/c2nim/clex.nim')
-rwxr-xr-x | compiler/c2nim/clex.nim | 752 |
1 files changed, 752 insertions, 0 deletions
diff --git a/compiler/c2nim/clex.nim b/compiler/c2nim/clex.nim new file mode 100755 index 000000000..5a67f9475 --- /dev/null +++ b/compiler/c2nim/clex.nim @@ -0,0 +1,752 @@ +# +# +# c2nim - C to Nimrod source converter +# (c) Copyright 2010 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +# This module implements an Ansi C scanner. This is an adaption from +# the scanner module. Keywords are not handled here, but in the parser to make +# it more flexible. + + +import + options, msgs, strutils, platform, lexbase, llstream + +const + MaxLineLength* = 80 # lines longer than this lead to a warning + numChars*: TCharSet = {'0'..'9', 'a'..'z', 'A'..'Z'} + SymChars*: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '_', '\x80'..'\xFF'} + SymStartChars*: TCharSet = {'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF'} + +type + TTokKind* = enum + pxInvalid, pxEof, + pxMacroParam, # fake token: macro parameter (with its index) + pxStarComment, # /* */ comment + pxLineComment, # // comment + pxDirective, # #define, etc. + pxDirectiveParLe, # #define m( with parle (yes, C is that ugly!) + pxDirConc, # ## + pxNewLine, # newline: end of directive + pxAmp, # & + pxAmpAmp, # && + pxAmpAsgn, # &= + pxAmpAmpAsgn, # &&= + pxBar, # | + pxBarBar, # || + pxBarAsgn, # |= + pxBarBarAsgn, # ||= + pxNot, # ! + pxPlusPlus, # ++ + pxMinusMinus, # -- + pxPlus, # + + pxPlusAsgn, # += + pxMinus, # - + pxMinusAsgn, # -= + pxMod, # % + pxModAsgn, # %= + pxSlash, # / + pxSlashAsgn, # /= + pxStar, # * + pxStarAsgn, # *= + pxHat, # ^ + pxHatAsgn, # ^= + pxAsgn, # = + pxEquals, # == + pxDot, # . + pxDotDotDot, # ... + pxLe, # <= + pxLt, # < + pxGe, # >= + pxGt, # > + pxNeq, # != + pxConditional, # ? + pxShl, # << + pxShlAsgn, # <<= + pxShr, # >> + pxShrAsgn, # >>= + pxTilde, # ~ + pxTildeAsgn, # ~= + pxArrow, # -> + pxScope, # :: + + pxStrLit, + pxCharLit, + pxSymbol, # a symbol + pxIntLit, + pxInt64Lit, # long constant like 0x70fffffff or out of int range + pxFloatLit, + pxParLe, pxBracketLe, pxCurlyLe, # this order is important + pxParRi, pxBracketRi, pxCurlyRi, # for macro argument parsing! + pxComma, pxSemiColon, pxColon, + TTokKinds* = set[TTokKind] + +type + TNumericalBase* = enum base10, base2, base8, base16 + TToken* = object + xkind*: TTokKind # the type of the token + s*: string # parsed symbol, char or string literal + iNumber*: BiggestInt # the parsed integer literal; + # if xkind == pxMacroParam: parameter's position + fNumber*: BiggestFloat # the parsed floating point literal + base*: TNumericalBase # the numerical base; only valid for int + # or float literals + next*: ref TToken # for C we need arbitrary look-ahead :-( + + TLexer* = object of TBaseLexer + filename*: string + inDirective: bool + +proc getTok*(L: var TLexer, tok: var TToken) +proc PrintTok*(tok: TToken) +proc `$`*(tok: TToken): string +# implementation + +var + gLinesCompiled*: int + +proc fillToken(L: var TToken) = + L.xkind = pxInvalid + L.iNumber = 0 + L.s = "" + L.fNumber = 0.0 + L.base = base10 + +proc openLexer*(lex: var TLexer, filename: string, inputstream: PLLStream) = + openBaseLexer(lex, inputstream) + lex.filename = filename + +proc closeLexer*(lex: var TLexer) = + inc(gLinesCompiled, lex.LineNumber) + closeBaseLexer(lex) + +proc getColumn*(L: TLexer): int = + result = getColNumber(L, L.bufPos) + +proc getLineInfo*(L: TLexer): TLineInfo = + result = newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos)) + +proc lexMessage*(L: TLexer, msg: TMsgKind, arg = "") = + msgs.GenericMessage(getLineInfo(L), msg, arg) + +proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg = "") = + var info = newLineInfo(L.filename, L.linenumber, pos - L.lineStart) + msgs.GenericMessage(info, msg, arg) + +proc TokKindToStr*(k: TTokKind): string = + case k + of pxEof: result = "[EOF]" + of pxInvalid: result = "[invalid]" + of pxMacroParam: result = "[macro param]" + of pxStarComment, pxLineComment: result = "[comment]" + of pxStrLit: result = "[string literal]" + of pxCharLit: result = "[char literal]" + + of pxDirective, pxDirectiveParLe: result = "#" # #define, etc. + of pxDirConc: result = "##" + of pxNewLine: result = "[NewLine]" + of pxAmp: result = "&" # & + of pxAmpAmp: result = "&&" # && + of pxAmpAsgn: result = "&=" # &= + of pxAmpAmpAsgn: result = "&&=" # &&= + of pxBar: result = "|" # | + of pxBarBar: result = "||" # || + of pxBarAsgn: result = "|=" # |= + of pxBarBarAsgn: result = "||=" # ||= + of pxNot: result = "!" # ! + of pxPlusPlus: result = "++" # ++ + of pxMinusMinus: result = "--" # -- + of pxPlus: result = "+" # + + of pxPlusAsgn: result = "+=" # += + of pxMinus: result = "-" # - + of pxMinusAsgn: result = "-=" # -= + of pxMod: result = "%" # % + of pxModAsgn: result = "%=" # %= + of pxSlash: result = "/" # / + of pxSlashAsgn: result = "/=" # /= + of pxStar: result = "*" # * + of pxStarAsgn: result = "*=" # *= + of pxHat: result = "^" # ^ + of pxHatAsgn: result = "^=" # ^= + of pxAsgn: result = "=" # = + of pxEquals: result = "==" # == + of pxDot: result = "." # . + of pxDotDotDot: result = "..." # ... + of pxLe: result = "<=" # <= + of pxLt: result = "<" # < + of pxGe: result = ">=" # >= + of pxGt: result = ">" # > + of pxNeq: result = "!=" # != + of pxConditional: result = "?" + of pxShl: result = "<<" + of pxShlAsgn: result = "<<=" + of pxShr: result = ">>" + of pxShrAsgn: result = ">>=" + of pxTilde: result = "~" + of pxTildeAsgn: result = "~=" + of pxArrow: result = "->" + of pxScope: result = "::" + + of pxSymbol: result = "[identifier]" + of pxIntLit, pxInt64Lit: result = "[integer literal]" + of pxFloatLit: result = "[floating point literal]" + of pxParLe: result = "(" + of pxParRi: result = ")" + of pxBracketLe: result = "[" + of pxBracketRi: result = "]" + of pxComma: result = "," + of pxSemiColon: result = ";" + of pxColon: result = ":" + of pxCurlyLe: result = "{" + of pxCurlyRi: result = "}" + +proc `$`(tok: TToken): string = + case tok.xkind + of pxSymbol, pxInvalid, pxStarComment, pxLineComment, pxStrLit: result = tok.s + of pxIntLit, pxInt64Lit: result = $tok.iNumber + of pxFloatLit: result = $tok.fNumber + else: result = TokKindToStr(tok.xkind) + +proc PrintTok(tok: TToken) = + writeln(stdout, $tok) + +proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: TCharSet) = + # matches ([chars]_)* + var pos = L.bufpos # use registers for pos, buf + var buf = L.buf + while true: + if buf[pos] in chars: + add(tok.s, buf[pos]) + Inc(pos) + else: + break + if buf[pos] == '_': + add(tok.s, '_') + Inc(pos) + L.bufPos = pos + +proc isFloatLiteral(s: string): bool = + for i in countup(0, len(s)-1): + if s[i] in {'.', 'e', 'E'}: + return true + +proc getNumber2(L: var TLexer, tok: var TToken) = + var pos = L.bufpos + 2 # skip 0b + tok.base = base2 + var xi: biggestInt = 0 + var bits = 0 + while true: + case L.buf[pos] + of 'A'..'Z', 'a'..'z': + # ignore type suffix: + inc(pos) + of '2'..'9', '.': + lexMessage(L, errInvalidNumber) + inc(pos) + of '_': + inc(pos) + of '0', '1': + xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + inc(bits) + else: break + tok.iNumber = xi + if (bits > 32): tok.xkind = pxInt64Lit + else: tok.xkind = pxIntLit + L.bufpos = pos + +proc getNumber8(L: var TLexer, tok: var TToken) = + var pos = L.bufpos + 1 # skip 0 + tok.base = base8 + var xi: biggestInt = 0 + var bits = 0 + while true: + case L.buf[pos] + of 'A'..'Z', 'a'..'z': + # ignore type suffix: + inc(pos) + of '8'..'9', '.': + lexMessage(L, errInvalidNumber) + inc(pos) + of '_': + inc(pos) + of '0'..'7': + xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + inc(bits) + else: break + tok.iNumber = xi + if (bits > 12): tok.xkind = pxInt64Lit + else: tok.xkind = pxIntLit + L.bufpos = pos + +proc getNumber16(L: var TLexer, tok: var TToken) = + var pos = L.bufpos + 2 # skip 0x + tok.base = base16 + var xi: biggestInt = 0 + var bits = 0 + while true: + case L.buf[pos] + of 'G'..'Z', 'g'..'z': + # ignore type suffix: + inc(pos) + of '_': inc(pos) + of '0'..'9': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + inc(bits, 4) + of 'a'..'f': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10) + inc(pos) + inc(bits, 4) + of 'A'..'F': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10) + inc(pos) + inc(bits, 4) + else: break + tok.iNumber = xi + if bits > 32: tok.xkind = pxInt64Lit + else: tok.xkind = pxIntLit + L.bufpos = pos + +proc getNumber(L: var TLexer, tok: var TToken) = + tok.base = base10 + matchUnderscoreChars(L, tok, {'0'..'9'}) + if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): + add(tok.s, '.') + inc(L.bufpos) + matchUnderscoreChars(L, tok, {'e', 'E', '+', '-', '0'..'9'}) + try: + if isFloatLiteral(tok.s): + tok.fnumber = parseFloat(tok.s) + tok.xkind = pxFloatLit + else: + tok.iNumber = ParseInt(tok.s) + if (tok.iNumber < low(int32)) or (tok.iNumber > high(int32)): + tok.xkind = pxInt64Lit + else: + tok.xkind = pxIntLit + except EInvalidValue: + lexMessage(L, errInvalidNumber, tok.s) + except EOverflow: + lexMessage(L, errNumberOutOfRange, tok.s) + # ignore type suffix: + while L.buf[L.bufpos] in {'A'..'Z', 'a'..'z'}: inc(L.bufpos) + +proc HandleCRLF(L: var TLexer, pos: int): int = + case L.buf[pos] + of CR: result = lexbase.HandleCR(L, pos) + of LF: result = lexbase.HandleLF(L, pos) + else: result = pos + +proc escape(L: var TLexer, tok: var TToken, allowEmpty=false) = + inc(L.bufpos) # skip \ + case L.buf[L.bufpos] + of 'b', 'B': + add(tok.s, '\b') + inc(L.bufpos) + of 't', 'T': + add(tok.s, '\t') + inc(L.bufpos) + of 'n', 'N': + add(tok.s, '\L') + inc(L.bufpos) + of 'f', 'F': + add(tok.s, '\f') + inc(L.bufpos) + of 'r', 'R': + add(tok.s, '\r') + inc(L.bufpos) + of '\'': + add(tok.s, '\'') + inc(L.bufpos) + of '"': + add(tok.s, '"') + inc(L.bufpos) + of '\\': + add(tok.s, '\b') + inc(L.bufpos) + of '0'..'7': + var xi = ord(L.buf[L.bufpos]) - ord('0') + inc(L.bufpos) + if L.buf[L.bufpos] in {'0'..'7'}: + xi = (xi shl 3) or (ord(L.buf[L.bufpos]) - ord('0')) + inc(L.bufpos) + if L.buf[L.bufpos] in {'0'..'7'}: + xi = (xi shl 3) or (ord(L.buf[L.bufpos]) - ord('0')) + inc(L.bufpos) + add(tok.s, chr(xi)) + elif not allowEmpty: + lexMessage(L, errInvalidCharacterConstant) + +proc getCharLit(L: var TLexer, tok: var TToken) = + inc(L.bufpos) # skip ' + if L.buf[L.bufpos] == '\\': + escape(L, tok) + else: + add(tok.s, L.buf[L.bufpos]) + inc(L.bufpos) + if L.buf[L.bufpos] == '\'': + inc(L.bufpos) + else: + lexMessage(L, errMissingFinalQuote) + tok.xkind = pxCharLit + +proc getString(L: var TLexer, tok: var TToken) = + var pos = L.bufPos + 1 # skip " + var buf = L.buf # put `buf` in a register + var line = L.linenumber # save linenumber for better error message + while true: + case buf[pos] + of '\"': + Inc(pos) + break + of CR: + pos = lexbase.HandleCR(L, pos) + buf = L.buf + of LF: + pos = lexbase.HandleLF(L, pos) + buf = L.buf + of lexbase.EndOfFile: + var line2 = L.linenumber + L.LineNumber = line + lexMessagePos(L, errClosingQuoteExpected, L.lineStart) + L.LineNumber = line2 + break + of '\\': + # we allow an empty \ for line concatenation, but we don't require it + # for line concatenation + L.bufpos = pos + escape(L, tok, allowEmpty=true) + pos = L.bufpos + else: + add(tok.s, buf[pos]) + Inc(pos) + L.bufpos = pos + tok.xkind = pxStrLit + +proc getSymbol(L: var TLexer, tok: var TToken) = + var pos = L.bufpos + var buf = L.buf + while true: + var c = buf[pos] + if c notin SymChars: break + add(tok.s, c) + Inc(pos) + L.bufpos = pos + tok.xkind = pxSymbol + +proc scanLineComment(L: var TLexer, tok: var TToken) = + var pos = L.bufpos + var buf = L.buf + # a comment ends if the next line does not start with the // on the same + # column after only whitespace + tok.xkind = pxLineComment + var col = getColNumber(L, pos) + while true: + inc(pos, 2) # skip // + add(tok.s, '#') + while not (buf[pos] in {CR, LF, lexbase.EndOfFile}): + add(tok.s, buf[pos]) + inc(pos) + pos = handleCRLF(L, pos) + buf = L.buf + var indent = 0 + while buf[pos] == ' ': + inc(pos) + inc(indent) + if (col == indent) and (buf[pos] == '/') and (buf[pos + 1] == '/'): + add(tok.s, "\n") + else: + break + L.bufpos = pos + +proc scanStarComment(L: var TLexer, tok: var TToken) = + var pos = L.bufpos + var buf = L.buf + tok.s = "#" + tok.xkind = pxStarComment + while true: + case buf[pos] + of CR, LF: + pos = HandleCRLF(L, pos) + buf = L.buf + add(tok.s, "\n#") + # skip annoying stars as line prefix: (eg. + # /* + # * ugly comment <-- this star + # */ + while buf[pos] in {' ', '\t'}: + add(tok.s, ' ') + inc(pos) + if buf[pos] == '*' and buf[pos+1] != '/': inc(pos) + of '*': + inc(pos) + if buf[pos] == '/': + inc(pos) + break + else: + add(tok.s, '*') + of lexbase.EndOfFile: + lexMessage(L, errTokenExpected, "*/") + else: + add(tok.s, buf[pos]) + inc(pos) + L.bufpos = pos + +proc skip(L: var TLexer, tok: var TToken) = + var pos = L.bufpos + var buf = L.buf + while true: + case buf[pos] + of '\\': + # Ignore \ line continuation characters when not inDirective + inc(pos) + if L.inDirective: + while buf[pos] in {' ', '\t'}: inc(pos) + if buf[pos] in {CR, LF}: + pos = HandleCRLF(L, pos) + buf = L.buf + of ' ', Tabulator: + Inc(pos) # newline is special: + of CR, LF: + pos = HandleCRLF(L, pos) + buf = L.buf + if L.inDirective: + tok.xkind = pxNewLine + L.inDirective = false + else: + break # EndOfFile also leaves the loop + L.bufpos = pos + +proc getDirective(L: var TLexer, tok: var TToken) = + var pos = L.bufpos + 1 + var buf = L.buf + while buf[pos] in {' ', '\t'}: inc(pos) + while buf[pos] in SymChars: + add(tok.s, buf[pos]) + inc(pos) + # a HACK: we need to distinguish + # #define x (...) + # from: + # #define x(...) + # + L.bufpos = pos + # look ahead: + while buf[pos] in {' ', '\t'}: inc(pos) + while buf[pos] in SymChars: inc(pos) + if buf[pos] == '(': tok.xkind = pxDirectiveParLe + else: tok.xkind = pxDirective + L.inDirective = true + +proc getTok(L: var TLexer, tok: var TToken) = + tok.xkind = pxInvalid + fillToken(tok) + skip(L, tok) + if tok.xkind == pxNewLine: return + var c = L.buf[L.bufpos] + if c in SymStartChars: + getSymbol(L, tok) + elif c == '0': + case L.buf[L.bufpos+1] + of 'x', 'X': getNumber16(L, tok) + of 'b', 'B': getNumber2(L, tok) + of '1'..'7': getNumber8(L, tok) + else: getNumber(L, tok) + elif c in {'1'..'9'}: + getNumber(L, tok) + else: + case c + of ';': + tok.xkind = pxSemicolon + Inc(L.bufpos) + of '/': + if L.buf[L.bufpos + 1] == '/': + scanLineComment(L, tok) + elif L.buf[L.bufpos+1] == '*': + inc(L.bufpos, 2) + scanStarComment(L, tok) + elif L.buf[L.bufpos+1] == '=': + inc(L.bufpos, 2) + tok.xkind = pxSlashAsgn + else: + tok.xkind = pxSlash + inc(L.bufpos) + of ',': + tok.xkind = pxComma + Inc(L.bufpos) + of '(': + Inc(L.bufpos) + tok.xkind = pxParLe + of '*': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + inc(L.bufpos) + tok.xkind = pxStarAsgn + else: + tok.xkind = pxStar + of ')': + Inc(L.bufpos) + tok.xkind = pxParRi + of '[': + Inc(L.bufpos) + tok.xkind = pxBracketLe + of ']': + Inc(L.bufpos) + tok.xkind = pxBracketRi + of '.': + inc(L.bufpos) + if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] == '.': + tok.xkind = pxDotDotDot + inc(L.bufpos, 2) + else: + tok.xkind = pxDot + of '{': + Inc(L.bufpos) + tok.xkind = pxCurlyLe + of '}': + Inc(L.bufpos) + tok.xkind = pxCurlyRi + of '+': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + tok.xkind = pxPlusAsgn + inc(L.bufpos) + elif L.buf[L.bufpos] == '+': + tok.xkind = pxPlusPlus + inc(L.bufpos) + else: + tok.xkind = pxPlus + of '-': + inc(L.bufpos) + case L.buf[L.bufpos] + of '>': + tok.xkind = pxArrow + inc(L.bufpos) + of '=': + tok.xkind = pxMinusAsgn + inc(L.bufpos) + of '-': + tok.xkind = pxMinusMinus + inc(L.bufpos) + else: + tok.xkind = pxMinus + of '?': + inc(L.bufpos) + tok.xkind = pxConditional + of ':': + inc(L.bufpos) + if L.buf[L.bufpos] == ':': + tok.xkind = pxScope + inc(L.bufpos) + else: + tok.xkind = pxColon + of '!': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + tok.xkind = pxNeq + inc(L.bufpos) + else: + tok.xkind = pxNot + of '<': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + inc(L.bufpos) + tok.xkind = pxLe + elif L.buf[L.bufpos] == '<': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + inc(L.bufpos) + tok.xkind = pxShlAsgn + else: + tok.xkind = pxShl + else: + tok.xkind = pxLt + of '>': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + inc(L.bufpos) + tok.xkind = pxGe + elif L.buf[L.bufpos] == '>': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + inc(L.bufpos) + tok.xkind = pxShrAsgn + else: + tok.xkind = pxShr + else: + tok.xkind = pxGt + of '=': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + tok.xkind = pxEquals + inc(L.bufpos) + else: + tok.xkind = pxAsgn + of '&': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + tok.xkind = pxAmpAsgn + inc(L.bufpos) + elif L.buf[L.bufpos] == '&': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + inc(L.bufpos) + tok.xkind = pxAmpAmpAsgn + else: + tok.xkind = pxAmpAmp + else: + tok.xkind = pxAmp + of '|': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + tok.xkind = pxBarAsgn + inc(L.bufpos) + elif L.buf[L.bufpos] == '|': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + inc(L.bufpos) + tok.xkind = pxBarBarAsgn + else: + tok.xkind = pxBarBar + else: + tok.xkind = pxBar + of '^': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + tok.xkind = pxHatAsgn + inc(L.bufpos) + else: + tok.xkind = pxHat + of '%': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + tok.xkind = pxModAsgn + inc(L.bufpos) + else: + tok.xkind = pxMod + of '~': + inc(L.bufpos) + if L.buf[L.bufpos] == '=': + tok.xkind = pxTildeAsgn + inc(L.bufpos) + else: + tok.xkind = pxTilde + of '#': + if L.buf[L.bufpos+1] == '#': + inc(L.bufpos, 2) + tok.xkind = pxDirConc + else: + getDirective(L, tok) + of '"': getString(L, tok) + of '\'': getCharLit(L, tok) + of lexbase.EndOfFile: + tok.xkind = pxEof + else: + tok.s = $c + tok.xkind = pxInvalid + lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')') + Inc(L.bufpos) |