# # # c2nim - C to Nimrod source converter # (c) Copyright 2010 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # # This module implements an Ansi C scanner. This is an adaption from # the scanner module. Keywords are not handled here, but in the parser to make # it more flexible. import options, msgs, strutils, platform, lexbase, llstream const MaxLineLength* = 80 # lines longer than this lead to a warning numChars*: TCharSet = {'0'..'9', 'a'..'z', 'A'..'Z'} SymChars*: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '_', '\x80'..'\xFF'} SymStartChars*: TCharSet = {'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF'} type TTokKind* = enum pxInvalid, pxEof, pxMacroParam, # fake token: macro parameter (with its index) pxStarComment, # /* */ comment pxLineComment, # // comment pxDirective, # #define, etc. pxDirectiveParLe, # #define m( with parle (yes, C is that ugly!) pxDirConc, # ## pxNewLine, # newline: end of directive pxAmp, # & pxAmpAmp, # && pxAmpAsgn, # &= pxAmpAmpAsgn, # &&= pxBar, # | pxBarBar, # || pxBarAsgn, # |= pxBarBarAsgn, # ||= pxNot, # ! pxPlusPlus, # ++ pxMinusMinus, # -- pxPlus, # + pxPlusAsgn, # += pxMinus, # - pxMinusAsgn, # -= pxMod, # % pxModAsgn, # %= pxSlash, # / pxSlashAsgn, # /= pxStar, # * pxStarAsgn, # *= pxHat, # ^ pxHatAsgn, # ^= pxAsgn, # = pxEquals, # == pxDot, # . pxDotDotDot, # ... pxLe, # <= pxLt, # < pxGe, # >= pxGt, # > pxNeq, # != pxConditional, # ? pxShl, # << pxShlAsgn, # <<= pxShr, # >> pxShrAsgn, # >>= pxTilde, # ~ pxTildeAsgn, # ~= pxArrow, # -> pxScope, # :: pxStrLit, pxCharLit, pxSymbol, # a symbol pxIntLit, pxInt64Lit, # long constant like 0x70fffffff or out of int range pxFloatLit, pxParLe, pxBracketLe, pxCurlyLe, # this order is important pxParRi, pxBracketRi, pxCurlyRi, # for macro argument parsing! pxComma, pxSemiColon, pxColon, TTokKinds* = set[TTokKind] type TNumericalBase* = enum base10, base2, base8, base16 TToken* = object xkind*: TTokKind # the type of the token s*: string # parsed symbol, char or string literal iNumber*: BiggestInt # the parsed integer literal; # if xkind == pxMacroParam: parameter's position fNumber*: BiggestFloat # the parsed floating point literal base*: TNumericalBase # the numerical base; only valid for int # or float literals next*: ref TToken # for C we need arbitrary look-ahead :-( TLexer* = object of TBaseLexer filename*: string inDirective: bool proc getTok*(L: var TLexer, tok: var TToken) proc PrintTok*(tok: TToken) proc `$`*(tok: TToken): string # implementation var gLinesCompiled*: int proc fillToken(L: var TToken) = L.xkind = pxInvalid L.iNumber = 0 L.s = "" L.fNumber = 0.0 L.base = base10 proc openLexer*(lex: var TLexer, filename: string, inputstream: PLLStream) = openBaseLexer(lex, inputstream) lex.filename = filename proc closeLexer*(lex: var TLexer) = inc(gLinesCompiled, lex.LineNumber) closeBaseLexer(lex) proc getColumn*(L: TLexer): int = result = getColNumber(L, L.bufPos) proc getLineInfo*(L: TLexer): TLineInfo = result = newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos)) proc lexMessage*(L: TLexer, msg: TMsgKind, arg = "") = msgs.liMessage(getLineInfo(L), msg, arg) proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg = "") = var info = newLineInfo(L.filename, L.linenumber, pos - L.lineStart) msgs.liMessage(info, msg, arg) proc TokKindToStr*(k: TTokKind): string = case k of pxEof: result = "[EOF]" of pxInvalid: result = "[invalid]" of pxMacroParam: result = "[macro param]" of pxStarComment, pxLineComment: result = "[comment]" of pxStrLit: result = "[string literal]" of pxCharLit: result = "[char literal]" of pxDirective, pxDirectiveParLe: result = "#" # #define, etc. of pxDirConc: result = "##" of pxNewLine: result = "[NewLine]" of pxAmp: result = "&" # & of pxAmpAmp: result = "&&" # && of pxAmpAsgn: result = "&=" # &= of pxAmpAmpAsgn: result = "&&=" # &&= of pxBar: result = "|" # | of pxBarBar: result = "||" # || of pxBarAsgn: result = "|=" # |= of pxBarBarAsgn: result = "||=" # ||= of pxNot: result = "!" # ! of pxPlusPlus: result = "++" # ++ of pxMinusMinus: result = "--" # -- of pxPlus: result = "+" # + of pxPlusAsgn: result = "+=" # += of pxMinus: result = "-" # - of pxMinusAsgn: result = "-=" # -= of pxMod: result = "%" # % of pxModAsgn: result = "%=" # %= of pxSlash: result = "/" # / of pxSlashAsgn: result = "/=" # /= of pxStar: result = "*" # * of pxStarAsgn: result = "*=" # *= of pxHat: result = "^" # ^ of pxHatAsgn: result = "^=" # ^= of pxAsgn: result = "=" # = of pxEquals: result = "==" # == of pxDot: result = "." # . of pxDotDotDot: result = "..." # ... of pxLe: result = "<=" # <= of pxLt: result = "<" # < of pxGe: result = ">=" # >= of pxGt: result = ">" # > of pxNeq: result = "!=" # != of pxConditional: result = "?" of pxShl: result = "<<" of pxShlAsgn: result = "<<=" of pxShr: result = ">>" of pxShrAsgn: result = ">>=" of pxTilde: result = "~" of pxTildeAsgn: result = "~=" of pxArrow: result = "->" of pxScope: result = "::" of pxSymbol: result = "[identifier]" of pxIntLit, pxInt64Lit: result = "[integer literal]" of pxFloatLit: result = "[floating point literal]" of pxParLe: result = "(" of pxParRi: result = ")" of pxBracketLe: result = "[" of pxBracketRi: result = "]" of pxComma: result = "," of pxSemiColon: result = ";" of pxColon: result = ":" of pxCurlyLe: result = "{" of pxCurlyRi: result = "}" proc `$`(tok: TToken): string = case tok.xkind of pxSymbol, pxInvalid, pxStarComment, pxLineComment, pxStrLit: result = tok.s of pxIntLit, pxInt64Lit: result = $tok.iNumber of pxFloatLit: result = $tok.fNumber else: result = TokKindToStr(tok.xkind) proc PrintTok(tok: TToken) = writeln(stdout, $tok) proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: TCharSet) = # matches ([chars]_)* var pos = L.bufpos # use registers for pos, buf var buf = L.buf while true: if buf[pos] in chars: add(tok.s, buf[pos]) Inc(pos) else: break if buf[pos] == '_': add(tok.s, '_') Inc(pos) L.bufPos = pos proc isFloatLiteral(s: string): bool = for i in countup(0, len(s)-1): if s[i] in {'.', 'e', 'E'}: return true proc getNumber2(L: var TLexer, tok: var TToken) = var pos = L.bufpos + 2 # skip 0b tok.base = base2 var xi: biggestInt = 0 var bits = 0 while true: case L.buf[pos] of 'A'..'Z', 'a'..'z': # ignore type suffix: inc(pos) of '2'..'9', '.': lexMessage(L, errInvalidNumber) inc(pos) of '_': inc(pos) of '0', '1': xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0')) inc(pos) inc(bits) else: break tok.iNumber = xi if (bits > 32): tok.xkind = pxInt64Lit else: tok.xkind = pxIntLit L.bufpos = pos proc getNumber8(L: var TLexer, tok: var TToken) = var pos = L.bufpos + 2 # skip 0b tok.base = base8 var xi: biggestInt = 0 var bits = 0 while true: case L.buf[pos] of 'A'..'Z', 'a'..'z': # ignore type suffix: inc(pos) of '8'..'9', '.': lexMessage(L, errInvalidNumber) inc(pos) of '_': inc(pos) of '0'..'7': xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0')) inc(pos) inc(bits) else: break tok.iNumber = xi if (bits > 12): tok.xkind = pxInt64Lit else: tok.xkind = pxIntLit L.bufpos = pos proc getNumber16(L: var TLexer, tok: var TToken) = var pos = L.bufpos + 2 # skip 0x tok.base = base16 var xi: biggestInt = 0 var bits = 0 while true: case L.buf[pos] of 'G'..'Z', 'g'..'z': # ignore type suffix: inc(pos) of '_': inc(pos) of '0'..'9': xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0')) inc(pos) inc(bits, 4) of 'a'..'f': xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10) inc(pos) inc(bits, 4) of 'A'..'F': xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10) inc(pos) inc(bits, 4) else: break tok.iNumber = xi if bits > 32: tok.xkind = pxInt64Lit else: tok.xkind = pxIntLit L.bufpos = pos proc getNumber(L: var TLexer, tok: var TToken) = tok.base = base10 matchUnderscoreChars(L, tok, {'0'..'9'}) if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}): add(tok.s, '.') inc(L.bufpos) matchUnderscoreChars(L, tok, {'e', 'E', '+', '-', '0'..'9'}) try: if isFloatLiteral(tok.s): tok.fnumber = parseFloat(tok.s) tok.xkind = pxFloatLit else: tok.iNumber = ParseInt(tok.s) if (tok.iNumber < low(int32)) or (tok.iNumber > high(int32)): tok.xkind = pxInt64Lit else: tok.xkind = pxIntLit except EInvalidValue: lexMessage(L, errInvalidNumber, tok.s) except EOverflow: lexMessage(L, errNumberOutOfRange, tok.s) # ignore type suffix: while L.buf[L.bufpos] in {'A'..'Z', 'a'..'z'}: inc(L.bufpos) proc HandleCRLF(L: var TLexer, pos: int): int = case L.buf[pos] of CR: result = lexbase.HandleCR(L, pos) of LF: result = lexbase.HandleLF(L, pos) else: result = pos proc escape(L: var TLexer, tok: var TToken, allowEmpty=false) = inc(L.bufpos) # skip \ case L.buf[L.bufpos] of 'b', 'B': add(tok.s, '\b') inc(L.bufpos) of 't', 'T': add(tok.s, '\t') inc(L.bufpos) of 'n', 'N': add(tok.s, '\L') inc(L.bufpos) of 'f', 'F': add(tok.s, '\f') inc(L.bufpos) of 'r', 'R': add(tok.s, '\r') inc(L.bufpos) of '\'': add(tok.s, '\'') inc(L.bufpos) of '"': add(tok.s, '"') inc(L.bufpos) of '\\': add(tok.s, '\b') inc(L.bufpos) of '0'..'7': var xi = ord(L.buf[L.bufpos]) - ord('0') inc(L.bufpos) if L.buf[L.bufpos] in {'0'..'7'}: xi = (xi shl 3) or (ord(L.buf[L.bufpos]) - ord('0')) inc(L.bufpos) if L.buf[L.bufpos] in {'0'..'7'}: xi = (xi shl 3) or (ord(L.buf[L.bufpos]) - ord('0')) inc(L.bufpos) add(tok.s, chr(xi)) elif not allowEmpty: lexMessage(L, errInvalidCharacterConstant) proc getCharLit(L: var TLexer, tok: var TToken) = inc(L.bufpos) # skip ' if L.buf[L.bufpos] == '\\': escape(L, tok) else: add(tok.s, L.buf[L.bufpos]) inc(L.bufpos) if L.buf[L.bufpos] == '\'': inc(L.bufpos) else: lexMessage(L, errMissingFinalQuote) tok.xkind = pxCharLit proc getString(L: var TLexer, tok: var TToken) = var pos = L.bufPos + 1 # skip " var buf = L.buf # put `buf` in a register var line = L.linenumber # save linenumber for better error message while true: case buf[pos] of '\"': Inc(pos) break of CR: pos = lexbase.HandleCR(L, pos) buf = L.buf of LF: pos = lexbase.HandleLF(L, pos) buf = L.buf of lexbase.EndOfFile: var line2 = L.linenumber L.LineNumber = line lexMessagePos(L, errClosingQuoteExpected, L.lineStart) L.LineNumber = line2 break of '\\': # we allow an empty \ for line concatenation, but we don't require it # for line concatenation L.bufpos = pos escape(L, tok, allowEmpty=true) pos = L.bufpos else: add(tok.s, buf[pos]) Inc(pos) L.bufpos = pos tok.xkind = pxStrLit proc getSymbol(L: var TLexer, tok: var TToken) = var pos = L.bufpos var buf = L.buf while true: var c = buf[pos] if c notin SymChars: break add(tok.s, c) Inc(pos) L.bufpos = pos tok.xkind = pxSymbol proc scanLineComment(L: var TLexer, tok: var TToken) = var pos = L.bufpos var buf = L.buf # a comment ends if the next line does not start with the // on the same # column after only whitespace tok.xkind = pxLineComment var col = getColNumber(L, pos) while true: inc(pos, 2) # skip // add(tok.s, '#') while not (buf[pos] in {CR, LF, lexbase.EndOfFile}): add(tok.s, buf[pos]) inc(pos) pos = handleCRLF(L, pos) buf = L.buf var indent = 0 while buf[pos] == ' ': inc(pos) inc(indent) if (col == indent) and (buf[pos] == '/') and (buf[pos + 1] == '/'): add(tok.s, "\n") else: break L.bufpos = pos proc scanStarComment(L: var TLexer, tok: var TToken) = var pos = L.bufpos var buf = L.buf tok.s = "#" tok.xkind = pxStarComment while true: case buf[pos] of CR, LF: pos = HandleCRLF(L, pos) buf = L.buf add(tok.s, "\n#") # skip annoying stars as line prefix: (eg. # /* # * ugly comment <-- this star # */ while buf[pos] in {' ', '\t'}: add(tok.s, ' ') inc(pos) if buf[pos] == '*' and buf[pos+1] != '/': inc(pos) of '*': inc(pos) if buf[pos] == '/': inc(pos) break else: add(tok.s, '*') of lexbase.EndOfFile: lexMessage(L, errTokenExpected, "*/") else: add(tok.s, buf[pos]) inc(pos) L.bufpos = pos proc skip(L: var TLexer, tok: var TToken) = var pos = L.bufpos var buf = L.buf while true: case buf[pos] of '\\': # Ignore \ line continuation characters when not inDirective inc(pos) if L.inDirective: while buf[pos] in {' ', '\t'}: inc(pos) if buf[pos] in {CR, LF}: pos = HandleCRLF(L, pos) buf = L.buf of ' ', Tabulator: Inc(pos) # newline is special: of CR, LF: pos = HandleCRLF(L, pos) buf = L.buf if L.inDirective: tok.xkind = pxNewLine L.inDirective = false else: break # EndOfFile also leaves the loop L.bufpos = pos proc getDirective(L: var TLexer, tok: var TToken) = var pos = L.bufpos + 1 var buf = L.buf while buf[pos] in {' ', '\t'}: inc(pos) while buf[pos] in SymChars: add(tok.s, buf[pos]) inc(pos) # a HACK: we need to distinguish # #define x (...) # from: # #define x(...) # L.bufpos = pos # look ahead: while buf[pos] in {' ', '\t'}: inc(pos) while buf[pos] in SymChars: inc(pos) if buf[pos] == '(': tok.xkind = pxDirectiveParLe else: tok.xkind = pxDirective L.inDirective = true proc getTok(L: var TLexer, tok: var TToken) = tok.xkind = pxInvalid fillToken(tok) skip(L, tok) if tok.xkind == pxNewLine: return var c = L.buf[L.bufpos] if c in SymStartChars: getSymbol(L, tok) elif c == '0': case L.buf[L.bufpos+1] of 'x', 'X': getNumber16(L, tok) of 'b', 'B': getNumber2(L, tok) of '1'..'7': getNumber8(L, tok) else: getNumber(L, tok) elif c in {'1'..'9'}: getNumber(L, tok) else: case c of ';': tok.xkind = pxSemicolon Inc(L.bufpos) of '/': if L.buf[L.bufpos + 1] == '/': scanLineComment(L, tok) elif L.buf[L.bufpos+1] == '*': inc(L.bufpos, 2) scanStarComment(L, tok) elif L.buf[L.bufpos+1] == '=': inc(L.bufpos, 2) tok.xkind = pxSlashAsgn else: tok.xkind = pxSlash inc(L.bufpos) of ',': tok.xkind = pxComma Inc(L.bufpos) of '(': Inc(L.bufpos) tok.xkind = pxParLe of '*': inc(L.bufpos) if L.buf[L.bufpos] == '=': inc(L.bufpos) tok.xkind = pxStarAsgn else: tok.xkind = pxStar of ')': Inc(L.bufpos) tok.xkind = pxParRi of '[': Inc(L.bufpos) tok.xkind = pxBracketLe of ']': Inc(L.bufpos) tok.xkind = pxBracketRi of '.': inc(L.bufpos) if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] == '.': tok.xkind = pxDotDotDot inc(L.bufpos, 2) else: tok.xkind = pxDot of '{': Inc(L.bufpos) tok.xkind = pxCurlyLe of '}': Inc(L.bufpos) tok.xkind = pxCurlyRi of '+': inc(L.bufpos) if L.buf[L.bufpos] == '=': tok.xkind = pxPlusAsgn inc(L.bufpos) elif L.buf[L.bufpos] == '+': tok.xkind = pxPlusPlus inc(L.bufpos) else: tok.xkind = pxPlus of '-': inc(L.bufpos) case L.buf[L.bufpos] of '>': tok.xkind = pxArrow inc(L.bufpos) of '=': tok.xkind = pxMinusAsgn inc(L.bufpos) of '-': tok.xkind = pxMinusMinus inc(L.bufpos) else: tok.xkind = pxMinus of '?': inc(L.bufpos) tok.xkind = pxConditional of ':': inc(L.bufpos) if L.buf[L.bufpos] == ':': tok.xkind = pxScope inc(L.bufpos) else: tok.xkind = pxColon of '!': inc(L.bufpos) if L.buf[L.bufpos] == '=': tok.xkind = pxNeq inc(L.bufpos) else: tok.xkind = pxNot of '<': inc(L.bufpos) if L.buf[L.bufpos] == '=': inc(L.bufpos) tok.xkind = pxLe elif L.buf[L.bufpos] == '<': inc(L.bufpos) if L.buf[L.bufpos] == '=': inc(L.bufpos) tok.xkind = pxShlAsgn else: tok.xkind = pxShl else: tok.xkind = pxLt of '>': inc(L.bufpos) if L.buf[L.bufpos] == '=': inc(L.bufpos) tok.xkind = pxGe elif L.buf[L.bufpos] == '>': inc(L.bufpos) if L.buf[L.bufpos] == '=': inc(L.bufpos) tok.xkind = pxShrAsgn else: tok.xkind = pxShr else: tok.xkind = pxGt of '=': inc(L.bufpos) if L.buf[L.bufpos] == '=': tok.xkind = pxEquals inc(L.bufpos) else: tok.xkind = pxAsgn of '&': inc(L.bufpos) if L.buf[L.bufpos] == '=': tok.xkind = pxAmpAsgn inc(L.bufpos) elif L.buf[L.bufpos] == '&': inc(L.bufpos) if L.buf[L.bufpos] == '=': inc(L.bufpos) tok.xkind = pxAmpAmpAsgn else: tok.xkind = pxAmpAmp else: tok.xkind = pxAmp of '|': inc(L.bufpos) if L.buf[L.bufpos] == '=': tok.xkind = pxBarAsgn inc(L.bufpos) elif L.buf[L.bufpos] == '|': inc(L.bufpos) if L.buf[L.bufpos] == '=': inc(L.bufpos) tok.xkind = pxBarBarAsgn else: tok.xkind = pxBarBar else: tok.xkind = pxBar of '^': inc(L.bufpos) if L.buf[L.bufpos] == '=': tok.xkind = pxHatAsgn inc(L.bufpos) else: tok.xkind = pxHat of '%': inc(L.bufpos) if L.buf[L.bufpos] == '=': tok.xkind = pxModAsgn inc(L.bufpos) else: tok.xkind = pxMod of '~': inc(L.bufpos) if L.buf[L.bufpos] == '=': tok.xkind = pxTildeAsgn inc(L.bufpos) else: tok.xkind = pxTilde of '#': if L.buf[L.bufpos+1] == '#': inc(L.bufpos, 2) tok.xkind = pxDirConc else: getDirective(L, tok) of '"': getString(L, tok) of '\'': getCharLit(L, tok) of lexbase.EndOfFile: tok.xkind = pxEof else: tok.s = $c tok.xkind = pxInvalid lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')') Inc(L.bufpos)