#
#
# c2nim - C to Nimrod source converter
# (c) Copyright 2012 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
# This module implements an Ansi C scanner. This is an adaption from
# the scanner module. Keywords are not handled here, but in the parser to make
# it more flexible.
import
options, msgs, strutils, platform, lexbase, llstream
const
MaxLineLength* = 80 # lines longer than this lead to a warning
numChars*: TCharSet = {'0'..'9', 'a'..'z', 'A'..'Z'}
SymChars*: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '_', '\x80'..'\xFF'}
SymStartChars*: TCharSet = {'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF'}
type
TTokKind* = enum
pxInvalid, pxEof,
pxMacroParam, # fake token: macro parameter (with its index)
pxStarComment, # /* */ comment
pxLineComment, # // comment
pxDirective, # #define, etc.
pxDirectiveParLe, # #define m( with parle (yes, C is that ugly!)
pxDirConc, # ##
pxNewLine, # newline: end of directive
pxAmp, # &
pxAmpAmp, # &&
pxAmpAsgn, # &=
pxAmpAmpAsgn, # &&=
pxBar, # |
pxBarBar, # ||
pxBarAsgn, # |=
pxBarBarAsgn, # ||=
pxNot, # !
pxPlusPlus, # ++
pxMinusMinus, # --
pxPlus, # +
pxPlusAsgn, # +=
pxMinus, # -
pxMinusAsgn, # -=
pxMod, # %
pxModAsgn, # %=
pxSlash, # /
pxSlashAsgn, # /=
pxStar, # *
pxStarAsgn, # *=
pxHat, # ^
pxHatAsgn, # ^=
pxAsgn, # =
pxEquals, # ==
pxDot, # .
pxDotDotDot, # ...
pxLe, # <=
pxLt, # <
pxGe, # >=
pxGt, # >
pxNeq, # !=
pxConditional, # ?
pxShl, # <<
pxShlAsgn, # <<=
pxShr, # >>
pxShrAsgn, # >>=
pxTilde, # ~
pxTildeAsgn, # ~=
pxArrow, # ->
pxScope, # ::
pxStrLit,
pxCharLit,
pxSymbol, # a symbol
pxIntLit,
pxInt64Lit, # long constant like 0x70fffffff or out of int range
pxFloatLit,
pxParLe, pxBracketLe, pxCurlyLe, # this order is important
pxParRi, pxBracketRi, pxCurlyRi, # for macro argument parsing!
pxComma, pxSemiColon, pxColon,
TTokKinds* = set[TTokKind]
type
TNumericalBase* = enum base10, base2, base8, base16
TToken* = object
xkind*: TTokKind # the type of the token
s*: string # parsed symbol, char or string literal
iNumber*: BiggestInt # the parsed integer literal;
# if xkind == pxMacroParam: parameter's position
fNumber*: BiggestFloat # the parsed floating point literal
base*: TNumericalBase # the numerical base; only valid for int
# or float literals
next*: ref TToken # for C we need arbitrary look-ahead :-(
TLexer* = object of TBaseLexer
fileIdx*: int32
inDirective: bool
proc getTok*(L: var TLexer, tok: var TToken)
proc PrintTok*(tok: TToken)
proc `$`*(tok: TToken): string
# implementation
var
gLinesCompiled*: int
proc fillToken(L: var TToken) =
L.xkind = pxInvalid
L.iNumber = 0
L.s = ""
L.fNumber = 0.0
L.base = base10
proc openLexer*(lex: var TLexer, filename: string, inputstream: PLLStream) =
openBaseLexer(lex, inputstream)
lex.fileIdx = filename.fileInfoIdx
proc closeLexer*(lex: var TLexer) =
inc(gLinesCompiled, lex.LineNumber)
closeBaseLexer(lex)
proc getColumn*(L: TLexer): int =
result = getColNumber(L, L.bufPos)
proc getLineInfo*(L: TLexer): TLineInfo =
result = newLineInfo(L.fileIdx, L.linenumber, getColNumber(L, L.bufpos))
proc lexMessage*(L: TLexer, msg: TMsgKind, arg = "") =
msgs.GlobalError(getLineInfo(L), msg, arg)
proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg = "") =
var info = newLineInfo(L.fileIdx, L.linenumber, pos - L.lineStart)
msgs.GlobalError(info, msg, arg)
proc TokKindToStr*(k: TTokKind): string =
case k
of pxEof: result = "[EOF]"
of pxInvalid: result = "[invalid]"
of pxMacroParam: result = "[macro param]"
of pxStarComment, pxLineComment: result = "[comment]"
of pxStrLit: result = "[string literal]"
of pxCharLit: result = "[char literal]"
of pxDirective, pxDirectiveParLe: result = "#" # #define, etc.
of pxDirConc: result = "##"
of pxNewLine: result = "[NewLine]"
of pxAmp: result = "&" # &
of pxAmpAmp: result = "&&" # &&
of pxAmpAsgn: result = "&=" # &=
of pxAmpAmpAsgn: result = "&&=" # &&=
of pxBar: result = "|" # |
of pxBarBar: result = "||" # ||
of pxBarAsgn: result = "|=" # |=
of pxBarBarAsgn: result = "||=" # ||=
of pxNot: result = "!" # !
of pxPlusPlus: result = "++" # ++
of pxMinusMinus: result = "--" # --
of pxPlus: result = "+" # +
of pxPlusAsgn: result = "+=" # +=
of pxMinus: result = "-" # -
of pxMinusAsgn: result = "-=" # -=
of pxMod: result = "%" # %
of pxModAsgn: result = "%=" # %=
of pxSlash: result = "/" # /
of pxSlashAsgn: result = "/=" # /=
of pxStar: result = "*" # *
of pxStarAsgn: result = "*=" # *=
of pxHat: result = "^" # ^
of pxHatAsgn: result = "^=" # ^=
of pxAsgn: result = "=" # =
of pxEquals: result = "==" # ==
of pxDot: result = "." # .
of pxDotDotDot: result = "..." # ...
of pxLe: result = "<=" # <=
of pxLt: result = "<" # <
of pxGe: result = ">=" # >=
of pxGt: result = ">" # >
of pxNeq: result = "!=" # !=
of pxConditional: result = "?"
of pxShl: result = "<<"
of pxShlAsgn: result = "<<="
of pxShr: result = ">>"
of pxShrAsgn: result = ">>="
of pxTilde: result = "~"
of pxTildeAsgn: result = "~="
of pxArrow: result = "->"
of pxScope: result = "::"
of pxSymbol: result = "[identifier]"
of pxIntLit, pxInt64Lit: result = "[integer literal]"
of pxFloatLit: result = "[floating point literal]"
of pxParLe: result = "("
of pxParRi: result = ")"
of pxBracketLe: result = "["
of pxBracketRi: result = "]"
of pxComma: result = ","
of pxSemiColon: result = ";"
of pxColon: result = ":"
of pxCurlyLe: result = "{"
of pxCurlyRi: result = "}"
proc `$`(tok: TToken): string =
case tok.xkind
of pxSymbol, pxInvalid, pxStarComment, pxLineComment, pxStrLit: result = tok.s
of pxIntLit, pxInt64Lit: result = $tok.iNumber
of pxFloatLit: result = $tok.fNumber
else: result = TokKindToStr(tok.xkind)
proc PrintTok(tok: TToken) =
writeln(stdout, $tok)
proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: TCharSet) =
# matches ([chars]_)*
var pos = L.bufpos # use registers for pos, buf
var buf = L.buf
while true:
if buf[pos] in chars:
add(tok.s, buf[pos])
Inc(pos)
else:
break
if buf[pos] == '_':
add(tok.s, '_')
Inc(pos)
L.bufPos = pos
proc isFloatLiteral(s: string): bool =
for i in countup(0, len(s)-1):
if s[i] in {'.', 'e', 'E'}:
return true
proc getNumber2(L: var TLexer, tok: var TToken) =
var pos = L.bufpos + 2 # skip 0b
tok.base = base2
var xi: biggestInt = 0
var bits = 0
while true:
case L.buf[pos]
of 'A'..'Z', 'a'..'z':
# ignore type suffix:
inc(pos)
of '2'..'9', '.':
lexMessage(L, errInvalidNumber)
inc(pos)
of '_':
inc(pos)
of '0', '1':
xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0'))
inc(pos)
inc(bits)
else: break
tok.iNumber = xi
if (bits > 32): tok.xkind = pxInt64Lit
else: tok.xkind = pxIntLit
L.bufpos = pos
proc getNumber8(L: var TLexer, tok: var TToken) =
var pos = L.bufpos + 1 # skip 0
tok.base = base8
var xi: biggestInt = 0
var bits = 0
while true:
case L.buf[pos]
of 'A'..'Z', 'a'..'z':
# ignore type suffix:
inc(pos)
of '8'..'9', '.':
lexMessage(L, errInvalidNumber)
inc(pos)
of '_':
inc(pos)
of '0'..'7':
xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0'))
inc(pos)
inc(bits)
else: break
tok.iNumber = xi
if (bits > 12): tok.xkind = pxInt64Lit
else: tok.xkind = pxIntLit
L.bufpos = pos
proc getNumber16(L: var TLexer, tok: var TToken) =
var pos = L.bufpos + 2 # skip 0x
tok.base = base16
var xi: biggestInt = 0
var bits = 0
while true:
case L.buf[pos]
of 'G'..'Z', 'g'..'z':
# ignore type suffix:
inc(pos)
of '_': inc(pos)
of '0'..'9':
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0'))
inc(pos)
inc(bits, 4)
of 'a'..'f':
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10)
inc(pos)
inc(bits, 4)
of 'A'..'F':
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10)
inc(pos)
inc(bits, 4)
else: break
tok.iNumber = xi
if bits > 32: tok.xkind = pxInt64Lit
else: tok.xkind = pxIntLit
L.bufpos = pos
proc getNumber(L: var TLexer, tok: var TToken) =
tok.base = base10
matchUnderscoreChars(L, tok, {'0'..'9'})
if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}):
add(tok.s, '.')
inc(L.bufpos)
matchUnderscoreChars(L, tok, {'e', 'E', '+', '-', '0'..'9'})
try:
if isFloatLiteral(tok.s):
tok.fnumber = parseFloat(tok.s)
tok.xkind = pxFloatLit
else:
tok.iNumber = ParseInt(tok.s)
if (tok.iNumber < low(int32)) or (tok.iNumber > high(int32)):
tok.xkind = pxInt64Lit
else:
tok.xkind = pxIntLit
except EInvalidValue:
lexMessage(L, errInvalidNumber, tok.s)
except EOverflow:
lexMessage(L, errNumberOutOfRange, tok.s)
# ignore type suffix:
while L.buf[L.bufpos] in {'A'..'Z', 'a'..'z'}: inc(L.bufpos)
proc HandleCRLF(L: var TLexer, pos: int): int =
case L.buf[pos]
of CR: result = lexbase.HandleCR(L, pos)
of LF: result = lexbase.HandleLF(L, pos)
else: result = pos
proc escape(L: var TLexer, tok: var TToken, allowEmpty=false) =
inc(L.bufpos) # skip \
case L.buf[L.bufpos]
of 'b', 'B':
add(tok.s, '\b')
inc(L.bufpos)
of 't', 'T':
add(tok.s, '\t')
inc(L.bufpos)
of 'n', 'N':
add(tok.s, '\L')
inc(L.bufpos)
of 'f', 'F':
add(tok.s, '\f')
inc(L.bufpos)
of 'r', 'R':
add(tok.s, '\r')
inc(L.bufpos)
of '\'':
add(tok.s, '\'')
inc(L.bufpos)
of '"':
add(tok.s, '"')
inc(L.bufpos)
of '\\':
add(tok.s, '\b')
inc(L.bufpos)
of '0'..'7':
var xi = ord(L.buf[L.bufpos]) - ord('0')
inc(L.bufpos)
if L.buf[L.bufpos] in {'0'..'7'}:
xi = (xi shl 3) or (ord(L.buf[L.bufpos]) - ord('0'))
inc(L.bufpos)
if L.buf[L.bufpos] in {'0'..'7'}:
xi = (xi shl 3) or (ord(L.buf[L.bufpos]) - ord('0'))
inc(L.bufpos)
add(tok.s, chr(xi))
elif not allowEmpty:
lexMessage(L, errInvalidCharacterConstant)
proc getCharLit(L: var TLexer, tok: var TToken) =
inc(L.bufpos) # skip '
if L.buf[L.bufpos] == '\\':
escape(L, tok)
else:
add(tok.s, L.buf[L.bufpos])
inc(L.bufpos)
if L.buf[L.bufpos] == '\'':
inc(L.bufpos)
else:
lexMessage(L, errMissingFinalQuote)
tok.xkind = pxCharLit
proc getString(L: var TLexer, tok: var TToken) =
var pos = L.bufPos + 1 # skip "
var buf = L.buf # put `buf` in a register
var line = L.linenumber # save linenumber for better error message
while true:
case buf[pos]
of '\"':
Inc(pos)
break
of CR:
pos = lexbase.HandleCR(L, pos)
buf = L.buf
of LF:
pos = lexbase.HandleLF(L, pos)
buf = L.buf
of lexbase.EndOfFile:
var line2 = L.linenumber
L.LineNumber = line
lexMessagePos(L, errClosingQuoteExpected, L.lineStart)
L.LineNumber = line2
break
of '\\':
# we allow an empty \ for line concatenation, but we don't require it
# for line concatenation
L.bufpos = pos
escape(L, tok, allowEmpty=true)
pos = L.bufpos
else:
add(tok.s, buf[pos])
Inc(pos)
L.bufpos = pos
tok.xkind = pxStrLit
proc getSymbol(L: var TLexer, tok: var TToken) =
var pos = L.bufpos
var buf = L.buf
while true:
var c = buf[pos]
if c notin SymChars: break
add(tok.s, c)
Inc(pos)
L.bufpos = pos
tok.xkind = pxSymbol
proc scanLineComment(L: var TLexer, tok: var TToken) =
var pos = L.bufpos
var buf = L.buf
# a comment ends if the next line does not start with the // on the same
# column after only whitespace
tok.xkind = pxLineComment
var col = getColNumber(L, pos)
while true:
inc(pos, 2) # skip //
add(tok.s, '#')
while not (buf[pos] in {CR, LF, lexbase.EndOfFile}):
add(tok.s, buf[pos])
inc(pos)
pos = handleCRLF(L, pos)
buf = L.buf
var indent = 0
while buf[pos] == ' ':
inc(pos)
inc(indent)
if (col == indent) and (buf[pos] == '/') and (buf[pos + 1] == '/'):
add(tok.s, "\n")
else:
break
L.bufpos = pos
proc scanStarComment(L: var TLexer, tok: var TToken) =
var pos = L.bufpos
var buf = L.buf
tok.s = "#"
tok.xkind = pxStarComment
while true:
case buf[pos]
of CR, LF:
pos = HandleCRLF(L, pos)
buf = L.buf
add(tok.s, "\n#")
# skip annoying stars as line prefix: (eg.
# /*
# * ugly comment <-- this star
# */
while buf[pos] in {' ', '\t'}:
add(tok.s, ' ')
inc(pos)
if buf[pos] == '*' and buf[pos+1] != '/': inc(pos)
of '*':
inc(pos)
if buf[pos] == '/':
inc(pos)
break
else:
add(tok.s, '*')
of lexbase.EndOfFile:
lexMessage(L, errTokenExpected, "*/")
else:
add(tok.s, buf[pos])
inc(pos)
L.bufpos = pos
proc skip(L: var TLexer, tok: var TToken) =
var pos = L.bufpos
var buf = L.buf
while true:
case buf[pos]
of '\\':
# Ignore \ line continuation characters when not inDirective
inc(pos)
if L.inDirective:
while buf[pos] in {' ', '\t'}: inc(pos)
if buf[pos] in {CR, LF}:
pos = HandleCRLF(L, pos)
buf = L.buf
of ' ', Tabulator:
Inc(pos) # newline is special:
of CR, LF:
pos = HandleCRLF(L, pos)
buf = L.buf
if L.inDirective:
tok.xkind = pxNewLine
L.inDirective = false
else:
break # EndOfFile also leaves the loop
L.bufpos = pos
proc getDirective(L: var TLexer, tok: var TToken) =
var pos = L.bufpos + 1
var buf = L.buf
while buf[pos] in {' ', '\t'}: inc(pos)
while buf[pos] in SymChars:
add(tok.s, buf[pos])
inc(pos)
# a HACK: we need to distinguish
# #define x (...)
# from:
# #define x(...)
#
L.bufpos = pos
# look ahead:
while buf[pos] in {' ', '\t'}: inc(pos)
while buf[pos] in SymChars: inc(pos)
if buf[pos] == '(': tok.xkind = pxDirectiveParLe
else: tok.xkind = pxDirective
L.inDirective = true
proc getTok(L: var TLexer, tok: var TToken) =
tok.xkind = pxInvalid
fillToken(tok)
skip(L, tok)
if tok.xkind == pxNewLine: return
var c = L.buf[L.bufpos]
if c in SymStartChars:
getSymbol(L, tok)
elif c == '0':
case L.buf[L.bufpos+1]
of 'x', 'X': getNumber16(L, tok)
of 'b', 'B': getNumber2(L, tok)
of '1'..'7': getNumber8(L, tok)
else: getNumber(L, tok)
elif c in {'1'..'9'}:
getNumber(L, tok)
else:
case c
of ';':
tok.xkind = pxSemicolon
Inc(L.bufpos)
of '/':
if L.buf[L.bufpos + 1] == '/':
scanLineComment(L, tok)
elif L.buf[L.bufpos+1] == '*':
inc(L.bufpos, 2)
scanStarComment(L, tok)
elif L.buf[L.bufpos+1] == '=':
inc(L.bufpos, 2)
tok.xkind = pxSlashAsgn
else:
tok.xkind = pxSlash
inc(L.bufpos)
of ',':
tok.xkind = pxComma
Inc(L.bufpos)
of '(':
Inc(L.bufpos)
tok.xkind = pxParLe
of '*':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
inc(L.bufpos)
tok.xkind = pxStarAsgn
else:
tok.xkind = pxStar
of ')':
Inc(L.bufpos)
tok.xkind = pxParRi
of '[':
Inc(L.bufpos)
tok.xkind = pxBracketLe
of ']':
Inc(L.bufpos)
tok.xkind = pxBracketRi
of '.':
inc(L.bufpos)
if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] == '.':
tok.xkind = pxDotDotDot
inc(L.bufpos, 2)
else:
tok.xkind = pxDot
of '{':
Inc(L.bufpos)
tok.xkind = pxCurlyLe
of '}':
Inc(L.bufpos)
tok.xkind = pxCurlyRi
of '+':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
tok.xkind = pxPlusAsgn
inc(L.bufpos)
elif L.buf[L.bufpos] == '+':
tok.xkind = pxPlusPlus
inc(L.bufpos)
else:
tok.xkind = pxPlus
of '-':
inc(L.bufpos)
case L.buf[L.bufpos]
of '>':
tok.xkind = pxArrow
inc(L.bufpos)
of '=':
tok.xkind = pxMinusAsgn
inc(L.bufpos)
of '-':
tok.xkind = pxMinusMinus
inc(L.bufpos)
else:
tok.xkind = pxMinus
of '?':
inc(L.bufpos)
tok.xkind = pxConditional
of ':':
inc(L.bufpos)
if L.buf[L.bufpos] == ':':
tok.xkind = pxScope
inc(L.bufpos)
else:
tok.xkind = pxColon
of '!':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
tok.xkind = pxNeq
inc(L.bufpos)
else:
tok.xkind = pxNot
of '<':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
inc(L.bufpos)
tok.xkind = pxLe
elif L.buf[L.bufpos] == '<':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
inc(L.bufpos)
tok.xkind = pxShlAsgn
else:
tok.xkind = pxShl
else:
tok.xkind = pxLt
of '>':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
inc(L.bufpos)
tok.xkind = pxGe
elif L.buf[L.bufpos] == '>':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
inc(L.bufpos)
tok.xkind = pxShrAsgn
else:
tok.xkind = pxShr
else:
tok.xkind = pxGt
of '=':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
tok.xkind = pxEquals
inc(L.bufpos)
else:
tok.xkind = pxAsgn
of '&':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
tok.xkind = pxAmpAsgn
inc(L.bufpos)
elif L.buf[L.bufpos] == '&':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
inc(L.bufpos)
tok.xkind = pxAmpAmpAsgn
else:
tok.xkind = pxAmpAmp
else:
tok.xkind = pxAmp
of '|':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
tok.xkind = pxBarAsgn
inc(L.bufpos)
elif L.buf[L.bufpos] == '|':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
inc(L.bufpos)
tok.xkind = pxBarBarAsgn
else:
tok.xkind = pxBarBar
else:
tok.xkind = pxBar
of '^':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
tok.xkind = pxHatAsgn
inc(L.bufpos)
else:
tok.xkind = pxHat
of '%':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
tok.xkind = pxModAsgn
inc(L.bufpos)
else:
tok.xkind = pxMod
of '~':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
tok.xkind = pxTildeAsgn
inc(L.bufpos)
else:
tok.xkind = pxTilde
of '#':
if L.buf[L.bufpos+1] == '#':
inc(L.bufpos, 2)
tok.xkind = pxDirConc
else:
getDirective(L, tok)
of '"': getString(L, tok)
of '\'': getCharLit(L, tok)
of lexbase.EndOfFile:
tok.xkind = pxEof
else:
tok.s = $c
tok.xkind = pxInvalid
lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')')
Inc(L.bufpos)