#
#
# Pas2nim - Pascal to Nimrod source converter
# (c) Copyright 2010 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
# This module implements a FreePascal scanner. This is an adaption from
# the scanner module.
import
hashes, options, msgs, strutils, platform, idents, lexbase, llstream
const
MaxLineLength* = 80 # lines longer than this lead to a warning
numChars*: TCharSet = {'0'..'9', 'a'..'z', 'A'..'Z'}
SymChars*: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
SymStartChars*: TCharSet = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'}
OpChars*: TCharSet = {'+', '-', '*', '/', '<', '>', '!', '?', '^', '.', '|',
'=', ':', '%', '&', '$', '@', '~', '\x80'..'\xFF'}
# keywords are sorted!
type
TTokKind* = enum
pxInvalid, pxEof,
pxAnd, pxArray, pxAs, pxAsm, pxBegin, pxCase, pxClass, pxConst,
pxConstructor, pxDestructor, pxDiv, pxDo, pxDownto, pxElse, pxEnd, pxExcept,
pxExports, pxFinalization, pxFinally, pxFor, pxFunction, pxGoto, pxIf,
pxImplementation, pxIn, pxInherited, pxInitialization, pxInline,
pxInterface, pxIs, pxLabel, pxLibrary, pxMod, pxNil, pxNot, pxObject, pxOf,
pxOr, pxOut, pxPacked, pxProcedure, pxProgram, pxProperty, pxRaise,
pxRecord, pxRepeat, pxResourcestring, pxSet, pxShl, pxShr, pxThen,
pxThreadvar, pxTo, pxTry, pxType, pxUnit, pxUntil, pxUses, pxVar, pxWhile,
pxWith, pxXor,
pxComment, # ordinary comment
pxCommand, # {@}
pxAmp, # {&}
pxPer, # {%}
pxStrLit, pxSymbol, # a symbol
pxIntLit, pxInt64Lit, # long constant like 0x70fffffff or out of int range
pxFloatLit, pxParLe, pxParRi, pxBracketLe, pxBracketRi, pxComma,
pxSemiColon, pxColon, # operators
pxAsgn, pxEquals, pxDot, pxDotDot, pxHat, pxPlus, pxMinus, pxStar, pxSlash,
pxLe, pxLt, pxGe, pxGt, pxNeq, pxAt, pxStarDirLe, pxStarDirRi, pxCurlyDirLe,
pxCurlyDirRi
TTokKinds* = set[TTokKind]
const
Keywords = ["and", "array", "as", "asm", "begin", "case", "class", "const",
"constructor", "destructor", "div", "do", "downto", "else", "end", "except",
"exports", "finalization", "finally", "for", "function", "goto", "if",
"implementation", "in", "inherited", "initialization", "inline",
"interface", "is", "label", "library", "mod", "nil", "not", "object", "of",
"or", "out", "packed", "procedure", "program", "property", "raise",
"record", "repeat", "resourcestring", "set", "shl", "shr", "then",
"threadvar", "to", "try", "type", "unit", "until", "uses", "var", "while",
"with", "xor"]
firstKeyword = pxAnd
lastKeyword = pxXor
type
TNumericalBase* = enum base10, base2, base8, base16
TToken* = object
xkind*: TTokKind # the type of the token
ident*: PIdent # the parsed identifier
iNumber*: BiggestInt # the parsed integer literal
fNumber*: BiggestFloat # the parsed floating point literal
base*: TNumericalBase # the numerical base; only valid for int
# or float literals
literal*: string # the parsed (string) literal
TLexer* = object of TBaseLexer
filename*: string
proc getTok*(L: var TLexer, tok: var TToken)
proc PrintTok*(tok: TToken)
proc `$`*(tok: TToken): string
# implementation
var
dummyIdent: PIdent
gLinesCompiled: int
proc fillToken(L: var TToken) =
L.xkind = pxInvalid
L.iNumber = 0
L.literal = ""
L.fNumber = 0.0
L.base = base10
L.ident = dummyIdent # this prevents many bugs!
proc openLexer*(lex: var TLexer, filename: string, inputstream: PLLStream) =
openBaseLexer(lex, inputstream)
lex.filename = filename
proc closeLexer*(lex: var TLexer) =
inc(gLinesCompiled, lex.LineNumber)
closeBaseLexer(lex)
proc getColumn(L: TLexer): int =
result = getColNumber(L, L.bufPos)
proc getLineInfo*(L: TLexer): TLineInfo =
result = newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos))
proc lexMessage*(L: TLexer, msg: TMsgKind, arg = "") =
msgs.GenericMessage(getLineInfo(L), msg, arg)
proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg = "") =
var info = newLineInfo(L.filename, L.linenumber, pos - L.lineStart)
msgs.GenericMessage(info, msg, arg)
proc TokKindToStr*(k: TTokKind): string =
case k
of pxEof: result = "[EOF]"
of firstKeyword..lastKeyword:
result = keywords[ord(k)-ord(firstKeyword)]
of pxInvalid, pxComment, pxStrLit: result = "string literal"
of pxCommand: result = "{@"
of pxAmp: result = "{&"
of pxPer: result = "{%"
of pxSymbol: result = "identifier"
of pxIntLit, pxInt64Lit: result = "integer literal"
of pxFloatLit: result = "floating point literal"
of pxParLe: result = "("
of pxParRi: result = ")"
of pxBracketLe: result = "["
of pxBracketRi: result = "]"
of pxComma: result = ","
of pxSemiColon: result = ";"
of pxColon: result = ":"
of pxAsgn: result = ":="
of pxEquals: result = "="
of pxDot: result = "."
of pxDotDot: result = ".."
of pxHat: result = "^"
of pxPlus: result = "+"
of pxMinus: result = "-"
of pxStar: result = "*"
of pxSlash: result = "/"
of pxLe: result = "<="
of pxLt: result = "<"
of pxGe: result = ">="
of pxGt: result = ">"
of pxNeq: result = "<>"
of pxAt: result = "@"
of pxStarDirLe: result = "(*$"
of pxStarDirRi: result = "*)"
of pxCurlyDirLe: result = "{$"
of pxCurlyDirRi: result = "}"
proc `$`(tok: TToken): string =
case tok.xkind
of pxInvalid, pxComment, pxStrLit: result = tok.literal
of pxSymbol: result = tok.ident.s
of pxIntLit, pxInt64Lit: result = $tok.iNumber
of pxFloatLit: result = $tok.fNumber
else: result = TokKindToStr(tok.xkind)
proc PrintTok(tok: TToken) =
writeln(stdout, $tok)
proc setKeyword(L: var TLexer, tok: var TToken) =
var x = binaryStrSearch(keywords, toLower(tok.ident.s))
if x < 0: tok.xkind = pxSymbol
else: tok.xKind = TTokKind(x + ord(firstKeyword))
proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: TCharSet) =
# matches ([chars]_)*
var pos = L.bufpos # use registers for pos, buf
var buf = L.buf
while true:
if buf[pos] in chars:
add(tok.literal, buf[pos])
Inc(pos)
else:
break
if buf[pos] == '_':
add(tok.literal, '_')
Inc(pos)
L.bufPos = pos
proc isFloatLiteral(s: string): bool =
for i in countup(0, len(s)-1):
if s[i] in {'.', 'e', 'E'}:
return true
proc getNumber2(L: var TLexer, tok: var TToken) =
var pos = L.bufpos + 1 # skip %
if not (L.buf[pos] in {'0'..'1'}):
# BUGFIX for %date%
tok.xkind = pxInvalid
add(tok.literal, '%')
inc(L.bufpos)
return
tok.base = base2
var xi: biggestInt = 0
var bits = 0
while true:
case L.buf[pos]
of 'A'..'Z', 'a'..'z', '2'..'9', '.':
lexMessage(L, errInvalidNumber)
inc(pos)
of '_':
inc(pos)
of '0', '1':
xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0'))
inc(pos)
inc(bits)
else: break
tok.iNumber = xi
if (bits > 32): tok.xkind = pxInt64Lit
else: tok.xkind = pxIntLit
L.bufpos = pos
proc getNumber16(L: var TLexer, tok: var TToken) =
var pos = L.bufpos + 1 # skip $
tok.base = base16
var xi: biggestInt = 0
var bits = 0
while true:
case L.buf[pos]
of 'G'..'Z', 'g'..'z', '.':
lexMessage(L, errInvalidNumber)
inc(pos)
of '_': inc(pos)
of '0'..'9':
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0'))
inc(pos)
inc(bits, 4)
of 'a'..'f':
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10)
inc(pos)
inc(bits, 4)
of 'A'..'F':
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10)
inc(pos)
inc(bits, 4)
else: break
tok.iNumber = xi
if (bits > 32):
tok.xkind = pxInt64Lit
else:
tok.xkind = pxIntLit
L.bufpos = pos
proc getNumber10(L: var TLexer, tok: var TToken) =
tok.base = base10
matchUnderscoreChars(L, tok, {'0'..'9'})
if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}):
add(tok.literal, '.')
inc(L.bufpos)
matchUnderscoreChars(L, tok, {'e', 'E', '+', '-', '0'..'9'})
try:
if isFloatLiteral(tok.literal):
tok.fnumber = parseFloat(tok.literal)
tok.xkind = pxFloatLit
else:
tok.iNumber = ParseInt(tok.literal)
if (tok.iNumber < low(int32)) or (tok.iNumber > high(int32)):
tok.xkind = pxInt64Lit
else:
tok.xkind = pxIntLit
except EInvalidValue:
lexMessage(L, errInvalidNumber, tok.literal)
except EOverflow:
lexMessage(L, errNumberOutOfRange, tok.literal)
proc HandleCRLF(L: var TLexer, pos: int): int =
case L.buf[pos]
of CR: result = lexbase.HandleCR(L, pos)
of LF: result = lexbase.HandleLF(L, pos)
else: result = pos
proc getString(L: var TLexer, tok: var TToken) =
var xi: int
var pos = L.bufPos
var buf = L.buf
while true:
if buf[pos] == '\'':
inc(pos)
while true:
case buf[pos]
of CR, LF, lexbase.EndOfFile:
lexMessage(L, errClosingQuoteExpected)
break
of '\'':
inc(pos)
if buf[pos] == '\'':
inc(pos)
add(tok.literal, '\'')
else:
break
else:
add(tok.literal, buf[pos])
inc(pos)
elif buf[pos] == '#':
inc(pos)
xi = 0
case buf[pos]
of '$':
inc(pos)
xi = 0
while true:
case buf[pos]
of '0'..'9': xi = (xi shl 4) or (ord(buf[pos]) - ord('0'))
of 'a'..'f': xi = (xi shl 4) or (ord(buf[pos]) - ord('a') + 10)
of 'A'..'F': xi = (xi shl 4) or (ord(buf[pos]) - ord('A') + 10)
else: break
inc(pos)
of '0'..'9':
xi = 0
while buf[pos] in {'0'..'9'}:
xi = (xi * 10) + (ord(buf[pos]) - ord('0'))
inc(pos)
else: lexMessage(L, errInvalidCharacterConstant)
if (xi <= 255): add(tok.literal, Chr(xi))
else: lexMessage(L, errInvalidCharacterConstant)
else:
break
tok.xkind = pxStrLit
L.bufpos = pos
proc getSymbol(L: var TLexer, tok: var TToken) =
var h: THash = 0
var pos = L.bufpos
var buf = L.buf
while true:
var c = buf[pos]
case c
of 'a'..'z', '0'..'9', '\x80'..'\xFF':
h = h +% Ord(c)
h = h +% h shl 10
h = h xor (h shr 6)
of 'A'..'Z':
c = chr(ord(c) + (ord('a') - ord('A'))) # toLower()
h = h +% Ord(c)
h = h +% h shl 10
h = h xor (h shr 6)
of '_': nil
else: break
Inc(pos)
h = h +% h shl 3
h = h xor (h shr 11)
h = h +% h shl 15
tok.ident = getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
L.bufpos = pos
setKeyword(L, tok)
proc scanLineComment(L: var TLexer, tok: var TToken) =
var pos = L.bufpos
var buf = L.buf
# a comment ends if the next line does not start with the // on the same
# column after only whitespace
tok.xkind = pxComment
var col = getColNumber(L, pos)
while true:
inc(pos, 2) # skip //
add(tok.literal, '#')
while not (buf[pos] in {CR, LF, lexbase.EndOfFile}):
add(tok.literal, buf[pos])
inc(pos)
pos = handleCRLF(L, pos)
buf = L.buf
var indent = 0
while buf[pos] == ' ':
inc(pos)
inc(indent)
if (col == indent) and (buf[pos] == '/') and (buf[pos + 1] == '/'):
tok.literal = tok.literal & "\n"
else:
break
L.bufpos = pos
proc scanCurlyComment(L: var TLexer, tok: var TToken) =
var pos = L.bufpos
var buf = L.buf
tok.literal = "#"
tok.xkind = pxComment
while true:
case buf[pos]
of CR, LF:
pos = HandleCRLF(L, pos)
buf = L.buf
add(tok.literal, "\n#")
of '}':
inc(pos)
break
of lexbase.EndOfFile: lexMessage(L, errTokenExpected, "}")
else:
add(tok.literal, buf[pos])
inc(pos)
L.bufpos = pos
proc scanStarComment(L: var TLexer, tok: var TToken) =
var pos = L.bufpos
var buf = L.buf
tok.literal = "#"
tok.xkind = pxComment
while true:
case buf[pos]
of CR, LF:
pos = HandleCRLF(L, pos)
buf = L.buf
add(tok.literal, "\n#")
of '*':
inc(pos)
if buf[pos] == ')':
inc(pos)
break
else:
add(tok.literal, '*')
of lexbase.EndOfFile:
lexMessage(L, errTokenExpected, "*)")
else:
add(tok.literal, buf[pos])
inc(pos)
L.bufpos = pos
proc skip(L: var TLexer, tok: var TToken) =
var pos = L.bufpos
var buf = L.buf
while true:
case buf[pos]
of ' ', Tabulator:
Inc(pos) # newline is special:
of CR, LF:
pos = HandleCRLF(L, pos)
buf = L.buf
else:
break # EndOfFile also leaves the loop
L.bufpos = pos
proc getTok(L: var TLexer, tok: var TToken) =
tok.xkind = pxInvalid
fillToken(tok)
skip(L, tok)
var c = L.buf[L.bufpos]
if c in SymStartChars:
getSymbol(L, tok)
elif c in {'0'..'9'}:
getNumber10(L, tok)
else:
case c
of ';':
tok.xkind = pxSemicolon
Inc(L.bufpos)
of '/':
if L.buf[L.bufpos + 1] == '/':
scanLineComment(L, tok)
else:
tok.xkind = pxSlash
inc(L.bufpos)
of ',':
tok.xkind = pxComma
Inc(L.bufpos)
of '(':
Inc(L.bufpos)
if (L.buf[L.bufPos] == '*'):
if (L.buf[L.bufPos + 1] == '$'):
Inc(L.bufpos, 2)
skip(L, tok)
getSymbol(L, tok)
tok.xkind = pxStarDirLe
else:
inc(L.bufpos)
scanStarComment(L, tok)
else:
tok.xkind = pxParLe
of '*':
inc(L.bufpos)
if L.buf[L.bufpos] == ')':
inc(L.bufpos)
tok.xkind = pxStarDirRi
else:
tok.xkind = pxStar
of ')':
tok.xkind = pxParRi
Inc(L.bufpos)
of '[':
Inc(L.bufpos)
tok.xkind = pxBracketLe
of ']':
Inc(L.bufpos)
tok.xkind = pxBracketRi
of '.':
inc(L.bufpos)
if L.buf[L.bufpos] == '.':
tok.xkind = pxDotDot
inc(L.bufpos)
else:
tok.xkind = pxDot
of '{':
Inc(L.bufpos)
case L.buf[L.bufpos]
of '$':
Inc(L.bufpos)
skip(L, tok)
getSymbol(L, tok)
tok.xkind = pxCurlyDirLe
of '&':
Inc(L.bufpos)
tok.xkind = pxAmp
of '%':
Inc(L.bufpos)
tok.xkind = pxPer
of '@':
Inc(L.bufpos)
tok.xkind = pxCommand
else: scanCurlyComment(L, tok)
of '+':
tok.xkind = pxPlus
inc(L.bufpos)
of '-':
tok.xkind = pxMinus
inc(L.bufpos)
of ':':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
inc(L.bufpos)
tok.xkind = pxAsgn
else:
tok.xkind = pxColon
of '<':
inc(L.bufpos)
if L.buf[L.bufpos] == '>':
inc(L.bufpos)
tok.xkind = pxNeq
elif L.buf[L.bufpos] == '=':
inc(L.bufpos)
tok.xkind = pxLe
else:
tok.xkind = pxLt
of '>':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
inc(L.bufpos)
tok.xkind = pxGe
else:
tok.xkind = pxGt
of '=':
tok.xkind = pxEquals
inc(L.bufpos)
of '@':
tok.xkind = pxAt
inc(L.bufpos)
of '^':
tok.xkind = pxHat
inc(L.bufpos)
of '}':
tok.xkind = pxCurlyDirRi
Inc(L.bufpos)
of '\'', '#':
getString(L, tok)
of '$':
getNumber16(L, tok)
of '%':
getNumber2(L, tok)
of lexbase.EndOfFile:
tok.xkind = pxEof
else:
tok.literal = c & ""
tok.xkind = pxInvalid
lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')')
Inc(L.bufpos)