#
#
# Pas2nim - Pascal to Nimrod source converter
# (c) Copyright 2010 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
# This module implements a FreePascal scanner. This is an adaption from
# the scanner module.
import
nhashes, options, msgs, strutils, platform, idents, lexbase
const
MaxLineLength* = 80 # lines longer than this lead to a warning
numChars*: TCharSet = {'0'..'9', 'a'..'z', 'A'..'Z'}
SymChars*: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
SymStartChars*: TCharSet = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'}
OpChars*: TCharSet = {'+', '-', '*', '/', '<', '>', '!', '?', '^', '.', '|',
'=', ':', '%', '&', '$', '@', '~', '\x80'..'\xFF'}
# keywords are sorted!
type
TPasTokKind* = enum
pxInvalid, pxEof,
pxAnd, pxArray, pxAs, pxAsm, pxBegin, pxCase, pxClass, pxConst,
pxConstructor, pxDestructor, pxDiv, pxDo, pxDownto, pxElse, pxEnd, pxExcept,
pxExports, pxFinalization, pxFinally, pxFor, pxFunction, pxGoto, pxIf,
pxImplementation, pxIn, pxInherited, pxInitialization, pxInline,
pxInterface, pxIs, pxLabel, pxLibrary, pxMod, pxNil, pxNot, pxObject, pxOf,
pxOr, pxOut, pxPacked, pxProcedure, pxProgram, pxProperty, pxRaise,
pxRecord, pxRepeat, pxResourcestring, pxSet, pxShl, pxShr, pxThen,
pxThreadvar, pxTo, pxTry, pxType, pxUnit, pxUntil, pxUses, pxVar, pxWhile,
pxWith, pxXor,
pxComment, # ordinary comment
pxCommand, # {@}
pxAmp, # {&}
pxPer, # {%}
pxStrLit, pxSymbol, # a symbol
pxIntLit, pxInt64Lit, # long constant like 0x70fffffff or out of int range
pxFloatLit, pxParLe, pxParRi, pxBracketLe, pxBracketRi, pxComma,
pxSemiColon, pxColon, # operators
pxAsgn, pxEquals, pxDot, pxDotDot, pxHat, pxPlus, pxMinus, pxStar, pxSlash,
pxLe, pxLt, pxGe, pxGt, pxNeq, pxAt, pxStarDirLe, pxStarDirRi, pxCurlyDirLe,
pxCurlyDirRi
TPasTokKinds* = set[TPasTokKind]
const
Keywords = ["and", "array", "as", "asm", "begin", "case", "class", "const",
"constructor", "destructor", "div", "do", "downto", "else", "end", "except",
"exports", "finalization", "finally", "for", "function", "goto", "if",
"implementation", "in", "inherited", "initialization", "inline",
"interface", "is", "label", "library", "mod", "nil", "not", "object", "of",
"or", "out", "packed", "procedure", "program", "property", "raise",
"record", "repeat", "resourcestring", "set", "shl", "shr", "then",
"threadvar", "to", "try", "type", "unit", "until", "uses", "var", "while",
"with", "xor"]
firstKeyword = pxAnd
lastKeyword = pxXor
type
TPasTok* = object
xkind*: TPasTokKind # the type of the token
ident*: PIdent # the parsed identifier
iNumber*: BiggestInt # the parsed integer literal
fNumber*: BiggestFloat # the parsed floating point literal
base*: TNumericalBase # the numerical base; only valid for int
# or float literals
literal*: string # the parsed (string) literal
TPasLex* = object
filename*: string
proc getPasTok*(L: var TPasLex, tok: var TPasTok)
proc PrintPasTok*(tok: TPasTok)
proc pasTokToStr*(tok: TPasTok): string
# implementation
var dummyIdent: PIdent
proc fillToken(L: var TToken) =
L.TokType = tkInvalid
L.iNumber = 0
L.Indent = 0
L.literal = ""
L.fNumber = 0.0
L.base = base10
L.ident = dummyIdent # this prevents many bugs!
proc openLexer(lex: var TLexer, filename: string, inputstream: PLLStream) =
openBaseLexer(lex, inputstream)
lex.indentStack = @[0]
lex.filename = filename
lex.indentAhead = - 1
proc closeLexer(lex: var TLexer) =
inc(gLinesCompiled, lex.LineNumber)
closeBaseLexer(lex)
proc getColumn(L: TLexer): int =
result = getColNumber(L, L.bufPos)
proc getLineInfo(L: TLexer): TLineInfo =
result = newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos))
proc lexMessage(L: TLexer, msg: TMsgKind, arg = "") =
msgs.liMessage(getLineInfo(L), msg, arg)
proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg = "") =
var info = newLineInfo(L.filename, L.linenumber, pos - L.lineStart)
msgs.liMessage(info, msg, arg)
proc binaryStrSearch(x: openarray[string], y: string): int =
var a = 0
var b = len(x)
while a < b:
var mid = (a + b) div 2
if x[mid] < y: a = mid + 1
else: b = mid
if a < len(x) and x[a] == y: result = a
else: result = -1
proc pastokToStr(tok: TPasTok): string =
case tok.xkind
of pxIntLit, pxInt64Lit: result = $(tok.iNumber)
of pxFloatLit: result = $(tok.fNumber)
of pxInvalid, pxComment..pxStrLit: result = tok.literal
else:
if (tok.ident.s != ""): result = tok.ident.s
else: result = pasTokKindToStr[tok.xkind]
proc PrintPasTok(tok: TPasTok) =
write(stdout, pasTokKindToStr[tok.xkind])
write(stdout, ' ')
writeln(stdout, pastokToStr(tok))
proc setKeyword(L: var TPasLex, tok: var TPasTok) =
var x = binaryStrSearch(keywords, toLower(tok.ident.s))
if x < 0: tok.xkind = pxSymbol
else: tok.xKind = TPasTokKind(x + ord(firstKeyword))
proc matchUnderscoreChars(L: var TPasLex, tok: var TPasTok, chars: TCharSet) =
# matches ([chars]_)*
var pos = L.bufpos # use registers for pos, buf
var buf = L.buf
while true:
if buf[pos] in chars:
add(tok.literal, buf[pos])
Inc(pos)
else:
break
if buf[pos] == '_':
add(tok.literal, '_')
Inc(pos)
L.bufPos = pos
proc isFloatLiteral(s: string): bool =
for i in countup(0, len(s)-1):
if s[i] in {'.', 'e', 'E'}:
return true
proc getNumber2(L: var TPasLex, tok: var TPasTok) =
var pos = L.bufpos + 1 # skip %
if not (L.buf[pos] in {'0'..'1'}):
# BUGFIX for %date%
tok.xkind = pxInvalid
add(tok.literal, '%')
inc(L.bufpos)
return
tok.base = base2
var xi: biggestInt = 0
var bits = 0
while true:
case L.buf[pos]
of 'A'..'Z', 'a'..'z', '2'..'9', '.':
lexMessage(L, errInvalidNumber)
inc(pos)
of '_':
inc(pos)
of '0', '1':
xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0'))
inc(pos)
inc(bits)
else: break
tok.iNumber = xi
if (bits > 32): tok.xkind = pxInt64Lit
else: tok.xkind = pxIntLit
L.bufpos = pos
proc getNumber16(L: var TPasLex, tok: var TPasTok) =
var pos = L.bufpos + 1 # skip $
tok.base = base16
var xi: biggestInt = 0
var bits = 0
while true:
case L.buf[pos]
of 'G'..'Z', 'g'..'z', '.':
lexMessage(L, errInvalidNumber)
inc(pos)
of '_': inc(pos)
of '0'..'9':
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0'))
inc(pos)
inc(bits, 4)
of 'a'..'f':
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10)
inc(pos)
inc(bits, 4)
of 'A'..'F':
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10)
inc(pos)
inc(bits, 4)
else: break
tok.iNumber = xi
if (bits > 32):
tok.xkind = pxInt64Lit
else:
tok.xkind = pxIntLit
L.bufpos = pos
proc getNumber10(L: var TPasLex, tok: var TPasTok) =
tok.base = base10
matchUnderscoreChars(L, tok, {'0'..'9'})
if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}):
add(tok.literal, '.')
inc(L.bufpos)
matchUnderscoreChars(L, tok, {'e', 'E', '+', '-', '0'..'9'})
try:
if isFloatLiteral(tok.literal):
tok.fnumber = parseFloat(tok.literal)
tok.xkind = pxFloatLit
else:
tok.iNumber = ParseInt(tok.literal)
if (tok.iNumber < low(int32)) or (tok.iNumber > high(int32)):
tok.xkind = pxInt64Lit
else:
tok.xkind = pxIntLit
except EInvalidValue:
lexMessage(L, errInvalidNumber, tok.literal)
except EOverflow:
lexMessage(L, errNumberOutOfRange, tok.literal)
proc HandleCRLF(L: var TLexer, pos: int): int =
case L.buf[pos]
of CR: result = lexbase.HandleCR(L, pos)
of LF: result = lexbase.HandleLF(L, pos)
else: result = pos
proc getString(L: var TPasLex, tok: var TPasTok) =
var xi: int
var pos = L.bufPos
var buf = L.buf
while true:
if buf[pos] == '\'':
inc(pos)
while true:
case buf[pos]
of CR, LF, lexbase.EndOfFile:
lexMessage(L, errClosingQuoteExpected)
break
of '\'':
inc(pos)
if buf[pos] == '\'':
inc(pos)
add(tok.literal, '\'')
else:
break
else:
add(tok.literal, buf[pos])
inc(pos)
elif buf[pos] == '#':
inc(pos)
xi = 0
case buf[pos]
of '$':
inc(pos)
xi = 0
while true:
case buf[pos]
of '0'..'9': xi = (xi shl 4) or (ord(buf[pos]) - ord('0'))
of 'a'..'f': xi = (xi shl 4) or (ord(buf[pos]) - ord('a') + 10)
of 'A'..'F': xi = (xi shl 4) or (ord(buf[pos]) - ord('A') + 10)
else: break
inc(pos)
of '0'..'9':
xi = 0
while buf[pos] in {'0'..'9'}:
xi = (xi * 10) + (ord(buf[pos]) - ord('0'))
inc(pos)
else: lexMessage(L, errInvalidCharacterConstant)
if (xi <= 255): add(tok.literal, Chr(xi))
else: lexMessage(L, errInvalidCharacterConstant)
else:
break
tok.xkind = pxStrLit
L.bufpos = pos
proc getSymbol(L: var TPasLex, tok: var TPasTok) =
var h: THash = 0
var pos = L.bufpos
var buf = L.buf
while true:
var c = buf[pos]
case c
of 'a'..'z', '0'..'9', '\x80'..'\xFF':
h = h +% Ord(c)
h = h +% h shl 10
h = h xor (h shr 6)
of 'A'..'Z':
c = chr(ord(c) + (ord('a') - ord('A'))) # toLower()
h = h +% Ord(c)
h = h +% h shl 10
h = h xor (h shr 6)
of '_': nil
else: break
Inc(pos)
h = h +% h shl 3
h = h xor (h shr 11)
h = h +% h shl 15
tok.ident = getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
L.bufpos = pos
setKeyword(L, tok)
proc scanLineComment(L: var TPasLex, tok: var TPasTok) =
var pos = L.bufpos
var buf = L.buf
# a comment ends if the next line does not start with the // on the same
# column after only whitespace
tok.xkind = pxComment
var col = getColNumber(L, pos)
while true:
inc(pos, 2) # skip //
add(tok.literal, '#')
while not (buf[pos] in {CR, LF, lexbase.EndOfFile}):
add(tok.literal, buf[pos])
inc(pos)
pos = handleCRLF(L, pos)
buf = L.buf
var indent = 0
while buf[pos] == ' ':
inc(pos)
inc(indent)
if (col == indent) and (buf[pos] == '/') and (buf[pos + 1] == '/'):
tok.literal = tok.literal & "\n"
else:
break
L.bufpos = pos
proc scanCurlyComment(L: var TPasLex, tok: var TPasTok) =
var pos = L.bufpos
var buf = L.buf
tok.literal = "#"
tok.xkind = pxComment
while true:
case buf[pos]
of CR, LF:
pos = HandleCRLF(L, pos)
buf = L.buf
add(tok.literal, "\n#")
of '}':
inc(pos)
break
of lexbase.EndOfFile: lexMessage(L, errTokenExpected, "}")
else:
add(tok.literal, buf[pos])
inc(pos)
L.bufpos = pos
proc scanStarComment(L: var TPasLex, tok: var TPasTok) =
var pos = L.bufpos
var buf = L.buf
tok.literal = "#"
tok.xkind = pxComment
while true:
case buf[pos]
of CR, LF:
pos = HandleCRLF(L, pos)
buf = L.buf
add(tok.literal, "\n#")
of '*':
inc(pos)
if buf[pos] == ')':
inc(pos)
break
else:
add(tok.literal, '*')
of lexbase.EndOfFile:
lexMessage(L, errTokenExpected, "*)")
else:
add(tok.literal, buf[pos])
inc(pos)
L.bufpos = pos
proc skip(L: var TPasLex, tok: var TPasTok) =
var pos = L.bufpos
var buf = L.buf
while true:
case buf[pos]
of ' ', Tabulator:
Inc(pos) # newline is special:
of CR, LF:
pos = HandleCRLF(L, pos)
buf = L.buf
else:
break # EndOfFile also leaves the loop
L.bufpos = pos
proc getPasTok(L: var TPasLex, tok: var TPasTok) =
tok.xkind = pxInvalid
fillToken(tok)
skip(L, tok)
var c = L.buf[L.bufpos]
if c in SymStartChars:
getSymbol(L, tok)
elif c in {'0'..'9'}:
getNumber10(L, tok)
else:
case c
of ';':
tok.xkind = pxSemicolon
Inc(L.bufpos)
of '/':
if L.buf[L.bufpos + 1] == '/':
scanLineComment(L, tok)
else:
tok.xkind = pxSlash
inc(L.bufpos)
of ',':
tok.xkind = pxComma
Inc(L.bufpos)
of '(':
Inc(L.bufpos)
if (L.buf[L.bufPos] == '*'):
if (L.buf[L.bufPos + 1] == '$'):
Inc(L.bufpos, 2)
skip(L, tok)
getSymbol(L, tok)
tok.xkind = pxStarDirLe
else:
inc(L.bufpos)
scanStarComment(L, tok)
else:
tok.xkind = pxParLe
of '*':
inc(L.bufpos)
if L.buf[L.bufpos] == ')':
inc(L.bufpos)
tok.xkind = pxStarDirRi
else:
tok.xkind = pxStar
of ')':
tok.xkind = pxParRi
Inc(L.bufpos)
of '[':
Inc(L.bufpos)
tok.xkind = pxBracketLe
of ']':
Inc(L.bufpos)
tok.xkind = pxBracketRi
of '.':
inc(L.bufpos)
if L.buf[L.bufpos] == '.':
tok.xkind = pxDotDot
inc(L.bufpos)
else:
tok.xkind = pxDot
of '{':
Inc(L.bufpos)
case L.buf[L.bufpos]
of '$':
Inc(L.bufpos)
skip(L, tok)
getSymbol(L, tok)
tok.xkind = pxCurlyDirLe
of '&':
Inc(L.bufpos)
tok.xkind = pxAmp
of '%':
Inc(L.bufpos)
tok.xkind = pxPer
of '@':
Inc(L.bufpos)
tok.xkind = pxCommand
else: scanCurlyComment(L, tok)
of '+':
tok.xkind = pxPlus
inc(L.bufpos)
of '-':
tok.xkind = pxMinus
inc(L.bufpos)
of ':':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
inc(L.bufpos)
tok.xkind = pxAsgn
else:
tok.xkind = pxColon
of '<':
inc(L.bufpos)
if L.buf[L.bufpos] == '>':
inc(L.bufpos)
tok.xkind = pxNeq
elif L.buf[L.bufpos] == '=':
inc(L.bufpos)
tok.xkind = pxLe
else:
tok.xkind = pxLt
of '>':
inc(L.bufpos)
if L.buf[L.bufpos] == '=':
inc(L.bufpos)
tok.xkind = pxGe
else:
tok.xkind = pxGt
of '=':
tok.xkind = pxEquals
inc(L.bufpos)
of '@':
tok.xkind = pxAt
inc(L.bufpos)
of '^':
tok.xkind = pxHat
inc(L.bufpos)
of '}':
tok.xkind = pxCurlyDirRi
Inc(L.bufpos)
of '\'', '#':
getString(L, tok)
of '$':
getNumber16(L, tok)
of '%':
getNumber2(L, tok)
of lexbase.EndOfFile:
tok.xkind = pxEof
else:
tok.literal = c & ""
tok.xkind = pxInvalid
lexMessage(L, errInvalidToken, c & " (\\" & $(ord(c)) & ')')
Inc(L.bufpos)