//
//
// The Nimrod Compiler
// (c) Copyright 2009 Andreas Rumpf
//
// See the file "copying.txt", included in this
// distribution, for details about the copyright.
//
unit paslex;
// This module implements a FreePascal scanner. This is a adaption from
// the scanner module.
interface
{$include 'config.inc'}
uses
charsets, nsystem, sysutils,
nhashes, options, msgs, strutils, platform, idents,
lexbase, wordrecg, scanner;
const
MaxLineLength = 80; // lines longer than this lead to a warning
numChars: TCharSet = ['0'..'9','a'..'z','A'..'Z']; // we support up to base 36
SymChars: TCharSet = ['a'..'z', 'A'..'Z', '0'..'9', #128..#255];
SymStartChars: TCharSet = ['a'..'z', 'A'..'Z', #128..#255];
OpChars: TCharSet = ['+', '-', '*', '/', '<', '>', '!', '?', '^', '.',
'|', '=', ':', '%', '&', '$', '@', '~', #128..#255];
type
// order is important for TPasTokKind
TPasTokKind = (pxInvalid, pxEof,
// keywords:
//[[[cog
//from string import capitalize
//keywords = eval(open("data/pas_keyw.yml").read())
//idents = ""
//strings = ""
//i = 1
//for k in keywords:
// idents = idents + "px" + capitalize(k) + ", "
// strings = strings + "'" + k + "', "
// if i % 4 == 0:
// idents = idents + "\n"
// strings = strings + "\n"
// i = i + 1
//cog.out(idents)
//]]]
pxAnd, pxArray, pxAs, pxAsm,
pxBegin, pxCase, pxClass, pxConst,
pxConstructor, pxDestructor, pxDiv, pxDo,
pxDownto, pxElse, pxEnd, pxExcept,
pxExports, pxFinalization, pxFinally, pxFor,
pxFunction, pxGoto, pxIf, pxImplementation,
pxIn, pxInherited, pxInitialization, pxInline,
pxInterface, pxIs, pxLabel, pxLibrary,
pxMod, pxNil, pxNot, pxObject,
pxOf, pxOr, pxOut, pxPacked,
pxProcedure, pxProgram, pxProperty, pxRaise,
pxRecord, pxRepeat, pxResourcestring, pxSet,
pxShl, pxShr, pxThen, pxThreadvar,
pxTo, pxTry, pxType, pxUnit,
pxUntil, pxUses, pxVar, pxWhile,
pxWith, pxXor,
//[[[end]]]
pxComment, // ordinary comment
pxCommand, // {@}
pxAmp, // {&}
pxPer, // {%}
pxStrLit,
pxSymbol, // a symbol
pxIntLit,
pxInt64Lit, // long constant like 0x00000070fffffff or out of int range
pxFloatLit,
pxParLe, pxParRi, pxBracketLe, pxBracketRi,
pxComma, pxSemiColon, pxColon,
// operators
pxAsgn,
pxEquals, pxDot, pxDotDot, pxHat, pxPlus, pxMinus, pxStar, pxSlash,
pxLe, pxLt, pxGe, pxGt, pxNeq, pxAt,
pxStarDirLe,
pxStarDirRi,
pxCurlyDirLe,
pxCurlyDirRi
);
TPasTokKinds = set of TPasTokKind;
const
PasTokKindToStr: array [TPasTokKind] of string = (
'pxInvalid', '[EOF]',
//[[[cog
//cog.out(strings)
//]]]
'and', 'array', 'as', 'asm',
'begin', 'case', 'class', 'const',
'constructor', 'destructor', 'div', 'do',
'downto', 'else', 'end', 'except',
'exports', 'finalization', 'finally', 'for',
'function', 'goto', 'if', 'implementation',
'in', 'inherited', 'initialization', 'inline',
'interface', 'is', 'label', 'library',
'mod', 'nil', 'not', 'object',
'of', 'or', 'out', 'packed',
'procedure', 'program', 'property', 'raise',
'record', 'repeat', 'resourcestring', 'set',
'shl', 'shr', 'then', 'threadvar',
'to', 'try', 'type', 'unit',
'until', 'uses', 'var', 'while',
'with', 'xor',
//[[[end]]]
'pxComment', 'pxCommand',
'{&}', '{%}', 'pxStrLit', '[IDENTIFIER]', 'pxIntLit', 'pxInt64Lit',
'pxFloatLit',
'('+'', ')'+'', '['+'', ']'+'',
','+'', ';'+'', ':'+'',
':=', '='+'', '.'+'', '..', '^'+'', '+'+'', '-'+'', '*'+'', '/'+'',
'<=', '<'+'', '>=', '>'+'', '<>', '@'+'', '(*$', '*)', '{$', '}'+''
);
type
TPasTok = object(TToken) // a Pascal token
xkind: TPasTokKind; // the type of the token
end;
TPasLex = object(TLexer)
end;
procedure getPasTok(var L: TPasLex; out tok: TPasTok);
procedure PrintPasTok(const tok: TPasTok);
function pasTokToStr(const tok: TPasTok): string;
implementation
function pastokToStr(const tok: TPasTok): string;
begin
case tok.xkind of
pxIntLit, pxInt64Lit:
result := toString(tok.iNumber);
pxFloatLit:
result := toStringF(tok.fNumber);
pxInvalid, pxComment..pxStrLit:
result := tok.literal;
else if (tok.ident.s <> '') then
result := tok.ident.s
else
result := pasTokKindToStr[tok.xkind];
end
end;
procedure PrintPasTok(const tok: TPasTok);
begin
write(output, pasTokKindToStr[tok.xkind]);
write(output, ' ');
writeln(output, pastokToStr(tok))
end;
// ----------------------------------------------------------------------------
procedure setKeyword(var L: TPasLex; var tok: TPasTok);
begin
case tok.ident.id of
//[[[cog
//for k in keywords:
// m = capitalize(k)
// cog.outl("ord(w%s):%s tok.xkind := px%s;" % (m, ' '*(18-len(m)), m))
//]]]
ord(wAnd): tok.xkind := pxAnd;
ord(wArray): tok.xkind := pxArray;
ord(wAs): tok.xkind := pxAs;
ord(wAsm): tok.xkind := pxAsm;
ord(wBegin): tok.xkind := pxBegin;
ord(wCase): tok.xkind := pxCase;
ord(wClass): tok.xkind := pxClass;
ord(wConst): tok.xkind := pxConst;
ord(wConstructor): tok.xkind := pxConstructor;
ord(wDestructor): tok.xkind := pxDestructor;
ord(wDiv): tok.xkind := pxDiv;
ord(wDo): tok.xkind := pxDo;
ord(wDownto): tok.xkind := pxDownto;
ord(wElse): tok.xkind := pxElse;
ord(wEnd): tok.xkind := pxEnd;
ord(wExcept): tok.xkind := pxExcept;
ord(wExports): tok.xkind := pxExports;
ord(wFinalization): tok.xkind := pxFinalization;
ord(wFinally): tok.xkind := pxFinally;
ord(wFor): tok.xkind := pxFor;
ord(wFunction): tok.xkind := pxFunction;
ord(wGoto): tok.xkind := pxGoto;
ord(wIf): tok.xkind := pxIf;
ord(wImplementation): tok.xkind := pxImplementation;
ord(wIn): tok.xkind := pxIn;
ord(wInherited): tok.xkind := pxInherited;
ord(wInitialization): tok.xkind := pxInitialization;
ord(wInline): tok.xkind := pxInline;
ord(wInterface): tok.xkind := pxInterface;
ord(wIs): tok.xkind := pxIs;
ord(wLabel): tok.xkind := pxLabel;
ord(wLibrary): tok.xkind := pxLibrary;
ord(wMod): tok.xkind := pxMod;
ord(wNil): tok.xkind := pxNil;
ord(wNot): tok.xkind := pxNot;
ord(wObject): tok.xkind := pxObject;
ord(wOf): tok.xkind := pxOf;
ord(wOr): tok.xkind := pxOr;
ord(wOut): tok.xkind := pxOut;
ord(wPacked): tok.xkind := pxPacked;
ord(wProcedure): tok.xkind := pxProcedure;
ord(wProgram): tok.xkind := pxProgram;
ord(wProperty): tok.xkind := pxProperty;
ord(wRaise): tok.xkind := pxRaise;
ord(wRecord): tok.xkind := pxRecord;
ord(wRepeat): tok.xkind := pxRepeat;
ord(wResourcestring): tok.xkind := pxResourcestring;
ord(wSet): tok.xkind := pxSet;
ord(wShl): tok.xkind := pxShl;
ord(wShr): tok.xkind := pxShr;
ord(wThen): tok.xkind := pxThen;
ord(wThreadvar): tok.xkind := pxThreadvar;
ord(wTo): tok.xkind := pxTo;
ord(wTry): tok.xkind := pxTry;
ord(wType): tok.xkind := pxType;
ord(wUnit): tok.xkind := pxUnit;
ord(wUntil): tok.xkind := pxUntil;
ord(wUses): tok.xkind := pxUses;
ord(wVar): tok.xkind := pxVar;
ord(wWhile): tok.xkind := pxWhile;
ord(wWith): tok.xkind := pxWith;
ord(wXor): tok.xkind := pxXor;
//[[[end]]]
else tok.xkind := pxSymbol
end
end;
// ----------------------------------------------------------------------------
procedure matchUnderscoreChars(var L: TPasLex; var tok: TPasTok;
const chars: TCharSet);
// matches ([chars]_)*
var
pos: int;
buf: PChar;
begin
pos := L.bufpos; // use registers for pos, buf
buf := L.buf;
repeat
if buf[pos] in chars then begin
addChar(tok.literal, buf[pos]);
Inc(pos)
end
else break;
if buf[pos] = '_' then begin
addChar(tok.literal, '_');
Inc(pos);
end;
until false;
L.bufPos := pos;
end;
function isFloatLiteral(const s: string): boolean;
var
i: int;
begin
for i := strStart to length(s)+strStart-1 do
if s[i] in ['.','e','E'] then begin
result := true; exit
end;
result := false
end;
procedure getNumber2(var L: TPasLex; var tok: TPasTok);
var
pos, bits: int;
xi: biggestInt;
begin
pos := L.bufpos+1; // skip %
if not (L.buf[pos] in ['0'..'1']) then begin // BUGFIX for %date%
tok.xkind := pxInvalid;
addChar(tok.literal, '%');
inc(L.bufpos);
exit;
end;
tok.base := base2;
xi := 0;
bits := 0;
while true do begin
case L.buf[pos] of
'A'..'Z', 'a'..'z', '2'..'9', '.': begin
lexMessage(L, errInvalidNumber);
inc(pos)
end;
'_': inc(pos);
'0', '1': begin
xi := shlu(xi, 1) or (ord(L.buf[pos]) - ord('0'));
inc(pos);
inc(bits);
end;
else break;
end
end;
tok.iNumber := xi;
if (bits > 32) then //or (xi < low(int32)) or (xi > high(int32)) then
tok.xkind := pxInt64Lit
else
tok.xkind := pxIntLit;
L.bufpos := pos;
end;
procedure getNumber16(var L: TPasLex; var tok: TPasTok);
var
pos, bits: int;
xi: biggestInt;
begin
pos := L.bufpos+1; // skip $
tok.base := base16;
xi := 0;
bits := 0;
while true do begin
case L.buf[pos] of
'G'..'Z', 'g'..'z', '.': begin
lexMessage(L, errInvalidNumber);
inc(pos);
end;
'_': inc(pos);
'0'..'9': begin
xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('0'));
inc(pos);
inc(bits, 4);
end;
'a'..'f': begin
xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10);
inc(pos);
inc(bits, 4);
end;
'A'..'F': begin
xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10);
inc(pos);
inc(bits, 4);
end;
else break;
end
end;
tok.iNumber := xi;
if (bits > 32) then // (xi < low(int32)) or (xi > high(int32)) then
tok.xkind := pxInt64Lit
else
tok.xkind := pxIntLit;
L.bufpos := pos;
end;
procedure getNumber10(var L: TPasLex; var tok: TPasTok);
begin
tok.base := base10;
matchUnderscoreChars(L, tok, ['0'..'9']);
if (L.buf[L.bufpos] = '.') and (L.buf[L.bufpos+1] in ['0'..'9']) then begin
addChar(tok.literal, '.');
inc(L.bufpos);
matchUnderscoreChars(L, tok, ['e', 'E', '+', '-', '0'..'9'])
end;
try
if isFloatLiteral(tok.literal) then begin
tok.fnumber := parseFloat(tok.literal);
tok.xkind := pxFloatLit;
end
else begin
tok.iNumber := ParseInt(tok.literal);
if (tok.iNumber < low(int32)) or (tok.iNumber > high(int32)) then
tok.xkind := pxInt64Lit
else
tok.xkind := pxIntLit;
end;
except
on EInvalidValue do
lexMessage(L, errInvalidNumber, tok.literal);
on EOverflow do
lexMessage(L, errNumberOutOfRange, tok.literal);
{@ignore}
on sysutils.EIntOverflow do
lexMessage(L, errNumberOutOfRange, tok.literal);
{@emit}
end;
end;
function HandleCRLF(var L: TLexer; pos: int): int;
begin
case L.buf[pos] of
CR: result := lexbase.HandleCR(L, pos);
LF: result := lexbase.HandleLF(L, pos);
else result := pos
end
end;
procedure getString(var L: TPasLex; var tok: TPasTok);
var
pos, xi: int;
buf: PChar;
begin
pos := L.bufPos;
buf := L.buf;
while true do begin
if buf[pos] = '''' then begin
inc(pos);
while true do begin
case buf[pos] of
CR, LF, lexbase.EndOfFile: begin
lexMessage(L, errClosingQuoteExpected);
break
end;
'''': begin
inc(pos);
if buf[pos] = '''' then begin
inc(pos);
addChar(tok.literal, '''');
end
else break;
end;
else begin
addChar(tok.literal, buf[pos]);
inc(pos);
end
end
end
end
else if buf[pos] = '#' then begin
inc(pos);
xi := 0;
case buf[pos] of
'$': begin
inc(pos);
xi := 0;
while true do begin
case buf[pos] of
'0'..'9': xi := (xi shl 4) or (ord(buf[pos]) - ord('0'));
'a'..'f': xi := (xi shl 4) or (ord(buf[pos]) - ord('a') + 10);
'A'..'F': xi := (xi shl 4) or (ord(buf[pos]) - ord('A') + 10);
else break;
end;
inc(pos)
end
end;
'0'..'9': begin
xi := 0;
while buf[pos] in ['0'..'9'] do begin
xi := (xi * 10) + (ord(buf[pos]) - ord('0'));
inc(pos);
end;
end
else lexMessage(L, errInvalidCharacterConstant)
end;
if (xi <= 255) then
addChar(tok.literal, Chr(xi))
else
lexMessage(L, errInvalidCharacterConstant)
end
else break
end;
tok.xkind := pxStrLit;
L.bufpos := pos;
end;
{@ignore}
{$ifopt Q+} {$define Q_on} {$Q-} {$endif}
{$ifopt R+} {$define R_on} {$R-} {$endif}
{@emit}
procedure getSymbol(var L: TPasLex; var tok: TPasTok);
var
pos: int;
c: Char;
buf: pchar;
h: THash; // hashing algorithm inlined
begin
h := 0;
pos := L.bufpos;
buf := L.buf;
while true do begin
c := buf[pos];
case c of
'a'..'z', '0'..'9', #128..#255: begin
h := h +{%} Ord(c);
h := h +{%} h shl 10;
h := h xor (h shr 6)
end;
'A'..'Z': begin
c := chr(ord(c) + (ord('a')-ord('A'))); // toLower()
h := h +{%} Ord(c);
h := h +{%} h shl 10;
h := h xor (h shr 6)
end;
'_': begin end;
else break
end;
Inc(pos)
end;
h := h +{%} h shl 3;
h := h xor (h shr 11);
h := h +{%} h shl 15;
tok.ident := getIdent(addr(L.buf[L.bufpos]), pos-L.bufpos, h);
L.bufpos := pos;
setKeyword(L, tok);
end;
{@ignore}
{$ifdef Q_on} {$undef Q_on} {$Q+} {$endif}
{$ifdef R_on} {$undef R_on} {$R+} {$endif}
{@emit}
procedure scanLineComment(var L: TPasLex; var tok: TPasTok);
var
buf: PChar;
pos, col: int;
indent: int;
begin
pos := L.bufpos;
buf := L.buf;
// a comment ends if the next line does not start with the // on the same
// column after only whitespace
tok.xkind := pxComment;
col := getColNumber(L, pos);
while true do begin
inc(pos, 2); // skip //
addChar(tok.literal, '#');
while not (buf[pos] in [CR, LF, lexbase.EndOfFile]) do begin
addChar(tok.literal, buf[pos]); inc(pos);
end;
pos := handleCRLF(L, pos);
buf := L.buf;
indent := 0;
while buf[pos] = ' ' do begin inc(pos); inc(indent) end;
if (col = indent) and (buf[pos] = '/') and (buf[pos+1] = '/') then
tok.literal := tok.literal +{&} nl
else
break
end;
L.bufpos := pos;
end;
procedure scanCurlyComment(var L: TPasLex; var tok: TPasTok);
var
buf: PChar;
pos: int;
begin
pos := L.bufpos;
buf := L.buf;
tok.literal := '#'+'';
tok.xkind := pxComment;
repeat
case buf[pos] of
CR, LF: begin
pos := HandleCRLF(L, pos);
buf := L.buf;
tok.literal := tok.literal +{&} nl + '#';
end;
'}': begin inc(pos); break end;
lexbase.EndOfFile: lexMessage(L, errTokenExpected, '}'+'');
else begin
addChar(tok.literal, buf[pos]);
inc(pos)
end
end
until false;
L.bufpos := pos;
end;
procedure scanStarComment(var L: TPasLex; var tok: TPasTok);
var
buf: PChar;
pos: int;
begin
pos := L.bufpos;
buf := L.buf;
tok.literal := '#'+'';
tok.xkind := pxComment;
repeat
case buf[pos] of
CR, LF: begin
pos := HandleCRLF(L, pos);
buf := L.buf;
tok.literal := tok.literal +{&} nl + '#';
end;
'*': begin
inc(pos);
if buf[pos] = ')' then begin inc(pos); break end
else addChar(tok.literal, '*')
end;
lexbase.EndOfFile: lexMessage(L, errTokenExpected, '*)');
else begin
addChar(tok.literal, buf[pos]);
inc(pos)
end
end
until false;
L.bufpos := pos;
end;
procedure skip(var L: TPasLex; var tok: TPasTok);
var
buf: PChar;
pos: int;
begin
pos := L.bufpos;
buf := L.buf;
repeat
case buf[pos] of
' ', Tabulator: Inc(pos);
// newline is special:
CR, LF: begin
pos := HandleCRLF(L, pos);
buf := L.buf;
end
else break // EndOfFile also leaves the loop
end
until false;
L.bufpos := pos;
end;
procedure getPasTok(var L: TPasLex; out tok: TPasTok);
var
c: Char;
begin
tok.xkind := pxInvalid;
fillToken(tok);
skip(L, tok);
c := L.buf[L.bufpos];
if c in SymStartChars then // common case first
getSymbol(L, tok)
else if c in ['0'..'9'] then
getNumber10(L, tok)
else begin
case c of
';': begin tok.xkind := pxSemicolon; Inc(L.bufpos) end;
'/': begin
if L.buf[L.bufpos+1] = '/' then scanLineComment(L, tok)
else begin tok.xkind := pxSlash; inc(L.bufpos) end;
end;
',': begin tok.xkind := pxComma; Inc(L.bufpos) end;
'(': begin
Inc(L.bufpos);
if (L.buf[L.bufPos] = '*') then begin
if (L.buf[L.bufPos+1] = '$') then begin
Inc(L.bufpos, 2);
skip(L, tok);
getSymbol(L, tok);
tok.xkind := pxStarDirLe;
end
else begin
inc(L.bufpos);
scanStarComment(L, tok)
end
end
else
tok.xkind := pxParLe;
end;
'*': begin
inc(L.bufpos);
if L.buf[L.bufpos] = ')' then begin
inc(L.bufpos); tok.xkind := pxStarDirRi
end
else tok.xkind := pxStar
end;
')': begin tok.xkind := pxParRi; Inc(L.bufpos) end;
'[': begin Inc(L.bufpos); tok.xkind := pxBracketLe end;
']': begin Inc(L.bufpos); tok.xkind := pxBracketRi end;
'.': begin
inc(L.bufpos);
if L.buf[L.bufpos] = '.' then begin
tok.xkind := pxDotDot; inc(L.bufpos)
end
else tok.xkind := pxDot
end;
'{': begin
Inc(L.bufpos);
case L.buf[L.bufpos] of
'$': begin
Inc(L.bufpos);
skip(L, tok);
getSymbol(L, tok);
tok.xkind := pxCurlyDirLe
end;
'&': begin Inc(L.bufpos); tok.xkind := pxAmp end;
'%': begin Inc(L.bufpos); tok.xkind := pxPer end;
'@': begin Inc(L.bufpos); tok.xkind := pxCommand end;
else scanCurlyComment(L, tok);
end;
end;
'+': begin tok.xkind := pxPlus; inc(L.bufpos) end;
'-': begin tok.xkind := pxMinus; inc(L.bufpos) end;
':': begin
inc(L.bufpos);
if L.buf[L.bufpos] = '=' then begin
inc(L.bufpos); tok.xkind := pxAsgn;
end
else tok.xkind := pxColon
end;
'<': begin
inc(L.bufpos);
if L.buf[L.bufpos] = '>' then begin
inc(L.bufpos);
tok.xkind := pxNeq
end
else if L.buf[L.bufpos] = '=' then begin
inc(L.bufpos);
tok.xkind := pxLe
end
else tok.xkind := pxLt
end;
'>': begin
inc(L.bufpos);
if L.buf[L.bufpos] = '=' then begin
inc(L.bufpos);
tok.xkind := pxGe
end
else tok.xkind := pxGt
end;
'=': begin tok.xkind := pxEquals; inc(L.bufpos) end;
'@': begin tok.xkind := pxAt; inc(L.bufpos) end;
'^': begin tok.xkind := pxHat; inc(L.bufpos) end;
'}': begin tok.xkind := pxCurlyDirRi; Inc(L.bufpos) end;
'''', '#': getString(L, tok);
'$': getNumber16(L, tok);
'%': getNumber2(L, tok);
lexbase.EndOfFile: tok.xkind := pxEof;
else begin
tok.literal := c + '';
tok.xkind := pxInvalid;
lexMessage(L, errInvalidToken, c + ' (\' +{&} toString(ord(c)) + ')');
Inc(L.bufpos);
end
end
end
end;
end.