//
//
// The Nimrod Compiler
// (c) Copyright 2009 Andreas Rumpf
//
// See the file "copying.txt", included in this
// distribution, for details about the copyright.
//
unit scanner;
// This scanner is handwritten for efficiency. I used an elegant buffering
// scheme which I have not seen anywhere else:
// We guarantee that a whole line is in the buffer. Thus only when scanning
// the \n or \r character we have to check wether we need to read in the next
// chunk. (\n or \r already need special handling for incrementing the line
// counter; choosing both \n and \r allows the scanner to properly read Unix,
// DOS or Macintosh text files, even when it is not the native format.
interface
{$include 'config.inc'}
uses
charsets, nsystem, sysutils, nhashes, options, msgs, strutils, platform,
idents, lexbase, llstream, wordrecg;
const
MaxLineLength = 80; // lines longer than this lead to a warning
numChars: TCharSet = ['0'..'9','a'..'z','A'..'Z'];
SymChars: TCharSet = ['a'..'z', 'A'..'Z', '0'..'9', #128..#255];
SymStartChars: TCharSet = ['a'..'z', 'A'..'Z', #128..#255];
OpChars: TCharSet = ['+', '-', '*', '/', '\', '<', '>', '!', '?', '^', '.',
'|', '=', '%', '&', '$', '@', '~', #128..#255];
type
TTokType = (tkInvalid, tkEof, // order is important here!
tkSymbol,
// keywords:
//[[[cog
//from string import split, capitalize
//keywords = split(open("data/keywords.txt").read())
//idents = ""
//strings = ""
//i = 1
//for k in keywords:
// idents = idents + "tk" + capitalize(k) + ", "
// strings = strings + "'" + k + "', "
// if i % 4 == 0:
// idents = idents + "\n"
// strings = strings + "\n"
// i = i + 1
//cog.out(idents)
//]]]
tkAddr, tkAnd, tkAs, tkAsm,
tkBind, tkBlock, tkBreak, tkCase,
tkCast, tkConst, tkContinue, tkConverter,
tkDiscard, tkDistinct, tkDiv, tkElif,
tkElse, tkEnd, tkEnum, tkExcept,
tkFinally, tkFor, tkFrom, tkGeneric,
tkIf, tkImplies, tkImport, tkIn,
tkInclude, tkIs, tkIsnot, tkIterator,
tkLambda, tkMacro, tkMethod, tkMod,
tkNil, tkNot, tkNotin, tkObject,
tkOf, tkOr, tkOut, tkProc,
tkPtr, tkRaise, tkRef, tkReturn,
tkShl, tkShr, tkTemplate, tkTry,
tkTuple, tkType, tkVar, tkWhen,
tkWhile, tkWith, tkWithout, tkXor,
tkYield,
//[[[end]]]
tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit,
tkFloatLit, tkFloat32Lit, tkFloat64Lit,
tkStrLit, tkRStrLit, tkTripleStrLit, tkCallRStrLit, tkCallTripleStrLit,
tkCharLit, tkParLe, tkParRi, tkBracketLe, tkBracketRi, tkCurlyLe, tkCurlyRi,
tkBracketDotLe, tkBracketDotRi, // [. and .]
tkCurlyDotLe, tkCurlyDotRi, // {. and .}
tkParDotLe, tkParDotRi, // (. and .)
tkComma, tkSemiColon, tkColon,
tkEquals, tkDot, tkDotDot, tkHat, tkOpr,
tkComment, tkAccent, tkInd, tkSad, tkDed,
// pseudo token types used by the source renderers:
tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr
);
TTokTypes = set of TTokType;
const
tokKeywordLow = succ(tkSymbol);
tokKeywordHigh = pred(tkIntLit);
tokOperators: TTokTypes = {@set}[tkOpr, tkSymbol, tkBracketLe, tkBracketRi,
tkIn, tkIs, tkIsNot, tkEquals, tkDot, tkHat, tkNot, tkAnd, tkOr, tkXor,
tkShl, tkShr, tkDiv, tkMod, tkNotIn];
TokTypeToStr: array [TTokType] of string = (
'tkInvalid', '[EOF]',
'tkSymbol',
//[[[cog
//cog.out(strings)
//]]]
'addr', 'and', 'as', 'asm',
'bind', 'block', 'break', 'case',
'cast', 'const', 'continue', 'converter',
'discard', 'distinct', 'div', 'elif',
'else', 'end', 'enum', 'except',
'finally', 'for', 'from', 'generic',
'if', 'implies', 'import', 'in',
'include', 'is', 'isnot', 'iterator',
'lambda', 'macro', 'method', 'mod',
'nil', 'not', 'notin', 'object',
'of', 'or', 'out', 'proc',
'ptr', 'raise', 'ref', 'return',
'shl', 'shr', 'template', 'try',
'tuple', 'type', 'var', 'when',
'while', 'with', 'without', 'xor',
'yield',
//[[[end]]]
'tkIntLit', 'tkInt8Lit', 'tkInt16Lit', 'tkInt32Lit', 'tkInt64Lit',
'tkFloatLit', 'tkFloat32Lit', 'tkFloat64Lit',
'tkStrLit', 'tkRStrLit', 'tkTripleStrLit',
'tkCallRStrLit', 'tkCallTripleStrLit',
'tkCharLit',
'('+'', ')'+'', '['+'', ']'+'', '{'+'', '}'+'',
'[.', '.]', '{.', '.}', '(.', '.)', ','+'', ';'+'', ':'+'',
'='+'', '.'+'', '..', '^'+'', 'tkOpr',
'tkComment', '`'+'', '[new indentation]', '[same indentation]',
'[dedentation]',
'tkSpaces', 'tkInfixOpr', 'tkPrefixOpr', 'tkPostfixOpr'
);
type
TNumericalBase = (base10, // base10 is listed as the first element,
// so that it is the correct default value
base2,
base8,
base16);
PToken = ^TToken;
TToken = object // a Nimrod token
tokType: TTokType; // the type of the token
indent: int; // the indentation; only valid if tokType = tkIndent
ident: PIdent; // the parsed identifier
iNumber: BiggestInt; // the parsed integer literal
fNumber: BiggestFloat; // the parsed floating point literal
base: TNumericalBase; // the numerical base; only valid for int
// or float literals
literal: string; // the parsed (string) literal; and
// documentation comments are here too
next: PToken; // next token; can be used for arbitrary look-ahead
end;
PLexer = ^TLexer;
TLexer = object(TBaseLexer)
filename: string;
indentStack: array of int; // the indentation stack
dedent: int; // counter for DED token generation
indentAhead: int; // if > 0 an indendation has already been read
// this is needed because scanning comments
// needs so much look-ahead
end;
var
gLinesCompiled: int; // all lines that have been compiled
procedure pushInd(var L: TLexer; indent: int);
procedure popInd(var L: TLexer);
function isKeyword(kind: TTokType): boolean;
procedure openLexer(out lex: TLexer; const filename: string;
inputstream: PLLStream);
procedure rawGetTok(var L: TLexer; var tok: TToken);
// reads in the next token into tok and skips it
function getColumn(const L: TLexer): int;
function getLineInfo(const L: TLexer): TLineInfo;
procedure closeLexer(var lex: TLexer);
procedure PrintTok(tok: PToken);
function tokToStr(tok: PToken): string;
// auxiliary functions:
procedure lexMessage(const L: TLexer; const msg: TMsgKind;
const arg: string = '');
// the Pascal scanner uses this too:
procedure fillToken(var L: TToken);
implementation
function isKeyword(kind: TTokType): boolean;
begin
result := (kind >= tokKeywordLow) and (kind <= tokKeywordHigh)
end;
procedure pushInd(var L: TLexer; indent: int);
var
len: int;
begin
len := length(L.indentStack);
setLength(L.indentStack, len+1);
if (indent > L.indentStack[len-1]) then
L.indentstack[len] := indent
else
InternalError('pushInd');
//writeln('push indent ', indent);
end;
procedure popInd(var L: TLexer);
var
len: int;
begin
len := length(L.indentStack);
setLength(L.indentStack, len-1);
end;
function findIdent(const L: TLexer; indent: int): boolean;
var
i: int;
begin
for i := length(L.indentStack)-1 downto 0 do
if L.indentStack[i] = indent then begin result := true; exit end;
result := false
end;
function tokToStr(tok: PToken): string;
begin
case tok.tokType of
tkIntLit..tkInt64Lit:
result := toString(tok.iNumber);
tkFloatLit..tkFloat64Lit:
result := toStringF(tok.fNumber);
tkInvalid, tkStrLit..tkCharLit, tkComment:
result := tok.literal;
tkParLe..tkColon, tkEof, tkInd, tkSad, tkDed, tkAccent:
result := tokTypeToStr[tok.tokType];
else if (tok.ident <> nil) then
result := tok.ident.s
else begin
InternalError('tokToStr');
result := ''
end
end
end;
procedure PrintTok(tok: PToken);
begin
write(output, TokTypeToStr[tok.tokType]);
write(output, ' '+'');
writeln(output, tokToStr(tok))
end;
// ----------------------------------------------------------------------------
var
dummyIdent: PIdent;
procedure fillToken(var L: TToken);
begin
L.TokType := tkInvalid;
L.iNumber := 0;
L.Indent := 0;
L.literal := '';
L.fNumber := 0.0;
L.base := base10;
L.ident := dummyIdent; // this prevents many bugs!
end;
procedure openLexer(out lex: TLexer; const filename: string;
inputstream: PLLStream);
begin
{@ignore}
FillChar(lex, sizeof(lex), 0);
{@emit}
openBaseLexer(lex, inputstream);
{@ignore}
setLength(lex.indentStack, 1);
lex.indentStack[0] := 0;
{@emit lex.indentStack := @[0]; }
lex.filename := filename;
lex.indentAhead := -1;
end;
procedure closeLexer(var lex: TLexer);
begin
inc(gLinesCompiled, lex.LineNumber);
closeBaseLexer(lex);
end;
function getColumn(const L: TLexer): int;
begin
result := getColNumber(L, L.bufPos)
end;
function getLineInfo(const L: TLexer): TLineInfo;
begin
result := newLineInfo(L.filename, L.linenumber, getColNumber(L, L.bufpos))
end;
procedure lexMessage(const L: TLexer; const msg: TMsgKind;
const arg: string = '');
begin
msgs.liMessage(getLineInfo(L), msg, arg)
end;
procedure lexMessagePos(var L: TLexer; const msg: TMsgKind; pos: int;
const arg: string = '');
var
info: TLineInfo;
begin
info := newLineInfo(L.filename, L.linenumber, pos - L.lineStart);
msgs.liMessage(info, msg, arg);
end;
// ----------------------------------------------------------------------------
procedure matchUnderscoreChars(var L: TLexer; var tok: TToken;
const chars: TCharSet);
// matches ([chars]_)*
var
pos: int;
buf: PChar;
begin
pos := L.bufpos; // use registers for pos, buf
buf := L.buf;
repeat
if buf[pos] in chars then begin
addChar(tok.literal, buf[pos]);
Inc(pos)
end
else break;
if buf[pos] = '_' then begin
addChar(tok.literal, '_');
Inc(pos);
end;
until false;
L.bufPos := pos;
end;
function matchTwoChars(const L: TLexer; first: Char;
const second: TCharSet): Boolean;
begin
result := (L.buf[L.bufpos] = first) and (L.buf[L.bufpos+1] in Second);
end;
function isFloatLiteral(const s: string): boolean;
var
i: int;
begin
for i := strStart to length(s)+strStart-1 do
if s[i] in ['.','e','E'] then begin
result := true; exit
end;
result := false
end;
function GetNumber(var L: TLexer): TToken;
var
pos, endpos: int;
xi: biggestInt;
begin
// get the base:
result.tokType := tkIntLit; // int literal until we know better
result.literal := '';
result.base := base10; // BUGFIX
pos := L.bufpos;
// make sure the literal is correct for error messages:
matchUnderscoreChars(L, result, ['A'..'Z', 'a'..'z', '0'..'9']);
if (L.buf[L.bufpos] = '.') and (L.buf[L.bufpos+1] in ['0'..'9']) then begin
addChar(result.literal, '.');
inc(L.bufpos);
//matchUnderscoreChars(L, result, ['A'..'Z', 'a'..'z', '0'..'9'])
matchUnderscoreChars(L, result, ['0'..'9']);
if L.buf[L.bufpos] in ['e', 'E'] then begin
addChar(result.literal, 'e');
inc(L.bufpos);
if L.buf[L.bufpos] in ['+', '-'] then begin
addChar(result.literal, L.buf[L.bufpos]);
inc(L.bufpos);
end;
matchUnderscoreChars(L, result, ['0'..'9']);
end
end;
endpos := L.bufpos;
if L.buf[endpos] = '''' then begin
//matchUnderscoreChars(L, result, ['''', 'f', 'F', 'i', 'I', '0'..'9']);
inc(endpos);
L.bufpos := pos; // restore position
case L.buf[endpos] of
'f', 'F': begin
inc(endpos);
if (L.buf[endpos] = '6') and (L.buf[endpos+1] = '4') then begin
result.tokType := tkFloat64Lit;
inc(endpos, 2);
end
else if (L.buf[endpos] = '3') and (L.buf[endpos+1] = '2') then begin
result.tokType := tkFloat32Lit;
inc(endpos, 2);
end
else lexMessage(L, errInvalidNumber, result.literal);
end;
'i', 'I': begin
inc(endpos);
if (L.buf[endpos] = '6') and (L.buf[endpos+1] = '4') then begin
result.tokType := tkInt64Lit;
inc(endpos, 2);
end
else if (L.buf[endpos] = '3') and (L.buf[endpos+1] = '2') then begin
result.tokType := tkInt32Lit;
inc(endpos, 2);
end
else if (L.buf[endpos] = '1') and (L.buf[endpos+1] = '6') then begin
result.tokType := tkInt16Lit;
inc(endpos, 2);
end
else if (L.buf[endpos] = '8') then begin
result.tokType := tkInt8Lit;
inc(endpos);
end
else lexMessage(L, errInvalidNumber, result.literal);
end;
else lexMessage(L, errInvalidNumber, result.literal);
end
end
else
L.bufpos := pos; // restore position
try
if (L.buf[pos] = '0') and (L.buf[pos+1] in ['x','X','b','B','o','O','c','C'])
then begin
inc(pos, 2);
xi := 0;
// it may be a base prefix
case L.buf[pos-1] of
'b', 'B': begin
result.base := base2;
while true do begin
case L.buf[pos] of
'A'..'Z', 'a'..'z', '2'..'9', '.': begin
lexMessage(L, errInvalidNumber, result.literal);
inc(pos)
end;
'_': inc(pos);
'0', '1': begin
xi := shlu(xi, 1) or (ord(L.buf[pos]) - ord('0'));
inc(pos);
end;
else break;
end
end
end;
'o', 'c', 'C': begin
result.base := base8;
while true do begin
case L.buf[pos] of
'A'..'Z', 'a'..'z', '8'..'9', '.': begin
lexMessage(L, errInvalidNumber, result.literal);
inc(pos)
end;
'_': inc(pos);
'0'..'7': begin
xi := shlu(xi, 3) or (ord(L.buf[pos]) - ord('0'));
inc(pos);
end;
else break;
end
end
end;
'O': lexMessage(L, errInvalidNumber, result.literal);
'x', 'X': begin
result.base := base16;
while true do begin
case L.buf[pos] of
'G'..'Z', 'g'..'z', '.': begin
lexMessage(L, errInvalidNumber, result.literal);
inc(pos);
end;
'_': inc(pos);
'0'..'9': begin
xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('0'));
inc(pos);
end;
'a'..'f': begin
xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10);
inc(pos);
end;
'A'..'F': begin
xi := shlu(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10);
inc(pos);
end;
else break;
end
end
end;
else InternalError(getLineInfo(L), 'getNumber');
end;
// now look at the optional type suffix:
case result.tokType of
tkIntLit, tkInt64Lit:
result.iNumber := xi;
tkInt8Lit:
result.iNumber := biggestInt(int8(toU8(int(xi))));
tkInt16Lit:
result.iNumber := biggestInt(toU16(int(xi)));
tkInt32Lit:
result.iNumber := biggestInt(toU32(xi));
tkFloat32Lit:
result.fNumber := ({@cast}PFloat32(addr(xi)))^;
// note: this code is endian neutral!
// XXX: Test this on big endian machine!
tkFloat64Lit:
result.fNumber := ({@cast}PFloat64(addr(xi)))^;
else InternalError(getLineInfo(L), 'getNumber');
end
end
else if isFloatLiteral(result.literal)
or (result.tokType = tkFloat32Lit)
or (result.tokType = tkFloat64Lit) then begin
result.fnumber := parseFloat(result.literal);
if result.tokType = tkIntLit then result.tokType := tkFloatLit;
end
else begin
result.iNumber := ParseBiggestInt(result.literal);
if (result.iNumber < low(int32)) or (result.iNumber > high(int32)) then
begin
if result.tokType = tkIntLit then result.tokType := tkInt64Lit
else if result.tokType <> tkInt64Lit then
lexMessage(L, errInvalidNumber, result.literal);
end
end;
except
on EInvalidValue do
lexMessage(L, errInvalidNumber, result.literal);
{@ignore}
on sysutils.EIntOverflow do
lexMessage(L, errNumberOutOfRange, result.literal);
{@emit}
on EOverflow do
lexMessage(L, errNumberOutOfRange, result.literal);
on EOutOfRange do
lexMessage(L, errNumberOutOfRange, result.literal);
end;
L.bufpos := endpos;
end;
procedure handleHexChar(var L: TLexer; var xi: int);
begin
case L.buf[L.bufpos] of
'0'..'9': begin
xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0'));
inc(L.bufpos);
end;
'a'..'f': begin
xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('a') + 10);
inc(L.bufpos);
end;
'A'..'F': begin
xi := (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10);
inc(L.bufpos);
end;
else begin end // do nothing
end
end;
procedure handleDecChars(var L: TLexer; var xi: int);
begin
while L.buf[L.bufpos] in ['0'..'9'] do begin
xi := (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0'));
inc(L.bufpos);
end;
end;
procedure getEscapedChar(var L: TLexer; var tok: TToken);
var
xi: int;
begin
inc(L.bufpos); // skip '\'
case L.buf[L.bufpos] of
'n', 'N': begin
if tok.toktype = tkCharLit then
lexMessage(L, errNnotAllowedInCharacter);
tok.literal := tok.literal +{&} tnl;
Inc(L.bufpos);
end;
'r', 'R', 'c', 'C': begin addChar(tok.literal, CR); Inc(L.bufpos); end;
'l', 'L': begin addChar(tok.literal, LF); Inc(L.bufpos); end;
'f', 'F': begin addChar(tok.literal, FF); inc(L.bufpos); end;
'e', 'E': begin addChar(tok.literal, ESC); Inc(L.bufpos); end;
'a', 'A': begin addChar(tok.literal, BEL); Inc(L.bufpos); end;
'b', 'B': begin addChar(tok.literal, BACKSPACE); Inc(L.bufpos); end;
'v', 'V': begin addChar(tok.literal, VT); Inc(L.bufpos); end;
't', 'T': begin addChar(tok.literal, Tabulator); Inc(L.bufpos); end;
'''', '"': begin addChar(tok.literal, L.buf[L.bufpos]); Inc(L.bufpos); end;
'\': begin addChar(tok.literal, '\'); Inc(L.bufpos) end;
'x', 'X': begin
inc(L.bufpos);
xi := 0;
handleHexChar(L, xi);
handleHexChar(L, xi);
addChar(tok.literal, Chr(xi));
end;
'0'..'9': begin
if matchTwoChars(L, '0', ['0'..'9']) then
// this warning will make it easier for newcomers:
lexMessage(L, warnOctalEscape);
xi := 0;
handleDecChars(L, xi);
if (xi <= 255) then
addChar(tok.literal, Chr(xi))
else
lexMessage(L, errInvalidCharacterConstant)
end
else lexMessage(L, errInvalidCharacterConstant)
end
end;
function HandleCRLF(var L: TLexer; pos: int): int;
begin
case L.buf[pos] of
CR: begin
if getColNumber(L, pos) > MaxLineLength then
lexMessagePos(L, hintLineTooLong, pos);
result := lexbase.HandleCR(L, pos)
end;
LF: begin
if getColNumber(L, pos) > MaxLineLength then
lexMessagePos(L, hintLineTooLong, pos);
result := lexbase.HandleLF(L, pos)
end;
else result := pos
end
end;
procedure getString(var L: TLexer; var tok: TToken; rawMode: Boolean);
var
line, line2, pos: int;
c: Char;
buf: PChar;
begin
pos := L.bufPos + 1; // skip "
buf := L.buf; // put `buf` in a register
line := L.linenumber; // save linenumber for better error message
if (buf[pos] = '"') and (buf[pos+1] = '"') then begin
tok.tokType := tkTripleStrLit;
// long string literal:
inc(pos, 2); // skip ""
// skip leading newline:
pos := HandleCRLF(L, pos);
buf := L.buf;
repeat
case buf[pos] of
'"': begin
if (buf[pos+1] = '"') and (buf[pos+2] = '"') then
break;
addChar(tok.literal, '"');
Inc(pos)
end;
CR, LF: begin
pos := HandleCRLF(L, pos);
buf := L.buf;
tok.literal := tok.literal +{&} tnl;
end;
lexbase.EndOfFile: begin
line2 := L.linenumber;
L.LineNumber := line;
lexMessagePos(L, errClosingTripleQuoteExpected, L.lineStart);
L.LineNumber := line2;
break
end
else begin
addChar(tok.literal, buf[pos]);
Inc(pos)
end
end
until false;
L.bufpos := pos + 3 // skip the three """
end
else begin // ordinary string literal
if rawMode then tok.tokType := tkRStrLit
else tok.tokType := tkStrLit;
repeat
c := buf[pos];
if c = '"' then begin
inc(pos); // skip '"'
break
end;
if c in [CR, LF, lexbase.EndOfFile] then begin
lexMessage(L, errClosingQuoteExpected);
break
end;
if (c = '\') and not rawMode then begin
L.bufPos := pos;
getEscapedChar(L, tok);
pos := L.bufPos;
end
else begin
addChar(tok.literal, c);
Inc(pos)
end
until false;
L.bufpos := pos;
end
end;
procedure getCharacter(var L: TLexer; var tok: TToken);
var
c: Char;
begin
Inc(L.bufpos); // skip '
c := L.buf[L.bufpos];
case c of
#0..Pred(' '), '''': lexMessage(L, errInvalidCharacterConstant);
'\': getEscapedChar(L, tok);
else begin
tok.literal := c + '';
Inc(L.bufpos);
end
end;
if L.buf[L.bufpos] <> '''' then lexMessage(L, errMissingFinalQuote);
inc(L.bufpos); // skip '
end;
{@ignore}
{$ifopt Q+} {$define Q_on} {$Q-} {$endif}
{$ifopt R+} {$define R_on} {$R-} {$endif}
{@emit}
procedure getSymbol(var L: TLexer; var tok: TToken);
var
pos: int;
c: Char;
buf: pchar;
h: THash; // hashing algorithm inlined
begin
h := 0;
pos := L.bufpos;
buf := L.buf;
while true do begin
c := buf[pos];
case c of
'a'..'z', '0'..'9', #128..#255: begin
h := h +{%} Ord(c);
h := h +{%} h shl 10;
h := h xor (h shr 6)
end;
'A'..'Z': begin
c := chr(ord(c) + (ord('a')-ord('A'))); // toLower()
h := h +{%} Ord(c);
h := h +{%} h shl 10;
h := h xor (h shr 6)
end;
'_': begin end;
else break
end;
Inc(pos)
end;
h := h +{%} h shl 3;
h := h xor (h shr 11);
h := h +{%} h shl 15;
tok.ident := getIdent(addr(L.buf[L.bufpos]), pos-L.bufpos, h);
L.bufpos := pos;
if (tok.ident.id < ord(tokKeywordLow)-ord(tkSymbol)) or
(tok.ident.id > ord(tokKeywordHigh)-ord(tkSymbol)) then
tok.tokType := tkSymbol
else
tok.tokType := TTokType(tok.ident.id+ord(tkSymbol));
if buf[pos] = '"' then begin
getString(L, tok, true);
if tok.tokType = tkRStrLit then tok.tokType := tkCallRStrLit
else tok.tokType := tkCallTripleStrLit
end
end;
procedure getOperator(var L: TLexer; var tok: TToken);
var
pos: int;
c: Char;
buf: pchar;
h: THash; // hashing algorithm inlined
begin
pos := L.bufpos;
buf := L.buf;
h := 0;
while true do begin
c := buf[pos];
if c in OpChars then begin
h := h +{%} Ord(c);
h := h +{%} h shl 10;
h := h xor (h shr 6)
end
else break;
Inc(pos)
end;
h := h +{%} h shl 3;
h := h xor (h shr 11);
h := h +{%} h shl 15;
tok.ident := getIdent(addr(L.buf[L.bufpos]), pos-L.bufpos, h);
if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh) then
tok.tokType := tkOpr
else
tok.tokType := TTokType(tok.ident.id - oprLow + ord(tkColon));
L.bufpos := pos
end;
{@ignore}
{$ifdef Q_on} {$undef Q_on} {$Q+} {$endif}
{$ifdef R_on} {$undef R_on} {$R+} {$endif}
{@emit}
procedure handleIndentation(var L: TLexer; var tok: TToken; indent: int);
var
i: int;
begin
tok.indent := indent;
i := high(L.indentStack);
if indent > L.indentStack[i] then
tok.tokType := tkInd
else if indent = L.indentStack[i] then
tok.tokType := tkSad
else begin
// check we have the indentation somewhere in the stack:
while (i >= 0) and (indent <> L.indentStack[i]) do begin
dec(i);
inc(L.dedent);
end;
dec(L.dedent);
tok.tokType := tkDed;
if i < 0 then begin
tok.tokType := tkSad; // for the parser it is better as SAD
lexMessage(L, errInvalidIndentation);
end
end
end;
procedure scanComment(var L: TLexer; var tok: TToken);
var
buf: PChar;
pos, col: int;
indent: int;
begin
pos := L.bufpos;
buf := L.buf;
// a comment ends if the next line does not start with the # on the same
// column after only whitespace
tok.tokType := tkComment;
col := getColNumber(L, pos);
while true do begin
while not (buf[pos] in [CR, LF, lexbase.EndOfFile]) do begin
addChar(tok.literal, buf[pos]); inc(pos);
end;
pos := handleCRLF(L, pos);
buf := L.buf;
indent := 0;
while buf[pos] = ' ' do begin inc(pos); inc(indent) end;
if (buf[pos] = '#') and (col = indent) then begin
tok.literal := tok.literal +{&} nl;
end
else begin
if buf[pos] > ' ' then begin
L.indentAhead := indent;
inc(L.dedent)
end;
break
end
end;
L.bufpos := pos;
end;
procedure skip(var L: TLexer; var tok: TToken);
var
buf: PChar;
indent, pos: int;
begin
pos := L.bufpos;
buf := L.buf;
repeat
case buf[pos] of
' ': Inc(pos);
Tabulator: begin
lexMessagePos(L, errTabulatorsAreNotAllowed, pos);
inc(pos); // BUGFIX
end;
// newline is special:
CR, LF: begin
pos := HandleCRLF(L, pos);
buf := L.buf;
indent := 0;
while buf[pos] = ' ' do begin
Inc(pos); Inc(indent)
end;
if (buf[pos] > ' ') then begin
handleIndentation(L, tok, indent);
break;
end
end;
else break // EndOfFile also leaves the loop
end
until false;
L.bufpos := pos;
end;
procedure rawGetTok(var L: TLexer; var tok: TToken);
var
c: Char;
begin
fillToken(tok);
if L.dedent > 0 then begin
dec(L.dedent);
if L.indentAhead >= 0 then begin
handleIndentation(L, tok, L.indentAhead);
L.indentAhead := -1;
end
else
tok.tokType := tkDed;
exit;
end;
// Skip whitespace, comments:
skip(L, tok); // skip
// got an documentation comment or tkIndent, return that:
if tok.toktype <> tkInvalid then exit;
c := L.buf[L.bufpos];
if c in SymStartChars - ['r', 'R', 'l'] then // common case first
getSymbol(L, tok)
else if c in ['0'..'9'] then
tok := getNumber(L)
else begin
case c of
'#': scanComment(L, tok);
':': begin
tok.tokType := tkColon;
inc(L.bufpos);
end;
',': begin
tok.toktype := tkComma;
Inc(L.bufpos)
end;
'l': begin
// if we parsed exactly one character and its a small L (l), this
// is treated as a warning because it may be confused with the number 1
if not (L.buf[L.bufpos+1] in (SymChars+['_'])) then
lexMessage(L, warnSmallLshouldNotBeUsed);
getSymbol(L, tok);
end;
'r', 'R': begin
if L.buf[L.bufPos+1] = '"' then begin
Inc(L.bufPos);
getString(L, tok, true);
end
else getSymbol(L, tok);
end;
'(': begin
Inc(L.bufpos);
if (L.buf[L.bufPos] = '.')
and (L.buf[L.bufPos+1] <> '.') then begin
tok.toktype := tkParDotLe;
Inc(L.bufpos);
end
else
tok.toktype := tkParLe;
end;
')': begin
tok.toktype := tkParRi;
Inc(L.bufpos)
end;
'[': begin
Inc(L.bufpos);
if (L.buf[L.bufPos] = '.')
and (L.buf[L.bufPos+1] <> '.') then begin
tok.toktype := tkBracketDotLe;
Inc(L.bufpos);
end
else
tok.toktype := tkBracketLe;
end;
']': begin
tok.toktype := tkBracketRi;
Inc(L.bufpos)
end;
'.': begin
if L.buf[L.bufPos+1] = ']' then begin
tok.tokType := tkBracketDotRi;
Inc(L.bufpos, 2);
end
else if L.buf[L.bufPos+1] = '}' then begin
tok.tokType := tkCurlyDotRi;
Inc(L.bufpos, 2);
end
else if L.buf[L.bufPos+1] = ')' then begin
tok.tokType := tkParDotRi;
Inc(L.bufpos, 2);
end
else
getOperator(L, tok)
end;
'{': begin
Inc(L.bufpos);
if (L.buf[L.bufPos] = '.')
and (L.buf[L.bufPos+1] <> '.') then begin
tok.toktype := tkCurlyDotLe;
Inc(L.bufpos);
end
else
tok.toktype := tkCurlyLe;
end;
'}': begin
tok.toktype := tkCurlyRi;
Inc(L.bufpos)
end;
';': begin
tok.toktype := tkSemiColon;
Inc(L.bufpos)
end;
'`': begin
tok.tokType := tkAccent;
Inc(L.bufpos);
end;
'"': getString(L, tok, false);
'''': begin
getCharacter(L, tok);
tok.tokType := tkCharLit;
end;
lexbase.EndOfFile: tok.toktype := tkEof;
else if c in OpChars then
getOperator(L, tok)
else begin
tok.literal := c + '';
tok.tokType := tkInvalid;
lexMessage(L, errInvalidToken, c +{&} ' (\' +{&} toString(ord(c)) + ')');
Inc(L.bufpos);
end
end
end
end;
initialization
dummyIdent := getIdent('');
end.