//
//
// The Nimrod Compiler
// (c) Copyright 2008 Andreas Rumpf
//
// See the file "copying.txt", included in this
// distribution, for details about the copyright.
//
unit lexbase;
// Base Object of a lexer with efficient buffer handling. In fact
// I believe that this is the most efficient method of buffer
// handling that exists! Only at line endings checks are necessary
// if the buffer needs refilling.
interface
uses
nsystem, llstream, charsets, strutils;
{@emit
const
Lrz = ' ';
Apo = '''';
Tabulator = #9;
ESC = #27;
CR = #13;
FF = #12;
LF = #10;
BEL = #7;
BACKSPACE = #8;
VT = #11;
}
const
EndOfFile = #0; // end of file marker
{ A little picture makes everything clear :-)
buf:
"Example Text\n ha!" bufLen = 17
^pos = 0 ^ sentinel = 12
}
NewLines = {@set}[CR, LF];
type
TBaseLexer = object(NObject)
bufpos: int;
buf: PChar;
bufLen: int; // length of buffer in characters
stream: PLLStream; // we read from this stream
LineNumber: int; // the current line number
// private data:
sentinel: int;
lineStart: int; // index of last line start in buffer
end;
procedure openBaseLexer(out L: TBaseLexer;
inputstream: PLLStream;
bufLen: int = 8192);
// 8K is a reasonable buffer size
procedure closeBaseLexer(var L: TBaseLexer);
function getCurrentLine(const L: TBaseLexer; marker: boolean = true): string;
function getColNumber(const L: TBaseLexer; pos: int): int;
function HandleCR(var L: TBaseLexer; pos: int): int;
// Call this if you scanned over CR in the buffer; it returns the
// position to continue the scanning from. `pos` must be the position
// of the CR.
function HandleLF(var L: TBaseLexer; pos: int): int;
// Call this if you scanned over LF in the buffer; it returns the the
// position to continue the scanning from. `pos` must be the position
// of the LF.
implementation
const
chrSize = sizeof(char);
procedure closeBaseLexer(var L: TBaseLexer);
begin
dealloc(L.buf);
LLStreamClose(L.stream);
end;
{@ignore}
{$ifdef false}
procedure printBuffer(const L: TBaseLexer);
var
i: int;
begin
writeln('____________________________________');
writeln('sentinel: ', L.sentinel);
writeln('bufLen: ', L.bufLen);
writeln('buf: ');
for i := 0 to L.bufLen-1 do write(L.buf[i]);
writeln(NL + '____________________________________');
end;
{$endif}
{@emit}
procedure FillBuffer(var L: TBaseLexer);
var
charsRead, toCopy, s: int; // all are in characters,
// not bytes (in case this
// is not the same)
oldBufLen: int;
begin
// we know here that pos == L.sentinel, but not if this proc
// is called the first time by initBaseLexer()
assert(L.sentinel < L.bufLen);
toCopy := L.BufLen - L.sentinel - 1;
assert(toCopy >= 0);
if toCopy > 0 then
MoveMem(L.buf, addr(L.buf[L.sentinel+1]), toCopy * chrSize);
// "moveMem" handles overlapping regions
charsRead := LLStreamRead(L.stream, addr(L.buf[toCopy]),
(L.sentinel+1) * chrSize) div chrSize;
s := toCopy + charsRead;
if charsRead < L.sentinel+1 then begin
L.buf[s] := EndOfFile; // set end marker
L.sentinel := s
end
else begin
// compute sentinel:
dec(s); // BUGFIX (valgrind)
while true do begin
assert(s < L.bufLen);
while (s >= 0) and not (L.buf[s] in NewLines) do Dec(s);
if s >= 0 then begin
// we found an appropriate character for a sentinel:
L.sentinel := s;
break
end
else begin
// rather than to give up here because the line is too long,
// double the buffer's size and try again:
oldBufLen := L.BufLen;
L.bufLen := L.BufLen * 2;
L.buf := {@cast}PChar(realloc(L.buf, L.bufLen*chrSize));
assert(L.bufLen - oldBuflen = oldBufLen);
charsRead := LLStreamRead(L.stream, addr(L.buf[oldBufLen]),
oldBufLen*chrSize) div chrSize;
if charsRead < oldBufLen then begin
L.buf[oldBufLen+charsRead] := EndOfFile;
L.sentinel := oldBufLen+charsRead;
break
end;
s := L.bufLen - 1
end
end
end
end;
function fillBaseLexer(var L: TBaseLexer; pos: int): int;
begin
assert(pos <= L.sentinel);
if pos < L.sentinel then begin
result := pos+1; // nothing to do
end
else begin
fillBuffer(L);
L.bufpos := 0; // XXX: is this really correct?
result := 0;
end;
L.lineStart := result;
end;
function HandleCR(var L: TBaseLexer; pos: int): int;
begin
assert(L.buf[pos] = CR);
inc(L.linenumber);
result := fillBaseLexer(L, pos);
if L.buf[result] = LF then begin
result := fillBaseLexer(L, result);
end;
//L.lastNL := result-1; // BUGFIX: was: result;
end;
function HandleLF(var L: TBaseLexer; pos: int): int;
begin
assert(L.buf[pos] = LF);
inc(L.linenumber);
result := fillBaseLexer(L, pos);
//L.lastNL := result-1; // BUGFIX: was: result;
end;
procedure skip_UTF_8_BOM(var L: TBaseLexer);
begin
if (L.buf[0] = #239) and (L.buf[1] = #187) and (L.buf[2] = #191) then begin
inc(L.bufpos, 3);
inc(L.lineStart, 3)
end
end;
procedure openBaseLexer(out L: TBaseLexer; inputstream: PLLStream;
bufLen: int = 8192);
begin
assert(bufLen > 0);
L.bufpos := 0;
L.bufLen := bufLen;
L.buf := {@cast}PChar(alloc(bufLen * chrSize));
L.sentinel := bufLen-1;
L.lineStart := 0;
L.linenumber := 1; // lines start at 1
L.stream := inputstream;
fillBuffer(L);
skip_UTF_8_BOM(L);
end;
function getColNumber(const L: TBaseLexer; pos: int): int;
begin
result := abs(pos - L.lineStart);
end;
function getCurrentLine(const L: TBaseLexer; marker: boolean = true): string;
var
i: int;
begin
result := '';
i := L.lineStart;
while not (L.buf[i] in [CR, LF, EndOfFile]) do begin
addChar(result, L.buf[i]);
inc(i)
end;
result := result +{&} NL;
if marker then
result := result +{&} RepeatChar(getColNumber(L, L.bufpos)) +{&} '^' +{&} NL
end;
end.