// // // The Nimrod Compiler // (c) Copyright 2008 Andreas Rumpf // // See the file "copying.txt", included in this // distribution, for details about the copyright. // unit lexbase; // Base Object of a lexer with efficient buffer handling. In fact // I believe that this is the most efficient method of buffer // handling that exists! Only at line endings checks are necessary // if the buffer needs refilling. interface uses nsystem, llstream, charsets, strutils; {@emit const Lrz = ' '; Apo = ''''; Tabulator = #9; ESC = #27; CR = #13; FF = #12; LF = #10; BEL = #7; BACKSPACE = #8; VT = #11; } const EndOfFile = #0; // end of file marker { A little picture makes everything clear :-) buf: "Example Text\n ha!" bufLen = 17 ^pos = 0 ^ sentinel = 12 } NewLines = {@set}[CR, LF]; type TBaseLexer = object(NObject) bufpos: int; buf: PChar; bufLen: int; // length of buffer in characters stream: PLLStream; // we read from this stream LineNumber: int; // the current line number // private data: sentinel: int; lineStart: int; // index of last line start in buffer end; procedure openBaseLexer(out L: TBaseLexer; inputstream: PLLStream; bufLen: int = 8192); // 8K is a reasonable buffer size procedure closeBaseLexer(var L: TBaseLexer); function getCurrentLine(const L: TBaseLexer; marker: boolean = true): string; function getColNumber(const L: TBaseLexer; pos: int): int; function HandleCR(var L: TBaseLexer; pos: int): int; // Call this if you scanned over CR in the buffer; it returns the // position to continue the scanning from. `pos` must be the position // of the CR. function HandleLF(var L: TBaseLexer; pos: int): int; // Call this if you scanned over LF in the buffer; it returns the the // position to continue the scanning from. `pos` must be the position // of the LF. implementation const chrSize = sizeof(char); procedure closeBaseLexer(var L: TBaseLexer); begin dealloc(L.buf); LLStreamClose(L.stream); end; {@ignore} {$ifdef false} procedure printBuffer(const L: TBaseLexer); var i: int; begin writeln('____________________________________'); writeln('sentinel: ', L.sentinel); writeln('bufLen: ', L.bufLen); writeln('buf: '); for i := 0 to L.bufLen-1 do write(L.buf[i]); writeln(NL + '____________________________________'); end; {$endif} {@emit} procedure FillBuffer(var L: TBaseLexer); var charsRead, toCopy, s: int; // all are in characters, // not bytes (in case this // is not the same) oldBufLen: int; begin // we know here that pos == L.sentinel, but not if this proc // is called the first time by initBaseLexer() assert(L.sentinel < L.bufLen); toCopy := L.BufLen - L.sentinel - 1; assert(toCopy >= 0); if toCopy > 0 then MoveMem(L.buf, addr(L.buf[L.sentinel+1]), toCopy * chrSize); // "moveMem" handles overlapping regions charsRead := LLStreamRead(L.stream, addr(L.buf[toCopy]), (L.sentinel+1) * chrSize) div chrSize; s := toCopy + charsRead; if charsRead < L.sentinel+1 then begin L.buf[s] := EndOfFile; // set end marker L.sentinel := s end else begin // compute sentinel: dec(s); // BUGFIX (valgrind) while true do begin assert(s < L.bufLen); while (s >= 0) and not (L.buf[s] in NewLines) do Dec(s); if s >= 0 then begin // we found an appropriate character for a sentinel: L.sentinel := s; break end else begin // rather than to give up here because the line is too long, // double the buffer's size and try again: oldBufLen := L.BufLen; L.bufLen := L.BufLen * 2; L.buf := {@cast}PChar(realloc(L.buf, L.bufLen*chrSize)); assert(L.bufLen - oldBuflen = oldBufLen); charsRead := LLStreamRead(L.stream, addr(L.buf[oldBufLen]), oldBufLen*chrSize) div chrSize; if charsRead < oldBufLen then begin L.buf[oldBufLen+charsRead] := EndOfFile; L.sentinel := oldBufLen+charsRead; break end; s := L.bufLen - 1 end end end end; function fillBaseLexer(var L: TBaseLexer; pos: int): int; begin assert(pos <= L.sentinel); if pos < L.sentinel then begin result := pos+1; // nothing to do end else begin fillBuffer(L); L.bufpos := 0; // XXX: is this really correct? result := 0; end; L.lineStart := result; end; function HandleCR(var L: TBaseLexer; pos: int): int; begin assert(L.buf[pos] = CR); inc(L.linenumber); result := fillBaseLexer(L, pos); if L.buf[result] = LF then begin result := fillBaseLexer(L, result); end; //L.lastNL := result-1; // BUGFIX: was: result; end; function HandleLF(var L: TBaseLexer; pos: int): int; begin assert(L.buf[pos] = LF); inc(L.linenumber); result := fillBaseLexer(L, pos); //L.lastNL := result-1; // BUGFIX: was: result; end; procedure skip_UTF_8_BOM(var L: TBaseLexer); begin if (L.buf[0] = #239) and (L.buf[1] = #187) and (L.buf[2] = #191) then begin inc(L.bufpos, 3); inc(L.lineStart, 3) end end; procedure openBaseLexer(out L: TBaseLexer; inputstream: PLLStream; bufLen: int = 8192); begin assert(bufLen > 0); L.bufpos := 0; L.bufLen := bufLen; L.buf := {@cast}PChar(alloc(bufLen * chrSize)); L.sentinel := bufLen-1; L.lineStart := 0; L.linenumber := 1; // lines start at 1 L.stream := inputstream; fillBuffer(L); skip_UTF_8_BOM(L); end; function getColNumber(const L: TBaseLexer; pos: int): int; begin result := abs(pos - L.lineStart); end; function getCurrentLine(const L: TBaseLexer; marker: boolean = true): string; var i: int; begin result := ''; i := L.lineStart; while not (L.buf[i] in [CR, LF, EndOfFile]) do begin addChar(result, L.buf[i]); inc(i) end; result := result +{&} NL; if marker then result := result +{&} RepeatChar(getColNumber(L, L.bufpos)) +{&} '^' +{&} NL end; end.