summary refs log tree commit diff stats
path: root/nim/lexbase.pas
blob: 2b056c04f1f91fd27b05b57e04e26b49758a57bd (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
pre { line-height: 125%; }
td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
.highlight .hll { background-color: #ffffcc }
.highlight .c { color: #888888 } /* Comment */
.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
.highlight .k { color: #008800; font-weight: bold } /* Keyword */
.highlight .ch { color: #888888 } /* Comment.Hashbang */
.highlight .cm { color: #888888 } /* Comment.Multiline */
.highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
.highlight .cpf { color: #888888 } /* Comment.PreprocFile */
.highlight .c1 { color: #888888 } /* Comment.Single */
.highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */
.highlight .gr { color: #aa0000 } /* Generic.Error */
.highlight .gh { color: #333333 } /* Generic.Heading */
.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
.highlight .go { color: #888888 } /* Generic.Output */
.highlight .gp { color: #555555 } /* Generic.Prompt */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #666666 } /* Generic.Subheading */
.highlight .gt { color: #aa0000 } /* Generic.Traceback */
.highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #008800 } /* Keyword.Pseudo */
.highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */
.highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */
.highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */
.highlight .na { color: #336699 } /* Name.Attribute */
.highlight .nb { color: #003388 } /* Name.Builtin */
.highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */
.highlight .no { color: #003366; font-weight: bold } /* Name.Constant */
.highlight .nd { color: #555555 } /* Name.Decorator */
.highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */
.highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */
.highlight .nl { color: #336699; font-style: italic } /* Name.Label */
.highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */
.highlight .py { color: #336699; font-weight: bold } /* Name.Property */
.highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */
.highlight .nv { color: #336699 } /* Name.Variable */
.highlight .ow { color: #008800 } /* Operator.Word */
.highlight .w { color: #bbbbbb } /* Text.Whitespace */
.highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */
.highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */
.highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */
.highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */
.highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */
.highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */
.highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */
.highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */
.highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */
.highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */
.highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */
.highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */
.highlight .
//
//
//           The Nimrod Compiler
//        (c) Copyright 2008 Andreas Rumpf
//
//    See the file "copying.txt", included in this
//    distribution, for details about the copyright.
//
unit lexbase;

// Base Object of a lexer with efficient buffer handling. In fact
// I believe that this is the most efficient method of buffer
// handling that exists! Only at line endings checks are necessary
// if the buffer needs refilling.

interface

uses
  nsystem, llstream, charsets, strutils;

{@emit
const
  Lrz = ' ';
  Apo = '''';
  Tabulator = #9;
  ESC = #27;
  CR = #13;
  FF = #12;
  LF = #10;
  BEL = #7;
  BACKSPACE = #8;
  VT = #11;
}

const
  EndOfFile = #0;          // end of file marker
{ A little picture makes everything clear :-)
  buf:
  "Example Text\n ha!"   bufLen = 17
   ^pos = 0     ^ sentinel = 12
}
  NewLines = {@set}[CR, LF];

type
  TBaseLexer = object(NObject)
    bufpos: int;
    buf: PChar;
    bufLen: int;     // length of buffer in characters
    stream: PLLStream; // we read from this stream
    LineNumber: int; // the current line number
    // private data:
    sentinel: int;
    lineStart: int;     // index of last line start in buffer
  end;

procedure openBaseLexer(out L: TBaseLexer;
                        inputstream: PLLStream;
                        bufLen: int = 8192);
    // 8K is a reasonable buffer size

procedure closeBaseLexer(var L: TBaseLexer);

function getCurrentLine(const L: TBaseLexer; marker: boolean = true): string;
function getColNumber(const L: TBaseLexer; pos: int): int;

function HandleCR(var L: TBaseLexer; pos: int): int;
// Call this if you scanned over CR in the buffer; it returns the
// position to continue the scanning from. `pos` must be the position
// of the CR.

function HandleLF(var L: TBaseLexer; pos: int): int;
// Call this if you scanned over LF in the buffer; it returns the the
// position to continue the scanning from. `pos` must be the position
// of the LF.

implementation

const
  chrSize = sizeof(char);

procedure closeBaseLexer(var L: TBaseLexer);
begin
  dealloc(L.buf);
  LLStreamClose(L.stream);
end;

{@ignore}
{$ifdef false}
procedure printBuffer(const L: TBaseLexer);
var
  i: int;
begin
  writeln('____________________________________');
  writeln('sentinel: ', L.sentinel);
  writeln('bufLen: ', L.bufLen);
  writeln('buf: ');
  for i := 0 to L.bufLen-1 do write(L.buf[i]);
  writeln(NL + '____________________________________');
end;
{$endif}
{@emit}

procedure FillBuffer(var L: TBaseLexer);
var
  charsRead, toCopy, s: int; // all are in characters,
                             // not bytes (in case this
                             // is not the same)
  oldBufLen: int;
begin
  // we know here that pos == L.sentinel, but not if this proc
  // is called the first time by initBaseLexer()
  assert(L.sentinel < L.bufLen);
  toCopy := L.BufLen - L.sentinel - 1;
  assert(toCopy >= 0);
  if toCopy > 0 then
    MoveMem(L.buf, addr(L.buf[L.sentinel+1]), toCopy * chrSize);
    // "moveMem" handles overlapping regions
  charsRead := LLStreamRead(L.stream, addr(L.buf[toCopy]), 
                            (L.sentinel+1) * chrSize) div chrSize;
  s := toCopy + charsRead;
  if charsRead < L.sentinel+1 then begin
    L.buf[s] := EndOfFile; // set end marker
    L.sentinel := s
  end
  else begin
    // compute sentinel:
    dec(s); // BUGFIX (valgrind)
    while true do begin
      assert(s < L.bufLen);
      while (s >= 0) and not (L.buf[s] in NewLines) do Dec(s);
      if s >= 0 then begin
        // we found an appropriate character for a sentinel:
        L.sentinel := s;
        break
      end
      else begin
        // rather than to give up here because the line is too long,
        // double the buffer's size and try again:
        oldBufLen := L.BufLen;
        L.bufLen := L.BufLen * 2;
        L.buf := {@cast}PChar(realloc(L.buf, L.bufLen*chrSize));
        assert(L.bufLen - oldBuflen = oldBufLen);
        charsRead := LLStreamRead(L.stream, addr(L.buf[oldBufLen]), 
                                  oldBufLen*chrSize) div chrSize;
        if charsRead < oldBufLen then begin
          L.buf[oldBufLen+charsRead] := EndOfFile;
          L.sentinel := oldBufLen+charsRead;
          break
        end;
        s := L.bufLen - 1
      end
    end
  end
end;

function fillBaseLexer(var L: TBaseLexer; pos: int): int;
begin
  assert(pos <= L.sentinel);
  if pos < L.sentinel then begin
    result := pos+1;          // nothing to do
  end
  else begin
    fillBuffer(L);
    L.bufpos := 0;          // XXX: is this really correct?
    result := 0;
  end;
  L.lineStart := result;
end;

function HandleCR(var L: TBaseLexer; pos: int): int;
begin
  assert(L.buf[pos] = CR);
  inc(L.linenumber);
  result := fillBaseLexer(L, pos);
  if L.buf[result] = LF then begin
    result := fillBaseLexer(L, result);
  end;
  //L.lastNL := result-1; // BUGFIX: was: result;
end;

function HandleLF(var L: TBaseLexer; pos: int): int;
begin
  assert(L.buf[pos] = LF);
  inc(L.linenumber);
  result := fillBaseLexer(L, pos);
  //L.lastNL := result-1; // BUGFIX: was: result;
end;

procedure skip_UTF_8_BOM(var L: TBaseLexer);
begin
  if (L.buf[0] = #239) and (L.buf[1] = #187) and (L.buf[2] = #191) then begin
    inc(L.bufpos, 3);
    inc(L.lineStart, 3)
  end
end;

procedure openBaseLexer(out L: TBaseLexer; inputstream: PLLStream;
                        bufLen: int = 8192);
begin
  assert(bufLen > 0);
  L.bufpos := 0;
  L.bufLen := bufLen;
  L.buf := {@cast}PChar(alloc(bufLen * chrSize));
  L.sentinel := bufLen-1;
  L.lineStart := 0;
  L.linenumber := 1; // lines start at 1
  L.stream := inputstream;
  fillBuffer(L);
  skip_UTF_8_BOM(L);
end;

function getColNumber(const L: TBaseLexer; pos: int): int;
begin
  result := abs(pos - L.lineStart);
end;

function getCurrentLine(const L: TBaseLexer; marker: boolean = true): string;
var
  i: int;
begin
  result := '';
  i := L.lineStart;
  while not (L.buf[i] in [CR, LF, EndOfFile]) do begin
    addChar(result, L.buf[i]);
    inc(i)
  end;
  result := result +{&} NL;
  if marker then
    result := result +{&} RepeatChar(getColNumber(L, L.bufpos)) +{&} '^' +{&} NL
end;

end.