diff options
author | Andreas Rumpf <andreas@andreas-desktop> | 2009-12-07 01:23:19 +0100 |
---|---|---|
committer | Andreas Rumpf <andreas@andreas-desktop> | 2009-12-07 01:23:19 +0100 |
commit | e254741541b0389dfb0b675116c76a6a144b90b7 (patch) | |
tree | c6769c04b3bdc6a77bcc5075b0df011252e3702b /rod/lexbase.nim | |
parent | 90119066adf6a9a2e8d779d4955637c6dd99aba3 (diff) | |
download | Nim-e254741541b0389dfb0b675116c76a6a144b90b7.tar.gz |
version 0.8.5: added Nimrod version of the compiler
Diffstat (limited to 'rod/lexbase.nim')
-rwxr-xr-x | rod/lexbase.nim | 170 |
1 files changed, 170 insertions, 0 deletions
diff --git a/rod/lexbase.nim b/rod/lexbase.nim new file mode 100755 index 000000000..0c16e986f --- /dev/null +++ b/rod/lexbase.nim @@ -0,0 +1,170 @@ +# +# +# The Nimrod Compiler +# (c) Copyright 2008 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +# Base Object of a lexer with efficient buffer handling. In fact +# I believe that this is the most efficient method of buffer +# handling that exists! Only at line endings checks are necessary +# if the buffer needs refilling. + +import + llstream, strutils + +const + Lrz* = ' ' + Apo* = '\'' + Tabulator* = '\x09' + ESC* = '\x1B' + CR* = '\x0D' + FF* = '\x0C' + LF* = '\x0A' + BEL* = '\x07' + BACKSPACE* = '\x08' + VT* = '\x0B' + +const + EndOfFile* = '\0' # end of file marker + # A little picture makes everything clear :-) + # buf: + # "Example Text\n ha!" bufLen = 17 + # ^pos = 0 ^ sentinel = 12 + # + NewLines* = {CR, LF} + +type + TBaseLexer* = object of TObject + bufpos*: int + buf*: cstring + bufLen*: int # length of buffer in characters + stream*: PLLStream # we read from this stream + LineNumber*: int # the current line number + # private data: + sentinel*: int + lineStart*: int # index of last line start in buffer + + +proc openBaseLexer*(L: var TBaseLexer, inputstream: PLLStream, + bufLen: int = 8192) + # 8K is a reasonable buffer size +proc closeBaseLexer*(L: var TBaseLexer) +proc getCurrentLine*(L: TBaseLexer, marker: bool = true): string +proc getColNumber*(L: TBaseLexer, pos: int): int +proc HandleCR*(L: var TBaseLexer, pos: int): int + # Call this if you scanned over CR in the buffer; it returns the + # position to continue the scanning from. `pos` must be the position + # of the CR. +proc HandleLF*(L: var TBaseLexer, pos: int): int + # Call this if you scanned over LF in the buffer; it returns the the + # position to continue the scanning from. `pos` must be the position + # of the LF. +# implementation + +const + chrSize = sizeof(char) + +proc closeBaseLexer(L: var TBaseLexer) = + dealloc(L.buf) + LLStreamClose(L.stream) + +proc FillBuffer(L: var TBaseLexer) = + var + charsRead, toCopy, s: int # all are in characters, + # not bytes (in case this + # is not the same) + oldBufLen: int + # we know here that pos == L.sentinel, but not if this proc + # is called the first time by initBaseLexer() + assert(L.sentinel < L.bufLen) + toCopy = L.BufLen - L.sentinel - 1 + assert(toCopy >= 0) + if toCopy > 0: + MoveMem(L.buf, addr(L.buf[L.sentinel + 1]), toCopy * chrSize) # "moveMem" handles overlapping regions + charsRead = LLStreamRead(L.stream, addr(L.buf[toCopy]), + (L.sentinel + 1) * chrSize) div chrSize + s = toCopy + charsRead + if charsRead < L.sentinel + 1: + L.buf[s] = EndOfFile # set end marker + L.sentinel = s + else: + # compute sentinel: + dec(s) # BUGFIX (valgrind) + while true: + assert(s < L.bufLen) + while (s >= 0) and not (L.buf[s] in NewLines): Dec(s) + if s >= 0: + # we found an appropriate character for a sentinel: + L.sentinel = s + break + else: + # rather than to give up here because the line is too long, + # double the buffer's size and try again: + oldBufLen = L.BufLen + L.bufLen = L.BufLen * 2 + L.buf = cast[cstring](realloc(L.buf, L.bufLen * chrSize)) + assert(L.bufLen - oldBuflen == oldBufLen) + charsRead = LLStreamRead(L.stream, addr(L.buf[oldBufLen]), + oldBufLen * chrSize) div chrSize + if charsRead < oldBufLen: + L.buf[oldBufLen + charsRead] = EndOfFile + L.sentinel = oldBufLen + charsRead + break + s = L.bufLen - 1 + +proc fillBaseLexer(L: var TBaseLexer, pos: int): int = + assert(pos <= L.sentinel) + if pos < L.sentinel: + result = pos + 1 # nothing to do + else: + fillBuffer(L) + L.bufpos = 0 # XXX: is this really correct? + result = 0 + L.lineStart = result + +proc HandleCR(L: var TBaseLexer, pos: int): int = + assert(L.buf[pos] == CR) + inc(L.linenumber) + result = fillBaseLexer(L, pos) + if L.buf[result] == LF: + result = fillBaseLexer(L, result) + +proc HandleLF(L: var TBaseLexer, pos: int): int = + assert(L.buf[pos] == LF) + inc(L.linenumber) + result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result; + +proc skip_UTF_8_BOM(L: var TBaseLexer) = + if (L.buf[0] == '\xEF') and (L.buf[1] == '\xBB') and (L.buf[2] == '\xBF'): + inc(L.bufpos, 3) + inc(L.lineStart, 3) + +proc openBaseLexer(L: var TBaseLexer, inputstream: PLLStream, bufLen: int = 8192) = + assert(bufLen > 0) + L.bufpos = 0 + L.bufLen = bufLen + L.buf = cast[cstring](alloc(bufLen * chrSize)) + L.sentinel = bufLen - 1 + L.lineStart = 0 + L.linenumber = 1 # lines start at 1 + L.stream = inputstream + fillBuffer(L) + skip_UTF_8_BOM(L) + +proc getColNumber(L: TBaseLexer, pos: int): int = + result = abs(pos - L.lineStart) + +proc getCurrentLine(L: TBaseLexer, marker: bool = true): string = + var i: int + result = "" + i = L.lineStart + while not (L.buf[i] in {CR, LF, EndOfFile}): + add(result, L.buf[i]) + inc(i) + result = result & "\n" + if marker: + result = result & RepeatChar(getColNumber(L, L.bufpos)) & '^' & "\n" + \ No newline at end of file |