summary refs log tree commit diff stats
path: root/compiler/nimlexbase.nim
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/nimlexbase.nim')
-rw-r--r--compiler/nimlexbase.nim174
1 files changed, 174 insertions, 0 deletions
diff --git a/compiler/nimlexbase.nim b/compiler/nimlexbase.nim
new file mode 100644
index 000000000..6708b57f8
--- /dev/null
+++ b/compiler/nimlexbase.nim
@@ -0,0 +1,174 @@
+#
+#
+#           The Nim Compiler
+#        (c) Copyright 2012 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+# Base Object of a lexer with efficient buffer handling. In fact
+# I believe that this is the most efficient method of buffer
+# handling that exists! Only at line endings checks are necessary
+# if the buffer needs refilling.
+
+import llstream
+
+import std/strutils
+
+when defined(nimPreviewSlimSystem):
+  import std/assertions
+
+const
+  Lrz* = ' '
+  Apo* = '\''
+  Tabulator* = '\x09'
+  ESC* = '\x1B'
+  CR* = '\x0D'
+  FF* = '\x0C'
+  LF* = '\x0A'
+  BEL* = '\x07'
+  BACKSPACE* = '\x08'
+  VT* = '\x0B'
+
+const
+  EndOfFile* = '\0'           # end of file marker
+                              # A little picture makes everything clear :-)
+                              #  buf:
+                              #  "Example Text\n ha!"   bufLen = 17
+                              #   ^pos = 0     ^ sentinel = 12
+                              #
+  NewLines* = {CR, LF}
+
+type
+  TBaseLexer* = object of RootObj
+    bufpos*: int
+    buf*: cstring
+    bufStorage: string
+    bufLen: int
+    stream*: PLLStream        # we read from this stream
+    lineNumber*: int          # the current line number
+                              # private data:
+    sentinel*: int
+    lineStart*: int           # index of last line start in buffer
+    offsetBase*: int          # use ``offsetBase + bufpos`` to get the offset
+
+
+proc openBaseLexer*(L: var TBaseLexer, inputstream: PLLStream,
+                    bufLen: int = 8192)
+  # 8K is a reasonable buffer size
+proc closeBaseLexer*(L: var TBaseLexer)
+proc getCurrentLine*(L: TBaseLexer, marker: bool = true): string
+proc getColNumber*(L: TBaseLexer, pos: int): int
+proc handleCR*(L: var TBaseLexer, pos: int): int
+  # Call this if you scanned over CR in the buffer; it returns the
+  # position to continue the scanning from. `pos` must be the position
+  # of the CR.
+proc handleLF*(L: var TBaseLexer, pos: int): int
+  # Call this if you scanned over LF in the buffer; it returns the
+  # position to continue the scanning from. `pos` must be the position
+  # of the LF.
+# implementation
+
+proc closeBaseLexer(L: var TBaseLexer) =
+  llStreamClose(L.stream)
+
+proc fillBuffer(L: var TBaseLexer) =
+  var
+    charsRead, toCopy, s: int # all are in characters,
+                              # not bytes (in case this
+                              # is not the same)
+    oldBufLen: int
+  # we know here that pos == L.sentinel, but not if this proc
+  # is called the first time by initBaseLexer()
+  assert(L.sentinel < L.bufLen)
+  toCopy = L.bufLen - L.sentinel - 1
+  assert(toCopy >= 0)
+  if toCopy > 0:
+    moveMem(addr L.buf[0], addr L.buf[L.sentinel + 1], toCopy)
+    # "moveMem" handles overlapping regions
+  charsRead = llStreamRead(L.stream, addr L.buf[toCopy], L.sentinel + 1)
+  s = toCopy + charsRead
+  if charsRead < L.sentinel + 1:
+    L.buf[s] = EndOfFile      # set end marker
+    L.sentinel = s
+  else:
+    # compute sentinel:
+    dec(s)                    # BUGFIX (valgrind)
+    while true:
+      assert(s < L.bufLen)
+      while (s >= 0) and not (L.buf[s] in NewLines): dec(s)
+      if s >= 0:
+        # we found an appropriate character for a sentinel:
+        L.sentinel = s
+        break
+      else:
+        # rather than to give up here because the line is too long,
+        # double the buffer's size and try again:
+        oldBufLen = L.bufLen
+        L.bufLen = L.bufLen * 2
+        L.bufStorage.setLen(L.bufLen)
+        L.buf = L.bufStorage.cstring
+        assert(L.bufLen - oldBufLen == oldBufLen)
+        charsRead = llStreamRead(L.stream, addr(L.buf[oldBufLen]),
+                                 oldBufLen)
+        if charsRead < oldBufLen:
+          L.buf[oldBufLen + charsRead] = EndOfFile
+          L.sentinel = oldBufLen + charsRead
+          break
+        s = L.bufLen - 1
+
+proc fillBaseLexer(L: var TBaseLexer, pos: int): int =
+  assert(pos <= L.sentinel)
+  if pos < L.sentinel:
+    result = pos + 1          # nothing to do
+  else:
+    fillBuffer(L)
+    L.offsetBase += pos + 1
+    L.bufpos = 0
+    result = 0
+  L.lineStart = result
+
+proc handleCR(L: var TBaseLexer, pos: int): int =
+  assert(L.buf[pos] == CR)
+  inc(L.lineNumber)
+  result = fillBaseLexer(L, pos)
+  if L.buf[result] == LF:
+    result = fillBaseLexer(L, result)
+
+proc handleLF(L: var TBaseLexer, pos: int): int =
+  assert(L.buf[pos] == LF)
+  inc(L.lineNumber)
+  result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
+
+proc skipUTF8BOM(L: var TBaseLexer) =
+  if L.buf[0] == '\xEF' and L.buf[1] == '\xBB' and L.buf[2] == '\xBF':
+    inc(L.bufpos, 3)
+    inc(L.lineStart, 3)
+
+proc openBaseLexer(L: var TBaseLexer, inputstream: PLLStream, bufLen = 8192) =
+  assert(bufLen > 0)
+  L.bufpos = 0
+  L.offsetBase = 0
+  L.bufStorage = newString(bufLen)
+  L.buf = L.bufStorage.cstring
+  L.bufLen = bufLen
+  L.sentinel = bufLen - 1
+  L.lineStart = 0
+  L.lineNumber = 1            # lines start at 1
+  L.stream = inputstream
+  fillBuffer(L)
+  skipUTF8BOM(L)
+
+proc getColNumber(L: TBaseLexer, pos: int): int =
+  result = abs(pos - L.lineStart)
+
+proc getCurrentLine(L: TBaseLexer, marker: bool = true): string =
+  result = ""
+  var i = L.lineStart
+  while L.buf[i] notin {CR, LF, EndOfFile}:
+    result.add L.buf[i]
+    inc i
+  result.add "\n"
+  if marker:
+    result.add spaces(getColNumber(L, L.bufpos)) & '^' & "\n"