summary refs log blame commit diff stats
path: root/lib/pure/lexbase.nim
blob: cf2e8bb89dcf14b42cfd4c5c39a75f5be53bbed0 (plain) (tree)
1
2
3
4

 
                            
                                         






















                                                                        

                                                                           
                                                                       



                                                  
                                                               
                                                
                                                        

                                                                  
                          
 

                                       


                        

                                                                    

                       
                
 
                                   







                                                             
                                    

                     




                                                                  

                                                              








                                                 
                                                           






                                                                   

                               



                                                                   
                                                 
                                                             
                                                             





                                                  
                                                     






                                                            
 



                                                                         
                            
                   


                                     
                      
 



                                                                         
                            
                   
                                                                               





                                                                               
 
                                    



                                                                            


                                                               




                      
                             



                                                  

                         
                                                
               
                
 

                                                 
                                 
 

                                                                 







                                                  
                                                          
#
#
#           The Nim Compiler
#        (c) Copyright 2009 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## This module implements a base object of a lexer with efficient buffer
## handling. Only at line endings checks are necessary if the buffer
## needs refilling.

import
  strutils, streams

const
  EndOfFile* = '\0'           ## end of file marker
  NewLines* = {'\c', '\L'}

# Buffer handling:
#  buf:
#  "Example Text\n ha!"   bufLen = 17
#   ^pos = 0     ^ sentinel = 12
#

type
  BaseLexer* = object of RootObj ## the base lexer. Inherit your lexer from
                                 ## this object.
    bufpos*: int              ## the current position within the buffer
    when defined(js):         ## the buffer itself
      buf*: string
    else:
      buf*: cstring
    bufLen*: int              ## length of buffer in characters
    input: Stream            ## the input stream
    lineNumber*: int          ## the current line number
    sentinel: int
    lineStart: int            # index of last line start in buffer
    refillChars: set[char]

{.deprecated: [TBaseLexer: BaseLexer].}

const
  chrSize = sizeof(char)

proc close*(L: var BaseLexer) =
  ## closes the base lexer. This closes `L`'s associated stream too.
  when not defined(js):
    dealloc(L.buf)
  close(L.input)

proc fillBuffer(L: var BaseLexer) =
  var
    charsRead, toCopy, s: int # all are in characters,
                              # not bytes (in case this
                              # is not the same)
    oldBufLen: int
  # we know here that pos == L.sentinel, but not if this proc
  # is called the first time by initBaseLexer()
  assert(L.sentinel < L.bufLen)
  toCopy = L.bufLen - L.sentinel - 1
  assert(toCopy >= 0)
  if toCopy > 0:
    when defined(js):
      for i in 0 ..< toCopy: L.buf[i] = L.buf[L.sentinel + 1 + i]
    else:
      # "moveMem" handles overlapping regions
      moveMem(L.buf, addr L.buf[L.sentinel + 1], toCopy * chrSize)
  charsRead = readData(L.input, addr(L.buf[toCopy]),
                       (L.sentinel + 1) * chrSize) div chrSize
  s = toCopy + charsRead
  if charsRead < L.sentinel + 1:
    L.buf[s] = EndOfFile      # set end marker
    L.sentinel = s
  else:
    # compute sentinel:
    dec(s)                    # BUGFIX (valgrind)
    while true:
      assert(s < L.bufLen)
      while s >= 0 and L.buf[s] notin L.refillChars: dec(s)
      if s >= 0:
        # we found an appropriate character for a sentinel:
        L.sentinel = s
        break
      else:
        # rather than to give up here because the line is too long,
        # double the buffer's size and try again:
        oldBufLen = L.bufLen
        L.bufLen = L.bufLen * 2
        when defined(js):
          L.buf.setLen(L.bufLen)
        else:
          L.buf = cast[cstring](realloc(L.buf, L.bufLen * chrSize))
        assert(L.bufLen - oldBufLen == oldBufLen)
        charsRead = readData(L.input, addr(L.buf[oldBufLen]),
                             oldBufLen * chrSize) div chrSize
        if charsRead < oldBufLen:
          L.buf[oldBufLen + charsRead] = EndOfFile
          L.sentinel = oldBufLen + charsRead
          break
        s = L.bufLen - 1

proc fillBaseLexer(L: var BaseLexer, pos: int): int =
  assert(pos <= L.sentinel)
  if pos < L.sentinel:
    result = pos + 1          # nothing to do
  else:
    fillBuffer(L)
    L.bufpos = 0              # XXX: is this really correct?
    result = 0

proc handleCR*(L: var BaseLexer, pos: int): int =
  ## Call this if you scanned over '\c' in the buffer; it returns the the
  ## position to continue the scanning from. `pos` must be the position
  ## of the '\c'.
  assert(L.buf[pos] == '\c')
  inc(L.lineNumber)
  result = fillBaseLexer(L, pos)
  if L.buf[result] == '\L':
    result = fillBaseLexer(L, result)
  L.lineStart = result

proc handleLF*(L: var BaseLexer, pos: int): int =
  ## Call this if you scanned over '\L' in the buffer; it returns the the
  ## position to continue the scanning from. `pos` must be the position
  ## of the '\L'.
  assert(L.buf[pos] == '\L')
  inc(L.lineNumber)
  result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
  L.lineStart = result

proc handleRefillChar*(L: var BaseLexer, pos: int): int =
  ## To be documented.
  assert(L.buf[pos] in L.refillChars)
  result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;

proc skipUtf8Bom(L: var BaseLexer) =
  if (L.buf[0] == '\xEF') and (L.buf[1] == '\xBB') and (L.buf[2] == '\xBF'):
    inc(L.bufpos, 3)
    inc(L.lineStart, 3)

proc open*(L: var BaseLexer, input: Stream, bufLen: int = 8192;
           refillChars: set[char] = NewLines) =
  ## inits the BaseLexer with a stream to read from.
  assert(bufLen > 0)
  assert(input != nil)
  L.input = input
  L.bufpos = 0
  L.bufLen = bufLen
  L.refillChars = refillChars
  when defined(js):
    L.buf = newString(bufLen)
  else:
    L.buf = cast[cstring](alloc(bufLen * chrSize))
  L.sentinel = bufLen - 1
  L.lineStart = 0
  L.lineNumber = 1            # lines start at 1
  fillBuffer(L)
  skipUtf8Bom(L)

proc getColNumber*(L: BaseLexer, pos: int): int =
  ## retrieves the current column.
  result = abs(pos - L.lineStart)

proc getCurrentLine*(L: BaseLexer, marker: bool = true): string =
  ## retrieves the current line.
  var i: int
  result = ""
  i = L.lineStart
  while not (L.buf[i] in {'\c', '\L', EndOfFile}):
    add(result, L.buf[i])
    inc(i)
  add(result, "\n")
  if marker:
    add(result, spaces(getColNumber(L, L.bufpos)) & "^\n")