summary refs log tree commit diff stats
path: root/lib/pure/memfiles.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/pure/memfiles.nim')
-rw-r--r--lib/pure/memfiles.nim119
1 files changed, 110 insertions, 9 deletions
diff --git a/lib/pure/memfiles.nim b/lib/pure/memfiles.nim
index d49dfae9f..b9c574944 100644
--- a/lib/pure/memfiles.nim
+++ b/lib/pure/memfiles.nim
@@ -11,6 +11,9 @@
 ##
 ## This module provides support for `memory mapped files`:idx:
 ## (Posix's `mmap`:idx:) on the different operating systems.
+##
+## It also provides some fast iterators over lines in text files (or
+## other "line-like", variable length, delimited records).
 
 when defined(windows):
   import winlean
@@ -29,8 +32,9 @@ type
     size*: int       ## size of the memory mapped file
 
     when defined(windows):
-      fHandle: int
-      mapHandle: int 
+      fHandle: Handle
+      mapHandle: Handle
+      wasOpened: bool   ## only close if wasOpened
     else:
       handle: cint
 
@@ -112,7 +116,8 @@ proc open*(filename: string, mode: FileMode = fmRead,
     template callCreateFile(winApiProc, filename: expr): expr =
       winApiProc(
         filename,
-        if readonly: GENERIC_READ else: GENERIC_ALL,
+        # GENERIC_ALL != (GENERIC_READ or GENERIC_WRITE)
+        if readonly: GENERIC_READ else: GENERIC_READ or GENERIC_WRITE,
         FILE_SHARE_READ,
         nil,
         if newFileSize != -1: CREATE_ALWAYS else: OPEN_EXISTING,
@@ -128,7 +133,7 @@ proc open*(filename: string, mode: FileMode = fmRead,
       fail(osLastError(), "error opening file")
 
     if newFileSize != -1:
-      var 
+      var
         sizeHigh = int32(newFileSize shr 32)
         sizeLow  = int32(newFileSize and 0xffffffff)
 
@@ -169,12 +174,14 @@ proc open*(filename: string, mode: FileMode = fmRead,
       if mappedSize != -1: result.size = min(fileSize, mappedSize).int
       else: result.size = fileSize.int
 
+    result.wasOpened = true
+
   else:
     template fail(errCode: OSErrorCode, msg: expr) =
       rollback()
       if result.handle != 0: discard close(result.handle)
       raiseOSError(errCode)
-  
+
     var flags = if readonly: O_RDONLY else: O_RDWR
 
     if newFileSize != -1:
@@ -196,7 +203,7 @@ proc open*(filename: string, mode: FileMode = fmRead,
     if mappedSize != -1:
       result.size = mappedSize
     else:
-      var stat: TStat
+      var stat: Stat
       if fstat(result.handle, stat) != -1:
         # XXX: Hmm, this could be unsafe
         # Why is mmap taking int anyway?
@@ -218,12 +225,12 @@ proc open*(filename: string, mode: FileMode = fmRead,
 proc close*(f: var MemFile) =
   ## closes the memory mapped file `f`. All changes are written back to the
   ## file system, if `f` was opened with write access.
-  
+
   var error = false
   var lastErr: OSErrorCode
 
   when defined(windows):
-    if f.fHandle != INVALID_HANDLE_VALUE:
+    if f.fHandle != INVALID_HANDLE_VALUE and f.wasOpened:
       error = unmapViewOfFile(f.mem) == 0
       lastErr = osLastError()
       error = (closeHandle(f.mapHandle) == 0) or error
@@ -240,8 +247,102 @@ proc close*(f: var MemFile) =
   when defined(windows):
     f.fHandle = 0
     f.mapHandle = 0
+    f.wasOpened = false
   else:
     f.handle = 0
-  
+
   if error: raiseOSError(lastErr)
 
+type MemSlice* = object  ## represent slice of a MemFile for iteration over delimited lines/records
+  data*: pointer
+  size*: int
+
+proc c_memcpy(a, b: pointer, n: int) {.importc: "memcpy", header: "<string.h>".}
+
+proc `$`*(ms: MemSlice): string {.inline.} =
+  ## Return a Nim string built from a MemSlice.
+  var buf = newString(ms.size)
+  c_memcpy(addr(buf[0]), ms.data, ms.size)
+  buf[ms.size] = '\0'
+  result = buf
+
+iterator memSlices*(mfile: MemFile, delim='\l', eat='\r'): MemSlice {.inline.} =
+  ## Iterates over [optional `eat`] `delim`-delimited slices in MemFile `mfile`.
+  ##
+  ## Default parameters parse lines ending in either Unix(\\l) or Windows(\\r\\l)
+  ## style on on a line-by-line basis.  I.e., not every line needs the same ending.
+  ## Unlike readLine(File) & lines(File), archaic MacOS9 \\r-delimited lines
+  ## are not supported as a third option for each line.  Such archaic MacOS9
+  ## files can be handled by passing delim='\\r', eat='\\0', though.
+  ##
+  ## Delimiters are not part of the returned slice.  A final, unterminated line
+  ## or record is returned just like any other.
+  ##
+  ## Non-default delimiters can be passed to allow iteration over other sorts
+  ## of "line-like" variable length records.  Pass eat='\\0' to be strictly
+  ## `delim`-delimited. (Eating an optional prefix equal to '\\0' is not
+  ## supported.)
+  ##
+  ## This zero copy, memchr-limited interface is probably the fastest way to
+  ## iterate over line-like records in a file.  However, returned (data,size)
+  ## objects are not Nim strings, bounds checked Nim arrays, or even terminated
+  ## C strings.  So, care is required to access the data (e.g., think C mem*
+  ## functions, not str* functions).  Example:
+  ##
+  ## .. code-block:: nim
+  ##   var count = 0
+  ##   for slice in memSlices(memfiles.open("foo")):
+  ##     if slice.size > 0 and cast[cstring](slice.data)[0] != '#':
+  ##       inc(count)
+  ##   echo count
+
+  proc c_memchr(cstr: pointer, c: char, n: csize): pointer {.
+       importc: "memchr", header: "<string.h>" .}
+  proc `-!`(p, q: pointer): int {.inline.} = return cast[int](p) -% cast[int](q)
+  var ms: MemSlice
+  var ending: pointer
+  ms.data = mfile.mem
+  var remaining = mfile.size
+  while remaining > 0:
+    ending = c_memchr(ms.data, delim, remaining)
+    if ending == nil:                               # unterminated final slice
+      ms.size = remaining                           # Weird case..check eat?
+      yield ms
+      break
+    ms.size = ending -! ms.data                     # delim is NOT included
+    if eat != '\0' and ms.size > 0 and cast[cstring](ms.data)[ms.size - 1] == eat:
+      dec(ms.size)                                  # trim pre-delim char
+    yield ms
+    ms.data = cast[pointer](cast[int](ending) +% 1)     # skip delim
+    remaining = mfile.size - (ms.data -! mfile.mem)
+
+iterator lines*(mfile: MemFile, buf: var TaintedString, delim='\l', eat='\r'): TaintedString {.inline.} =
+  ## Replace contents of passed buffer with each new line, like
+  ## `readLine(File) <system.html#readLine,File,TaintedString>`_.
+  ## `delim`, `eat`, and delimiting logic is exactly as for
+  ## `memSlices <#memSlices>`_, but Nim strings are returned.  Example:
+  ##
+  ## .. code-block:: nim
+  ##   var buffer: TaintedString = ""
+  ##   for line in lines(memfiles.open("foo"), buffer):
+  ##     echo line
+
+  for ms in memSlices(mfile, delim, eat):
+    buf.setLen(ms.size)
+    c_memcpy(addr(buf[0]), ms.data, ms.size)
+    buf[ms.size] = '\0'
+    yield buf
+
+iterator lines*(mfile: MemFile, delim='\l', eat='\r'): TaintedString {.inline.} =
+  ## Return each line in a file as a Nim string, like
+  ## `lines(File) <system.html#lines.i,File>`_.
+  ## `delim`, `eat`, and delimiting logic is exactly as for
+  ## `memSlices <#memSlices>`_, but Nim strings are returned.  Example:
+  ##
+  ## .. code-block:: nim
+  ##   for line in lines(memfiles.open("foo")):
+  ##     echo line
+
+  var buf = TaintedString(newStringOfCap(80))
+  for line in lines(mfile, buf, delim, eat):
+    yield buf