summary refs log tree commit diff stats
path: root/lib/pure/parsecsv.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/pure/parsecsv.nim')
-rw-r--r--lib/pure/parsecsv.nim348
1 files changed, 222 insertions, 126 deletions
diff --git a/lib/pure/parsecsv.nim b/lib/pure/parsecsv.nim
index 796114d37..c7bf0c9c1 100644
--- a/lib/pure/parsecsv.nim
+++ b/lib/pure/parsecsv.nim
@@ -10,13 +10,18 @@
 ## This module implements a simple high performance `CSV`:idx:
 ## (`comma separated value`:idx:) parser.
 ##
-## Example: How to use the parser
-## ==============================
+## Basic usage
+## ===========
+##
+##   ```nim
+##   import std/parsecsv
+##   from std/os import paramStr
+##   from std/streams import newFileStream
 ##
-## .. code-block:: nim
-##   import os, parsecsv, streams
 ##   var s = newFileStream(paramStr(1), fmRead)
-##   if s == nil: quit("cannot open the file" & paramStr(1))
+##   if s == nil:
+##     quit("cannot open the file" & paramStr(1))
+##
 ##   var x: CsvParser
 ##   open(x, s, paramStr(1))
 ##   while readRow(x):
@@ -24,13 +29,14 @@
 ##     for val in items(x.row):
 ##       echo "##", val, "##"
 ##   close(x)
+##   ```
 ##
 ## For CSV files with a header row, the header can be read and then used as a
-## reference for item access with `rowEntry <#rowEntry.CsvParser.string>`_:
+## reference for item access with `rowEntry <#rowEntry,CsvParser,string>`_:
+##
+##   ```nim
+##   import std/parsecsv
 ##
-## .. code-block:: nim
-##   import parsecsv
-##   import os
 ##   # Prepare a file
 ##   let content = """One,Two,Three,Four
 ##   1,2,3,4
@@ -47,24 +53,43 @@
 ##     for col in items(p.headers):
 ##       echo "##", col, ":", p.rowEntry(col), "##"
 ##   p.close()
+##   ```
+##
+## See also
+## ========
+##
+## * `streams module <streams.html>`_ for using
+##   `open proc <#open,CsvParser,Stream,string,char,char,char>`_
+##   and other stream processing (like `close proc <streams.html#close,Stream>`_)
+## * `parseopt module <parseopt.html>`_ for a command line parser
+## * `parsecfg module <parsecfg.html>`_ for a configuration file parser
+## * `parsexml module <parsexml.html>`_ for a XML / HTML parser
+## * `parsesql module <parsesql.html>`_ for a SQL parser
+## * `other parsers <lib.html#pure-libraries-parsers>`_ for other parsers
+
+import std/[lexbase, streams]
 
-import
-  lexbase, streams
+when defined(nimPreviewSlimSystem):
+  import std/syncio
 
 type
-  CsvRow* = seq[string] ## a row in a CSV file
-  CsvParser* = object of BaseLexer ## the parser object.
-    row*: CsvRow                    ## the current row
+  CsvRow* = seq[string] ## A row in a CSV file.
+  CsvParser* = object of BaseLexer ## The parser object.
+                                   ##
+                                   ## It consists of two public fields:
+                                   ## * `row` is the current row
+                                   ## * `headers` are the columns that are defined in the csv file
+                                   ##   (read using `readHeaderRow <#readHeaderRow,CsvParser>`_).
+                                   ##   Used with `rowEntry <#rowEntry,CsvParser,string>`_).
+    row*: CsvRow
     filename: string
     sep, quote, esc: char
     skipWhite: bool
     currRow: int
-    headers*: seq[string] ## The columns that are defined in the csv file
-                          ## (read using `readHeaderRow <#readHeaderRow.CsvParser>`_).
-                          ## Used with `rowEntry <#rowEntry.CsvParser.string>`_).
+    headers*: seq[string]
 
-  CsvError* = object of IOError ## exception that is raised if
-                                ## a parsing error occurs
+  CsvError* = object of IOError ## An exception that is raised if
+                                ## a parsing error occurs.
 
 proc raiseEInvalidCsv(filename: string, line, col: int,
                       msg: string) {.noreturn.} =
@@ -76,151 +101,255 @@ proc raiseEInvalidCsv(filename: string, line, col: int,
     e.msg = filename & "(" & $line & ", " & $col & ") Error: " & msg
   raise e
 
-proc error(my: CsvParser, pos: int, msg: string) =
-  raiseEInvalidCsv(my.filename, my.lineNumber, getColNumber(my, pos), msg)
+proc error(self: CsvParser, pos: int, msg: string) =
+  raiseEInvalidCsv(self.filename, self.lineNumber, getColNumber(self, pos), msg)
 
-proc open*(my: var CsvParser, input: Stream, filename: string,
+proc open*(self: var CsvParser, input: Stream, filename: string,
            separator = ',', quote = '"', escape = '\0',
            skipInitialSpace = false) =
-  ## initializes the parser with an input stream. `Filename` is only used
+  ## Initializes the parser with an input stream. `Filename` is only used
   ## for nice error messages. The parser's behaviour can be controlled by
   ## the diverse optional parameters:
   ## - `separator`: character used to separate fields
   ## - `quote`: Used to quote fields containing special characters like
-  ##   `separator`, `quote` or new-line characters. '\0' disables the parsing
+  ##   `separator`, `quote` or new-line characters. '\\0' disables the parsing
   ##   of quotes.
   ## - `escape`: removes any special meaning from the following character;
-  ##   '\0' disables escaping; if escaping is disabled and `quote` is not '\0',
+  ##   '\\0' disables escaping; if escaping is disabled and `quote` is not '\\0',
   ##   two `quote` characters are parsed one literal `quote` character.
   ## - `skipInitialSpace`: If true, whitespace immediately following the
   ##   `separator` is ignored.
-  lexbase.open(my, input)
-  my.filename = filename
-  my.sep = separator
-  my.quote = quote
-  my.esc = escape
-  my.skipWhite = skipInitialSpace
-  my.row = @[]
-  my.currRow = 0
+  ##
+  ## See also:
+  ## * `open proc <#open,CsvParser,string,char,char,char>`_ which creates the
+  ##   file stream for you
+  runnableExamples:
+    import std/streams
+    var strm = newStringStream("One,Two,Three\n1,2,3\n10,20,30")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+    parser.close()
+    strm.close()
 
-proc open*(my: var CsvParser, filename: string,
+  lexbase.open(self, input)
+  self.filename = filename
+  self.sep = separator
+  self.quote = quote
+  self.esc = escape
+  self.skipWhite = skipInitialSpace
+
+proc open*(self: var CsvParser, filename: string,
            separator = ',', quote = '"', escape = '\0',
            skipInitialSpace = false) =
-  ## same as the other `open` but creates the file stream for you.
+  ## Similar to the `other open proc<#open,CsvParser,Stream,string,char,char,char>`_,
+  ## but creates the file stream for you.
+  runnableExamples:
+    from std/os import removeFile
+    writeFile("tmp.csv", "One,Two,Three\n1,2,3\n10,20,300")
+    var parser: CsvParser
+    parser.open("tmp.csv")
+    parser.close()
+    removeFile("tmp.csv")
+
   var s = newFileStream(filename, fmRead)
-  if s == nil: my.error(0, "cannot open: " & filename)
-  open(my, s, filename, separator,
+  if s == nil: self.error(0, "cannot open: " & filename)
+  open(self, s, filename, separator,
        quote, escape, skipInitialSpace)
 
-proc parseField(my: var CsvParser, a: var string) =
-  var pos = my.bufpos
-  var buf = my.buf
-  if my.skipWhite:
-    while buf[pos] in {' ', '\t'}: inc(pos)
+proc parseField(self: var CsvParser, a: var string) =
+  var pos = self.bufpos
+  if self.skipWhite:
+    while self.buf[pos] in {' ', '\t'}: inc(pos)
   setLen(a, 0) # reuse memory
-  if buf[pos] == my.quote and my.quote != '\0':
+  if self.buf[pos] == self.quote and self.quote != '\0':
     inc(pos)
     while true:
-      let c = buf[pos]
+      let c = self.buf[pos]
       if c == '\0':
-        my.bufpos = pos # can continue after exception?
-        error(my, pos, my.quote & " expected")
+        self.bufpos = pos # can continue after exception?
+        error(self, pos, self.quote & " expected")
         break
-      elif c == my.quote:
-        if my.esc == '\0' and buf[pos+1] == my.quote:
-          add(a, my.quote)
+      elif c == self.quote:
+        if self.esc == '\0' and self.buf[pos + 1] == self.quote:
+          add(a, self.quote)
           inc(pos, 2)
         else:
           inc(pos)
           break
-      elif c == my.esc:
-        add(a, buf[pos+1])
+      elif c == self.esc:
+        add(a, self.buf[pos + 1])
         inc(pos, 2)
       else:
         case c
         of '\c':
-          pos = handleCR(my, pos)
-          buf = my.buf
+          pos = handleCR(self, pos)
           add(a, "\n")
         of '\l':
-          pos = handleLF(my, pos)
-          buf = my.buf
+          pos = handleLF(self, pos)
           add(a, "\n")
         else:
           add(a, c)
           inc(pos)
   else:
     while true:
-      let c = buf[pos]
-      if c == my.sep: break
+      let c = self.buf[pos]
+      if c == self.sep: break
       if c in {'\c', '\l', '\0'}: break
       add(a, c)
       inc(pos)
-  my.bufpos = pos
+  self.bufpos = pos
+
+proc processedRows*(self: var CsvParser): int {.inline.} =
+  ## Returns number of the processed rows.
+  ##
+  ## But even if `readRow <#readRow,CsvParser,int>`_ arrived at EOF then
+  ## processed rows counter is incremented.
+  runnableExamples:
+    import std/streams
 
-proc processedRows*(my: var CsvParser): int =
-  ## returns number of the processed rows
-  return my.currRow
+    var strm = newStringStream("One,Two,Three\n1,2,3")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+    doAssert parser.readRow()
+    doAssert parser.processedRows() == 1
+    doAssert parser.readRow()
+    doAssert parser.processedRows() == 2
+    ## Even if `readRow` arrived at EOF then `processedRows` is incremented.
+    doAssert parser.readRow() == false
+    doAssert parser.processedRows() == 3
+    doAssert parser.readRow() == false
+    doAssert parser.processedRows() == 4
+    parser.close()
+    strm.close()
 
-proc readRow*(my: var CsvParser, columns = 0): bool =
-  ## reads the next row; if `columns` > 0, it expects the row to have
+  self.currRow
+
+proc readRow*(self: var CsvParser, columns = 0): bool =
+  ## Reads the next row; if `columns` > 0, it expects the row to have
   ## exactly this many columns. Returns false if the end of the file
   ## has been encountered else true.
   ##
   ## Blank lines are skipped.
+  runnableExamples:
+    import std/streams
+    var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+    doAssert parser.readRow()
+    doAssert parser.row == @["One", "Two", "Three"]
+    doAssert parser.readRow()
+    doAssert parser.row == @["1", "2", "3"]
+    ## Blank lines are skipped.
+    doAssert parser.readRow()
+    doAssert parser.row == @["10", "20", "30"]
+
+    var emptySeq: seq[string]
+    doAssert parser.readRow() == false
+    doAssert parser.row == emptySeq
+    doAssert parser.readRow() == false
+    doAssert parser.row == emptySeq
+
+    parser.close()
+    strm.close()
+
   var col = 0 # current column
-  let oldpos = my.bufpos
-  while my.buf[my.bufpos] != '\0':
-    let oldlen = my.row.len
-    if oldlen < col+1:
-      setLen(my.row, col+1)
-      my.row[col] = ""
-    parseField(my, my.row[col])
+  let oldpos = self.bufpos
+  # skip initial empty lines #8365
+  while true:
+    case self.buf[self.bufpos]
+    of '\c': self.bufpos = handleCR(self, self.bufpos)
+    of '\l': self.bufpos = handleLF(self, self.bufpos)
+    else: break
+  while self.buf[self.bufpos] != '\0':
+    let oldlen = self.row.len
+    if oldlen < col + 1:
+      setLen(self.row, col + 1)
+      self.row[col] = ""
+    parseField(self, self.row[col])
     inc(col)
-    if my.buf[my.bufpos] == my.sep:
-      inc(my.bufpos)
+    if self.buf[self.bufpos] == self.sep:
+      inc(self.bufpos)
     else:
-      case my.buf[my.bufpos]
+      case self.buf[self.bufpos]
       of '\c', '\l':
         # skip empty lines:
         while true:
-          case my.buf[my.bufpos]
-          of '\c': my.bufpos = handleCR(my, my.bufpos)
-          of '\l': my.bufpos = handleLF(my, my.bufpos)
+          case self.buf[self.bufpos]
+          of '\c': self.bufpos = handleCR(self, self.bufpos)
+          of '\l': self.bufpos = handleLF(self, self.bufpos)
           else: break
       of '\0': discard
-      else: error(my, my.bufpos, my.sep & " expected")
+      else: error(self, self.bufpos, self.sep & " expected")
       break
 
-  setLen(my.row, col)
+  setLen(self.row, col)
   result = col > 0
   if result and col != columns and columns > 0:
-    error(my, oldpos+1, $columns & " columns expected, but found " &
+    error(self, oldpos + 1, $columns & " columns expected, but found " &
           $col & " columns")
-  inc(my.currRow)
+  inc(self.currRow)
 
-proc close*(my: var CsvParser) {.inline.} =
-  ## closes the parser `my` and its associated input stream.
-  lexbase.close(my)
+proc close*(self: var CsvParser) {.inline.} =
+  ## Closes the parser `self` and its associated input stream.
+  lexbase.close(self)
 
-proc readHeaderRow*(my: var CsvParser) =
+proc readHeaderRow*(self: var CsvParser) =
   ## Reads the first row and creates a look-up table for column numbers
-  ## See also `rowEntry <#rowEntry.CsvParser.string>`_.
-  let present = my.readRow()
+  ## See also:
+  ## * `rowEntry proc <#rowEntry,CsvParser,string>`_
+  runnableExamples:
+    import std/streams
+
+    var strm = newStringStream("One,Two,Three\n1,2,3")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+
+    parser.readHeaderRow()
+    doAssert parser.headers == @["One", "Two", "Three"]
+    doAssert parser.row == @["One", "Two", "Three"]
+
+    doAssert parser.readRow()
+    doAssert parser.headers == @["One", "Two", "Three"]
+    doAssert parser.row == @["1", "2", "3"]
+
+    parser.close()
+    strm.close()
+
+  let present = self.readRow()
   if present:
-    my.headers = my.row
+    self.headers = self.row
 
-proc rowEntry*(my: var CsvParser, entry: string): var string =
-  ## Acceses a specified `entry` from the current row.
+proc rowEntry*(self: var CsvParser, entry: string): var string =
+  ## Accesses a specified `entry` from the current row.
   ##
-  ## Assumes that `readHeaderRow <#readHeaderRow.CsvParser>`_ has already been
+  ## Assumes that `readHeaderRow <#readHeaderRow,CsvParser>`_ has already been
   ## called.
-  let index = my.headers.find(entry)
+  ##
+  ## If specified `entry` does not exist, raises KeyError.
+  runnableExamples:
+    import std/streams
+    var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+    ## Requires calling `readHeaderRow`.
+    parser.readHeaderRow()
+    doAssert parser.readRow()
+    doAssert parser.rowEntry("One") == "1"
+    doAssert parser.rowEntry("Two") == "2"
+    doAssert parser.rowEntry("Three") == "3"
+    doAssertRaises(KeyError):
+      discard parser.rowEntry("NonexistentEntry")
+    parser.close()
+    strm.close()
+
+  let index = self.headers.find(entry)
   if index >= 0:
-    result = my.row[index]
+    result = self.row[index]
+  else:
+    raise newException(KeyError, "Entry `" & entry & "` doesn't exist")
 
 when not defined(testing) and isMainModule:
-  import os
+  import std/os
   var s = newFileStream(paramStr(1), fmRead)
   if s == nil: quit("cannot open the file" & paramStr(1))
   var x: CsvParser
@@ -230,36 +359,3 @@ when not defined(testing) and isMainModule:
     for val in items(x.row):
       echo "##", val, "##"
   close(x)
-
-when isMainModule:
-  import os
-  import strutils
-  block: # Tests for reading the header row
-    let content = "One,Two,Three,Four\n1,2,3,4\n10,20,30,40,\n100,200,300,400\n"
-    writeFile("temp.csv", content)
-
-    var p: CsvParser
-    p.open("temp.csv")
-    p.readHeaderRow()
-    while p.readRow():
-      let zeros = repeat('0', p.currRow-2)
-      doAssert p.rowEntry("One") == "1" & zeros
-      doAssert p.rowEntry("Two") == "2" & zeros
-      doAssert p.rowEntry("Three") == "3" & zeros
-      doAssert p.rowEntry("Four") == "4" & zeros
-    p.close()
-
-    when not defined(testing):
-      var parser: CsvParser
-      parser.open("temp.csv")
-      parser.readHeaderRow()
-      while parser.readRow():
-        echo "new row: "
-        for col in items(parser.headers):
-          echo "##", col, ":", parser.rowEntry(col), "##"
-      parser.close()
-      removeFile("temp.csv")
-
-    # Tidy up
-    removeFile("temp.csv")
-