summary refs log tree commit diff stats
path: root/lib/pure/parsecsv.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/pure/parsecsv.nim')
-rw-r--r--[-rwxr-xr-x]lib/pure/parsecsv.nim393
1 files changed, 288 insertions, 105 deletions
diff --git a/lib/pure/parsecsv.nim b/lib/pure/parsecsv.nim
index 5970f2090..c7bf0c9c1 100755..100644
--- a/lib/pure/parsecsv.nim
+++ b/lib/pure/parsecsv.nim
@@ -1,6 +1,6 @@
 #
 #
-#            Nimrod's Runtime Library
+#            Nim's Runtime Library
 #        (c) Copyright 2009 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
@@ -8,171 +8,354 @@
 #
 
 ## This module implements a simple high performance `CSV`:idx:
-## (`comma separated value`:idx:) parser. 
+## (`comma separated value`:idx:) parser.
 ##
-## Example: How to use the parser
-## ==============================
+## Basic usage
+## ===========
 ##
-## .. code-block:: nimrod
-##   import os, parsecsv, streams
-##   var s = newFileStream(ParamStr(1), fmRead)
-##   if s == nil: quit("cannot open the file" & ParamStr(1))
-##   var x: TCsvParser
-##   open(x, s, ParamStr(1))
+##   ```nim
+##   import std/parsecsv
+##   from std/os import paramStr
+##   from std/streams import newFileStream
+##
+##   var s = newFileStream(paramStr(1), fmRead)
+##   if s == nil:
+##     quit("cannot open the file" & paramStr(1))
+##
+##   var x: CsvParser
+##   open(x, s, paramStr(1))
 ##   while readRow(x):
-##     Echo "new row: "
+##     echo "new row: "
 ##     for val in items(x.row):
-##       Echo "##", val, "##"
+##       echo "##", val, "##"
 ##   close(x)
+##   ```
+##
+## For CSV files with a header row, the header can be read and then used as a
+## reference for item access with `rowEntry <#rowEntry,CsvParser,string>`_:
+##
+##   ```nim
+##   import std/parsecsv
+##
+##   # Prepare a file
+##   let content = """One,Two,Three,Four
+##   1,2,3,4
+##   10,20,30,40
+##   100,200,300,400
+##   """
+##   writeFile("temp.csv", content)
 ##
+##   var p: CsvParser
+##   p.open("temp.csv")
+##   p.readHeaderRow()
+##   while p.readRow():
+##     echo "new row: "
+##     for col in items(p.headers):
+##       echo "##", col, ":", p.rowEntry(col), "##"
+##   p.close()
+##   ```
+##
+## See also
+## ========
+##
+## * `streams module <streams.html>`_ for using
+##   `open proc <#open,CsvParser,Stream,string,char,char,char>`_
+##   and other stream processing (like `close proc <streams.html#close,Stream>`_)
+## * `parseopt module <parseopt.html>`_ for a command line parser
+## * `parsecfg module <parsecfg.html>`_ for a configuration file parser
+## * `parsexml module <parsexml.html>`_ for a XML / HTML parser
+## * `parsesql module <parsesql.html>`_ for a SQL parser
+## * `other parsers <lib.html#pure-libraries-parsers>`_ for other parsers
 
-import
-  lexbase, streams
+import std/[lexbase, streams]
+
+when defined(nimPreviewSlimSystem):
+  import std/syncio
 
 type
-  TCsvRow* = seq[string] ## a row in a CSV file
-  TCsvParser* = object of TBaseLexer ## the parser object.
-    row*: TCsvRow                    ## the current row
+  CsvRow* = seq[string] ## A row in a CSV file.
+  CsvParser* = object of BaseLexer ## The parser object.
+                                   ##
+                                   ## It consists of two public fields:
+                                   ## * `row` is the current row
+                                   ## * `headers` are the columns that are defined in the csv file
+                                   ##   (read using `readHeaderRow <#readHeaderRow,CsvParser>`_).
+                                   ##   Used with `rowEntry <#rowEntry,CsvParser,string>`_).
+    row*: CsvRow
     filename: string
     sep, quote, esc: char
     skipWhite: bool
     currRow: int
+    headers*: seq[string]
 
-  EInvalidCsv* = object of EIO ## exception that is raised if
-                               ## a parsing error occurs
+  CsvError* = object of IOError ## An exception that is raised if
+                                ## a parsing error occurs.
 
-proc raiseEInvalidCsv(filename: string, line, col: int, 
+proc raiseEInvalidCsv(filename: string, line, col: int,
                       msg: string) {.noreturn.} =
-  var e: ref EInvalidCsv
+  var e: ref CsvError
   new(e)
-  e.msg = filename & "(" & $line & ", " & $col & ") Error: " & msg
+  if filename.len == 0:
+    e.msg = "Error: " & msg
+  else:
+    e.msg = filename & "(" & $line & ", " & $col & ") Error: " & msg
   raise e
 
-proc error(my: TCsvParser, pos: int, msg: string) = 
-  raiseEInvalidCsv(my.filename, my.LineNumber, getColNumber(my, pos), msg)
+proc error(self: CsvParser, pos: int, msg: string) =
+  raiseEInvalidCsv(self.filename, self.lineNumber, getColNumber(self, pos), msg)
 
-proc open*(my: var TCsvParser, input: PStream, filename: string,
+proc open*(self: var CsvParser, input: Stream, filename: string,
            separator = ',', quote = '"', escape = '\0',
            skipInitialSpace = false) =
-  ## initializes the parser with an input stream. `Filename` is only used
+  ## Initializes the parser with an input stream. `Filename` is only used
   ## for nice error messages. The parser's behaviour can be controlled by
   ## the diverse optional parameters:
   ## - `separator`: character used to separate fields
-  ## - `quote`: Used to quote fields containing special characters like 
-  ##   `separator`, `quote` or new-line characters. '\0' disables the parsing
+  ## - `quote`: Used to quote fields containing special characters like
+  ##   `separator`, `quote` or new-line characters. '\\0' disables the parsing
   ##   of quotes.
-  ## - `escape`: removes any special meaning from the following character; 
-  ##   '\0' disables escaping; if escaping is disabled and `quote` is not '\0',
+  ## - `escape`: removes any special meaning from the following character;
+  ##   '\\0' disables escaping; if escaping is disabled and `quote` is not '\\0',
   ##   two `quote` characters are parsed one literal `quote` character.
-  ## - `skipInitialSpace`: If true, whitespace immediately following the 
+  ## - `skipInitialSpace`: If true, whitespace immediately following the
   ##   `separator` is ignored.
-  lexbase.open(my, input)
-  my.filename = filename
-  my.sep = separator
-  my.quote = quote
-  my.esc = escape
-  my.skipWhite = skipInitialSpace
-  my.row = @[]
-  my.currRow = 0
-
-proc parseField(my: var TCsvParser, a: var string) = 
-  var pos = my.bufpos
-  var buf = my.buf
-  if my.skipWhite:
-    while buf[pos] in {' ', '\t'}: inc(pos)
+  ##
+  ## See also:
+  ## * `open proc <#open,CsvParser,string,char,char,char>`_ which creates the
+  ##   file stream for you
+  runnableExamples:
+    import std/streams
+    var strm = newStringStream("One,Two,Three\n1,2,3\n10,20,30")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+    parser.close()
+    strm.close()
+
+  lexbase.open(self, input)
+  self.filename = filename
+  self.sep = separator
+  self.quote = quote
+  self.esc = escape
+  self.skipWhite = skipInitialSpace
+
+proc open*(self: var CsvParser, filename: string,
+           separator = ',', quote = '"', escape = '\0',
+           skipInitialSpace = false) =
+  ## Similar to the `other open proc<#open,CsvParser,Stream,string,char,char,char>`_,
+  ## but creates the file stream for you.
+  runnableExamples:
+    from std/os import removeFile
+    writeFile("tmp.csv", "One,Two,Three\n1,2,3\n10,20,300")
+    var parser: CsvParser
+    parser.open("tmp.csv")
+    parser.close()
+    removeFile("tmp.csv")
+
+  var s = newFileStream(filename, fmRead)
+  if s == nil: self.error(0, "cannot open: " & filename)
+  open(self, s, filename, separator,
+       quote, escape, skipInitialSpace)
+
+proc parseField(self: var CsvParser, a: var string) =
+  var pos = self.bufpos
+  if self.skipWhite:
+    while self.buf[pos] in {' ', '\t'}: inc(pos)
   setLen(a, 0) # reuse memory
-  if buf[pos] == my.quote and my.quote != '\0': 
+  if self.buf[pos] == self.quote and self.quote != '\0':
     inc(pos)
-    while true: 
-      var c = buf[pos]
+    while true:
+      let c = self.buf[pos]
       if c == '\0':
-        my.bufpos = pos # can continue after exception?
-        error(my, pos, my.quote & " expected")
+        self.bufpos = pos # can continue after exception?
+        error(self, pos, self.quote & " expected")
         break
-      elif c == my.quote: 
-        if my.esc == '\0' and buf[pos+1] == my.quote:
-          add(a, my.quote)
+      elif c == self.quote:
+        if self.esc == '\0' and self.buf[pos + 1] == self.quote:
+          add(a, self.quote)
           inc(pos, 2)
         else:
           inc(pos)
           break
-      elif c == my.esc:
-        add(a, buf[pos+1])
+      elif c == self.esc:
+        add(a, self.buf[pos + 1])
         inc(pos, 2)
       else:
         case c
-        of '\c': 
-          pos = handleCR(my, pos)
-          buf = my.buf
+        of '\c':
+          pos = handleCR(self, pos)
           add(a, "\n")
-        of '\l': 
-          pos = handleLF(my, pos)
-          buf = my.buf
+        of '\l':
+          pos = handleLF(self, pos)
           add(a, "\n")
         else:
           add(a, c)
           inc(pos)
   else:
     while true:
-      var c = buf[pos]
-      if c == my.sep: break
+      let c = self.buf[pos]
+      if c == self.sep: break
       if c in {'\c', '\l', '\0'}: break
       add(a, c)
       inc(pos)
-  my.bufpos = pos
+  self.bufpos = pos
+
+proc processedRows*(self: var CsvParser): int {.inline.} =
+  ## Returns number of the processed rows.
+  ##
+  ## But even if `readRow <#readRow,CsvParser,int>`_ arrived at EOF then
+  ## processed rows counter is incremented.
+  runnableExamples:
+    import std/streams
 
-proc processedRows*(my: var TCsvParser): int = 
-  ## returns number of the processed rows
-  return my.currRow
+    var strm = newStringStream("One,Two,Three\n1,2,3")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+    doAssert parser.readRow()
+    doAssert parser.processedRows() == 1
+    doAssert parser.readRow()
+    doAssert parser.processedRows() == 2
+    ## Even if `readRow` arrived at EOF then `processedRows` is incremented.
+    doAssert parser.readRow() == false
+    doAssert parser.processedRows() == 3
+    doAssert parser.readRow() == false
+    doAssert parser.processedRows() == 4
+    parser.close()
+    strm.close()
 
-proc readRow*(my: var TCsvParser, columns = 0): bool = 
-  ## reads the next row; if `columns` > 0, it expects the row to have
+  self.currRow
+
+proc readRow*(self: var CsvParser, columns = 0): bool =
+  ## Reads the next row; if `columns` > 0, it expects the row to have
   ## exactly this many columns. Returns false if the end of the file
   ## has been encountered else true.
+  ##
+  ## Blank lines are skipped.
+  runnableExamples:
+    import std/streams
+    var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+    doAssert parser.readRow()
+    doAssert parser.row == @["One", "Two", "Three"]
+    doAssert parser.readRow()
+    doAssert parser.row == @["1", "2", "3"]
+    ## Blank lines are skipped.
+    doAssert parser.readRow()
+    doAssert parser.row == @["10", "20", "30"]
+
+    var emptySeq: seq[string]
+    doAssert parser.readRow() == false
+    doAssert parser.row == emptySeq
+    doAssert parser.readRow() == false
+    doAssert parser.row == emptySeq
+
+    parser.close()
+    strm.close()
+
   var col = 0 # current column
-  var oldpos = my.bufpos
-  while my.buf[my.bufpos] != '\0':
-    var oldlen = my.row.len
-    if oldlen < col+1:
-      setLen(my.row, col+1)
-      my.row[col] = ""
-    parseField(my, my.row[col])
+  let oldpos = self.bufpos
+  # skip initial empty lines #8365
+  while true:
+    case self.buf[self.bufpos]
+    of '\c': self.bufpos = handleCR(self, self.bufpos)
+    of '\l': self.bufpos = handleLF(self, self.bufpos)
+    else: break
+  while self.buf[self.bufpos] != '\0':
+    let oldlen = self.row.len
+    if oldlen < col + 1:
+      setLen(self.row, col + 1)
+      self.row[col] = ""
+    parseField(self, self.row[col])
     inc(col)
-    if my.buf[my.bufpos] == my.sep: 
-      inc(my.bufpos)
+    if self.buf[self.bufpos] == self.sep:
+      inc(self.bufpos)
     else:
-      case my.buf[my.bufpos]
-      of '\c', '\l': 
+      case self.buf[self.bufpos]
+      of '\c', '\l':
         # skip empty lines:
-        while true: 
-          case my.buf[my.bufpos]
-          of '\c': my.bufpos = handleCR(my, my.bufpos)
-          of '\l': my.bufpos = handleLF(my, my.bufpos)
+        while true:
+          case self.buf[self.bufpos]
+          of '\c': self.bufpos = handleCR(self, self.bufpos)
+          of '\l': self.bufpos = handleLF(self, self.bufpos)
           else: break
-      of '\0': nil
-      else: error(my, my.bufpos, my.sep & " expected")
+      of '\0': discard
+      else: error(self, self.bufpos, self.sep & " expected")
       break
-  
-  setlen(my.row, col)
+
+  setLen(self.row, col)
   result = col > 0
-  if result and col != columns and columns > 0: 
-    error(my, oldpos+1, $columns & " columns expected, but found " & 
+  if result and col != columns and columns > 0:
+    error(self, oldpos + 1, $columns & " columns expected, but found " &
           $col & " columns")
-  inc(my.currRow)
-  
-proc close*(my: var TCsvParser) {.inline.} = 
-  ## closes the parser `my` and its associated input stream.
-  lexbase.close(my)
-
-when isMainModule:
-  import os
-  var s = newFileStream(ParamStr(1), fmRead)
-  if s == nil: quit("cannot open the file" & ParamStr(1))
-  var x: TCsvParser
-  open(x, s, ParamStr(1))
+  inc(self.currRow)
+
+proc close*(self: var CsvParser) {.inline.} =
+  ## Closes the parser `self` and its associated input stream.
+  lexbase.close(self)
+
+proc readHeaderRow*(self: var CsvParser) =
+  ## Reads the first row and creates a look-up table for column numbers
+  ## See also:
+  ## * `rowEntry proc <#rowEntry,CsvParser,string>`_
+  runnableExamples:
+    import std/streams
+
+    var strm = newStringStream("One,Two,Three\n1,2,3")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+
+    parser.readHeaderRow()
+    doAssert parser.headers == @["One", "Two", "Three"]
+    doAssert parser.row == @["One", "Two", "Three"]
+
+    doAssert parser.readRow()
+    doAssert parser.headers == @["One", "Two", "Three"]
+    doAssert parser.row == @["1", "2", "3"]
+
+    parser.close()
+    strm.close()
+
+  let present = self.readRow()
+  if present:
+    self.headers = self.row
+
+proc rowEntry*(self: var CsvParser, entry: string): var string =
+  ## Accesses a specified `entry` from the current row.
+  ##
+  ## Assumes that `readHeaderRow <#readHeaderRow,CsvParser>`_ has already been
+  ## called.
+  ##
+  ## If specified `entry` does not exist, raises KeyError.
+  runnableExamples:
+    import std/streams
+    var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+    ## Requires calling `readHeaderRow`.
+    parser.readHeaderRow()
+    doAssert parser.readRow()
+    doAssert parser.rowEntry("One") == "1"
+    doAssert parser.rowEntry("Two") == "2"
+    doAssert parser.rowEntry("Three") == "3"
+    doAssertRaises(KeyError):
+      discard parser.rowEntry("NonexistentEntry")
+    parser.close()
+    strm.close()
+
+  let index = self.headers.find(entry)
+  if index >= 0:
+    result = self.row[index]
+  else:
+    raise newException(KeyError, "Entry `" & entry & "` doesn't exist")
+
+when not defined(testing) and isMainModule:
+  import std/os
+  var s = newFileStream(paramStr(1), fmRead)
+  if s == nil: quit("cannot open the file" & paramStr(1))
+  var x: CsvParser
+  open(x, s, paramStr(1))
   while readRow(x):
-    Echo "new row: "
+    echo "new row: "
     for val in items(x.row):
-      Echo "##", val, "##"
+      echo "##", val, "##"
   close(x)
-