summary refs log tree commit diff stats
path: root/lib/pure/parsecsv.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/pure/parsecsv.nim')
-rw-r--r--lib/pure/parsecsv.nim170
1 files changed, 147 insertions, 23 deletions
diff --git a/lib/pure/parsecsv.nim b/lib/pure/parsecsv.nim
index 796114d37..e0c4f38a4 100644
--- a/lib/pure/parsecsv.nim
+++ b/lib/pure/parsecsv.nim
@@ -10,13 +10,18 @@
 ## This module implements a simple high performance `CSV`:idx:
 ## (`comma separated value`:idx:) parser.
 ##
-## Example: How to use the parser
-## ==============================
+## Basic usage
+## ===========
 ##
 ## .. code-block:: nim
-##   import os, parsecsv, streams
+##   import parsecsv
+##   from os import paramStr
+##   from streams import newFileStream
+##
 ##   var s = newFileStream(paramStr(1), fmRead)
-##   if s == nil: quit("cannot open the file" & paramStr(1))
+##   if s == nil:
+##     quit("cannot open the file" & paramStr(1))
+##
 ##   var x: CsvParser
 ##   open(x, s, paramStr(1))
 ##   while readRow(x):
@@ -26,11 +31,11 @@
 ##   close(x)
 ##
 ## For CSV files with a header row, the header can be read and then used as a
-## reference for item access with `rowEntry <#rowEntry.CsvParser.string>`_:
+## reference for item access with `rowEntry <#rowEntry,CsvParser,string>`_:
 ##
 ## .. code-block:: nim
 ##   import parsecsv
-##   import os
+##
 ##   # Prepare a file
 ##   let content = """One,Two,Three,Four
 ##   1,2,3,4
@@ -47,24 +52,40 @@
 ##     for col in items(p.headers):
 ##       echo "##", col, ":", p.rowEntry(col), "##"
 ##   p.close()
+##
+## See also
+## ========
+##
+## * `streams module <streams.html>`_ for using
+##   `open proc <#open,CsvParser,Stream,string,Char,Char,Char>`_
+##   and other stream processing (like `close proc <streams.html#close,Stream>`_)
+## * `parseopt module <parseopt.html>`_ for a command line parser
+## * `parsecfg module <parsecfg.html>`_ for a configuration file parser
+## * `parsexml module <parsexml.html>`_ for a XML / HTML parser
+## * `parsesql module <parsesql.html>`_ for a SQL parser
+## * `other parsers <lib.html#pure-libraries-parsers>`_ for other parsers
 
 import
   lexbase, streams
 
 type
-  CsvRow* = seq[string] ## a row in a CSV file
-  CsvParser* = object of BaseLexer ## the parser object.
-    row*: CsvRow                    ## the current row
+  CsvRow* = seq[string] ## A row in a CSV file.
+  CsvParser* = object of BaseLexer ## The parser object.
+    ##
+    ## It consists of two public fields:
+    ## * `row` is the current row
+    ## * `headers` are the columns that are defined in the csv file
+    ##   (read using `readHeaderRow <#readHeaderRow,CsvParser>`_).
+    ##   Used with `rowEntry <#rowEntry,CsvParser,string>`_).
+    row*: CsvRow
     filename: string
     sep, quote, esc: char
     skipWhite: bool
     currRow: int
-    headers*: seq[string] ## The columns that are defined in the csv file
-                          ## (read using `readHeaderRow <#readHeaderRow.CsvParser>`_).
-                          ## Used with `rowEntry <#rowEntry.CsvParser.string>`_).
+    headers*: seq[string]
 
-  CsvError* = object of IOError ## exception that is raised if
-                                ## a parsing error occurs
+  CsvError* = object of IOError ## An exception that is raised if
+                                ## a parsing error occurs.
 
 proc raiseEInvalidCsv(filename: string, line, col: int,
                       msg: string) {.noreturn.} =
@@ -82,7 +103,7 @@ proc error(my: CsvParser, pos: int, msg: string) =
 proc open*(my: var CsvParser, input: Stream, filename: string,
            separator = ',', quote = '"', escape = '\0',
            skipInitialSpace = false) =
-  ## initializes the parser with an input stream. `Filename` is only used
+  ## Initializes the parser with an input stream. `Filename` is only used
   ## for nice error messages. The parser's behaviour can be controlled by
   ## the diverse optional parameters:
   ## - `separator`: character used to separate fields
@@ -94,6 +115,18 @@ proc open*(my: var CsvParser, input: Stream, filename: string,
   ##   two `quote` characters are parsed one literal `quote` character.
   ## - `skipInitialSpace`: If true, whitespace immediately following the
   ##   `separator` is ignored.
+  ##
+  ## See also:
+  ## * `open proc <#open,CsvParser,string,Char,Char,Char>`_ which creates the
+  ##   file stream for you
+  runnableExamples:
+    import streams
+    var strm = newStringStream("One,Two,Three\n1,2,3\n10,20,30")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+    parser.close()
+    strm.close()
+
   lexbase.open(my, input)
   my.filename = filename
   my.sep = separator
@@ -106,7 +139,16 @@ proc open*(my: var CsvParser, input: Stream, filename: string,
 proc open*(my: var CsvParser, filename: string,
            separator = ',', quote = '"', escape = '\0',
            skipInitialSpace = false) =
-  ## same as the other `open` but creates the file stream for you.
+  ## Similar to the `other open proc<#open,CsvParser,Stream,string,Char,Char,Char>`_,
+  ## but creates the file stream for you.
+  runnableExamples:
+    from os import removeFile
+    writeFile("tmp.csv", "One,Two,Three\n1,2,3\n10,20,300")
+    var parser: CsvParser
+    parser.open("tmp.csv")
+    parser.close()
+    removeFile("tmp.csv")
+
   var s = newFileStream(filename, fmRead)
   if s == nil: my.error(0, "cannot open: " & filename)
   open(my, s, filename, separator,
@@ -159,17 +201,66 @@ proc parseField(my: var CsvParser, a: var string) =
   my.bufpos = pos
 
 proc processedRows*(my: var CsvParser): int =
-  ## returns number of the processed rows
+  ## Returns number of the processed rows.
+  ##
+  ## But even if `readRow <#readRow,CsvParser,int>`_ arrived at EOF then
+  ## processed rows counter is incremented.
+  runnableExamples:
+    import streams
+
+    var strm = newStringStream("One,Two,Three\n1,2,3")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+    doAssert parser.readRow()
+    doAssert parser.processedRows() == 1
+    doAssert parser.readRow()
+    doAssert parser.processedRows() == 2
+    ## Even if `readRow` arrived at EOF then `processedRows` is incremented.
+    doAssert parser.readRow() == false
+    doAssert parser.processedRows() == 3
+    doAssert parser.readRow() == false
+    doAssert parser.processedRows() == 4
+    parser.close()
+    strm.close()
+
   return my.currRow
 
 proc readRow*(my: var CsvParser, columns = 0): bool =
-  ## reads the next row; if `columns` > 0, it expects the row to have
+  ## Reads the next row; if `columns` > 0, it expects the row to have
   ## exactly this many columns. Returns false if the end of the file
   ## has been encountered else true.
   ##
   ## Blank lines are skipped.
+  runnableExamples:
+    import streams
+    var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+    doAssert parser.readRow()
+    doAssert parser.row == @["One", "Two", "Three"]
+    doAssert parser.readRow()
+    doAssert parser.row == @["1", "2", "3"]
+    ## Blank lines are skipped.
+    doAssert parser.readRow()
+    doAssert parser.row == @["10", "20", "30"]
+
+    var emptySeq: seq[string]
+    doAssert parser.readRow() == false
+    doAssert parser.row == emptySeq
+    doAssert parser.readRow() == false
+    doAssert parser.row == emptySeq
+
+    parser.close()
+    strm.close()
+
   var col = 0 # current column
   let oldpos = my.bufpos
+  # skip initial empty lines #8365
+  while true:
+    case my.buf[my.bufpos]
+    of '\c': my.bufpos = handleCR(my, my.bufpos)
+    of '\l': my.bufpos = handleLF(my, my.bufpos)
+    else: break
   while my.buf[my.bufpos] != '\0':
     let oldlen = my.row.len
     if oldlen < col+1:
@@ -200,12 +291,31 @@ proc readRow*(my: var CsvParser, columns = 0): bool =
   inc(my.currRow)
 
 proc close*(my: var CsvParser) {.inline.} =
-  ## closes the parser `my` and its associated input stream.
+  ## Closes the parser `my` and its associated input stream.
   lexbase.close(my)
 
 proc readHeaderRow*(my: var CsvParser) =
   ## Reads the first row and creates a look-up table for column numbers
-  ## See also `rowEntry <#rowEntry.CsvParser.string>`_.
+  ## See also:
+  ## * `rowEntry proc <#rowEntry,CsvParser,string>`_
+  runnableExamples:
+    import streams
+
+    var strm = newStringStream("One,Two,Three\n1,2,3")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+
+    parser.readHeaderRow()
+    doAssert parser.headers == @["One", "Two", "Three"]
+    doAssert parser.row == @["One", "Two", "Three"]
+
+    doAssert parser.readRow()
+    doAssert parser.headers == @["One", "Two", "Three"]
+    doAssert parser.row == @["1", "2", "3"]
+
+    parser.close()
+    strm.close()
+
   let present = my.readRow()
   if present:
     my.headers = my.row
@@ -213,8 +323,23 @@ proc readHeaderRow*(my: var CsvParser) =
 proc rowEntry*(my: var CsvParser, entry: string): var string =
   ## Acceses a specified `entry` from the current row.
   ##
-  ## Assumes that `readHeaderRow <#readHeaderRow.CsvParser>`_ has already been
+  ## Assumes that `readHeaderRow <#readHeaderRow,CsvParser>`_ has already been
   ## called.
+  runnableExamples:
+    import streams
+    var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30")
+    var parser: CsvParser
+    parser.open(strm, "tmp.csv")
+    ## Need calling `readHeaderRow`.
+    parser.readHeaderRow()
+    doAssert parser.readRow()
+    doAssert parser.rowEntry("One") == "1"
+    doAssert parser.rowEntry("Two") == "2"
+    doAssert parser.rowEntry("Three") == "3"
+    ## `parser.rowEntry("NotExistEntry")` causes SIGSEGV fault.
+    parser.close()
+    strm.close()
+
   let index = my.headers.find(entry)
   if index >= 0:
     result = my.row[index]
@@ -235,7 +360,7 @@ when isMainModule:
   import os
   import strutils
   block: # Tests for reading the header row
-    let content = "One,Two,Three,Four\n1,2,3,4\n10,20,30,40,\n100,200,300,400\n"
+    let content = "\nOne,Two,Three,Four\n1,2,3,4\n10,20,30,40,\n100,200,300,400\n"
     writeFile("temp.csv", content)
 
     var p: CsvParser
@@ -262,4 +387,3 @@ when isMainModule:
 
     # Tidy up
     removeFile("temp.csv")
-