diff options
Diffstat (limited to 'lib/pure/parsecsv.nim')
-rw-r--r-- | lib/pure/parsecsv.nim | 348 |
1 files changed, 222 insertions, 126 deletions
diff --git a/lib/pure/parsecsv.nim b/lib/pure/parsecsv.nim index 796114d37..c7bf0c9c1 100644 --- a/lib/pure/parsecsv.nim +++ b/lib/pure/parsecsv.nim @@ -10,13 +10,18 @@ ## This module implements a simple high performance `CSV`:idx: ## (`comma separated value`:idx:) parser. ## -## Example: How to use the parser -## ============================== +## Basic usage +## =========== +## +## ```nim +## import std/parsecsv +## from std/os import paramStr +## from std/streams import newFileStream ## -## .. code-block:: nim -## import os, parsecsv, streams ## var s = newFileStream(paramStr(1), fmRead) -## if s == nil: quit("cannot open the file" & paramStr(1)) +## if s == nil: +## quit("cannot open the file" & paramStr(1)) +## ## var x: CsvParser ## open(x, s, paramStr(1)) ## while readRow(x): @@ -24,13 +29,14 @@ ## for val in items(x.row): ## echo "##", val, "##" ## close(x) +## ``` ## ## For CSV files with a header row, the header can be read and then used as a -## reference for item access with `rowEntry <#rowEntry.CsvParser.string>`_: +## reference for item access with `rowEntry <#rowEntry,CsvParser,string>`_: +## +## ```nim +## import std/parsecsv ## -## .. code-block:: nim -## import parsecsv -## import os ## # Prepare a file ## let content = """One,Two,Three,Four ## 1,2,3,4 @@ -47,24 +53,43 @@ ## for col in items(p.headers): ## echo "##", col, ":", p.rowEntry(col), "##" ## p.close() +## ``` +## +## See also +## ======== +## +## * `streams module <streams.html>`_ for using +## `open proc <#open,CsvParser,Stream,string,char,char,char>`_ +## and other stream processing (like `close proc <streams.html#close,Stream>`_) +## * `parseopt module <parseopt.html>`_ for a command line parser +## * `parsecfg module <parsecfg.html>`_ for a configuration file parser +## * `parsexml module <parsexml.html>`_ for a XML / HTML parser +## * `parsesql module <parsesql.html>`_ for a SQL parser +## * `other parsers <lib.html#pure-libraries-parsers>`_ for other parsers + +import std/[lexbase, streams] -import - lexbase, streams +when defined(nimPreviewSlimSystem): + import std/syncio type - CsvRow* = seq[string] ## a row in a CSV file - CsvParser* = object of BaseLexer ## the parser object. - row*: CsvRow ## the current row + CsvRow* = seq[string] ## A row in a CSV file. + CsvParser* = object of BaseLexer ## The parser object. + ## + ## It consists of two public fields: + ## * `row` is the current row + ## * `headers` are the columns that are defined in the csv file + ## (read using `readHeaderRow <#readHeaderRow,CsvParser>`_). + ## Used with `rowEntry <#rowEntry,CsvParser,string>`_). + row*: CsvRow filename: string sep, quote, esc: char skipWhite: bool currRow: int - headers*: seq[string] ## The columns that are defined in the csv file - ## (read using `readHeaderRow <#readHeaderRow.CsvParser>`_). - ## Used with `rowEntry <#rowEntry.CsvParser.string>`_). + headers*: seq[string] - CsvError* = object of IOError ## exception that is raised if - ## a parsing error occurs + CsvError* = object of IOError ## An exception that is raised if + ## a parsing error occurs. proc raiseEInvalidCsv(filename: string, line, col: int, msg: string) {.noreturn.} = @@ -76,151 +101,255 @@ proc raiseEInvalidCsv(filename: string, line, col: int, e.msg = filename & "(" & $line & ", " & $col & ") Error: " & msg raise e -proc error(my: CsvParser, pos: int, msg: string) = - raiseEInvalidCsv(my.filename, my.lineNumber, getColNumber(my, pos), msg) +proc error(self: CsvParser, pos: int, msg: string) = + raiseEInvalidCsv(self.filename, self.lineNumber, getColNumber(self, pos), msg) -proc open*(my: var CsvParser, input: Stream, filename: string, +proc open*(self: var CsvParser, input: Stream, filename: string, separator = ',', quote = '"', escape = '\0', skipInitialSpace = false) = - ## initializes the parser with an input stream. `Filename` is only used + ## Initializes the parser with an input stream. `Filename` is only used ## for nice error messages. The parser's behaviour can be controlled by ## the diverse optional parameters: ## - `separator`: character used to separate fields ## - `quote`: Used to quote fields containing special characters like - ## `separator`, `quote` or new-line characters. '\0' disables the parsing + ## `separator`, `quote` or new-line characters. '\\0' disables the parsing ## of quotes. ## - `escape`: removes any special meaning from the following character; - ## '\0' disables escaping; if escaping is disabled and `quote` is not '\0', + ## '\\0' disables escaping; if escaping is disabled and `quote` is not '\\0', ## two `quote` characters are parsed one literal `quote` character. ## - `skipInitialSpace`: If true, whitespace immediately following the ## `separator` is ignored. - lexbase.open(my, input) - my.filename = filename - my.sep = separator - my.quote = quote - my.esc = escape - my.skipWhite = skipInitialSpace - my.row = @[] - my.currRow = 0 + ## + ## See also: + ## * `open proc <#open,CsvParser,string,char,char,char>`_ which creates the + ## file stream for you + runnableExamples: + import std/streams + var strm = newStringStream("One,Two,Three\n1,2,3\n10,20,30") + var parser: CsvParser + parser.open(strm, "tmp.csv") + parser.close() + strm.close() -proc open*(my: var CsvParser, filename: string, + lexbase.open(self, input) + self.filename = filename + self.sep = separator + self.quote = quote + self.esc = escape + self.skipWhite = skipInitialSpace + +proc open*(self: var CsvParser, filename: string, separator = ',', quote = '"', escape = '\0', skipInitialSpace = false) = - ## same as the other `open` but creates the file stream for you. + ## Similar to the `other open proc<#open,CsvParser,Stream,string,char,char,char>`_, + ## but creates the file stream for you. + runnableExamples: + from std/os import removeFile + writeFile("tmp.csv", "One,Two,Three\n1,2,3\n10,20,300") + var parser: CsvParser + parser.open("tmp.csv") + parser.close() + removeFile("tmp.csv") + var s = newFileStream(filename, fmRead) - if s == nil: my.error(0, "cannot open: " & filename) - open(my, s, filename, separator, + if s == nil: self.error(0, "cannot open: " & filename) + open(self, s, filename, separator, quote, escape, skipInitialSpace) -proc parseField(my: var CsvParser, a: var string) = - var pos = my.bufpos - var buf = my.buf - if my.skipWhite: - while buf[pos] in {' ', '\t'}: inc(pos) +proc parseField(self: var CsvParser, a: var string) = + var pos = self.bufpos + if self.skipWhite: + while self.buf[pos] in {' ', '\t'}: inc(pos) setLen(a, 0) # reuse memory - if buf[pos] == my.quote and my.quote != '\0': + if self.buf[pos] == self.quote and self.quote != '\0': inc(pos) while true: - let c = buf[pos] + let c = self.buf[pos] if c == '\0': - my.bufpos = pos # can continue after exception? - error(my, pos, my.quote & " expected") + self.bufpos = pos # can continue after exception? + error(self, pos, self.quote & " expected") break - elif c == my.quote: - if my.esc == '\0' and buf[pos+1] == my.quote: - add(a, my.quote) + elif c == self.quote: + if self.esc == '\0' and self.buf[pos + 1] == self.quote: + add(a, self.quote) inc(pos, 2) else: inc(pos) break - elif c == my.esc: - add(a, buf[pos+1]) + elif c == self.esc: + add(a, self.buf[pos + 1]) inc(pos, 2) else: case c of '\c': - pos = handleCR(my, pos) - buf = my.buf + pos = handleCR(self, pos) add(a, "\n") of '\l': - pos = handleLF(my, pos) - buf = my.buf + pos = handleLF(self, pos) add(a, "\n") else: add(a, c) inc(pos) else: while true: - let c = buf[pos] - if c == my.sep: break + let c = self.buf[pos] + if c == self.sep: break if c in {'\c', '\l', '\0'}: break add(a, c) inc(pos) - my.bufpos = pos + self.bufpos = pos + +proc processedRows*(self: var CsvParser): int {.inline.} = + ## Returns number of the processed rows. + ## + ## But even if `readRow <#readRow,CsvParser,int>`_ arrived at EOF then + ## processed rows counter is incremented. + runnableExamples: + import std/streams -proc processedRows*(my: var CsvParser): int = - ## returns number of the processed rows - return my.currRow + var strm = newStringStream("One,Two,Three\n1,2,3") + var parser: CsvParser + parser.open(strm, "tmp.csv") + doAssert parser.readRow() + doAssert parser.processedRows() == 1 + doAssert parser.readRow() + doAssert parser.processedRows() == 2 + ## Even if `readRow` arrived at EOF then `processedRows` is incremented. + doAssert parser.readRow() == false + doAssert parser.processedRows() == 3 + doAssert parser.readRow() == false + doAssert parser.processedRows() == 4 + parser.close() + strm.close() -proc readRow*(my: var CsvParser, columns = 0): bool = - ## reads the next row; if `columns` > 0, it expects the row to have + self.currRow + +proc readRow*(self: var CsvParser, columns = 0): bool = + ## Reads the next row; if `columns` > 0, it expects the row to have ## exactly this many columns. Returns false if the end of the file ## has been encountered else true. ## ## Blank lines are skipped. + runnableExamples: + import std/streams + var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30") + var parser: CsvParser + parser.open(strm, "tmp.csv") + doAssert parser.readRow() + doAssert parser.row == @["One", "Two", "Three"] + doAssert parser.readRow() + doAssert parser.row == @["1", "2", "3"] + ## Blank lines are skipped. + doAssert parser.readRow() + doAssert parser.row == @["10", "20", "30"] + + var emptySeq: seq[string] + doAssert parser.readRow() == false + doAssert parser.row == emptySeq + doAssert parser.readRow() == false + doAssert parser.row == emptySeq + + parser.close() + strm.close() + var col = 0 # current column - let oldpos = my.bufpos - while my.buf[my.bufpos] != '\0': - let oldlen = my.row.len - if oldlen < col+1: - setLen(my.row, col+1) - my.row[col] = "" - parseField(my, my.row[col]) + let oldpos = self.bufpos + # skip initial empty lines #8365 + while true: + case self.buf[self.bufpos] + of '\c': self.bufpos = handleCR(self, self.bufpos) + of '\l': self.bufpos = handleLF(self, self.bufpos) + else: break + while self.buf[self.bufpos] != '\0': + let oldlen = self.row.len + if oldlen < col + 1: + setLen(self.row, col + 1) + self.row[col] = "" + parseField(self, self.row[col]) inc(col) - if my.buf[my.bufpos] == my.sep: - inc(my.bufpos) + if self.buf[self.bufpos] == self.sep: + inc(self.bufpos) else: - case my.buf[my.bufpos] + case self.buf[self.bufpos] of '\c', '\l': # skip empty lines: while true: - case my.buf[my.bufpos] - of '\c': my.bufpos = handleCR(my, my.bufpos) - of '\l': my.bufpos = handleLF(my, my.bufpos) + case self.buf[self.bufpos] + of '\c': self.bufpos = handleCR(self, self.bufpos) + of '\l': self.bufpos = handleLF(self, self.bufpos) else: break of '\0': discard - else: error(my, my.bufpos, my.sep & " expected") + else: error(self, self.bufpos, self.sep & " expected") break - setLen(my.row, col) + setLen(self.row, col) result = col > 0 if result and col != columns and columns > 0: - error(my, oldpos+1, $columns & " columns expected, but found " & + error(self, oldpos + 1, $columns & " columns expected, but found " & $col & " columns") - inc(my.currRow) + inc(self.currRow) -proc close*(my: var CsvParser) {.inline.} = - ## closes the parser `my` and its associated input stream. - lexbase.close(my) +proc close*(self: var CsvParser) {.inline.} = + ## Closes the parser `self` and its associated input stream. + lexbase.close(self) -proc readHeaderRow*(my: var CsvParser) = +proc readHeaderRow*(self: var CsvParser) = ## Reads the first row and creates a look-up table for column numbers - ## See also `rowEntry <#rowEntry.CsvParser.string>`_. - let present = my.readRow() + ## See also: + ## * `rowEntry proc <#rowEntry,CsvParser,string>`_ + runnableExamples: + import std/streams + + var strm = newStringStream("One,Two,Three\n1,2,3") + var parser: CsvParser + parser.open(strm, "tmp.csv") + + parser.readHeaderRow() + doAssert parser.headers == @["One", "Two", "Three"] + doAssert parser.row == @["One", "Two", "Three"] + + doAssert parser.readRow() + doAssert parser.headers == @["One", "Two", "Three"] + doAssert parser.row == @["1", "2", "3"] + + parser.close() + strm.close() + + let present = self.readRow() if present: - my.headers = my.row + self.headers = self.row -proc rowEntry*(my: var CsvParser, entry: string): var string = - ## Acceses a specified `entry` from the current row. +proc rowEntry*(self: var CsvParser, entry: string): var string = + ## Accesses a specified `entry` from the current row. ## - ## Assumes that `readHeaderRow <#readHeaderRow.CsvParser>`_ has already been + ## Assumes that `readHeaderRow <#readHeaderRow,CsvParser>`_ has already been ## called. - let index = my.headers.find(entry) + ## + ## If specified `entry` does not exist, raises KeyError. + runnableExamples: + import std/streams + var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30") + var parser: CsvParser + parser.open(strm, "tmp.csv") + ## Requires calling `readHeaderRow`. + parser.readHeaderRow() + doAssert parser.readRow() + doAssert parser.rowEntry("One") == "1" + doAssert parser.rowEntry("Two") == "2" + doAssert parser.rowEntry("Three") == "3" + doAssertRaises(KeyError): + discard parser.rowEntry("NonexistentEntry") + parser.close() + strm.close() + + let index = self.headers.find(entry) if index >= 0: - result = my.row[index] + result = self.row[index] + else: + raise newException(KeyError, "Entry `" & entry & "` doesn't exist") when not defined(testing) and isMainModule: - import os + import std/os var s = newFileStream(paramStr(1), fmRead) if s == nil: quit("cannot open the file" & paramStr(1)) var x: CsvParser @@ -230,36 +359,3 @@ when not defined(testing) and isMainModule: for val in items(x.row): echo "##", val, "##" close(x) - -when isMainModule: - import os - import strutils - block: # Tests for reading the header row - let content = "One,Two,Three,Four\n1,2,3,4\n10,20,30,40,\n100,200,300,400\n" - writeFile("temp.csv", content) - - var p: CsvParser - p.open("temp.csv") - p.readHeaderRow() - while p.readRow(): - let zeros = repeat('0', p.currRow-2) - doAssert p.rowEntry("One") == "1" & zeros - doAssert p.rowEntry("Two") == "2" & zeros - doAssert p.rowEntry("Three") == "3" & zeros - doAssert p.rowEntry("Four") == "4" & zeros - p.close() - - when not defined(testing): - var parser: CsvParser - parser.open("temp.csv") - parser.readHeaderRow() - while parser.readRow(): - echo "new row: " - for col in items(parser.headers): - echo "##", col, ":", parser.rowEntry(col), "##" - parser.close() - removeFile("temp.csv") - - # Tidy up - removeFile("temp.csv") - |