diff options
Diffstat (limited to 'lib/pure/parsecsv.nim')
-rw-r--r--[-rwxr-xr-x] | lib/pure/parsecsv.nim | 393 |
1 files changed, 288 insertions, 105 deletions
diff --git a/lib/pure/parsecsv.nim b/lib/pure/parsecsv.nim index 5970f2090..c7bf0c9c1 100755..100644 --- a/lib/pure/parsecsv.nim +++ b/lib/pure/parsecsv.nim @@ -1,6 +1,6 @@ # # -# Nimrod's Runtime Library +# Nim's Runtime Library # (c) Copyright 2009 Andreas Rumpf # # See the file "copying.txt", included in this @@ -8,171 +8,354 @@ # ## This module implements a simple high performance `CSV`:idx: -## (`comma separated value`:idx:) parser. +## (`comma separated value`:idx:) parser. ## -## Example: How to use the parser -## ============================== +## Basic usage +## =========== ## -## .. code-block:: nimrod -## import os, parsecsv, streams -## var s = newFileStream(ParamStr(1), fmRead) -## if s == nil: quit("cannot open the file" & ParamStr(1)) -## var x: TCsvParser -## open(x, s, ParamStr(1)) +## ```nim +## import std/parsecsv +## from std/os import paramStr +## from std/streams import newFileStream +## +## var s = newFileStream(paramStr(1), fmRead) +## if s == nil: +## quit("cannot open the file" & paramStr(1)) +## +## var x: CsvParser +## open(x, s, paramStr(1)) ## while readRow(x): -## Echo "new row: " +## echo "new row: " ## for val in items(x.row): -## Echo "##", val, "##" +## echo "##", val, "##" ## close(x) +## ``` +## +## For CSV files with a header row, the header can be read and then used as a +## reference for item access with `rowEntry <#rowEntry,CsvParser,string>`_: +## +## ```nim +## import std/parsecsv +## +## # Prepare a file +## let content = """One,Two,Three,Four +## 1,2,3,4 +## 10,20,30,40 +## 100,200,300,400 +## """ +## writeFile("temp.csv", content) ## +## var p: CsvParser +## p.open("temp.csv") +## p.readHeaderRow() +## while p.readRow(): +## echo "new row: " +## for col in items(p.headers): +## echo "##", col, ":", p.rowEntry(col), "##" +## p.close() +## ``` +## +## See also +## ======== +## +## * `streams module <streams.html>`_ for using +## `open proc <#open,CsvParser,Stream,string,char,char,char>`_ +## and other stream processing (like `close proc <streams.html#close,Stream>`_) +## * `parseopt module <parseopt.html>`_ for a command line parser +## * `parsecfg module <parsecfg.html>`_ for a configuration file parser +## * `parsexml module <parsexml.html>`_ for a XML / HTML parser +## * `parsesql module <parsesql.html>`_ for a SQL parser +## * `other parsers <lib.html#pure-libraries-parsers>`_ for other parsers -import - lexbase, streams +import std/[lexbase, streams] + +when defined(nimPreviewSlimSystem): + import std/syncio type - TCsvRow* = seq[string] ## a row in a CSV file - TCsvParser* = object of TBaseLexer ## the parser object. - row*: TCsvRow ## the current row + CsvRow* = seq[string] ## A row in a CSV file. + CsvParser* = object of BaseLexer ## The parser object. + ## + ## It consists of two public fields: + ## * `row` is the current row + ## * `headers` are the columns that are defined in the csv file + ## (read using `readHeaderRow <#readHeaderRow,CsvParser>`_). + ## Used with `rowEntry <#rowEntry,CsvParser,string>`_). + row*: CsvRow filename: string sep, quote, esc: char skipWhite: bool currRow: int + headers*: seq[string] - EInvalidCsv* = object of EIO ## exception that is raised if - ## a parsing error occurs + CsvError* = object of IOError ## An exception that is raised if + ## a parsing error occurs. -proc raiseEInvalidCsv(filename: string, line, col: int, +proc raiseEInvalidCsv(filename: string, line, col: int, msg: string) {.noreturn.} = - var e: ref EInvalidCsv + var e: ref CsvError new(e) - e.msg = filename & "(" & $line & ", " & $col & ") Error: " & msg + if filename.len == 0: + e.msg = "Error: " & msg + else: + e.msg = filename & "(" & $line & ", " & $col & ") Error: " & msg raise e -proc error(my: TCsvParser, pos: int, msg: string) = - raiseEInvalidCsv(my.filename, my.LineNumber, getColNumber(my, pos), msg) +proc error(self: CsvParser, pos: int, msg: string) = + raiseEInvalidCsv(self.filename, self.lineNumber, getColNumber(self, pos), msg) -proc open*(my: var TCsvParser, input: PStream, filename: string, +proc open*(self: var CsvParser, input: Stream, filename: string, separator = ',', quote = '"', escape = '\0', skipInitialSpace = false) = - ## initializes the parser with an input stream. `Filename` is only used + ## Initializes the parser with an input stream. `Filename` is only used ## for nice error messages. The parser's behaviour can be controlled by ## the diverse optional parameters: ## - `separator`: character used to separate fields - ## - `quote`: Used to quote fields containing special characters like - ## `separator`, `quote` or new-line characters. '\0' disables the parsing + ## - `quote`: Used to quote fields containing special characters like + ## `separator`, `quote` or new-line characters. '\\0' disables the parsing ## of quotes. - ## - `escape`: removes any special meaning from the following character; - ## '\0' disables escaping; if escaping is disabled and `quote` is not '\0', + ## - `escape`: removes any special meaning from the following character; + ## '\\0' disables escaping; if escaping is disabled and `quote` is not '\\0', ## two `quote` characters are parsed one literal `quote` character. - ## - `skipInitialSpace`: If true, whitespace immediately following the + ## - `skipInitialSpace`: If true, whitespace immediately following the ## `separator` is ignored. - lexbase.open(my, input) - my.filename = filename - my.sep = separator - my.quote = quote - my.esc = escape - my.skipWhite = skipInitialSpace - my.row = @[] - my.currRow = 0 - -proc parseField(my: var TCsvParser, a: var string) = - var pos = my.bufpos - var buf = my.buf - if my.skipWhite: - while buf[pos] in {' ', '\t'}: inc(pos) + ## + ## See also: + ## * `open proc <#open,CsvParser,string,char,char,char>`_ which creates the + ## file stream for you + runnableExamples: + import std/streams + var strm = newStringStream("One,Two,Three\n1,2,3\n10,20,30") + var parser: CsvParser + parser.open(strm, "tmp.csv") + parser.close() + strm.close() + + lexbase.open(self, input) + self.filename = filename + self.sep = separator + self.quote = quote + self.esc = escape + self.skipWhite = skipInitialSpace + +proc open*(self: var CsvParser, filename: string, + separator = ',', quote = '"', escape = '\0', + skipInitialSpace = false) = + ## Similar to the `other open proc<#open,CsvParser,Stream,string,char,char,char>`_, + ## but creates the file stream for you. + runnableExamples: + from std/os import removeFile + writeFile("tmp.csv", "One,Two,Three\n1,2,3\n10,20,300") + var parser: CsvParser + parser.open("tmp.csv") + parser.close() + removeFile("tmp.csv") + + var s = newFileStream(filename, fmRead) + if s == nil: self.error(0, "cannot open: " & filename) + open(self, s, filename, separator, + quote, escape, skipInitialSpace) + +proc parseField(self: var CsvParser, a: var string) = + var pos = self.bufpos + if self.skipWhite: + while self.buf[pos] in {' ', '\t'}: inc(pos) setLen(a, 0) # reuse memory - if buf[pos] == my.quote and my.quote != '\0': + if self.buf[pos] == self.quote and self.quote != '\0': inc(pos) - while true: - var c = buf[pos] + while true: + let c = self.buf[pos] if c == '\0': - my.bufpos = pos # can continue after exception? - error(my, pos, my.quote & " expected") + self.bufpos = pos # can continue after exception? + error(self, pos, self.quote & " expected") break - elif c == my.quote: - if my.esc == '\0' and buf[pos+1] == my.quote: - add(a, my.quote) + elif c == self.quote: + if self.esc == '\0' and self.buf[pos + 1] == self.quote: + add(a, self.quote) inc(pos, 2) else: inc(pos) break - elif c == my.esc: - add(a, buf[pos+1]) + elif c == self.esc: + add(a, self.buf[pos + 1]) inc(pos, 2) else: case c - of '\c': - pos = handleCR(my, pos) - buf = my.buf + of '\c': + pos = handleCR(self, pos) add(a, "\n") - of '\l': - pos = handleLF(my, pos) - buf = my.buf + of '\l': + pos = handleLF(self, pos) add(a, "\n") else: add(a, c) inc(pos) else: while true: - var c = buf[pos] - if c == my.sep: break + let c = self.buf[pos] + if c == self.sep: break if c in {'\c', '\l', '\0'}: break add(a, c) inc(pos) - my.bufpos = pos + self.bufpos = pos + +proc processedRows*(self: var CsvParser): int {.inline.} = + ## Returns number of the processed rows. + ## + ## But even if `readRow <#readRow,CsvParser,int>`_ arrived at EOF then + ## processed rows counter is incremented. + runnableExamples: + import std/streams -proc processedRows*(my: var TCsvParser): int = - ## returns number of the processed rows - return my.currRow + var strm = newStringStream("One,Two,Three\n1,2,3") + var parser: CsvParser + parser.open(strm, "tmp.csv") + doAssert parser.readRow() + doAssert parser.processedRows() == 1 + doAssert parser.readRow() + doAssert parser.processedRows() == 2 + ## Even if `readRow` arrived at EOF then `processedRows` is incremented. + doAssert parser.readRow() == false + doAssert parser.processedRows() == 3 + doAssert parser.readRow() == false + doAssert parser.processedRows() == 4 + parser.close() + strm.close() -proc readRow*(my: var TCsvParser, columns = 0): bool = - ## reads the next row; if `columns` > 0, it expects the row to have + self.currRow + +proc readRow*(self: var CsvParser, columns = 0): bool = + ## Reads the next row; if `columns` > 0, it expects the row to have ## exactly this many columns. Returns false if the end of the file ## has been encountered else true. + ## + ## Blank lines are skipped. + runnableExamples: + import std/streams + var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30") + var parser: CsvParser + parser.open(strm, "tmp.csv") + doAssert parser.readRow() + doAssert parser.row == @["One", "Two", "Three"] + doAssert parser.readRow() + doAssert parser.row == @["1", "2", "3"] + ## Blank lines are skipped. + doAssert parser.readRow() + doAssert parser.row == @["10", "20", "30"] + + var emptySeq: seq[string] + doAssert parser.readRow() == false + doAssert parser.row == emptySeq + doAssert parser.readRow() == false + doAssert parser.row == emptySeq + + parser.close() + strm.close() + var col = 0 # current column - var oldpos = my.bufpos - while my.buf[my.bufpos] != '\0': - var oldlen = my.row.len - if oldlen < col+1: - setLen(my.row, col+1) - my.row[col] = "" - parseField(my, my.row[col]) + let oldpos = self.bufpos + # skip initial empty lines #8365 + while true: + case self.buf[self.bufpos] + of '\c': self.bufpos = handleCR(self, self.bufpos) + of '\l': self.bufpos = handleLF(self, self.bufpos) + else: break + while self.buf[self.bufpos] != '\0': + let oldlen = self.row.len + if oldlen < col + 1: + setLen(self.row, col + 1) + self.row[col] = "" + parseField(self, self.row[col]) inc(col) - if my.buf[my.bufpos] == my.sep: - inc(my.bufpos) + if self.buf[self.bufpos] == self.sep: + inc(self.bufpos) else: - case my.buf[my.bufpos] - of '\c', '\l': + case self.buf[self.bufpos] + of '\c', '\l': # skip empty lines: - while true: - case my.buf[my.bufpos] - of '\c': my.bufpos = handleCR(my, my.bufpos) - of '\l': my.bufpos = handleLF(my, my.bufpos) + while true: + case self.buf[self.bufpos] + of '\c': self.bufpos = handleCR(self, self.bufpos) + of '\l': self.bufpos = handleLF(self, self.bufpos) else: break - of '\0': nil - else: error(my, my.bufpos, my.sep & " expected") + of '\0': discard + else: error(self, self.bufpos, self.sep & " expected") break - - setlen(my.row, col) + + setLen(self.row, col) result = col > 0 - if result and col != columns and columns > 0: - error(my, oldpos+1, $columns & " columns expected, but found " & + if result and col != columns and columns > 0: + error(self, oldpos + 1, $columns & " columns expected, but found " & $col & " columns") - inc(my.currRow) - -proc close*(my: var TCsvParser) {.inline.} = - ## closes the parser `my` and its associated input stream. - lexbase.close(my) - -when isMainModule: - import os - var s = newFileStream(ParamStr(1), fmRead) - if s == nil: quit("cannot open the file" & ParamStr(1)) - var x: TCsvParser - open(x, s, ParamStr(1)) + inc(self.currRow) + +proc close*(self: var CsvParser) {.inline.} = + ## Closes the parser `self` and its associated input stream. + lexbase.close(self) + +proc readHeaderRow*(self: var CsvParser) = + ## Reads the first row and creates a look-up table for column numbers + ## See also: + ## * `rowEntry proc <#rowEntry,CsvParser,string>`_ + runnableExamples: + import std/streams + + var strm = newStringStream("One,Two,Three\n1,2,3") + var parser: CsvParser + parser.open(strm, "tmp.csv") + + parser.readHeaderRow() + doAssert parser.headers == @["One", "Two", "Three"] + doAssert parser.row == @["One", "Two", "Three"] + + doAssert parser.readRow() + doAssert parser.headers == @["One", "Two", "Three"] + doAssert parser.row == @["1", "2", "3"] + + parser.close() + strm.close() + + let present = self.readRow() + if present: + self.headers = self.row + +proc rowEntry*(self: var CsvParser, entry: string): var string = + ## Accesses a specified `entry` from the current row. + ## + ## Assumes that `readHeaderRow <#readHeaderRow,CsvParser>`_ has already been + ## called. + ## + ## If specified `entry` does not exist, raises KeyError. + runnableExamples: + import std/streams + var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30") + var parser: CsvParser + parser.open(strm, "tmp.csv") + ## Requires calling `readHeaderRow`. + parser.readHeaderRow() + doAssert parser.readRow() + doAssert parser.rowEntry("One") == "1" + doAssert parser.rowEntry("Two") == "2" + doAssert parser.rowEntry("Three") == "3" + doAssertRaises(KeyError): + discard parser.rowEntry("NonexistentEntry") + parser.close() + strm.close() + + let index = self.headers.find(entry) + if index >= 0: + result = self.row[index] + else: + raise newException(KeyError, "Entry `" & entry & "` doesn't exist") + +when not defined(testing) and isMainModule: + import std/os + var s = newFileStream(paramStr(1), fmRead) + if s == nil: quit("cannot open the file" & paramStr(1)) + var x: CsvParser + open(x, s, paramStr(1)) while readRow(x): - Echo "new row: " + echo "new row: " for val in items(x.row): - Echo "##", val, "##" + echo "##", val, "##" close(x) - |