# # # Nim's Runtime Library # (c) Copyright 2009 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # ## This module implements a simple high performance `CSV`:idx: ## (`comma separated value`:idx:) parser. ## ## Basic usage ## =========== ## ## .. code-block:: nim ## import std/parsecsv ## from std/os import paramStr ## from std/streams import newFileStream ## ## var s = newFileStream(paramStr(1), fmRead) ## if s == nil: ## quit("cannot open the file" & paramStr(1)) ## ## var x: CsvParser ## open(x, s, paramStr(1)) ## while readRow(x): ## echo "new row: " ## for val in items(x.row): ## echo "##", val, "##" ## close(x) ## ## For CSV files with a header row, the header can be read and then used as a ## reference for item access with `rowEntry <#rowEntry,CsvParser,string>`_: ## ## .. code-block:: nim ## import std/parsecsv ## ## # Prepare a file ## let content = """One,Two,Three,Four ## 1,2,3,4 ## 10,20,30,40 ## 100,200,300,400 ## """ ## writeFile("temp.csv", content) ## ## var p: CsvParser ## p.open("temp.csv") ## p.readHeaderRow() ## while p.readRow(): ## echo "new row: " ## for col in items(p.headers): ## echo "##", col, ":", p.rowEntry(col), "##" ## p.close() ## ## See also ## ======== ## ## * `streams module `_ for using ## `open proc <#open,CsvParser,Stream,string,char,char,char>`_ ## and other stream processing (like `close proc `_) ## * `parseopt module `_ for a command line parser ## * `parsecfg module `_ for a configuration file parser ## * `parsexml module `_ for a XML / HTML parser ## * `parsesql module `_ for a SQL parser ## * `other parsers `_ for other parsers import lexbase, streams when defined(nimPreviewSlimSystem): import std/syncio type CsvRow* = seq[string] ## A row in a CSV file. CsvParser* = object of BaseLexer ## The parser object. ## ## It consists of two public fields: ## * `row` is the current row ## * `headers` are the columns that are defined in the csv file ## (read using `readHeaderRow <#readHeaderRow,CsvParser>`_). ## Used with `rowEntry <#rowEntry,CsvParser,string>`_). row*: CsvRow filename: string sep, quote, esc: char skipWhite: bool currRow: int headers*: seq[string] CsvError* = object of IOError ## An exception that is raised if ## a parsing error occurs. proc raiseEInvalidCsv(filename: string, line, col: int, msg: string) {.noreturn.} = var e: ref CsvError new(e) if filename.len == 0: e.msg = "Error: " & msg else: e.msg = filename & "(" & $line & ", " & $col & ") Error: " & msg raise e proc error(self: CsvParser, pos: int, msg: string) = raiseEInvalidCsv(self.filename, self.lineNumber, getColNumber(self, pos), msg) proc open*(self: var CsvParser, input: Stream, filename: string, separator = ',', quote = '"', escape = '\0', skipInitialSpace = false) = ## Initializes the parser with an input stream. `Filename` is only used ## for nice error messages. The parser's behaviour can be controlled by ## the diverse optional parameters: ## - `separator`: character used to separate fields ## - `quote`: Used to quote fields containing special characters like ## `separator`, `quote` or new-line characters. '\\0' disables the parsing ## of quotes. ## - `escape`: removes any special meaning from the following character; ## '\\0' disables escaping; if escaping is disabled and `quote` is not '\\0', ## two `quote` characters are parsed one literal `quote` character. ## - `skipInitialSpace`: If true, whitespace immediately following the ## `separator` is ignored. ## ## See also: ## * `open proc <#open,CsvParser,string,char,char,char>`_ which creates the ## file stream for you runnableExamples: import std/streams var strm = newStringStream("One,Two,Three\n1,2,3\n10,20,30") var parser: CsvParser parser.open(strm, "tmp.csv") parser.close() strm.close() lexbase.open(self, input) self.filename = filename self.sep = separator self.quote = quote self.esc = escape self.skipWhite = skipInitialSpace proc open*(self: var CsvParser, filename: string, separator = ',', quote = '"', escape = '\0', skipInitialSpace = false) = ## Similar to the `other open proc<#open,CsvParser,Stream,string,char,char,char>`_, ## but creates the file stream for you. runnableExamples: from std/os import removeFile writeFile("tmp.csv", "One,Two,Three\n1,2,3\n10,20,300") var parser: CsvParser parser.open("tmp.csv") parser.close() removeFile("tmp.csv") var s = newFileStream(filename, fmRead) if s == nil: self.error(0, "cannot open: " & filename) open(self, s, filename, separator, quote, escape, skipInitialSpace) proc parseField(self: var CsvParser, a: var string) = var pos = self.bufpos if self.skipWhite: while self.buf[pos] in {' ', '\t'}: inc(pos) setLen(a, 0) # reuse memory if self.buf[pos] == self.quote and self.quote != '\0': inc(pos) while true: let c = self.buf[pos] if c == '\0': self.bufpos = pos # can continue after exception? error(self, pos, self.quote & " expected") break elif c == self.quote: if self.esc == '\0' and self.buf[pos + 1] == self.quote: add(a, self.quote) inc(pos, 2) else: inc(pos) break elif c == self.esc: add(a, self.buf[pos + 1]) inc(pos, 2) else: case c of '\c': pos = handleCR(self, pos) add(a, "\n") of '\l': pos = handleLF(self, pos) add(a, "\n") else: add(a, c) inc(pos) else: while true: let c = self.buf[pos] if c == self.sep: break if c in {'\c', '\l', '\0'}: break add(a, c) inc(pos) self.bufpos = pos proc processedRows*(self: var CsvParser): int {.inline.} = ## Returns number of the processed rows. ## ## But even if `readRow <#readRow,CsvParser,int>`_ arrived at EOF then ## processed rows counter is incremented. runnableExamples: import std/streams var strm = newStringStream("One,Two,Three\n1,2,3") var parser: CsvParser parser.open(strm, "tmp.csv") doAssert parser.readRow() doAssert parser.processedRows() == 1 doAssert parser.readRow() doAssert parser.processedRows() == 2 ## Even if `readRow` arrived at EOF then `processedRows` is incremented. doAssert parser.readRow() == false doAssert parser.processedRows() == 3 doAssert parser.readRow() == false doAssert parser.processedRows() == 4 parser.close() strm.close() self.currRow proc readRow*(self: var CsvParser, columns = 0): bool = ## Reads the next row; if `columns` > 0, it expects the row to have ## exactly this many columns. Returns false if the end of the file ## has been encountered else true. ## ## Blank lines are skipped. runnableExamples: import std/streams var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30") var parser: CsvParser parser.open(strm, "tmp.csv") doAssert parser.readRow() doAssert parser.row == @["One", "Two", "Three"] doAssert parser.readRow() doAssert parser.row == @["1", "2", "3"] ## Blank lines are skipped. doAssert parser.readRow() doAssert parser.row == @["10", "20", "30"] var emptySeq: seq[string] doAssert parser.readRow() == false doAssert parser.row == emptySeq doAssert parser.readRow() == false doAssert parser.row == emptySeq parser.close() strm.close() var col = 0 # current column let oldpos = self.bufpos # skip initial empty lines #8365 while true: case self.buf[self.bufpos] of '\c': self.bufpos = handleCR(self, self.bufpos) of '\l': self.bufpos = handleLF(self, self.bufpos) else: break while self.buf[self.bufpos] != '\0': let oldlen = self.row.len if oldlen < col + 1: setLen(self.row, col + 1) self.row[col] = "" parseField(self, self.row[col]) inc(col) if self.buf[self.bufpos] == self.sep: inc(self.bufpos) else: case self.buf[self.bufpos] of '\c', '\l': # skip empty lines: while true: case self.buf[self.bufpos] of '\c': self.bufpos = handleCR(self, self.bufpos) of '\l': self.bufpos = handleLF(self, self.bufpos) else: break of '\0': discard else: error(self, self.bufpos, self.sep & " expected") break setLen(self.row, col) result = col > 0 if result and col != columns and columns > 0: error(self, oldpos + 1, $columns & " columns expected, but found " & $col & " columns") inc(self.currRow) proc close*(self: var CsvParser) {.inline.} = ## Closes the parser `self` and its associated input stream. lexbase.close(self) proc readHeaderRow*(self: var CsvParser) = ## Reads the first row and creates a look-up table for column numbers ## See also: ## * `rowEntry proc <#rowEntry,CsvParser,string>`_ runnableExamples: import std/streams var strm = newStringStream("One,Two,Three\n1,2,3") var parser: CsvParser parser.open(strm, "tmp.csv") parser.readHeaderRow() doAssert parser.headers == @["One", "Two", "Three"] doAssert parser.row == @["One", "Two", "Three"] doAssert parser.readRow() doAssert parser.headers == @["One", "Two", "Three"] doAssert parser.row == @["1", "2", "3"] parser.close() strm.close() let present = self.readRow() if present: self.headers = self.row proc rowEntry*(self: var CsvParser, entry: string): var string = ## Accesses a specified `entry` from the current row. ## ## Assumes that `readHeaderRow <#readHeaderRow,CsvParser>`_ has already been ## called. ## ## If specified `entry` does not exist, raises KeyError. runnableExamples: import std/streams var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30") var parser: CsvParser parser.open(strm, "tmp.csv") ## Requires calling `readHeaderRow`. parser.readHeaderRow() doAssert parser.readRow() doAssert parser.rowEntry("One") == "1" doAssert parser.rowEntry("Two") == "2" doAssert parser.rowEntry("Three") == "3" doAssertRaises(KeyError): discard parser.rowEntry("NonexistentEntry") parser.close() strm.close() let index = self.headers.find(entry) if index >= 0: result = self.row[index] else: raise newException(KeyError, "Entry `" & entry & "` doesn't exist") when not defined(testing) and isMainModule: import os var s = newFileStream(paramStr(1), fmRead) if s == nil: quit("cannot open the file" & paramStr(1)) var x: CsvParser open(x, s, paramStr(1)) while readRow(x): echo "new row: " for val in items(x.row): echo "##", val, "##" close(x)