diff options
author | Andreas Rumpf <rumpf_a@web.de> | 2009-06-08 08:06:25 +0200 |
---|---|---|
committer | Andreas Rumpf <rumpf_a@web.de> | 2009-06-08 08:06:25 +0200 |
commit | 4d4b3b1c04d41868ebb58bd9ccba7b303007e900 (patch) | |
tree | 909ed0aad0b145733521f4ac2bfb938dd4b43785 /lib/pure/parsecsv.nim | |
parent | ce88dc3e67436939b03f97e624c11ca6058fedce (diff) | |
download | Nim-4d4b3b1c04d41868ebb58bd9ccba7b303007e900.tar.gz |
version0.7.10
Diffstat (limited to 'lib/pure/parsecsv.nim')
-rw-r--r-- | lib/pure/parsecsv.nim | 178 |
1 files changed, 178 insertions, 0 deletions
diff --git a/lib/pure/parsecsv.nim b/lib/pure/parsecsv.nim new file mode 100644 index 000000000..5970f2090 --- /dev/null +++ b/lib/pure/parsecsv.nim @@ -0,0 +1,178 @@ +# +# +# Nimrod's Runtime Library +# (c) Copyright 2009 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module implements a simple high performance `CSV`:idx: +## (`comma separated value`:idx:) parser. +## +## Example: How to use the parser +## ============================== +## +## .. code-block:: nimrod +## import os, parsecsv, streams +## var s = newFileStream(ParamStr(1), fmRead) +## if s == nil: quit("cannot open the file" & ParamStr(1)) +## var x: TCsvParser +## open(x, s, ParamStr(1)) +## while readRow(x): +## Echo "new row: " +## for val in items(x.row): +## Echo "##", val, "##" +## close(x) +## + +import + lexbase, streams + +type + TCsvRow* = seq[string] ## a row in a CSV file + TCsvParser* = object of TBaseLexer ## the parser object. + row*: TCsvRow ## the current row + filename: string + sep, quote, esc: char + skipWhite: bool + currRow: int + + EInvalidCsv* = object of EIO ## exception that is raised if + ## a parsing error occurs + +proc raiseEInvalidCsv(filename: string, line, col: int, + msg: string) {.noreturn.} = + var e: ref EInvalidCsv + new(e) + e.msg = filename & "(" & $line & ", " & $col & ") Error: " & msg + raise e + +proc error(my: TCsvParser, pos: int, msg: string) = + raiseEInvalidCsv(my.filename, my.LineNumber, getColNumber(my, pos), msg) + +proc open*(my: var TCsvParser, input: PStream, filename: string, + separator = ',', quote = '"', escape = '\0', + skipInitialSpace = false) = + ## initializes the parser with an input stream. `Filename` is only used + ## for nice error messages. The parser's behaviour can be controlled by + ## the diverse optional parameters: + ## - `separator`: character used to separate fields + ## - `quote`: Used to quote fields containing special characters like + ## `separator`, `quote` or new-line characters. '\0' disables the parsing + ## of quotes. + ## - `escape`: removes any special meaning from the following character; + ## '\0' disables escaping; if escaping is disabled and `quote` is not '\0', + ## two `quote` characters are parsed one literal `quote` character. + ## - `skipInitialSpace`: If true, whitespace immediately following the + ## `separator` is ignored. + lexbase.open(my, input) + my.filename = filename + my.sep = separator + my.quote = quote + my.esc = escape + my.skipWhite = skipInitialSpace + my.row = @[] + my.currRow = 0 + +proc parseField(my: var TCsvParser, a: var string) = + var pos = my.bufpos + var buf = my.buf + if my.skipWhite: + while buf[pos] in {' ', '\t'}: inc(pos) + setLen(a, 0) # reuse memory + if buf[pos] == my.quote and my.quote != '\0': + inc(pos) + while true: + var c = buf[pos] + if c == '\0': + my.bufpos = pos # can continue after exception? + error(my, pos, my.quote & " expected") + break + elif c == my.quote: + if my.esc == '\0' and buf[pos+1] == my.quote: + add(a, my.quote) + inc(pos, 2) + else: + inc(pos) + break + elif c == my.esc: + add(a, buf[pos+1]) + inc(pos, 2) + else: + case c + of '\c': + pos = handleCR(my, pos) + buf = my.buf + add(a, "\n") + of '\l': + pos = handleLF(my, pos) + buf = my.buf + add(a, "\n") + else: + add(a, c) + inc(pos) + else: + while true: + var c = buf[pos] + if c == my.sep: break + if c in {'\c', '\l', '\0'}: break + add(a, c) + inc(pos) + my.bufpos = pos + +proc processedRows*(my: var TCsvParser): int = + ## returns number of the processed rows + return my.currRow + +proc readRow*(my: var TCsvParser, columns = 0): bool = + ## reads the next row; if `columns` > 0, it expects the row to have + ## exactly this many columns. Returns false if the end of the file + ## has been encountered else true. + var col = 0 # current column + var oldpos = my.bufpos + while my.buf[my.bufpos] != '\0': + var oldlen = my.row.len + if oldlen < col+1: + setLen(my.row, col+1) + my.row[col] = "" + parseField(my, my.row[col]) + inc(col) + if my.buf[my.bufpos] == my.sep: + inc(my.bufpos) + else: + case my.buf[my.bufpos] + of '\c', '\l': + # skip empty lines: + while true: + case my.buf[my.bufpos] + of '\c': my.bufpos = handleCR(my, my.bufpos) + of '\l': my.bufpos = handleLF(my, my.bufpos) + else: break + of '\0': nil + else: error(my, my.bufpos, my.sep & " expected") + break + + setlen(my.row, col) + result = col > 0 + if result and col != columns and columns > 0: + error(my, oldpos+1, $columns & " columns expected, but found " & + $col & " columns") + inc(my.currRow) + +proc close*(my: var TCsvParser) {.inline.} = + ## closes the parser `my` and its associated input stream. + lexbase.close(my) + +when isMainModule: + import os + var s = newFileStream(ParamStr(1), fmRead) + if s == nil: quit("cannot open the file" & ParamStr(1)) + var x: TCsvParser + open(x, s, ParamStr(1)) + while readRow(x): + Echo "new row: " + for val in items(x.row): + Echo "##", val, "##" + close(x) + |