1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
|
#
#
# Nimrod's Runtime Library
# (c) Copyright 2009 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## This module implements a simple high performance `CSV`:idx:
## (`comma separated value`:idx:) parser.
##
## Example: How to use the parser
## ==============================
##
## .. code-block:: nimrod
## import os, parsecsv, streams
## var s = newFileStream(ParamStr(1), fmRead)
## if s == nil: quit("cannot open the file" & ParamStr(1))
## var x: TCsvParser
## open(x, s, ParamStr(1))
## while readRow(x):
## Echo "new row: "
## for val in items(x.row):
## Echo "##", val, "##"
## close(x)
##
import
lexbase, streams
type
TCsvRow* = seq[string] ## a row in a CSV file
TCsvParser* = object of TBaseLexer ## the parser object.
row*: TCsvRow ## the current row
filename: string
sep, quote, esc: char
skipWhite: bool
currRow: int
EInvalidCsv* = object of EIO ## exception that is raised if
## a parsing error occurs
proc raiseEInvalidCsv(filename: string, line, col: int,
msg: string) {.noreturn.} =
var e: ref EInvalidCsv
new(e)
e.msg = filename & "(" & $line & ", " & $col & ") Error: " & msg
raise e
proc error(my: TCsvParser, pos: int, msg: string) =
raiseEInvalidCsv(my.filename, my.LineNumber, getColNumber(my, pos), msg)
proc open*(my: var TCsvParser, input: PStream, filename: string,
separator = ',', quote = '"', escape = '\0',
skipInitialSpace = false) =
## initializes the parser with an input stream. `Filename` is only used
## for nice error messages. The parser's behaviour can be controlled by
## the diverse optional parameters:
## - `separator`: character used to separate fields
## - `quote`: Used to quote fields containing special characters like
## `separator`, `quote` or new-line characters. '\0' disables the parsing
## of quotes.
## - `escape`: removes any special meaning from the following character;
## '\0' disables escaping; if escaping is disabled and `quote` is not '\0',
## two `quote` characters are parsed one literal `quote` character.
## - `skipInitialSpace`: If true, whitespace immediately following the
## `separator` is ignored.
lexbase.open(my, input)
my.filename = filename
my.sep = separator
my.quote = quote
my.esc = escape
my.skipWhite = skipInitialSpace
my.row = @[]
my.currRow = 0
proc parseField(my: var TCsvParser, a: var string) =
var pos = my.bufpos
var buf = my.buf
if my.skipWhite:
while buf[pos] in {' ', '\t'}: inc(pos)
setLen(a, 0) # reuse memory
if buf[pos] == my.quote and my.quote != '\0':
inc(pos)
while true:
var c = buf[pos]
if c == '\0':
my.bufpos = pos # can continue after exception?
error(my, pos, my.quote & " expected")
break
elif c == my.quote:
if my.esc == '\0' and buf[pos+1] == my.quote:
add(a, my.quote)
inc(pos, 2)
else:
inc(pos)
break
elif c == my.esc:
add(a, buf[pos+1])
inc(pos, 2)
else:
case c
of '\c':
pos = handleCR(my, pos)
buf = my.buf
add(a, "\n")
of '\l':
pos = handleLF(my, pos)
buf = my.buf
add(a, "\n")
else:
add(a, c)
inc(pos)
else:
while true:
var c = buf[pos]
if c == my.sep: break
if c in {'\c', '\l', '\0'}: break
add(a, c)
inc(pos)
my.bufpos = pos
proc processedRows*(my: var TCsvParser): int =
## returns number of the processed rows
return my.currRow
proc readRow*(my: var TCsvParser, columns = 0): bool =
## reads the next row; if `columns` > 0, it expects the row to have
## exactly this many columns. Returns false if the end of the file
## has been encountered else true.
var col = 0 # current column
var oldpos = my.bufpos
while my.buf[my.bufpos] != '\0':
var oldlen = my.row.len
if oldlen < col+1:
setLen(my.row, col+1)
my.row[col] = ""
parseField(my, my.row[col])
inc(col)
if my.buf[my.bufpos] == my.sep:
inc(my.bufpos)
else:
case my.buf[my.bufpos]
of '\c', '\l':
# skip empty lines:
while true:
case my.buf[my.bufpos]
of '\c': my.bufpos = handleCR(my, my.bufpos)
of '\l': my.bufpos = handleLF(my, my.bufpos)
else: break
of '\0': nil
else: error(my, my.bufpos, my.sep & " expected")
break
setlen(my.row, col)
result = col > 0
if result and col != columns and columns > 0:
error(my, oldpos+1, $columns & " columns expected, but found " &
$col & " columns")
inc(my.currRow)
proc close*(my: var TCsvParser) {.inline.} =
## closes the parser `my` and its associated input stream.
lexbase.close(my)
when isMainModule:
import os
var s = newFileStream(ParamStr(1), fmRead)
if s == nil: quit("cannot open the file" & ParamStr(1))
var x: TCsvParser
open(x, s, ParamStr(1))
while readRow(x):
Echo "new row: "
for val in items(x.row):
Echo "##", val, "##"
close(x)
|