lib/pure/parsecsv.nim


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178

#
#
#            Nimrod's Runtime Library
#        (c) Copyright 2009 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## This module implements a simple high performance `CSV`:idx:
## (`comma separated value`:idx:) parser. 
##
## Example: How to use the parser
## ==============================
##
## .. code-block:: nimrod
##   import os, parsecsv, streams
##   var s = newFileStream(ParamStr(1), fmRead)
##   if s == nil: quit("cannot open the file" & ParamStr(1))
##   var x: TCsvParser
##   open(x, s, ParamStr(1))
##   while readRow(x):
##     Echo "new row: "
##     for val in items(x.row):
##       Echo "##", val, "##"
##   close(x)
##

import
  lexbase, streams

type
  TCsvRow* = seq[string] ## a row in a CSV file
  TCsvParser* = object of TBaseLexer ## the parser object.
    row*: TCsvRow                    ## the current row
    filename: string
    sep, quote, esc: char
    skipWhite: bool
    currRow: int

  EInvalidCsv* = object of EIO ## exception that is raised if
                               ## a parsing error occurs

proc raiseEInvalidCsv(filename: string, line, col: int, 
                      msg: string) {.noreturn.} =
  var e: ref EInvalidCsv
  new(e)
  e.msg = filename & "(" & $line & ", " & $col & ") Error: " & msg
  raise e

proc error(my: TCsvParser, pos: int, msg: string) = 
  raiseEInvalidCsv(my.filename, my.LineNumber, getColNumber(my, pos), msg)

proc open*(my: var TCsvParser, input: PStream, filename: string,
           separator = ',', quote = '"', escape = '\0',
           skipInitialSpace = false) =
  ## initializes the parser with an input stream. `Filename` is only used
  ## for nice error messages. The parser's behaviour can be controlled by
  ## the diverse optional parameters:
  ## - `separator`: character used to separate fields
  ## - `quote`: Used to quote fields containing special characters like 
  ##   `separator`, `quote` or new-line characters. '\0' disables the parsing
  ##   of quotes.
  ## - `escape`: removes any special meaning from the following character; 
  ##   '\0' disables escaping; if escaping is disabled and `quote` is not '\0',
  ##   two `quote` characters are parsed one literal `quote` character.
  ## - `skipInitialSpace`: If true, whitespace immediately following the 
  ##   `separator` is ignored.
  lexbase.open(my, input)
  my.filename = filename
  my.sep = separator
  my.quote = quote
  my.esc = escape
  my.skipWhite = skipInitialSpace
  my.row = @[]
  my.currRow = 0

proc parseField(my: var TCsvParser, a: var string) = 
  var pos = my.bufpos
  var buf = my.buf
  if my.skipWhite:
    while buf[pos] in {' ', '\t'}: inc(pos)
  setLen(a, 0) # reuse memory
  if buf[pos] == my.quote and my.quote != '\0': 
    inc(pos)
    while true: 
      var c = buf[pos]
      if c == '\0':
        my.bufpos = pos # can continue after exception?
        error(my, pos, my.quote & " expected")
        break
      elif c == my.quote: 
        if my.esc == '\0' and buf[pos+1] == my.quote:
          add(a, my.quote)
          inc(pos, 2)
        else:
          inc(pos)
          break
      elif c == my.esc:
        add(a, buf[pos+1])
        inc(pos, 2)
      else:
        case c
        of '\c': 
          pos = handleCR(my, pos)
          buf = my.buf
          add(a, "\n")
        of '\l': 
          pos = handleLF(my, pos)
          buf = my.buf
          add(a, "\n")
        else:
          add(a, c)
          inc(pos)
  else:
    while true:
      var c = buf[pos]
      if c == my.sep: break
      if c in {'\c', '\l', '\0'}: break
      add(a, c)
      inc(pos)
  my.bufpos = pos

proc processedRows*(my: var TCsvParser): int = 
  ## returns number of the processed rows
  return my.currRow

proc readRow*(my: var TCsvParser, columns = 0): bool = 
  ## reads the next row; if `columns` > 0, it expects the row to have
  ## exactly this many columns. Returns false if the end of the file
  ## has been encountered else true.
  var col = 0 # current column
  var oldpos = my.bufpos
  while my.buf[my.bufpos] != '\0':
    var oldlen = my.row.len
    if oldlen < col+1:
      setLen(my.row, col+1)
      my.row[col] = ""
    parseField(my, my.row[col])
    inc(col)
    if my.buf[my.bufpos] == my.sep: 
      inc(my.bufpos)
    else:
      case my.buf[my.bufpos]
      of '\c', '\l': 
        # skip empty lines:
        while true: 
          case my.buf[my.bufpos]
          of '\c': my.bufpos = handleCR(my, my.bufpos)
          of '\l': my.bufpos = handleLF(my, my.bufpos)
          else: break
      of '\0': nil
      else: error(my, my.bufpos, my.sep & " expected")
      break
  
  setlen(my.row, col)
  result = col > 0
  if result and col != columns and columns > 0: 
    error(my, oldpos+1, $columns & " columns expected, but found " & 
          $col & " columns")
  inc(my.currRow)
  
proc close*(my: var TCsvParser) {.inline.} = 
  ## closes the parser `my` and its associated input stream.
  lexbase.close(my)

when isMainModule:
  import os
  var s = newFileStream(ParamStr(1), fmRead)
  if s == nil: quit("cannot open the file" & ParamStr(1))
  var x: TCsvParser
  open(x, s, ParamStr(1))
  while readRow(x):
    Echo "new row: "
    for val in items(x.row):
      Echo "##", val, "##"
  close(x)