version0.7.10

author: Andreas Rumpf <rumpf_a@web.de> 2009-06-08 08:06:25 +0200
committer: Andreas Rumpf <rumpf_a@web.de> 2009-06-08 08:06:25 +0200
commit: 4d4b3b1c04d41868ebb58bd9ccba7b303007e900 (patch)
tree: 909ed0aad0b145733521f4ac2bfb938dd4b43785 /lib/pure/parsecfg.nim
parent: ce88dc3e67436939b03f97e624c11ca6058fedce (diff)
download: Nim-4d4b3b1c04d41868ebb58bd9ccba7b303007e900.tar.gz
1 files changed, 352 insertions, 0 deletions
diff --git a/lib/pure/parsecfg.nim b/lib/pure/parsecfg.nim
new file mode 100644
index 000000000..5704f591c
--- /dev/null
+++ b/lib/pure/parsecfg.nim
@@ -0,0 +1,352 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2008 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## The ``parsecfg`` module implements a high performance configuration file 
+## parser. The configuration file's syntax is similar to the Windows ``.ini`` 
+## format, but much more powerful, as it is not a line based parser. String 
+## literals, raw string literals and triple quoted string literals are supported 
+## as in the Nimrod programming language.
+
+## This is an example of how a configuration file may look like:
+##
+## .. include:: doc/mytest.cfg
+##     :literal:
+## The file ``tests/tparscfg.nim`` demonstrates how to use the 
+## configuration file parser:
+##
+## .. code-block:: nimrod
+##     :file: tests/tparscfg.nim
+
+
+import 
+  hashes, strutils, lexbase, streams
+
+type 
+  TCfgEventKind* = enum ## enumation of all events that may occur when parsing
+    cfgEof,             ## end of file reached
+    cfgSectionStart,    ## a ``[section]`` has been parsed
+    cfgKeyValuePair,    ## a ``key=value`` pair has been detected
+    cfgOption,          ## a ``--key=value`` command line option
+    cfgError            ## an error ocurred during parsing
+    
+  TCfgEvent* = object of TObject ## describes a parsing event
+    case kind*: TCfgEventKind    ## the kind of the event
+    of cfgEof: nil
+    of cfgSectionStart: 
+      section*: string           ## `section` contains the name of the 
+                                 ## parsed section start (syntax: ``[section]``)
+    of cfgKeyValuePair, cfgOption: 
+      key*, value*: string       ## contains the (key, value) pair if an option
+                                 ## of the form ``--key: value`` or an ordinary
+                                 ## ``key= value`` pair has been parsed.
+                                 ## ``value==""`` if it was not specified in the
+                                 ## configuration file.
+    of cfgError:                 ## the parser encountered an error: `msg`
+      msg*: string               ## contains the error message. No exceptions
+                                 ## are thrown if a parse error occurs.
+  
+  TTokKind = enum 
+    tkInvalid, tkEof,        
+    tkSymbol, tkEquals, tkColon, tkBracketLe, tkBracketRi, tkDashDash
+  TToken {.final.} = object  # a token
+    kind: TTokKind           # the type of the token
+    literal: string          # the parsed (string) literal
+  
+  TParserState = enum 
+    startState # , commaState # not yet used
+  TCfgParser* = object of TBaseLexer ## the parser object.
+    tok: TToken
+    state: TParserState
+    filename: string
+
+proc open*(c: var TCfgParser, input: PStream, filename: string)
+  ## initializes the parser with an input stream. `Filename` is only used
+  ## for nice error messages.
+
+proc close*(c: var TCfgParser)
+  ## closes the parser `c` and its associated input stream.
+
+proc next*(c: var TCfgParser): TCfgEvent
+  ## retrieves the first/next event. This controls the parser.
+
+proc getColumn*(c: TCfgParser): int
+  ## get the current column the parser has arrived at.
+
+proc getLine*(c: TCfgParser): int
+  ## get the current line the parser has arrived at.
+  
+proc getFilename*(c: TCfgParser): string
+  ## get the filename of the file that the parser processes.
+
+proc errorStr*(c: TCfgParser, msg: string): string
+  ## returns a properly formated error message containing current line and
+  ## column information.
+
+
+# implementation
+
+const 
+  SymChars: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '_', '\x80'..'\xFF'} 
+  
+proc rawGetTok(c: var TCfgParser, tok: var TToken)
+proc open(c: var TCfgParser, input: PStream, filename: string) = 
+  lexbase.open(c, input)
+  c.filename = filename
+  c.state = startState
+  c.tok.kind = tkInvalid
+  c.tok.literal = ""
+  rawGetTok(c, c.tok)
+  
+proc close(c: var TCfgParser) = 
+  lexbase.close(c)
+
+proc getColumn(c: TCfgParser): int = 
+  result = getColNumber(c, c.bufPos)
+
+proc getLine(c: TCfgParser): int = 
+  result = c.linenumber
+
+proc getFilename(c: TCfgParser): string = 
+  result = c.filename
+
+proc handleHexChar(c: var TCfgParser, xi: var int) = 
+  case c.buf[c.bufpos]
+  of '0'..'9': 
+    xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('0'))
+    inc(c.bufpos)
+  of 'a'..'f': 
+    xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('a') + 10)
+    inc(c.bufpos)
+  of 'A'..'F': 
+    xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('A') + 10)
+    inc(c.bufpos)
+  else: 
+    nil
+
+proc handleDecChars(c: var TCfgParser, xi: var int) = 
+  while c.buf[c.bufpos] in {'0'..'9'}: 
+    xi = (xi * 10) + (ord(c.buf[c.bufpos]) - ord('0'))
+    inc(c.bufpos)
+
+proc getEscapedChar(c: var TCfgParser, tok: var TToken) = 
+  inc(c.bufpos)               # skip '\'
+  case c.buf[c.bufpos]
+  of 'n', 'N': 
+    add(tok.literal, nl)
+    Inc(c.bufpos)
+  of 'r', 'R', 'c', 'C': 
+    add(tok.literal, '\c')
+    Inc(c.bufpos)
+  of 'l', 'L': 
+    add(tok.literal, '\L')
+    Inc(c.bufpos)
+  of 'f', 'F': 
+    add(tok.literal, '\f')
+    inc(c.bufpos)
+  of 'e', 'E': 
+    add(tok.literal, '\e')
+    Inc(c.bufpos)
+  of 'a', 'A': 
+    add(tok.literal, '\a')
+    Inc(c.bufpos)
+  of 'b', 'B': 
+    add(tok.literal, '\b')
+    Inc(c.bufpos)
+  of 'v', 'V': 
+    add(tok.literal, '\v')
+    Inc(c.bufpos)
+  of 't', 'T': 
+    add(tok.literal, '\t')
+    Inc(c.bufpos)
+  of '\'', '\"': 
+    add(tok.literal, c.buf[c.bufpos])
+    Inc(c.bufpos)
+  of '\\': 
+    add(tok.literal, '\\')
+    Inc(c.bufpos)
+  of 'x', 'X': 
+    inc(c.bufpos)
+    var xi = 0
+    handleHexChar(c, xi)
+    handleHexChar(c, xi)
+    add(tok.literal, Chr(xi))
+  of '0'..'9': 
+    var xi = 0
+    handleDecChars(c, xi)
+    if (xi <= 255): add(tok.literal, Chr(xi))
+    else: tok.kind = tkInvalid
+  else: tok.kind = tkInvalid
+  
+proc HandleCRLF(c: var TCfgParser, pos: int): int = 
+  case c.buf[pos]
+  of '\c': result = lexbase.HandleCR(c, pos)
+  of '\L': result = lexbase.HandleLF(c, pos)
+  else: result = pos
+  
+proc getString(c: var TCfgParser, tok: var TToken, rawMode: bool) = 
+  var pos = c.bufPos + 1          # skip "
+  var buf = c.buf                 # put `buf` in a register
+  tok.kind = tkSymbol
+  if (buf[pos] == '\"') and (buf[pos + 1] == '\"'): 
+    # long string literal:
+    inc(pos, 2)               # skip ""
+                              # skip leading newline:
+    pos = HandleCRLF(c, pos)
+    buf = c.buf
+    while true: 
+      case buf[pos]
+      of '\"': 
+        if (buf[pos + 1] == '\"') and (buf[pos + 2] == '\"'): break 
+        add(tok.literal, '\"')
+        Inc(pos)
+      of '\c', '\L': 
+        pos = HandleCRLF(c, pos)
+        buf = c.buf
+        add(tok.literal, nl)
+      of lexbase.EndOfFile: 
+        tok.kind = tkInvalid
+        break 
+      else: 
+        add(tok.literal, buf[pos])
+        Inc(pos)
+    c.bufpos = pos + 3       # skip the three """
+  else: 
+    # ordinary string literal
+    while true: 
+      var ch = buf[pos]
+      if ch == '\"': 
+        inc(pos)              # skip '"'
+        break 
+      if ch in {'\c', '\L', lexbase.EndOfFile}: 
+        tok.kind = tkInvalid
+        break 
+      if (ch == '\\') and not rawMode: 
+        c.bufPos = pos
+        getEscapedChar(c, tok)
+        pos = c.bufPos
+      else: 
+        add(tok.literal, ch)
+        Inc(pos)
+    c.bufpos = pos
+
+proc getSymbol(c: var TCfgParser, tok: var TToken) = 
+  var pos = c.bufpos
+  var buf = c.buf
+  while true: 
+    add(tok.literal, buf[pos])
+    Inc(pos)
+    if not (buf[pos] in SymChars): break 
+  c.bufpos = pos
+  tok.kind = tkSymbol
+
+proc skip(c: var TCfgParser) = 
+  var pos = c.bufpos
+  var buf = c.buf
+  while true: 
+    case buf[pos]
+    of ' ', '\t': 
+      Inc(pos)
+    of '#', ';': 
+      while not (buf[pos] in {'\c', '\L', lexbase.EndOfFile}): inc(pos)
+    of '\c', '\L': 
+      pos = HandleCRLF(c, pos)
+      buf = c.buf
+    else: 
+      break                   # EndOfFile also leaves the loop
+  c.bufpos = pos
+
+proc rawGetTok(c: var TCfgParser, tok: var TToken) = 
+  tok.kind = tkInvalid
+  setlen(tok.literal, 0)
+  skip(c)
+  case c.buf[c.bufpos]
+  of '=': 
+    tok.kind = tkEquals
+    inc(c.bufpos)
+    tok.literal = "="
+  of '-': 
+    inc(c.bufPos)
+    if c.buf[c.bufPos] == '-': inc(c.bufPos)
+    tok.kind = tkDashDash
+    tok.literal = "--"
+  of ':': 
+    tok.kind = tkColon
+    inc(c.bufpos)
+    tok.literal = ":"
+  of 'r', 'R': 
+    if c.buf[c.bufPos + 1] == '\"': 
+      Inc(c.bufPos)
+      getString(c, tok, true)
+    else: 
+      getSymbol(c, tok)
+  of '[': 
+    tok.kind = tkBracketLe
+    inc(c.bufpos)
+    tok.literal = "]"
+  of ']': 
+    tok.kind = tkBracketRi
+    Inc(c.bufpos)
+    tok.literal = "]"
+  of '\"': 
+    getString(c, tok, false)
+  of lexbase.EndOfFile: 
+    tok.kind = tkEof
+    tok.literal = "[EOF]"
+  else: getSymbol(c, tok)
+  
+proc errorStr(c: TCfgParser, msg: string): string = 
+  result = `%`("$1($2, $3) Error: $4", 
+               [c.filename, toString(getLine(c)), toString(getColumn(c)), msg])
+
+proc getKeyValPair(c: var TCfgParser, kind: TCfgEventKind): TCfgEvent = 
+  if c.tok.kind == tkSymbol: 
+    result.kind = kind
+    result.key = c.tok.literal
+    result.value = ""
+    rawGetTok(c, c.tok)
+    if c.tok.kind in {tkEquals, tkColon}: 
+      rawGetTok(c, c.tok)
+      if c.tok.kind == tkSymbol: 
+        result.value = c.tok.literal
+      else: 
+        result.kind = cfgError
+        result.msg = errorStr(c, "symbol expected, but found: " & c.tok.literal)
+      rawGetTok(c, c.tok)
+  else: 
+    result.kind = cfgError
+    result.msg = errorStr(c, "symbol expected, but found: " & c.tok.literal)
+    rawGetTok(c, c.tok)
+
+proc next(c: var TCfgParser): TCfgEvent = 
+  case c.tok.kind  
+  of tkEof: 
+    result.kind = cfgEof
+  of tkDashDash: 
+    rawGetTok(c, c.tok)
+    result = getKeyValPair(c, cfgOption)
+  of tkSymbol: 
+    result = getKeyValPair(c, cfgKeyValuePair)
+  of tkBracketLe: 
+    rawGetTok(c, c.tok)
+    if c.tok.kind == tkSymbol: 
+      result.kind = cfgSectionStart
+      result.section = c.tok.literal
+    else: 
+      result.kind = cfgError
+      result.msg = errorStr(c, "symbol expected, but found: " & c.tok.literal)
+    rawGetTok(c, c.tok)
+    if c.tok.kind == tkBracketRi: 
+      rawGetTok(c, c.tok)
+    else: 
+      result.kind = cfgError
+      result.msg = errorStr(c, "\']\' expected, but found: " & c.tok.literal)
+  of tkInvalid, tkEquals, tkColon, tkBracketRi: 
+    result.kind = cfgError
+    result.msg = errorStr(c, "invalid token: " & c.tok.literal)
+    rawGetTok(c, c.tok)
author	Andreas Rumpf <rumpf_a@web.de>	2009-06-08 08:06:25 +0200
committer	Andreas Rumpf <rumpf_a@web.de>	2009-06-08 08:06:25 +0200
commit	4d4b3b1c04d41868ebb58bd9ccba7b303007e900 (patch)
tree	909ed0aad0b145733521f4ac2bfb938dd4b43785 /lib/pure/parsecfg.nim
parent	ce88dc3e67436939b03f97e624c11ca6058fedce (diff)
download	Nim-4d4b3b1c04d41868ebb58bd9ccba7b303007e900.tar.gz