summary refs log tree commit diff stats
path: root/lib/pure/pegs.nim
diff options
context:
space:
mode:
authorrumpf_a@web.de <>2009-10-21 10:20:15 +0200
committerrumpf_a@web.de <>2009-10-21 10:20:15 +0200
commit053309e60aee1eda594a4817ac8ac2fb8c18fb04 (patch)
tree0f1ce8b0de0b493045eb97eeca6ebf06542de601 /lib/pure/pegs.nim
parent581572b28c65bc9fe47974cfd625210a69be0f3f (diff)
downloadNim-053309e60aee1eda594a4817ac8ac2fb8c18fb04.tar.gz
version 0.8.2
Diffstat (limited to 'lib/pure/pegs.nim')
-rwxr-xr-xlib/pure/pegs.nim1320
1 files changed, 1320 insertions, 0 deletions
diff --git a/lib/pure/pegs.nim b/lib/pure/pegs.nim
new file mode 100755
index 000000000..c029f92a2
--- /dev/null
+++ b/lib/pure/pegs.nim
@@ -0,0 +1,1320 @@
+#
+#
+#            Nimrod's Runtime Library
+#        (c) Copyright 2009 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Simple PEG (Parsing expression grammar) matching. Uses no memorization, but
+## uses superoperators and symbol inlining to improve performance. Note:
+## Matching performance is hopefully competitive with optimized regular
+## expression engines.
+##
+## .. include:: ../doc/pegdocs.txt
+##
+
+const
+  useUnicode = true ## change this to deactivate proper UTF-8 support
+
+import
+  strutils
+
+when useUnicode:
+  import unicode
+
+const
+  InlineThreshold = 5   ## number of leaves; -1 to disable inlining
+
+type
+  TPegKind = enum
+    pkEmpty,
+    pkAny,              ## any character (.)
+    pkAnyRune,          ## any Unicode character (_)
+    pkNewLine,          ## CR-LF, LF, CR
+    pkTerminal,
+    pkTerminalIgnoreCase,
+    pkTerminalIgnoreStyle,
+    pkChar,             ## single character to match
+    pkCharChoice,
+    pkNonTerminal,
+    pkSequence,         ## a b c ... --> Internal DSL: peg(a, b, c)
+    pkOrderedChoice,    ## a / b / ... --> Internal DSL: a / b or /[a, b, c]
+    pkGreedyRep,        ## a*     --> Internal DSL: *a
+                        ## a+     --> Internal DSL: +a; represented as (a a*)
+    pkGreedyRepChar,    ## x* where x is a single character (superop)
+    pkGreedyRepSet,     ## [set]* (superop)
+    pkGreedyAny,        ## .* or _* (superop)
+    pkOption,           ## a?     --> Internal DSL: ?a
+    pkAndPredicate,     ## &a     --> Internal DSL: &a
+    pkNotPredicate,     ## !a     --> Internal DSL: !a
+    pkCapture,          ## {a}    --> Internal DSL: capture(a)
+    pkRule,             ## a <- b
+    pkList              ## a, b
+  TNonTerminalFlag = enum
+    ntDeclared, ntUsed
+  TNonTerminal {.final.} = object ## represents a non terminal symbol
+    name: string        ## the name of the symbol
+    line: int           ## the line the symbol has been declared/used in
+    col: int            ## the column the symbol has been declared/used in
+    flags: set[TNonTerminalFlag] ## the nonterminal's flags
+    rule: TNode         ## the rule that the symbol refers to
+  TNode {.final.} = object
+    case kind: TPegKind
+    of pkEmpty, pkAny, pkAnyRune, pkGreedyAny, pkNewLine: nil
+    of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle: term: string
+    of pkChar, pkGreedyRepChar: ch: char
+    of pkCharChoice, pkGreedyRepSet: charChoice: ref set[char]
+    of pkNonTerminal: nt: PNonTerminal
+    else: sons: seq[TNode]
+  PNonTerminal* = ref TNonTerminal
+  
+  TPeg* = TNode ## type that represents a PEG
+
+proc term*(t: string): TPeg =
+  ## constructs a PEG from a terminal string
+  if t.len != 1:  
+    result.kind = pkTerminal
+    result.term = t
+  else:
+    result.kind = pkChar
+    result.ch = t[0]
+
+proc termIgnoreCase*(t: string): TPeg =
+  ## constructs a PEG from a terminal string; ignore case for matching
+  result.kind = pkTerminalIgnoreCase
+  result.term = t
+
+proc termIgnoreStyle*(t: string): TPeg =
+  ## constructs a PEG from a terminal string; ignore style for matching
+  result.kind = pkTerminalIgnoreStyle
+  result.term = t
+
+proc term*(t: char): TPeg =
+  ## constructs a PEG from a terminal char
+  assert t != '\0'
+  result.kind = pkChar
+  result.ch = t
+  
+proc charSet*(s: set[char]): TPeg =
+  ## constructs a PEG from a character set `s`
+  assert '\0' notin s
+  result.kind = pkCharChoice
+  new(result.charChoice)
+  result.charChoice^ = s
+
+proc len(a: TPeg): int {.inline.} = return a.sons.len
+proc add(d: var TPeg, s: TPeg) {.inline.} = add(d.sons, s)
+
+proc addChoice(dest: var TPeg, elem: TPeg) =
+  var L = dest.len-1
+  if L >= 0 and dest.sons[L].kind == pkCharChoice: 
+    case elem.kind
+    of pkCharChoice:
+      dest.sons[L].charChoice^ = dest.sons[L].charChoice^ + elem.charChoice^
+    of pkChar: incl(dest.sons[L].charChoice^, elem.ch)
+    else: add(dest, elem)
+  else: add(dest, elem)
+
+template multipleOp(k: TPegKind, localOpt: expr) =
+  result.kind = k
+  result.sons = @[]
+  for x in items(a):
+    if x.kind == k:
+      for y in items(x.sons):
+        localOpt(result, y)
+    else:
+      localOpt(result, x)
+  if result.len == 1:
+    result = result.sons[0]
+
+proc `/`*(a: openArray[TPeg]): TPeg =
+  ## constructs an ordered choice with the PEGs in `a`
+  multipleOp(pkOrderedChoice, addChoice)
+
+proc addSequence(dest: var TPeg, elem: TPeg) =
+  var L = dest.len-1
+  if L >= 0 and dest.sons[L].kind == pkTerminal: 
+    case elem.kind
+    of pkTerminal: add(dest.sons[L].term, elem.term)
+    of pkChar: add(dest.sons[L].term, elem.ch)
+    else: add(dest, elem)
+  else: add(dest, elem)
+
+proc sequence*(a: openArray[TPeg]): TPeg =
+  ## constructs a sequence with all the PEGs from `a`
+  multipleOp(pkSequence, addSequence)
+ 
+proc `?`*(a: TPeg): TPeg =
+  ## constructs an optional piece with the PEG `a`
+  if a.kind in {pkOption, pkGreedyRep, pkGreedyAny, pkGreedyRepChar,
+                pkGreedyRepSet}:
+    # a* ?  --> a*
+    # a? ?  --> a?
+    result = a
+  else:
+    result.kind = pkOption
+    result.sons = @[a]
+
+proc `*`*(a: TPeg): TPeg =
+  ## constructs a "greedy repetition" piece the PEG `a`
+  case a.kind
+  of pkGreedyRep, pkGreedyRepChar, pkGreedyRepSet, pkGreedyAny, pkOption:
+    assert false
+    # produces endless loop!
+  of pkChar:
+    result.kind = pkGreedyRepChar
+    result.ch = a.ch
+  of pkCharChoice:
+    result.kind = pkGreedyRepSet
+    result.charChoice = a.charChoice # copying a reference suffices!
+  of pkAny, pkAnyRune:
+    result.kind = pkGreedyAny
+  else:
+    result.kind = pkGreedyRep
+    result.sons = @[a]
+  
+proc `+`*(a: TPeg): TPeg =
+  ## constructs a "greedy positive repetition" with the PEG `a`
+  return sequence(a, *a)
+  
+proc `&`*(a: TPeg): TPeg =
+  ## constructs an "and predicate" with the PEG `a`
+  result.kind = pkAndPredicate
+  result.sons = @[a]
+
+proc `!`*(a: TPeg): TPeg =
+  ## constructs a "not predicate" with the PEG `a`
+  result.kind = pkNotPredicate
+  result.sons = @[a]
+
+proc any*: TPeg {.inline.} =
+  ## constructs the PEG `any character`:idx: (``.``)
+  result.kind = pkAny
+
+proc anyRune*: TPeg {.inline.} =
+  ## constructs the PEG `any rune`:idx: (``_``)
+  result.kind = pkAnyRune
+
+proc newLine*: TPeg {.inline.} =
+  ## constructs the PEG `newline`:idx: (``\n``)
+  result.kind = pkNewline
+
+proc capture*(a: TPeg): TPeg =
+  ## constructs a capture with the PEG `a`
+  result.kind = pkCapture
+  result.sons = @[a]
+
+proc spaceCost(n: TPeg): int =
+  case n.kind
+  of pkEmpty: nil
+  of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar,
+     pkGreedyRepChar, pkCharChoice, pkGreedyRepSet, pkAny, pkAnyRune,
+     pkNewLine, pkGreedyAny:
+    result = 1
+  of pkNonTerminal:
+    # we cannot inline a rule with a non-terminal
+    result = InlineThreshold+1
+  else:
+    for i in 0..n.len-1:
+      inc(result, spaceCost(n.sons[i]))
+      if result >= InlineThreshold: break
+
+proc nonterminal*(n: PNonTerminal): TPeg = 
+  ## constructs a PEG that consists of the nonterminal symbol
+  assert n != nil
+  if ntDeclared in n.flags and spaceCost(n.rule) < InlineThreshold:
+    when false: echo "inlining symbol: ", n.name
+    result = n.rule # inlining of rule enables better optimizations
+  else:
+    result.kind = pkNonTerminal
+    result.nt = n
+
+proc newNonTerminal*(name: string, line, column: int): PNonTerminal =
+  ## constructs a nonterminal symbol
+  new(result)
+  result.name = name
+  result.line = line
+  result.col = column
+
+template letters*: expr =
+  ## expands to ``charset({'A'..'Z', 'a'..'z'})``
+  charset({'A'..'Z', 'a'..'z'})
+  
+template digits*: expr =
+  ## expands to ``charset({'0'..'9'})``
+  charset({'0'..'9'})
+
+template whitespace*: expr =
+  ## expands to ``charset({' ', '\9'..'\13'})``
+  charset({' ', '\9'..'\13'})
+  
+template identChars*: expr =
+  ## expands to ``charset({'a'..'z', 'A'..'Z', '0'..'9', '_'})``
+  charset({'a'..'z', 'A'..'Z', '0'..'9', '_'})
+  
+template identStartChars*: expr =
+  ## expands to ``charset({'A'..'Z', 'a'..'z', '_'})``
+  charset({'a'..'z', 'A'..'Z', '_'})
+
+template ident*: expr =
+  ## same as ``[a-zA-Z_][a-zA-z_0-9]*``; standard identifier
+  sequence(charset({'a'..'z', 'A'..'Z', '_'}),
+           *charset({'a'..'z', 'A'..'Z', '0'..'9', '_'}))
+  
+template natural*: expr =
+  ## same as ``\d+``
+  +digits
+
+const
+  MaxSubpatterns* = 10 ## defines the maximum number of subpatterns that
+                       ## can be captured. More subpatterns cannot be captured! 
+
+# --------------------- core engine -------------------------------------------
+
+type
+  TMatchClosure {.final.} = object
+    matches: array[0..maxSubpatterns-1, tuple[first, last: int]]
+    ml: int
+
+when not useUnicode:
+  type
+    TRune = char
+  template fastRuneAt(s, i, ch: expr) =
+    ch = s[i]
+    inc(i)
+  template runeLenAt(s, i: expr): expr = 1
+
+proc m(s: string, p: TPeg, start: int, c: var TMatchClosure): int =
+  ## this implements a simple PEG interpreter. Thanks to superoperators it
+  ## has competitive performance nevertheless.
+  ## Returns -1 if it does not match, else the length of the match
+  case p.kind
+  of pkEmpty: result = 0 # match of length 0
+  of pkAny:
+    if s[start] != '\0': result = 1
+    else: result = -1
+  of pkAnyRune:
+    if s[start] != '\0':
+      result = runeLenAt(s, start)
+    else:
+      result = -1
+  of pkGreedyAny:
+    result = len(s) - start
+  of pkNewLine:
+    if s[start] == '\L': result = 1
+    elif s[start] == '\C':
+      if s[start+1] == '\L': result = 2
+      else: result = 1
+    else: result = -1
+  of pkTerminal:
+    result = len(p.term)
+    for i in 0..result-1:
+      if p.term[i] != s[start+i]:
+        result = -1
+        break
+  of pkTerminalIgnoreCase:
+    var
+      i = 0
+      a, b: TRune
+    result = start
+    while i < len(p.term):
+      fastRuneAt(p.term, i, a)
+      fastRuneAt(s, result, b)
+      if toLower(a) != toLower(b):
+        result = -1
+        break
+    dec(result, start)
+  of pkTerminalIgnoreStyle:
+    var
+      i = 0
+      a, b: TRune
+    result = start
+    while i < len(p.term):
+      while true:
+        fastRuneAt(p.term, i, a)
+        if a != TRune('_'): break
+      while true:
+        fastRuneAt(s, result, b)
+        if b != TRune('_'): break
+      if toLower(a) != toLower(b):
+        result = -1
+        break
+    dec(result, start)
+  of pkChar:
+    if p.ch == s[start]: result = 1
+    else: result = -1
+  of pkCharChoice:
+    if contains(p.charChoice^, s[start]): result = 1
+    else: result = -1
+  of pkNonTerminal:
+    var oldMl = c.ml
+    when false: echo "enter: ", p.nt.name
+    result = m(s, p.nt.rule, start, c)
+    when false: echo "leave: ", p.nt.name
+    if result < 0: c.ml = oldMl
+  of pkSequence:
+    var oldMl = c.ml  
+    result = 0
+    for i in 0..high(p.sons):
+      var x = m(s, p.sons[i], start+result, c)
+      if x < 0:
+        c.ml = oldMl
+        result = -1
+        break
+      else: inc(result, x)
+  of pkOrderedChoice:
+    var oldMl = c.ml
+    for i in 0..high(p.sons):
+      result = m(s, p.sons[i], start, c)
+      if result >= 0: break
+      c.ml = oldMl
+  of pkGreedyRep:
+    result = 0
+    while true:
+      var x = m(s, p.sons[0], start+result, c)
+      # if x == 0, we have an endless loop; so the correct behaviour would be
+      # not to break. But endless loops can be easily introduced:
+      # ``(comment / \w*)*`` is such an example. Breaking for x == 0 does the
+      # expected thing in this case.
+      if x <= 0: break
+      inc(result, x)
+  of pkGreedyRepChar:
+    result = 0
+    var ch = p.ch
+    while ch == s[start+result]: inc(result)
+  of pkGreedyRepSet:
+    result = 0
+    while contains(p.charChoice^, s[start+result]): inc(result)
+  of pkOption:
+    result = max(0, m(s, p.sons[0], start, c))
+  of pkAndPredicate:
+    var oldMl = c.ml
+    result = m(s, p.sons[0], start, c)
+    if result >= 0: result = 0 # do not consume anything
+    else: c.ml = oldMl
+  of pkNotPredicate:
+    var oldMl = c.ml
+    result = m(s, p.sons[0], start, c)
+    if result < 0: result = 0
+    else:
+      c.ml = oldMl
+      result = -1
+  of pkCapture:
+    var idx = c.ml # reserve a slot for the subpattern
+    inc(c.ml)
+    result = m(s, p.sons[0], start, c)
+    if result >= 0:
+      if idx < maxSubpatterns:
+        c.matches[idx] = (start, start+result-1)
+      #else: silently ignore the capture
+    else:
+      c.ml = idx
+  of pkRule, pkList: assert false
+
+proc match*(s: string, pattern: TPeg, matches: var openarray[string],
+            start = 0): bool =
+  ## returns ``true`` if ``s[start..]`` matches the ``pattern`` and
+  ## the captured substrings in the array ``matches``. If it does not
+  ## match, nothing is written into ``matches`` and ``false`` is
+  ## returned.
+  var c: TMatchClosure
+  result = m(s, pattern, start, c) == len(s)
+  if result:
+    for i in 0..c.ml-1:
+      matches[i] = copy(s, c.matches[i][0], c.matches[i][1])
+
+proc match*(s: string, pattern: TPeg, start = 0): bool =
+  ## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``.
+  var c: TMatchClosure
+  result = m(s, pattern, start, c) == len(s)
+
+proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string],
+               start = 0): int =
+  ## the same as ``match``, but it returns the length of the match,
+  ## if there is no match, -1 is returned. Note that a match length
+  ## of zero can happen. It's possible that a suffix of `s` remains
+  ## that does not belong to the match.
+  var c: TMatchClosure
+  result = m(s, pattern, start, c)
+  if result >= 0:
+    for i in 0..c.ml-1:
+      matches[i] = copy(s, c.matches[i][0], c.matches[i][1])
+
+proc matchLen*(s: string, pattern: TPeg, start = 0): int =
+  ## the same as ``match``, but it returns the length of the match,
+  ## if there is no match, -1 is returned. Note that a match length
+  ## of zero can happen. It's possible that a suffix of `s` remains
+  ## that does not belong to the match.
+  var c: TMatchClosure
+  result = m(s, pattern, start, c)
+
+proc find*(s: string, pattern: TPeg, matches: var openarray[string],
+           start = 0): int =
+  ## returns the starting position of ``pattern`` in ``s`` and the captured
+  ## substrings in the array ``matches``. If it does not match, nothing
+  ## is written into ``matches`` and -1 is returned.
+  for i in 0 .. s.len-1:
+    if matchLen(s, pattern, matches, i) >= 0: return i
+  return -1
+  # could also use the pattern here: (!P .)* P
+  
+proc find*(s: string, pattern: TPeg, start = 0): int =
+  ## returns the starting position of ``pattern`` in ``s``. If it does not
+  ## match, -1 is returned.
+  for i in 0 .. s.len-1:
+    if matchLen(s, pattern, i) >= 0: return i
+  return -1
+  
+template `=~`*(s: string, pattern: TPeg): expr =
+  ## This calls ``match`` with an implicit declared ``matches`` array that 
+  ## can be used in the scope of the ``=~`` call: 
+  ## 
+  ## .. code-block:: nimrod
+  ##
+  ##   if line =~ peg"\s* {\w+} \s* '=' \s* {\w+}": 
+  ##     # matches a key=value pair:
+  ##     echo("Key: ", matches[0])
+  ##     echo("Value: ", matches[1])
+  ##   elif line =~ peg"\s*{'#'.*}":
+  ##     # matches a comment
+  ##     # note that the implicit ``matches`` array is different from the
+  ##     # ``matches`` array of the first branch
+  ##     echo("comment: ", matches[0])
+  ##   else:
+  ##     echo("syntax error")
+  ##  
+  when not definedInScope(matches):
+    var matches: array[0..maxSubpatterns-1, string]
+  match(s, pattern, matches)
+
+# ------------------------- more string handling ------------------------------
+
+proc contains*(s: string, pattern: TPeg, start = 0): bool =
+  ## same as ``find(s, pattern, start) >= 0``
+  return find(s, pattern, start) >= 0
+
+proc contains*(s: string, pattern: TPeg, matches: var openArray[string],
+              start = 0): bool =
+  ## same as ``find(s, pattern, matches, start) >= 0``
+  return find(s, pattern, matches, start) >= 0
+
+proc startsWith*(s: string, prefix: TPeg): bool =
+  ## returns true if `s` starts with the pattern `prefix`
+  result = matchLen(s, prefix) >= 0
+
+proc endsWith*(s: string, suffix: TPeg): bool =
+  ## returns true if `s` ends with the pattern `prefix`
+  for i in 0 .. s.len-1:
+    if matchLen(s, suffix, i) == s.len - i: return true
+
+proc replace*(s: string, sub: TPeg, by: string): string =
+  ## Replaces `sub` in `s` by the string `by`. Captures can be accessed in `by`
+  ## with the notation ``$i`` and ``$#`` (see strutils.`%`). Examples:
+  ##
+  ## .. code-block:: nimrod
+  ##   "var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2")
+  ##
+  ## Results in:
+  ##
+  ## .. code-block:: nimrod
+  ##
+  ##   "var1<-keykey; val2<-key2key2"
+  result = ""
+  var i = 0
+  var caps: array[0..maxSubpatterns-1, string]
+  while i < s.len:
+    var x = matchLen(s, sub, caps, i)
+    if x <= 0:
+      add(result, s[i])
+      inc(i)
+    else:
+      addf(result, by, caps)
+      inc(i, x)
+  # copy the rest:
+  add(result, copy(s, i))
+  
+proc parallelReplace*(s: string, subs: openArray[
+                      tuple[pattern: TPeg, repl: string]]): string = 
+  ## Returns a modified copy of `s` with the substitutions in `subs`
+  ## applied in parallel.
+  result = ""
+  var i = 0
+  var caps: array[0..maxSubpatterns-1, string]
+  while i < s.len:
+    block searchSubs:
+      for j in 0..high(subs):
+        var x = matchLen(s, subs[j][0], caps, i)
+        if x > 0:
+          addf(result, subs[j][1], caps)
+          inc(i, x)
+          break searchSubs
+      add(result, s[i])
+      inc(i)
+  # copy the rest:
+  add(result, copy(s, i))  
+  
+proc transformFile*(infile, outfile: string,
+                    subs: openArray[tuple[pattern: TPeg, repl: string]]) =
+  ## reads in the file `infile`, performs a parallel replacement (calls
+  ## `parallelReplace`) and writes back to `outfile`. Calls ``quit`` if an
+  ## error occurs. This is supposed to be used for quick scripting.
+  var x = readFile(infile)
+  if not isNil(x):
+    var f: TFile
+    if open(f, outfile, fmWrite):
+      write(f, x.parallelReplace(subs))
+      close(f)
+    else:
+      quit("cannot open for writing: " & outfile)
+  else:
+    quit("cannot open for reading: " & infile)
+  
+iterator split*(s: string, sep: TPeg): string =
+  ## Splits the string `s` into substrings.
+  ##
+  ## Substrings are separated by the PEG `sep`.
+  ## Examples:
+  ##
+  ## .. code-block:: nimrod
+  ##   for word in split("00232this02939is39an22example111", peg"\d+"):
+  ##     writeln(stdout, word)
+  ##
+  ## Results in:
+  ##
+  ## .. code-block:: nimrod
+  ##   "this"
+  ##   "is"
+  ##   "an"
+  ##   "example"
+  ##
+  var
+    first = 0
+    last = 0
+  while last < len(s):
+    var x = matchLen(s, sep, last)
+    if x > 0: inc(last, x)
+    first = last
+    while last < len(s):
+      inc(last)
+      x = matchLen(s, sep, last)
+      if x > 0: break
+    if first < last:
+      yield copy(s, first, last-1)
+
+proc split*(s: string, sep: TPeg): seq[string] {.noSideEffect.} =
+  ## Splits the string `s` into substrings.
+  accumulateResult(split(s, sep))
+
+# ------------------------- debugging -----------------------------------------
+
+proc esc(c: char, reserved = {'\0'..'\255'}): string = 
+  case c
+  of '\b': result = "\\b"
+  of '\t': result = "\\t"
+  of '\c': result = "\\c"
+  of '\L': result = "\\l"
+  of '\v': result = "\\v"
+  of '\f': result = "\\f"
+  of '\e': result = "\\e"
+  of '\a': result = "\\a"
+  of '\\': result = "\\\\"
+  of 'a'..'z', 'A'..'Z', '0'..'9', '_': result = $c
+  elif c < ' ' or c >= '\128': result = '\\' & $ord(c)
+  elif c in reserved: result = '\\' & c
+  else: result = $c
+  
+proc singleQuoteEsc(c: Char): string = return "'" & esc(c, {'\''}) & "'"
+
+proc singleQuoteEsc(str: string): string = 
+  result = "'"
+  for c in items(str): add result, esc(c, {'\''})
+  add result, '\''
+  
+proc charSetEscAux(cc: set[char]): string = 
+  const reserved = {'^', '-', ']'}
+  result = ""
+  var c1 = 0
+  while c1 <= 0xff: 
+    if chr(c1) in cc: 
+      var c2 = c1
+      while c2 < 0xff and chr(succ(c2)) in cc: inc(c2)
+      if c1 == c2: 
+        add result, esc(chr(c1), reserved)
+      elif c2 == succ(c1): 
+        add result, esc(chr(c1), reserved) & esc(chr(c2), reserved)
+      else: 
+        add result, esc(chr(c1), reserved) & '-' & esc(chr(c2), reserved)
+      c1 = c2
+    inc(c1)
+  
+proc CharSetEsc(cc: set[char]): string =
+  if card(cc) >= 128+64: 
+    result = "[^" & CharSetEscAux({'\1'..'\xFF'} - cc) & ']'
+  else: 
+    result = '[' & CharSetEscAux(cc) & ']'
+  
+proc toStrAux(r: TPeg, res: var string) = 
+  case r.kind
+  of pkEmpty: add(res, "()")
+  of pkAny: add(res, '.')
+  of pkAnyRune: add(res, '_')
+  of pkNewline: add(res, "\\n")
+  of pkTerminal: add(res, singleQuoteEsc(r.term))
+  of pkTerminalIgnoreCase:
+    add(res, 'i')
+    add(res, singleQuoteEsc(r.term))
+  of pkTerminalIgnoreStyle:
+    add(res, 'y')
+    add(res, singleQuoteEsc(r.term))
+  of pkChar: add(res, singleQuoteEsc(r.ch))
+  of pkCharChoice: add(res, charSetEsc(r.charChoice^))
+  of pkNonTerminal: add(res, r.nt.name)
+  of pkSequence:
+    add(res, '(')
+    toStrAux(r.sons[0], res)
+    for i in 1 .. high(r.sons):
+      add(res, ' ')
+      toStrAux(r.sons[i], res)
+    add(res, ')')
+  of pkOrderedChoice:
+    add(res, '(')
+    toStrAux(r.sons[0], res)
+    for i in 1 .. high(r.sons):
+      add(res, " / ")
+      toStrAux(r.sons[i], res)
+    add(res, ')')
+  of pkGreedyRep:
+    toStrAux(r.sons[0], res)
+    add(res, '*')
+  of pkGreedyRepChar:
+    add(res, singleQuoteEsc(r.ch))
+    add(res, '*')
+  of pkGreedyRepSet:
+    add(res, charSetEsc(r.charChoice^))
+    add(res, '*')
+  of pkGreedyAny:
+    add(res, ".*")
+  of pkOption:
+    toStrAux(r.sons[0], res)
+    add(res, '?')
+  of pkAndPredicate:
+    add(res, '&')
+    toStrAux(r.sons[0], res)
+  of pkNotPredicate:
+    add(res, '!')
+    toStrAux(r.sons[0], res)
+  of pkCapture:
+    add(res, '{')
+    toStrAux(r.sons[0], res)    
+    add(res, '}')
+  of pkRule:
+    toStrAux(r.sons[0], res)    
+    add(res, " <- ")
+    toStrAux(r.sons[1], res)
+  of pkList:
+    for i in 0 .. high(r.sons):
+      toStrAux(r.sons[i], res)
+      add(res, "\n")  
+
+proc `$` *(r: TPeg): string =
+  ## converts a PEG to its string representation
+  result = ""
+  toStrAux(r, result)
+
+
+# ------------------- scanner -------------------------------------------------
+
+type
+  TModifier = enum
+    modNone,
+    modVerbatim,
+    modIgnoreCase,
+    modIgnoreStyle
+  TTokKind = enum       ## enumeration of all tokens
+    tkInvalid,          ## invalid token
+    tkEof,              ## end of file reached
+    tkAny,              ## .
+    tkAnyRune,          ## _
+    tkIdentifier,       ## abc
+    tkStringLit,        ## "abc" or 'abc'
+    tkCharSet,          ## [^A-Z]
+    tkParLe,            ## '('
+    tkParRi,            ## ')'
+    tkCurlyLe,          ## '{'
+    tkCurlyRi,          ## '}'
+    tkArrow,            ## '<-'
+    tkBar,              ## '/'
+    tkStar,             ## '*'
+    tkPlus,             ## '+'
+    tkAmp,              ## '&'
+    tkNot,              ## '!'
+    tkOption,           ## '?'
+    tkBuiltin,          ## \identifier
+    tkEscaped           ## \\
+  
+  TToken {.final.} = object  ## a token
+    kind: TTokKind           ## the type of the token
+    modifier: TModifier
+    literal: string          ## the parsed (string) literal
+    charset: set[char]       ## if kind == tkCharSet
+  
+  TPegLexer = object          ## the lexer object.
+    bufpos: int               ## the current position within the buffer
+    buf: cstring              ## the buffer itself
+    LineNumber: int           ## the current line number
+    lineStart: int            ## index of last line start in buffer
+    colOffset: int            ## column to add
+    filename: string
+
+const
+  tokKindToStr: array[TTokKind, string] = [
+    "invalid", "[EOF]", ".", "_", "identifier", "string literal",
+    "character set", "(", ")", "{", "}", "<-", "/", "*", "+", "&", "!", "?",
+    "built-in", "escaped"
+  ]
+
+proc HandleCR(L: var TPegLexer, pos: int): int =
+  assert(L.buf[pos] == '\c')
+  inc(L.linenumber)
+  result = pos+1
+  if L.buf[result] == '\L': inc(result)
+  L.lineStart = result
+
+proc HandleLF(L: var TPegLexer, pos: int): int =
+  assert(L.buf[pos] == '\L')
+  inc(L.linenumber)
+  result = pos+1
+  L.lineStart = result
+
+proc init(L: var TPegLexer, input, filename: string, line = 1, col = 0) = 
+  L.buf = input
+  L.bufpos = 0
+  L.lineNumber = line
+  L.colOffset = col
+  L.lineStart = 0
+  L.filename = filename
+
+proc getColumn(L: TPegLexer): int {.inline.} = 
+  result = abs(L.bufpos - L.lineStart) + L.colOffset
+
+proc getLine(L: TPegLexer): int {.inline.} = 
+  result = L.linenumber
+  
+proc errorStr(L: TPegLexer, msg: string, line = -1, col = -1): string =
+  var line = if line < 0: getLine(L) else: line
+  var col = if col < 0: getColumn(L) else: col
+  result = "$1($2, $3) Error: $4" % [L.filename, $line, $col, msg]
+
+proc handleHexChar(c: var TPegLexer, xi: var int) = 
+  case c.buf[c.bufpos]
+  of '0'..'9': 
+    xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('0'))
+    inc(c.bufpos)
+  of 'a'..'f': 
+    xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('a') + 10)
+    inc(c.bufpos)
+  of 'A'..'F': 
+    xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('A') + 10)
+    inc(c.bufpos)
+  else: nil
+
+proc getEscapedChar(c: var TPegLexer, tok: var TToken) = 
+  inc(c.bufpos)
+  case c.buf[c.bufpos]
+  of 'r', 'R', 'c', 'C': 
+    add(tok.literal, '\c')
+    Inc(c.bufpos)
+  of 'l', 'L': 
+    add(tok.literal, '\L')
+    Inc(c.bufpos)
+  of 'f', 'F': 
+    add(tok.literal, '\f')
+    inc(c.bufpos)
+  of 'e', 'E': 
+    add(tok.literal, '\e')
+    Inc(c.bufpos)
+  of 'a', 'A': 
+    add(tok.literal, '\a')
+    Inc(c.bufpos)
+  of 'b', 'B': 
+    add(tok.literal, '\b')
+    Inc(c.bufpos)
+  of 'v', 'V': 
+    add(tok.literal, '\v')
+    Inc(c.bufpos)
+  of 't', 'T': 
+    add(tok.literal, '\t')
+    Inc(c.bufpos)
+  of 'x', 'X': 
+    inc(c.bufpos)
+    var xi = 0
+    handleHexChar(c, xi)
+    handleHexChar(c, xi)
+    if xi == 0: tok.kind = tkInvalid
+    else: add(tok.literal, Chr(xi))
+  of '0'..'9': 
+    var val = ord(c.buf[c.bufpos]) - ord('0')
+    Inc(c.bufpos)
+    var i = 1
+    while (i <= 3) and (c.buf[c.bufpos] in {'0'..'9'}): 
+      val = val * 10 + ord(c.buf[c.bufpos]) - ord('0')
+      inc(c.bufpos)
+      inc(i)
+    if val > 0 and val <= 255: add(tok.literal, chr(val))
+    else: tok.kind = tkInvalid
+  of '\0'..'\31':
+    tok.kind = tkInvalid
+  elif c.buf[c.bufpos] in strutils.letters:
+    tok.kind = tkInvalid
+  else:
+    add(tok.literal, c.buf[c.bufpos])
+    Inc(c.bufpos)
+  
+proc skip(c: var TPegLexer) = 
+  var pos = c.bufpos
+  var buf = c.buf
+  while true: 
+    case buf[pos]
+    of ' ', '\t': 
+      Inc(pos)
+    of '#':
+      while not (buf[pos] in {'\c', '\L', '\0'}): inc(pos)
+    of '\c':
+      pos = HandleCR(c, pos)
+      buf = c.buf
+    of '\L': 
+      pos = HandleLF(c, pos)
+      buf = c.buf
+    else: 
+      break                   # EndOfFile also leaves the loop
+  c.bufpos = pos
+  
+proc getString(c: var TPegLexer, tok: var TToken) = 
+  tok.kind = tkStringLit
+  var pos = c.bufPos + 1
+  var buf = c.buf
+  var quote = buf[pos-1]
+  while true: 
+    case buf[pos]
+    of '\\':
+      c.bufpos = pos
+      getEscapedChar(c, tok)
+      pos = c.bufpos
+    of '\c', '\L', '\0':
+      tok.kind = tkInvalid
+      break
+    elif buf[pos] == quote:
+      inc(pos)
+      break      
+    else:
+      add(tok.literal, buf[pos])
+      Inc(pos)
+  c.bufpos = pos
+  
+proc getCharSet(c: var TPegLexer, tok: var TToken) = 
+  tok.kind = tkCharSet
+  tok.charset = {}
+  var pos = c.bufPos + 1
+  var buf = c.buf
+  var caret = false
+  if buf[pos] == '^':
+    inc(pos)
+    caret = true
+  while true:
+    var ch: char
+    case buf[pos]
+    of ']':
+      inc(pos)
+      break
+    of '\\':
+      c.bufpos = pos
+      getEscapedChar(c, tok)
+      pos = c.bufpos
+      ch = tok.literal[tok.literal.len-1]
+    of '\C', '\L', '\0':
+      tok.kind = tkInvalid
+      break
+    else: 
+      ch = buf[pos]
+      Inc(pos)
+    incl(tok.charset, ch)
+    if buf[pos] == '-':
+      if buf[pos+1] == ']':
+        incl(tok.charset, '-')
+        inc(pos)
+      else:
+        inc(pos)
+        var ch2: char
+        case buf[pos]
+        of '\\':
+          c.bufpos = pos
+          getEscapedChar(c, tok)
+          pos = c.bufpos
+          ch2 = tok.literal[tok.literal.len-1]
+        of '\C', '\L', '\0':
+          tok.kind = tkInvalid
+          break
+        else: 
+          ch2 = buf[pos]
+          Inc(pos)
+        for i in ord(ch)+1 .. ord(ch2):
+          incl(tok.charset, chr(i))
+  c.bufpos = pos
+  if caret: tok.charset = {'\1'..'\xFF'} - tok.charset
+  
+proc getSymbol(c: var TPegLexer, tok: var TToken) = 
+  var pos = c.bufpos
+  var buf = c.buf
+  while true: 
+    add(tok.literal, buf[pos])
+    Inc(pos)
+    if buf[pos] notin strutils.IdentChars: break
+  c.bufpos = pos
+  tok.kind = tkIdentifier
+
+proc getBuiltin(c: var TPegLexer, tok: var TToken) =
+  if c.buf[c.bufpos+1] in strutils.Letters:
+    inc(c.bufpos)
+    getSymbol(c, tok)
+    tok.kind = tkBuiltin
+  else:
+    tok.kind = tkEscaped
+    getEscapedChar(c, tok) # may set tok.kind to tkInvalid
+
+proc getTok(c: var TPegLexer, tok: var TToken) = 
+  tok.kind = tkInvalid
+  tok.modifier = modNone
+  setlen(tok.literal, 0)
+  skip(c)
+  case c.buf[c.bufpos]
+  of '{':
+    tok.kind = tkCurlyLe
+    inc(c.bufpos)
+    add(tok.literal, '{')
+  of '}': 
+    tok.kind = tkCurlyRi
+    inc(c.bufpos)
+    add(tok.literal, '}')
+  of '[': 
+    getCharset(c, tok)
+  of '(':
+    tok.kind = tkParLe
+    Inc(c.bufpos)
+    add(tok.literal, '(')
+  of ')':
+    tok.kind = tkParRi
+    Inc(c.bufpos)
+    add(tok.literal, ')')
+  of '.': 
+    tok.kind = tkAny
+    inc(c.bufpos)
+    add(tok.literal, '.')
+  of '_':
+    tok.kind = tkAnyRune
+    inc(c.bufpos)
+    add(tok.literal, '_')
+  of '\\': 
+    getBuiltin(c, tok)
+  of '\'', '"': getString(c, tok)
+  of '\0': 
+    tok.kind = tkEof
+    tok.literal = "[EOF]"
+  of 'a'..'z', 'A'..'Z', '\128'..'\255':
+    getSymbol(c, tok)
+    if c.buf[c.bufpos] in {'\'', '"'}:
+      case tok.literal
+      of "i": tok.modifier = modIgnoreCase
+      of "y": tok.modifier = modIgnoreStyle
+      of "v": tok.modifier = modVerbatim
+      else: nil
+      setLen(tok.literal, 0)
+      getString(c, tok)
+      if tok.modifier == modNone: tok.kind = tkInvalid
+  of '+':
+    tok.kind = tkPlus
+    inc(c.bufpos)
+    add(tok.literal, '+')
+  of '*':
+    tok.kind = tkStar
+    inc(c.bufpos)
+    add(tok.literal, '+')
+  of '<':
+    if c.buf[c.bufpos+1] == '-':
+      inc(c.bufpos, 2)
+      tok.kind = tkArrow
+      add(tok.literal, "<-")
+    else:
+      add(tok.literal, '<')
+  of '/':
+    tok.kind = tkBar
+    inc(c.bufpos)
+    add(tok.literal, '/')
+  of '?':
+    tok.kind = tkOption
+    inc(c.bufpos)
+    add(tok.literal, '?')
+  of '!':
+    tok.kind = tkNot
+    inc(c.bufpos)
+    add(tok.literal, '!')
+  of '&':
+    tok.kind = tkAmp
+    inc(c.bufpos)
+    add(tok.literal, '!')
+  else:
+    add(tok.literal, c.buf[c.bufpos])
+    inc(c.bufpos)
+
+proc arrowIsNextTok(c: TPegLexer): bool =
+  # the only look ahead we need
+  var pos = c.bufpos
+  while c.buf[pos] in {'\t', ' '}: inc(pos)
+  result = c.buf[pos] == '<' and c.buf[pos+1] == '-'
+
+# ----------------------------- parser ----------------------------------------
+    
+type
+  EInvalidPeg* = object of EBase ## raised if an invalid PEG has been detected
+  TPegParser = object of TPegLexer ## the PEG parser object
+    tok: TToken
+    nonterms: seq[PNonTerminal]
+    modifier: TModifier
+
+proc getTok(p: var TPegParser) = getTok(p, p.tok)
+
+proc pegError(p: TPegParser, msg: string, line = -1, col = -1) =
+  var e: ref EInvalidPeg
+  new(e)
+  e.msg = errorStr(p, msg, line, col)
+  raise e
+
+proc eat(p: var TPegParser, kind: TTokKind) =
+  if p.tok.kind == kind: getTok(p)
+  else: pegError(p, tokKindToStr[kind] & " expected")
+
+proc parseExpr(p: var TPegParser): TPeg
+
+proc getNonTerminal(p: TPegParser, name: string): PNonTerminal =
+  for i in 0..high(p.nonterms):
+    result = p.nonterms[i]
+    if cmpIgnoreStyle(result.name, name) == 0: return
+  # forward reference:
+  result = newNonTerminal(name, getLine(p), getColumn(p))
+  add(p.nonterms, result)
+
+proc modifiedTerm(s: string, m: TModifier): TPeg =
+  case m
+  of modNone, modVerbatim: result = term(s)
+  of modIgnoreCase: result = termIgnoreCase(s)
+  of modIgnoreStyle: result = termIgnoreStyle(s)
+
+proc primary(p: var TPegParser): TPeg =
+  case p.tok.kind
+  of tkAmp:
+    getTok(p)
+    return &primary(p)
+  of tkNot:
+    getTok(p)
+    return !primary(p)
+  else: nil
+  case p.tok.kind
+  of tkIdentifier:
+    if not arrowIsNextTok(p):
+      var nt = getNonTerminal(p, p.tok.literal)
+      incl(nt.flags, ntUsed)
+      result = nonTerminal(nt)
+      getTok(p)
+    else:
+      pegError(p, "expression expected, but found: " & p.tok.literal)
+  of tkStringLit:
+    var m = p.tok.modifier
+    if m == modNone: m = p.modifier
+    result = modifiedTerm(p.tok.literal, m)
+    getTok(p)
+  of tkCharSet:
+    if '\0' in p.tok.charset:
+      pegError(p, "binary zero ('\\0') not allowed in character class")
+    result = charset(p.tok.charset)
+    getTok(p)
+  of tkParLe:
+    getTok(p)
+    result = parseExpr(p)
+    eat(p, tkParRi)
+  of tkCurlyLe:
+    getTok(p)
+    result = capture(parseExpr(p))
+    eat(p, tkCurlyRi)
+  of tkAny:
+    result = any()
+    getTok(p)
+  of tkAnyRune:
+    result = anyRune()
+    getTok(p)
+  of tkBuiltin:
+    case p.tok.literal
+    of "n": result = newLine()
+    of "d": result = charset({'0'..'9'})
+    of "D": result = charset({'\1'..'\xff'} - {'0'..'9'})
+    of "s": result = charset({' ', '\9'..'\13'})
+    of "S": result = charset({'\1'..'\xff'} - {' ', '\9'..'\13'})
+    of "w": result = charset({'a'..'z', 'A'..'Z', '_'})
+    of "W": result = charset({'\1'..'\xff'} - {'a'..'z', 'A'..'Z', '_'})
+    of "ident": result = pegs.ident
+    else: pegError(p, "unknown built-in: " & p.tok.literal)
+    getTok(p)
+  of tkEscaped:
+    result = term(p.tok.literal[0])
+    getTok(p)
+  else:
+    pegError(p, "expression expected, but found: " & p.tok.literal)
+    getTok(p) # we must consume a token here to prevent endless loops!
+  while true:
+    case p.tok.kind
+    of tkOption:
+      result = ?result
+      getTok(p)
+    of tkStar:
+      result = *result
+      getTok(p)
+    of tkPlus:
+      result = +result
+      getTok(p)
+    else: break
+
+proc seqExpr(p: var TPegParser): TPeg =
+  result = primary(p)
+  while true:
+    case p.tok.kind
+    of tkAmp, tkNot, tkStringLit, tkCharset, tkParLe, tkCurlyLe,
+       tkAny, tkAnyRune, tkBuiltin, tkEscaped:
+      result = sequence(result, primary(p))
+    of tkIdentifier:
+      if not arrowIsNextTok(p):
+        result = sequence(result, primary(p))
+      else: break
+    else: break
+
+proc parseExpr(p: var TPegParser): TPeg =
+  result = seqExpr(p)
+  while p.tok.kind == tkBar:
+    getTok(p)
+    result = result / seqExpr(p)
+  
+proc parseRule(p: var TPegParser): PNonTerminal =
+  if p.tok.kind == tkIdentifier and arrowIsNextTok(p):
+    result = getNonTerminal(p, p.tok.literal)
+    if ntDeclared in result.flags:
+      pegError(p, "attempt to redefine: " & result.name)
+    result.line = getLine(p)
+    result.col = getColumn(p)
+    getTok(p)
+    eat(p, tkArrow)
+    result.rule = parseExpr(p)
+    incl(result.flags, ntDeclared) # NOW inlining may be attempted
+  else:
+    pegError(p, "rule expected, but found: " & p.tok.literal)
+  
+proc rawParse(p: var TPegParser): TPeg =
+  ## parses a rule or a PEG expression
+  if p.tok.kind == tkBuiltin:
+    case p.tok.literal
+    of "i":
+      p.modifier = modIgnoreCase
+      getTok(p)
+    of "y":
+      p.modifier = modIgnoreStyle
+      getTok(p)
+    else: nil
+  if p.tok.kind == tkIdentifier and arrowIsNextTok(p):
+    result = parseRule(p).rule
+    while p.tok.kind != tkEof:
+      discard parseRule(p)
+  else:
+    result = parseExpr(p)
+  if p.tok.kind != tkEof:
+    pegError(p, "EOF expected, but found: " & p.tok.literal)
+  for i in 0..high(p.nonterms):
+    var nt = p.nonterms[i]
+    if ntDeclared notin nt.flags:
+      pegError(p, "undeclared identifier: " & nt.name, nt.line, nt.col)
+    elif ntUsed notin nt.flags and i > 0:
+      pegError(p, "unused rule: " & nt.name, nt.line, nt.col)
+
+proc parsePeg*(input: string, filename = "pattern", line = 1, col = 0): TPeg =
+  var p: TPegParser
+  init(TPegLexer(p), input, filename, line, col)
+  p.tok.kind = tkInvalid
+  p.tok.modifier = modNone
+  p.tok.literal = ""
+  p.tok.charset = {}
+  p.nonterms = @[]
+  getTok(p)
+  result = rawParse(p)
+
+proc peg*(pattern: string): TPeg =
+  ## constructs a TPeg object from the `pattern`. The short name has been
+  ## chosen to encourage its use as a raw string modifier::
+  ##
+  ##   peg"{\ident} \s* '=' \s* {.*}"
+  result = parsePeg(pattern, "pattern")
+
+when isMainModule:
+  assert match("W_HI_Le", peg"\y 'while'")
+  assert(not match("W_HI_L", peg"\y 'while'"))
+  assert(not match("W_HI_Le", peg"\y v'while'"))
+  assert match("W_HI_Le", peg"y'while'")
+  
+  assert($ +digits == $peg"\d+")
+  assert "0158787".match(peg"\d+")
+  assert "ABC 0232".match(peg"\w+\s+\d+")
+  assert "ABC".match(peg"\d+ / \w+")
+
+  for word in split("00232this02939is39an22example111", peg"\d+"):
+    writeln(stdout, word)
+
+  assert matchLen("key", ident) == 3
+
+  var pattern = sequence(ident, *whitespace, term('='), *whitespace, ident)
+  assert matchLen("key1=  cal9", pattern) == 11
+  
+  var ws = newNonTerminal("ws", 1, 1)
+  ws.rule = *whitespace
+  
+  var expr = newNonTerminal("expr", 1, 1)
+  expr.rule = sequence(capture(ident), *sequence(
+                nonterminal(ws), term('+'), nonterminal(ws), nonterminal(expr)))
+  
+  var c: TMatchClosure
+  var s = "a+b +  c +d+e+f"
+  assert m(s, expr.rule, 0, c) == len(s)
+  var a = ""
+  for i in 0..c.ml-1:
+    a.add(copy(s, c.matches[i][0], c.matches[i][1]))
+  assert a == "abcdef"
+  #echo expr.rule
+
+  #const filename = "lib/devel/peg/grammar.txt"
+  #var grammar = parsePeg(newFileStream(filename, fmRead), filename)
+  #echo "a <- [abc]*?".match(grammar)
+  assert find("_____abc_______", term("abc")) == 5
+  assert match("_______ana", peg"A <- 'ana' / . A")
+  assert match("abcs%%%", peg"A <- ..A / .A / '%'")
+
+  if "abc" =~ peg"{'a'}'bc' 'xyz' / {\ident}":
+    assert matches[0] == "abc"
+  else:
+    assert false
+  
+  var g2 = peg"""S <- A B / C D
+                 A <- 'a'+
+                 B <- 'b'+
+                 C <- 'c'+
+                 D <- 'd'+
+              """
+  assert($g2 == "((A B) / (C D))")
+  assert match("cccccdddddd", g2)
+  echo "var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2")
+  assert "var1=key; var2=key2".endsWith(peg"{\ident}'='{\ident}")
+