Peg captures get stack-like behavior (#18369)

* Implements reverse capture indexing. * Now works for modified backrefs too. * Changed reverse indexing syntax prefix for back-references to '$^'.
author: gemath <33119724+gemath@users.noreply.github.com> 2021-06-28 12:33:20 +0200
committer: GitHub <noreply@github.com> 2021-06-28 12:33:20 +0200
commit: e720bbdd76303e2cc0a3a169e870859470a1da84 (patch)
tree: b8cad0c1c8b90ebafb0e74c134ec303d026078b5
parent: 908b2cc2e44bededdcb943778f44b908a4f3c1df (diff)
download: Nim-e720bbdd76303e2cc0a3a169e870859470a1da84.tar.gz
3 files changed, 94 insertions, 34 deletions
diff --git a/doc/pegdocs.txt b/doc/pegdocs.txt
index 4c557aed8..0363d4874 100644
--- a/doc/pegdocs.txt
+++ b/doc/pegdocs.txt
@@ -27,7 +27,10 @@ notation           meaning
 ``{E}``            Capture: Apply expression `E` and store the substring
                    that matched `E` into a *capture* that can be accessed
                    after the matching process.
-``$i``             Back reference to the ``i``th capture. ``i`` counts from 1.
+``{}``             Empty capture: Delete the last capture. No character
+                   is consumed.
+``$i``             Back reference to the ``i``th capture. ``i`` counts forwards
+                   from 1 or backwards (last capture to first) from ^1.
 ``$``              Anchor: Matches at the end of the input. No character
                    is consumed. Same as ``!.``.
 ``^``              Anchor: Matches at the start of the input. No character
@@ -149,14 +152,15 @@ The PEG parser implements this grammar (written in PEG syntax)::
   rule <- identifier \s* "<-" expr ig
   identNoArrow <- identifier !(\s* "<-")
   prefixOpr <- ig '&' / ig '!' / ig '@' / ig '{@}' / ig '@@'
-  literal <- ig identifier? '$' [0-9]+ / '$' / '^' /
+  literal <- ig identifier? '$' '^'? [0-9]+ / '$' / '^' /
              ig identNoArrow /
              ig charset /
              ig stringlit /
              ig builtin /
              ig '.' /
              ig '_' /
-             (ig "(" expr ig ")")
+             (ig "(" expr ig ")") /
+             (ig "{" expr? ig "}")
   postfixOpr <- ig '?' / ig '*' / ig '+'
   primary <- prefixOpr* (literal postfixOpr*)
 
diff --git a/lib/pure/pegs.nim b/lib/pure/pegs.nim
index b5d9b1d07..b6b05cdc1 100644
--- a/lib/pure/pegs.nim
+++ b/lib/pure/pegs.nim
@@ -83,7 +83,7 @@ type
     of pkChar, pkGreedyRepChar: ch: char
     of pkCharChoice, pkGreedyRepSet: charChoice: ref set[char]
     of pkNonTerminal: nt: NonTerminal
-    of pkBackRef..pkBackRefIgnoreStyle: index: range[0..MaxSubpatterns]
+    of pkBackRef..pkBackRefIgnoreStyle: index: range[-MaxSubpatterns..MaxSubpatterns-1]
     else: sons: seq[Peg]
   NonTerminal* = ref NonTerminalObj
 
@@ -106,7 +106,7 @@ proc nt*(p: Peg): NonTerminal = p.nt
   ## Returns the *NonTerminal* object of a given *Peg* variant object
   ## where present.
 
-proc index*(p: Peg): range[0..MaxSubpatterns] = p.index
+proc index*(p: Peg): range[-MaxSubpatterns..MaxSubpatterns-1] = p.index
   ## Returns the back-reference index of a captured sub-pattern in the
   ## *Captures* object for a given *Peg* variant object where present.
 
@@ -304,34 +304,37 @@ proc endAnchor*: Peg {.inline.} =
   ## constructs the PEG ``$`` which matches the end of the input.
   result = !any()
 
-proc capture*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsCapture".} =
+proc capture*(a: Peg = Peg(kind: pkEmpty)): Peg {.noSideEffect, rtl, extern: "npegsCapture".} =
   ## constructs a capture with the PEG `a`
   result = Peg(kind: pkCapture, sons: @[a])
 
-proc backref*(index: range[1..MaxSubpatterns]): Peg {.
+proc backref*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
   noSideEffect, rtl, extern: "npegs$1".} =
   ## constructs a back reference of the given `index`. `index` starts counting
-  ## from 1.
-  result = Peg(kind: pkBackRef, index: index-1)
+  ## from 1. `reverse` specifies wether indexing starts from the end of the
+  ## capture list.
+  result = Peg(kind: pkBackRef, index: (if reverse: -index else: index - 1))
 
-proc backrefIgnoreCase*(index: range[1..MaxSubpatterns]): Peg {.
+proc backrefIgnoreCase*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
   noSideEffect, rtl, extern: "npegs$1".} =
   ## constructs a back reference of the given `index`. `index` starts counting
-  ## from 1. Ignores case for matching.
-  result = Peg(kind: pkBackRefIgnoreCase, index: index-1)
+  ## from 1. `reverse` specifies wether indexing starts from the end of the
+  ## capture list. Ignores case for matching.
+  result = Peg(kind: pkBackRefIgnoreCase, index: (if reverse: -index else: index - 1))
 
-proc backrefIgnoreStyle*(index: range[1..MaxSubpatterns]): Peg {.
+proc backrefIgnoreStyle*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
   noSideEffect, rtl, extern: "npegs$1".} =
   ## constructs a back reference of the given `index`. `index` starts counting
-  ## from 1. Ignores style for matching.
-  result = Peg(kind: pkBackRefIgnoreStyle, index: index-1)
+  ## from 1. `reverse` specifies wether indexing starts from the end of the
+  ## capture list. Ignores style for matching.
+  result = Peg(kind: pkBackRefIgnoreStyle, index: (if reverse: -index else: index - 1))
 
 proc spaceCost(n: Peg): int =
   case n.kind
   of pkEmpty: discard
   of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar,
      pkGreedyRepChar, pkCharChoice, pkGreedyRepSet,
-     pkAny..pkWhitespace, pkGreedyAny:
+     pkAny..pkWhitespace, pkGreedyAny, pkBackRef..pkBackRefIgnoreStyle:
     result = 1
   of pkNonTerminal:
     # we cannot inline a rule with a non-terminal
@@ -561,8 +564,10 @@ template matchOrParse(mopProc: untyped) =
       # Parse handler code must run in an *of* clause of its own for each
       # *PegKind*, so we encapsulate the identical clause body for
       # *pkBackRef..pkBackRefIgnoreStyle* here.
-      if p.index >= c.ml: return -1
-      var (a, b) = c.matches[p.index]
+      var index = p.index
+      if index < 0: index.inc(c.ml)
+      if index < 0 or index >= c.ml: return -1
+      var (a, b) = c.matches[index]
       var n: Peg
       case p.kind
       of pkBackRef:
@@ -822,15 +827,19 @@ template matchOrParse(mopProc: untyped) =
       leave(pkNotPredicate, s, p, start, result)
     of pkCapture:
       enter(pkCapture, s, p, start)
-      var idx = c.ml # reserve a slot for the subpattern
-      inc(c.ml)
-      result = mopProc(s, p.sons[0], start, c)
-      if result >= 0:
-        if idx < MaxSubpatterns:
-          c.matches[idx] = (start, start+result-1)
-        #else: silently ignore the capture
+      if p.sons.len == 0 or p.sons[0].kind == pkEmpty:
+        # empty capture removes last match
+        dec(c.ml)
+        c.matches[c.ml] = (0, 0)
+        result = 0 # match of length 0
       else:
-        c.ml = idx
+        var idx = c.ml # reserve a slot for the subpattern
+        result = mopProc(s, p.sons[0], start, c)
+        if result >= 0:
+          inc(c.ml)
+          if idx < MaxSubpatterns:
+            c.matches[idx] = (start, start+result-1)
+          #else: silently ignore the capture
       leave(pkCapture, s, p, start, result)
     of pkBackRef:
       enter(pkBackRef, s, p, start)
@@ -1395,6 +1404,7 @@ type
     tkCurlyLe,    ## '{'
     tkCurlyRi,    ## '}'
     tkCurlyAt,    ## '{@}'
+    tkEmptyCurl,  ## '{}'
     tkArrow,      ## '<-'
     tkBar,        ## '/'
     tkStar,       ## '*'
@@ -1427,7 +1437,7 @@ type
 const
   tokKindToStr: array[TokKind, string] = [
     "invalid", "[EOF]", ".", "_", "identifier", "string literal",
-    "character set", "(", ")", "{", "}", "{@}",
+    "character set", "(", ")", "{", "}", "{@}", "{}",
     "<-", "/", "*", "+", "&", "!", "?",
     "@", "built-in", "escaped", "$", "$", "^"
   ]
@@ -1564,13 +1574,21 @@ proc getString(c: var PegLexer, tok: var Token) =
 
 proc getDollar(c: var PegLexer, tok: var Token) =
   var pos = c.bufpos + 1
+  var neg = false
+  if pos < c.buf.len and c.buf[pos] == '^':
+    neg = true
+    inc(pos)
   if pos < c.buf.len and c.buf[pos] in {'0'..'9'}:
     tok.kind = tkBackref
     tok.index = 0
     while pos < c.buf.len and c.buf[pos] in {'0'..'9'}:
       tok.index = tok.index * 10 + ord(c.buf[pos]) - ord('0')
       inc(pos)
+    if neg:
+      tok.index = -tok.index
   else:
+    if neg:
+      dec(pos)
     tok.kind = tkDollar
   c.bufpos = pos
 
@@ -1670,6 +1688,10 @@ proc getTok(c: var PegLexer, tok: var Token) =
       tok.kind = tkCurlyAt
       inc(c.bufpos, 2)
       add(tok.literal, "{@}")
+    elif c.buf[c.bufpos] == '}' and c.bufpos < c.buf.len:
+      tok.kind = tkEmptyCurl
+      inc(c.bufpos)
+      add(tok.literal, "{}")
     else:
       tok.kind = tkCurlyLe
       add(tok.literal, '{')
@@ -1705,7 +1727,7 @@ proc getTok(c: var PegLexer, tok: var Token) =
       return
     if c.buf[c.bufpos] in {'\'', '"'} or
         c.buf[c.bufpos] == '$' and c.bufpos+1 < c.buf.len and
-        c.buf[c.bufpos+1] in {'0'..'9'}:
+        c.buf[c.bufpos+1] in {'^', '0'..'9'}:
       case tok.literal
       of "i": tok.modifier = modIgnoreCase
       of "y": tok.modifier = modIgnoreStyle
@@ -1819,10 +1841,13 @@ proc modifiedTerm(s: string, m: Modifier): Peg =
   of modIgnoreStyle: result = termIgnoreStyle(s)
 
 proc modifiedBackref(s: int, m: Modifier): Peg =
+  var
+    reverse = s < 0
+    index = if reverse: -s else: s
   case m
-  of modNone, modVerbatim: result = backref(s)
-  of modIgnoreCase: result = backrefIgnoreCase(s)
-  of modIgnoreStyle: result = backrefIgnoreStyle(s)
+  of modNone, modVerbatim: result = backref(index, reverse)
+  of modIgnoreCase: result = backrefIgnoreCase(index, reverse)
+  of modIgnoreStyle: result = backrefIgnoreStyle(index, reverse)
 
 proc builtin(p: var PegParser): Peg =
   # do not use "y", "skip" or "i" as these would be ambiguous
@@ -1896,6 +1921,9 @@ proc primary(p: var PegParser): Peg =
     result = capture(parseExpr(p)).token(p)
     eat(p, tkCurlyRi)
     inc(p.captures)
+  of tkEmptyCurl:
+    result = capture()
+    getTok(p)
   of tkAny:
     result = any().token(p)
     getTok(p)
@@ -1915,11 +1943,11 @@ proc primary(p: var PegParser): Peg =
     result = startAnchor()
     getTok(p)
   of tkBackref:
+    if abs(p.tok.index) > p.captures or p.tok.index == 0:
+      pegError(p, "invalid back reference index: " & $p.tok.index)
     var m = p.tok.modifier
     if m == modNone: m = p.modifier
     result = modifiedBackref(p.tok.index, m).token(p)
-    if p.tok.index < 0 or p.tok.index > p.captures:
-      pegError(p, "invalid back reference index: " & $p.tok.index)
     getTok(p)
   else:
     pegError(p, "expression expected, but found: " & p.tok.literal)
@@ -1943,7 +1971,7 @@ proc seqExpr(p: var PegParser): Peg =
     case p.tok.kind
     of tkAmp, tkNot, tkAt, tkStringLit, tkCharSet, tkParLe, tkCurlyLe,
        tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref,
-       tkHat, tkCurlyAt:
+       tkHat, tkCurlyAt, tkEmptyCurl:
       result = sequence(result, primary(p))
     of tkIdentifier:
       if not arrowIsNextTok(p):
diff --git a/tests/stdlib/tpegs.nim b/tests/stdlib/tpegs.nim
index f2c93c2e5..1261d55b8 100644
--- a/tests/stdlib/tpegs.nim
+++ b/tests/stdlib/tpegs.nim
@@ -293,6 +293,34 @@ block:
 
     doAssert "test1".match(peg"""{@}$""")
     doAssert "test2".match(peg"""{(!$ .)*} $""")
+
+    doAssert "abbb".match(peg"{a} {b} $2 $^1")
+    doAssert "abBA".match(peg"{a} {b} i$2 i$^2")
+
+    doAssert "abba".match(peg"{a} {b} $^1 {} $^1")
+
+    block:
+      let grammar = peg"""
+program <- {''} stmt* $
+stmt <- call / block
+call <- 'call()' EOL
+EOL <- \n / $
+block <- 'block:' \n indBody
+indBody <- {$^1 ' '+} stmt ($^1 stmt)* {}
+"""
+      let program = """
+call()
+block:
+  block:
+    call()
+    call()
+  call()
+call()
+"""
+      var c: Captures
+      doAssert program.len == program.rawMatch(grammar, 0, c)
+      doAssert c.ml == 1
+
   pegsTest()
   static:
     pegsTest()
author	gemath <33119724+gemath@users.noreply.github.com>	2021-06-28 12:33:20 +0200
committer	GitHub <noreply@github.com>	2021-06-28 12:33:20 +0200
commit	e720bbdd76303e2cc0a3a169e870859470a1da84 (patch)
tree	b8cad0c1c8b90ebafb0e74c134ec303d026078b5
parent	908b2cc2e44bededdcb943778f44b908a4f3c1df (diff)
download	Nim-e720bbdd76303e2cc0a3a169e870859470a1da84.tar.gz