summary refs log tree commit diff stats
path: root/lib/pure/pegs.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/pure/pegs.nim')
-rwxr-xr-xlib/pure/pegs.nim287
1 files changed, 166 insertions, 121 deletions
diff --git a/lib/pure/pegs.nim b/lib/pure/pegs.nim
index c029f92a2..21205bdb2 100755
--- a/lib/pure/pegs.nim
+++ b/lib/pure/pegs.nim
@@ -42,7 +42,7 @@ type
     pkSequence,         ## a b c ... --> Internal DSL: peg(a, b, c)
     pkOrderedChoice,    ## a / b / ... --> Internal DSL: a / b or /[a, b, c]
     pkGreedyRep,        ## a*     --> Internal DSL: *a
-                        ## a+     --> Internal DSL: +a; represented as (a a*)
+                        ## a+     --> (a a*)
     pkGreedyRepChar,    ## x* where x is a single character (superop)
     pkGreedyRepSet,     ## [set]* (superop)
     pkGreedyAny,        ## .* or _* (superop)
@@ -50,6 +50,7 @@ type
     pkAndPredicate,     ## &a     --> Internal DSL: &a
     pkNotPredicate,     ## !a     --> Internal DSL: !a
     pkCapture,          ## {a}    --> Internal DSL: capture(a)
+    pkSearch,           ## @a     --> Internal DSL: @a
     pkRule,             ## a <- b
     pkList              ## a, b
   TNonTerminalFlag = enum
@@ -174,7 +175,24 @@ proc `*`*(a: TPeg): TPeg =
   else:
     result.kind = pkGreedyRep
     result.sons = @[a]
+
+proc `@`*(a: TPeg): TPeg =
+  ## constructs a "search" for the PEG `a`
+  result.kind = pkSearch
+  result.sons = @[a]
   
+when false:
+  proc contains(a: TPeg, k: TPegKind): bool =
+    if a.kind == k: return true
+    case a.kind
+    of pkEmpty, pkAny, pkAnyRune, pkGreedyAny, pkNewLine, pkTerminal,
+       pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar, pkGreedyRepChar,
+       pkCharChoice, pkGreedyRepSet: nil
+    of pkNonTerminal: return true
+    else:
+      for i in 0..a.sons.len-1:
+        if contains(a.sons[i], k): return true
+
 proc `+`*(a: TPeg): TPeg =
   ## constructs a "greedy positive repetition" with the PEG `a`
   return sequence(a, *a)
@@ -271,6 +289,125 @@ const
   MaxSubpatterns* = 10 ## defines the maximum number of subpatterns that
                        ## can be captured. More subpatterns cannot be captured! 
 
+# ------------------------- debugging -----------------------------------------
+
+proc esc(c: char, reserved = {'\0'..'\255'}): string = 
+  case c
+  of '\b': result = "\\b"
+  of '\t': result = "\\t"
+  of '\c': result = "\\c"
+  of '\L': result = "\\l"
+  of '\v': result = "\\v"
+  of '\f': result = "\\f"
+  of '\e': result = "\\e"
+  of '\a': result = "\\a"
+  of '\\': result = "\\\\"
+  of 'a'..'z', 'A'..'Z', '0'..'9', '_': result = $c
+  elif c < ' ' or c >= '\128': result = '\\' & $ord(c)
+  elif c in reserved: result = '\\' & c
+  else: result = $c
+  
+proc singleQuoteEsc(c: Char): string = return "'" & esc(c, {'\''}) & "'"
+
+proc singleQuoteEsc(str: string): string = 
+  result = "'"
+  for c in items(str): add result, esc(c, {'\''})
+  add result, '\''
+  
+proc charSetEscAux(cc: set[char]): string = 
+  const reserved = {'^', '-', ']'}
+  result = ""
+  var c1 = 0
+  while c1 <= 0xff: 
+    if chr(c1) in cc: 
+      var c2 = c1
+      while c2 < 0xff and chr(succ(c2)) in cc: inc(c2)
+      if c1 == c2: 
+        add result, esc(chr(c1), reserved)
+      elif c2 == succ(c1): 
+        add result, esc(chr(c1), reserved) & esc(chr(c2), reserved)
+      else: 
+        add result, esc(chr(c1), reserved) & '-' & esc(chr(c2), reserved)
+      c1 = c2
+    inc(c1)
+  
+proc CharSetEsc(cc: set[char]): string =
+  if card(cc) >= 128+64: 
+    result = "[^" & CharSetEscAux({'\1'..'\xFF'} - cc) & ']'
+  else: 
+    result = '[' & CharSetEscAux(cc) & ']'
+  
+proc toStrAux(r: TPeg, res: var string) = 
+  case r.kind
+  of pkEmpty: add(res, "()")
+  of pkAny: add(res, '.')
+  of pkAnyRune: add(res, '_')
+  of pkNewline: add(res, "\\n")
+  of pkTerminal: add(res, singleQuoteEsc(r.term))
+  of pkTerminalIgnoreCase:
+    add(res, 'i')
+    add(res, singleQuoteEsc(r.term))
+  of pkTerminalIgnoreStyle:
+    add(res, 'y')
+    add(res, singleQuoteEsc(r.term))
+  of pkChar: add(res, singleQuoteEsc(r.ch))
+  of pkCharChoice: add(res, charSetEsc(r.charChoice^))
+  of pkNonTerminal: add(res, r.nt.name)
+  of pkSequence:
+    add(res, '(')
+    toStrAux(r.sons[0], res)
+    for i in 1 .. high(r.sons):
+      add(res, ' ')
+      toStrAux(r.sons[i], res)
+    add(res, ')')
+  of pkOrderedChoice:
+    add(res, '(')
+    toStrAux(r.sons[0], res)
+    for i in 1 .. high(r.sons):
+      add(res, " / ")
+      toStrAux(r.sons[i], res)
+    add(res, ')')
+  of pkGreedyRep:
+    toStrAux(r.sons[0], res)
+    add(res, '*')
+  of pkGreedyRepChar:
+    add(res, singleQuoteEsc(r.ch))
+    add(res, '*')
+  of pkGreedyRepSet:
+    add(res, charSetEsc(r.charChoice^))
+    add(res, '*')
+  of pkGreedyAny:
+    add(res, ".*")
+  of pkOption:
+    toStrAux(r.sons[0], res)
+    add(res, '?')
+  of pkAndPredicate:
+    add(res, '&')
+    toStrAux(r.sons[0], res)
+  of pkNotPredicate:
+    add(res, '!')
+    toStrAux(r.sons[0], res)
+  of pkSearch:
+    add(res, '@')
+    toStrAux(r.sons[0], res)
+  of pkCapture:
+    add(res, '{')
+    toStrAux(r.sons[0], res)    
+    add(res, '}')
+  of pkRule:
+    toStrAux(r.sons[0], res)    
+    add(res, " <- ")
+    toStrAux(r.sons[1], res)
+  of pkList:
+    for i in 0 .. high(r.sons):
+      toStrAux(r.sons[i], res)
+      add(res, "\n")  
+
+proc `$` *(r: TPeg): string =
+  ## converts a PEG to its string representation
+  result = ""
+  toStrAux(r, result)
+
 # --------------------- core engine -------------------------------------------
 
 type
@@ -370,6 +507,17 @@ proc m(s: string, p: TPeg, start: int, c: var TMatchClosure): int =
       result = m(s, p.sons[i], start, c)
       if result >= 0: break
       c.ml = oldMl
+  of pkSearch:
+    var oldMl = c.ml
+    result = 0
+    while start+result < s.len:
+      var x = m(s, p.sons[0], start+result, c)
+      if x >= 0:
+        inc(result, x)
+        return
+      inc(result)
+    result = -1
+    c.ml = oldMl
   of pkGreedyRep:
     result = 0
     while true:
@@ -607,123 +755,6 @@ proc split*(s: string, sep: TPeg): seq[string] {.noSideEffect.} =
   ## Splits the string `s` into substrings.
   accumulateResult(split(s, sep))
 
-# ------------------------- debugging -----------------------------------------
-
-proc esc(c: char, reserved = {'\0'..'\255'}): string = 
-  case c
-  of '\b': result = "\\b"
-  of '\t': result = "\\t"
-  of '\c': result = "\\c"
-  of '\L': result = "\\l"
-  of '\v': result = "\\v"
-  of '\f': result = "\\f"
-  of '\e': result = "\\e"
-  of '\a': result = "\\a"
-  of '\\': result = "\\\\"
-  of 'a'..'z', 'A'..'Z', '0'..'9', '_': result = $c
-  elif c < ' ' or c >= '\128': result = '\\' & $ord(c)
-  elif c in reserved: result = '\\' & c
-  else: result = $c
-  
-proc singleQuoteEsc(c: Char): string = return "'" & esc(c, {'\''}) & "'"
-
-proc singleQuoteEsc(str: string): string = 
-  result = "'"
-  for c in items(str): add result, esc(c, {'\''})
-  add result, '\''
-  
-proc charSetEscAux(cc: set[char]): string = 
-  const reserved = {'^', '-', ']'}
-  result = ""
-  var c1 = 0
-  while c1 <= 0xff: 
-    if chr(c1) in cc: 
-      var c2 = c1
-      while c2 < 0xff and chr(succ(c2)) in cc: inc(c2)
-      if c1 == c2: 
-        add result, esc(chr(c1), reserved)
-      elif c2 == succ(c1): 
-        add result, esc(chr(c1), reserved) & esc(chr(c2), reserved)
-      else: 
-        add result, esc(chr(c1), reserved) & '-' & esc(chr(c2), reserved)
-      c1 = c2
-    inc(c1)
-  
-proc CharSetEsc(cc: set[char]): string =
-  if card(cc) >= 128+64: 
-    result = "[^" & CharSetEscAux({'\1'..'\xFF'} - cc) & ']'
-  else: 
-    result = '[' & CharSetEscAux(cc) & ']'
-  
-proc toStrAux(r: TPeg, res: var string) = 
-  case r.kind
-  of pkEmpty: add(res, "()")
-  of pkAny: add(res, '.')
-  of pkAnyRune: add(res, '_')
-  of pkNewline: add(res, "\\n")
-  of pkTerminal: add(res, singleQuoteEsc(r.term))
-  of pkTerminalIgnoreCase:
-    add(res, 'i')
-    add(res, singleQuoteEsc(r.term))
-  of pkTerminalIgnoreStyle:
-    add(res, 'y')
-    add(res, singleQuoteEsc(r.term))
-  of pkChar: add(res, singleQuoteEsc(r.ch))
-  of pkCharChoice: add(res, charSetEsc(r.charChoice^))
-  of pkNonTerminal: add(res, r.nt.name)
-  of pkSequence:
-    add(res, '(')
-    toStrAux(r.sons[0], res)
-    for i in 1 .. high(r.sons):
-      add(res, ' ')
-      toStrAux(r.sons[i], res)
-    add(res, ')')
-  of pkOrderedChoice:
-    add(res, '(')
-    toStrAux(r.sons[0], res)
-    for i in 1 .. high(r.sons):
-      add(res, " / ")
-      toStrAux(r.sons[i], res)
-    add(res, ')')
-  of pkGreedyRep:
-    toStrAux(r.sons[0], res)
-    add(res, '*')
-  of pkGreedyRepChar:
-    add(res, singleQuoteEsc(r.ch))
-    add(res, '*')
-  of pkGreedyRepSet:
-    add(res, charSetEsc(r.charChoice^))
-    add(res, '*')
-  of pkGreedyAny:
-    add(res, ".*")
-  of pkOption:
-    toStrAux(r.sons[0], res)
-    add(res, '?')
-  of pkAndPredicate:
-    add(res, '&')
-    toStrAux(r.sons[0], res)
-  of pkNotPredicate:
-    add(res, '!')
-    toStrAux(r.sons[0], res)
-  of pkCapture:
-    add(res, '{')
-    toStrAux(r.sons[0], res)    
-    add(res, '}')
-  of pkRule:
-    toStrAux(r.sons[0], res)    
-    add(res, " <- ")
-    toStrAux(r.sons[1], res)
-  of pkList:
-    for i in 0 .. high(r.sons):
-      toStrAux(r.sons[i], res)
-      add(res, "\n")  
-
-proc `$` *(r: TPeg): string =
-  ## converts a PEG to its string representation
-  result = ""
-  toStrAux(r, result)
-
-
 # ------------------- scanner -------------------------------------------------
 
 type
@@ -751,6 +782,7 @@ type
     tkAmp,              ## '&'
     tkNot,              ## '!'
     tkOption,           ## '?'
+    tkAt,               ## '@'
     tkBuiltin,          ## \identifier
     tkEscaped           ## \\
   
@@ -772,7 +804,7 @@ const
   tokKindToStr: array[TTokKind, string] = [
     "invalid", "[EOF]", ".", "_", "identifier", "string literal",
     "character set", "(", ")", "{", "}", "<-", "/", "*", "+", "&", "!", "?",
-    "built-in", "escaped"
+    "@", "built-in", "escaped"
   ]
 
 proc HandleCR(L: var TPegLexer, pos: int): int =
@@ -1063,6 +1095,10 @@ proc getTok(c: var TPegLexer, tok: var TToken) =
     tok.kind = tkAmp
     inc(c.bufpos)
     add(tok.literal, '!')
+  of '@':
+    tok.kind = tkAt
+    inc(c.bufpos)
+    add(tok.literal, '@')
   else:
     add(tok.literal, c.buf[c.bufpos])
     inc(c.bufpos)
@@ -1118,6 +1154,9 @@ proc primary(p: var TPegParser): TPeg =
   of tkNot:
     getTok(p)
     return !primary(p)
+  of tkAt:
+    getTok(p)
+    return @primary(p)
   else: nil
   case p.tok.kind
   of tkIdentifier:
@@ -1187,7 +1226,7 @@ proc seqExpr(p: var TPegParser): TPeg =
   result = primary(p)
   while true:
     case p.tok.kind
-    of tkAmp, tkNot, tkStringLit, tkCharset, tkParLe, tkCurlyLe,
+    of tkAmp, tkNot, tkAt, tkStringLit, tkCharset, tkParLe, tkCurlyLe,
        tkAny, tkAnyRune, tkBuiltin, tkEscaped:
       result = sequence(result, primary(p))
     of tkIdentifier:
@@ -1261,6 +1300,7 @@ proc peg*(pattern: string): TPeg =
   result = parsePeg(pattern, "pattern")
 
 when isMainModule:
+  assert match("(a b c)", peg"'(' @ ')'")
   assert match("W_HI_Le", peg"\y 'while'")
   assert(not match("W_HI_L", peg"\y 'while'"))
   assert(not match("W_HI_Le", peg"\y v'while'"))
@@ -1315,6 +1355,11 @@ when isMainModule:
               """
   assert($g2 == "((A B) / (C D))")
   assert match("cccccdddddd", g2)
-  echo "var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2")
+  assert("var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2") ==
+         "var1<-keykey; var2<-key2key2")
   assert "var1=key; var2=key2".endsWith(peg"{\ident}'='{\ident}")
 
+  if "aaaaaa" =~ peg"'aa' !. / ({'a'})+":
+    assert matches[0] == "a"
+  else:
+    assert false