summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rwxr-xr-xdoc/pegdocs.txt8
-rwxr-xr-xlib/impure/re.nim18
-rwxr-xr-xlib/pure/pegs.nim55
-rwxr-xr-xlib/pure/strutils.nim4
-rwxr-xr-xlib/system.nim8
-rwxr-xr-xtools/nimgrep.nim126
-rwxr-xr-xweb/news.txt1
7 files changed, 174 insertions, 46 deletions
diff --git a/doc/pegdocs.txt b/doc/pegdocs.txt
index 05a7fdc58..eb7f4562f 100755
--- a/doc/pegdocs.txt
+++ b/doc/pegdocs.txt
@@ -27,7 +27,11 @@ notation           meaning
 ``{E}``            Capture: Apply expression `E` and store the substring
                    that matched `E` into a *capture* that can be accessed
                    after the matching process.
-``$i``             back reference to the ``i``th capture. ``i`` counts from 1. 
+``$i``             Back reference to the ``i``th capture. ``i`` counts from 1. 
+``$``              Anchor: Matches at the end of the input. No character 
+                   is consumed. Same as ``!.``. 
+``^``              Anchor: Matches at the start of the input. No character 
+                   is consumed. 
 ``&E``             And predicate: Indicate success if expression `E` matches
                    the text ahead; otherwise indicate failure. Do not consume
                    any text.
@@ -145,7 +149,7 @@ The PEG parser implements this grammar (written in PEG syntax)::
   rule <- identifier \s* "<-" expr ig
   identNoArrow <- identifier !(\s* "<-")
   prefixOpr <- ig '&' / ig '!' / ig '@' / ig '{@}' / ig '@@'
-  literal <- ig identifier? '$' [0-9]+
+  literal <- ig identifier? '$' [0-9]+ / '$' / '^' /
              ig identNoArrow / 
              ig charset / 
              ig stringlit / 
diff --git a/lib/impure/re.nim b/lib/impure/re.nim
index bce786087..9198a5bfe 100755
--- a/lib/impure/re.nim
+++ b/lib/impure/re.nim
@@ -80,6 +80,24 @@ proc matchOrFind(s: string, pattern: TRegEx, matches: var openarray[string],
     if a >= 0'i32: matches[i-1] = copy(s, int(a), int(b)-1)
     else: matches[i-1] = ""
   return rawMatches[1] - rawMatches[0]
+  
+proc findBounds*(s: string, pattern: TRegEx, matches: var openarray[string],
+                 start = 0): tuple[first, last: int] =
+  ## returns the starting position and end position of ``pattern`` in ``s`` 
+  ## and the captured
+  ## substrings in the array ``matches``. If it does not match, nothing
+  ## is written into ``matches`` and (-1,0) is returned.
+  var
+    rawMatches: array[0..maxSubpatterns * 3 - 1, cint]
+    res = pcre.Exec(pattern.h, nil, s, len(s), start, 0'i32,
+      cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
+  if res < 0'i32: return (-1, 0)
+  for i in 1..int(res)-1:
+    var a = rawMatches[i * 2]
+    var b = rawMatches[i * 2 + 1]
+    if a >= 0'i32: matches[i-1] = copy(s, int(a), int(b)-1)
+    else: matches[i-1] = ""
+  return (rawMatches[0].int, rawMatches[1].int - 1)
 
 proc matchOrFind(s: string, pattern: TRegEx, start, flags: cint): cint =
   var rawMatches: array [0..maxSubpatterns * 3 - 1, cint]
diff --git a/lib/pure/pegs.nim b/lib/pure/pegs.nim
index 4628a3ff9..de968bff4 100755
--- a/lib/pure/pegs.nim
+++ b/lib/pure/pegs.nim
@@ -1,7 +1,7 @@
 #
 #
 #            Nimrod's Runtime Library
-#        (c) Copyright 2010 Andreas Rumpf
+#        (c) Copyright 2011 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
@@ -65,7 +65,8 @@ type
     pkSearch,           ## @a     --> Internal DSL: @a
     pkCapturedSearch,   ## {@} a  --> Internal DSL: @@a
     pkRule,             ## a <- b
-    pkList              ## a, b
+    pkList,             ## a, b
+    pkStartAnchor       ## ^      --> Internal DSL: startAnchor()
   TNonTerminalFlag = enum
     ntDeclared, ntUsed
   TNonTerminal {.final.} = object ## represents a non terminal symbol
@@ -264,6 +265,14 @@ proc UnicodeWhitespace*: TPeg {.inline.} =
   ## whitespace character.
   result.kind = pkWhitespace
 
+proc startAnchor*: TPeg {.inline.} = 
+  ## constructs the PEG ``^`` which matches the start of the input.  
+  result.kind = pkStartAnchor
+
+proc endAnchor*: TPeg {.inline.} = 
+  ## constructs the PEG ``$`` which matches the end of the input.  
+  result = !any()
+
 proc capture*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsCapture".} =
   ## constructs a capture with the PEG `a`
   result.kind = pkCapture
@@ -484,6 +493,8 @@ proc toStrAux(r: TPeg, res: var string) =
     for i in 0 .. high(r.sons):
       toStrAux(r.sons[i], res)
       add(res, "\n")  
+  of pkStartAnchor:
+    add(res, '^')
 
 proc `$` *(r: TPeg): string {.nosideEffect, rtl, extern: "npegsToString".} =
   ## converts a PEG to its string representation
@@ -496,6 +507,7 @@ type
   TCaptures* {.final.} = object ## contains the captured substrings.
     matches: array[0..maxSubpatterns-1, tuple[first, last: int]]
     ml: int
+    origStart: int
 
 proc bounds*(c: TCaptures, 
              i: range[0..maxSubpatterns-1]): tuple[first, last: int] = 
@@ -721,6 +733,9 @@ proc rawMatch*(s: string, p: TPeg, start: int, c: var TCaptures): int {.
     n.kind = succ(pkTerminal, ord(p.kind)-ord(pkBackRef)) 
     n.term = s.copy(a, b)
     result = rawMatch(s, n, start, c)
+  of pkStartAnchor:
+    if c.origStart == start: result = 0
+    else: result = -1
   of pkRule, pkList: assert false
 
 proc match*(s: string, pattern: TPeg, matches: var openarray[string],
@@ -730,6 +745,7 @@ proc match*(s: string, pattern: TPeg, matches: var openarray[string],
   ## match, nothing is written into ``matches`` and ``false`` is
   ## returned.
   var c: TCaptures
+  c.origStart = start
   result = rawMatch(s, pattern, start, c) == len(s) -start
   if result:
     for i in 0..c.ml-1:
@@ -739,6 +755,7 @@ proc match*(s: string, pattern: TPeg,
             start = 0): bool {.nosideEffect, rtl, extern: "npegs$1".} =
   ## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``.
   var c: TCaptures
+  c.origStart = start
   result = rawMatch(s, pattern, start, c) == len(s)-start
 
 proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string],
@@ -748,6 +765,7 @@ proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string],
   ## of zero can happen. It's possible that a suffix of `s` remains
   ## that does not belong to the match.
   var c: TCaptures
+  c.origStart = start
   result = rawMatch(s, pattern, start, c)
   if result >= 0:
     for i in 0..c.ml-1:
@@ -760,6 +778,7 @@ proc matchLen*(s: string, pattern: TPeg,
   ## of zero can happen. It's possible that a suffix of `s` remains
   ## that does not belong to the match.
   var c: TCaptures
+  c.origStart = start
   result = rawMatch(s, pattern, start, c)
 
 proc find*(s: string, pattern: TPeg, matches: var openarray[string],
@@ -988,14 +1007,16 @@ type
     tkAt,               ## '@'
     tkBuiltin,          ## \identifier
     tkEscaped,          ## \\
-    tkDollar            ## '$'
+    tkBackref,          ## '$'
+    tkDollar,           ## '$'
+    tkHat               ## '^'
   
   TToken {.final.} = object  ## a token
     kind: TTokKind           ## the type of the token
     modifier: TModifier
     literal: string          ## the parsed (string) literal
     charset: set[char]       ## if kind == tkCharSet
-    index: int               ## if kind == tkDollar
+    index: int               ## if kind == tkBackref
   
   TPegLexer = object          ## the lexer object.
     bufpos: int               ## the current position within the buffer
@@ -1010,7 +1031,7 @@ const
     "invalid", "[EOF]", ".", "_", "identifier", "string literal",
     "character set", "(", ")", "{", "}", "{@}",
     "<-", "/", "*", "+", "&", "!", "?",
-    "@", "built-in", "escaped", "$"
+    "@", "built-in", "escaped", "$", "$", "^"
   ]
 
 proc HandleCR(L: var TPegLexer, pos: int): int =
@@ -1155,13 +1176,13 @@ proc getDollar(c: var TPegLexer, tok: var TToken) =
   var pos = c.bufPos + 1
   var buf = c.buf
   if buf[pos] in {'0'..'9'}:
-    tok.kind = tkDollar
+    tok.kind = tkBackref
     tok.index = 0
     while buf[pos] in {'0'..'9'}:
       tok.index = tok.index * 10 + ord(buf[pos]) - ord('0')
       inc(pos)
   else:
-    tok.kind = tkInvalid
+    tok.kind = tkDollar
   c.bufpos = pos
   
 proc getCharSet(c: var TPegLexer, tok: var TToken) = 
@@ -1280,7 +1301,8 @@ proc getTok(c: var TPegLexer, tok: var TToken) =
     tok.literal = "[EOF]"
   of 'a'..'z', 'A'..'Z', '\128'..'\255':
     getSymbol(c, tok)
-    if c.buf[c.bufpos] in {'\'', '"', '$'}:
+    if c.buf[c.bufpos] in {'\'', '"'} or 
+        c.buf[c.bufpos] == '$' and c.buf[c.bufpos+1] in {'0'..'9'}:
       case tok.literal
       of "i": tok.modifier = modIgnoreCase
       of "y": tok.modifier = modIgnoreStyle
@@ -1331,6 +1353,10 @@ proc getTok(c: var TPegLexer, tok: var TToken) =
       tok.kind = tkCurlyAt
       inc(c.bufpos)
       add(tok.literal, '@')
+  of '^':
+    tok.kind = tkHat
+    inc(c.bufpos)
+    add(tok.literal, '^')
   else:
     add(tok.literal, c.buf[c.bufpos])
     inc(c.bufpos)
@@ -1474,7 +1500,13 @@ proc primary(p: var TPegParser): TPeg =
   of tkEscaped:
     result = term(p.tok.literal[0]).token(p)
     getTok(p)
-  of tkDollar:
+  of tkDollar: 
+    result = endAnchor()
+    getTok(p)
+  of tkHat: 
+    result = startAnchor()
+    getTok(p)
+  of tkBackref:
     var m = p.tok.modifier
     if m == modNone: m = p.modifier
     result = modifiedBackRef(p.tok.index, m).token(p)
@@ -1502,7 +1534,8 @@ proc seqExpr(p: var TPegParser): TPeg =
   while true:
     case p.tok.kind
     of tkAmp, tkNot, tkAt, tkStringLit, tkCharset, tkParLe, tkCurlyLe,
-       tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkCurlyAt:
+       tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref, 
+       tkHat, tkCurlyAt:
       result = sequence(result, primary(p))
     of tkIdentifier:
       if not arrowIsNextTok(p):
@@ -1693,3 +1726,5 @@ when isMainModule:
     peg"\skip(\s*) {\ident}'='{\ident}", "$1<-$2$2") ==
          "var1<-keykey;var2<-key2key2")
 
+  assert match("prefix/start", peg"^start$", 7)
+
diff --git a/lib/pure/strutils.nim b/lib/pure/strutils.nim
index 76ea068df..6a7f128c5 100755
--- a/lib/pure/strutils.nim
+++ b/lib/pure/strutils.nim
@@ -1,7 +1,7 @@
 #

 #

 #            Nimrod's Runtime Library

-#        (c) Copyright 2010 Andreas Rumpf

+#        (c) Copyright 2011 Andreas Rumpf

 #

 #    See the file "copying.txt", included in this

 #    distribution, for details about the copyright.

@@ -42,6 +42,8 @@ const
   IdentStartChars* = {'a'..'z', 'A'..'Z', '_'}

     ## the set of characters an identifier can start with

 

+  NewLines* = {'\13', '\10'}

+    ## the set of characters a newline terminator can start with

 

 proc toLower*(c: Char): Char {.noSideEffect, procvar,

   rtl, extern: "nsuToLowerChar".} =

diff --git a/lib/system.nim b/lib/system.nim
index d632b7367..7822e6cad 100755
--- a/lib/system.nim
+++ b/lib/system.nim
@@ -734,16 +734,16 @@ proc compileOption*(option: string): bool {.
   ## can be used to determine an on|off compile-time option. Example:
   ##
   ## .. code-block:: nimrod
-  ## when compileOption("floatchecks"): 
-  ##   echo "compiled with floating point NaN and Inf checks"
+  ##   when compileOption("floatchecks"): 
+  ##     echo "compiled with floating point NaN and Inf checks"
   
 proc compileOption*(option, arg: string): bool {.
   magic: "CompileOptionArg", noSideEffect.}
   ## can be used to determine an enum compile-time option. Example:
   ##
   ## .. code-block:: nimrod
-  ## when compileOption("opt", "size") and compileOption("gc", "boehm"): 
-  ##   echo "compiled with optimization for size and uses Boehm's GC"
+  ##   when compileOption("opt", "size") and compileOption("gc", "boehm"): 
+  ##     echo "compiled with optimization for size and uses Boehm's GC"
   
 include "system/inclrtl"
 
diff --git a/tools/nimgrep.nim b/tools/nimgrep.nim
index cc1f89a74..8ee1b8a76 100755
--- a/tools/nimgrep.nim
+++ b/tools/nimgrep.nim
@@ -1,7 +1,7 @@
 #
 #
 #           Nimrod Grep Utility
-#        (c) Copyright 2010 Andreas Rumpf
+#        (c) Copyright 2011 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
@@ -11,21 +11,28 @@ import
   os, strutils, parseopt, pegs, re, terminal
 
 const
-  Usage = """
-Usage: nimgrep [options] [pattern] [files/directory]
+  Version = "0.7"
+  Usage = "nimgrep - Nimrod Grep Utility Version " & version & """
+
+  (c) 2011 Andreas Rumpf
+Usage:
+  nimgrep [options] [pattern] [files/directory]
 Options:
   --find, -f          find the pattern (default)
   --replace, -r       replace the pattern
   --peg               pattern is a peg (default)
-  --re                pattern is a regular expression
+  --re                pattern is a regular expression; extended syntax for
+                      the regular expression is always turned on
   --recursive         process directories recursively
   --confirm           confirm each occurence/replacement; there is a chance 
-                      to abort any time without touching the file(s)
+                      to abort any time without touching the file
   --stdin             read pattern from stdin (to avoid the shell's confusing
                       quoting rules)
   --word, -w          the pattern should have word boundaries
   --ignore_case, -i   be case insensitive
   --ignore_style, -y  be style insensitive
+  --help, -h          shows this help
+  --version, -v       shows the version
 """
 
 type
@@ -48,7 +55,7 @@ proc ask(msg: string): string =
 
 proc Confirm: TConfirmEnum = 
   while true:
-    case normalize(ask("[a]bort; [y]es, a[l]l, [n]o, non[e]: "))
+    case normalize(ask("     [a]bort; [y]es, a[l]l, [n]o, non[e]: "))
     of "a", "abort": return ceAbort 
     of "y", "yes": return ceYes
     of "l", "all": return ceAll
@@ -56,12 +63,7 @@ proc Confirm: TConfirmEnum =
     of "e", "none": return ceNone
     else: nil
 
-proc highlight(a, b, c: string) = 
-  stdout.write(a)
-  terminal.WriteStyled(b)
-  stdout.writeln(c)
-
-proc countLines(s: string, first = 0, last = s.high): int = 
+proc countLines(s: string, first, last: int): int = 
   var i = first
   while i <= last:
     if s[i] == '\13': 
@@ -71,6 +73,37 @@ proc countLines(s: string, first = 0, last = s.high): int =
       inc result
     inc i
 
+proc beforePattern(s: string, first: int): int = 
+  result = first-1
+  while result >= 0:
+    if s[result] in newlines: break
+    dec(result)
+  inc(result)
+
+proc afterPattern(s: string, last: int): int = 
+  result = last+1
+  while result < s.len:
+    if s[result] in newlines: break
+    inc(result)
+  dec(result)
+
+proc highlight(s, match, repl: string, t: tuple[first, last: int],
+               line: int, showRepl: bool) = 
+  const alignment = 6
+  stdout.write(line.`$`.align(alignment), ": ")
+  var x = beforePattern(s, t.first)
+  var y = afterPattern(s, t.last)
+  for i in x .. t.first-1: stdout.write(s[i])
+  terminal.WriteStyled(match, {styleUnderscore, styleBright})
+  for i in t.last+1 .. y: stdout.write(s[i])
+  stdout.write("\n")
+  if showRepl:
+    stdout.write(repeatChar(alignment-1), "-> ")
+    for i in x .. t.first-1: stdout.write(s[i])
+    terminal.WriteStyled(repl, {styleUnderscore, styleBright})
+    for i in t.last+1 .. y: stdout.write(s[i])
+    stdout.write("\n")
+
 proc processFile(filename: string) = 
   var buffer = system.readFile(filename)
   if isNil(buffer): quit("cannot open file: " & filename)
@@ -92,53 +125,76 @@ proc processFile(filename: string) =
     
   var line = 1
   var i = 0
-  var matches: array[0..re.MaxSubpatterns-1. string]
+  var matches: array[0..re.MaxSubpatterns-1, string]
+  for j in 0..high(matches): matches[j] = ""
   var reallyReplace = true
   while i < buffer.len:
     var t: tuple[first, last: int]
-    if optRegex in options:
-      quit "to implement"
-    else:
+    if optRegex notin options:
       t = findBounds(buffer, pegp, matches, i)
-
+    else:
+      t = findBounds(buffer, rep, matches, i)
     if t.first <= 0: break
     inc(line, countLines(buffer, i, t.first-1))
     
     var wholeMatch = buffer.copy(t.first, t.last)
-    echo "line ", line, ": ", wholeMatch
     
-    if optReplace in options: 
-      var r = replace(wholeMatch, pegp, replacement)
-      
+    if optReplace notin options: 
+      highlight(buffer, wholeMatch, "", t, line, showRepl=false)
+    else:
+      var r: string
+      if optRegex notin options:
+        r = replace(wholeMatch, pegp, replacement % matches)
+      else: 
+        r = replace(wholeMatch, rep, replacement % matches)
       if optConfirm in options: 
+        highlight(buffer, wholeMatch, r, t, line, showRepl=true)
         case Confirm()
-        of ceAbort:
-        of ceYes:
+        of ceAbort: quit(0)
+        of ceYes: reallyReplace = true 
         of ceAll: 
           reallyReplace = true
+          options.excl(optConfirm)
         of ceNo:
           reallyReplace = false
         of ceNone:
           reallyReplace = false
+          options.excl(optConfirm)
+      else:
+        highlight(buffer, wholeMatch, r, t, line, showRepl=reallyReplace)
       if reallyReplace:
-        
+        result.add(buffer.copy(i, t.first-1))
+        result.add(r)
+      else:
+        result.add(buffer.copy(i, t.last))
 
     inc(line, countLines(buffer, t.first, t.last))
-    
     i = t.last+1
-    
+  if optReplace in options:
+    result.add(copy(buffer, i))
+    var f: TFile
+    if open(f, filename, fmWrite):
+      f.write(result)
+      f.close()
+    else:
+      quit "cannot open file for overwriting: " & filename
+
 
 proc walker(dir: string) = 
+  var isDir = false
   for kind, path in walkDir(dir):
+    isDir = true
     case kind
-    of pcFile: processFile(path)
-    of pcDirectory: 
+    of pcFile: 
+      processFile(path)
+    of pcDir: 
       if optRecursive in options:
         walker(path)
     else: nil
+  if not isDir: processFile(dir)
 
 proc writeHelp() = quit(Usage)
-proc writeVersion() = quit("1.0")
+proc writeVersion() = quit(Version)
 
 proc checkOptions(subset: TOptions, a, b: string) =
   if subset <= options:
@@ -187,5 +243,17 @@ if pattern.len == 0:
   writeHelp()
 else: 
   if filename.len == 0: filename = os.getCurrentDir()
+  if optRegex notin options: 
+    if optIgnoreStyle in options: 
+      pattern = "\\y " & pattern
+    elif optIgnoreCase in options:
+      pattern = "\\i " & pattern
+    if optWord in options:
+      pattern = r"(&\letter? / ^ )(" & pattern & r") !\letter"
+  else:
+    if optIgnoreStyle in options: 
+      quit "ignorestyle not supported for regular expressions"
+    if optWord in options:
+      pattern = r"\b (:?" & pattern & r") \b"
   walker(filename)
 
diff --git a/web/news.txt b/web/news.txt
index 0a8f01cad..36deb92df 100755
--- a/web/news.txt
+++ b/web/news.txt
@@ -42,6 +42,7 @@ Additions
 - Pegs support new built-ins: ``\letter``, ``\upper``, ``\lower``,
   ``\title``, ``\white``.
 - Pegs support the new built-in ``\skip`` operation.
+- Pegs support the ``$`` and ``^`` anchors.
 - Source code filters are now documented.
 - Added ``emit`` pragma for direct code generator control.
 - Additional operations were added to the ``complex`` module.