summary refs log tree commit diff stats
path: root/lib/impure/re.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/impure/re.nim')
-rwxr-xr-xlib/impure/re.nim144
1 files changed, 110 insertions, 34 deletions
diff --git a/lib/impure/re.nim b/lib/impure/re.nim
index b74116395..36adf5d1f 100755
--- a/lib/impure/re.nim
+++ b/lib/impure/re.nim
@@ -1,7 +1,7 @@
 #
 #
 #            Nimrod's Runtime Library
-#        (c) Copyright 2010 Andreas Rumpf
+#        (c) Copyright 2011 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
@@ -32,46 +32,57 @@ type
     reIgnoreCase = 0,    ## do caseless matching
     reMultiLine = 1,     ## ``^`` and ``$`` match newlines within data 
     reDotAll = 2,        ## ``.`` matches anything including NL
-    reExtended = 3       ## ignore whitespace and ``#`` comments
+    reExtended = 3,      ## ignore whitespace and ``#`` comments
+    reStudy = 4          ## study the expression (may be omitted if the
+                         ## expression will be used only once)
     
   TRegExDesc {.pure, final.}  = object 
     h: PPcre
+    e: ptr TExtra
     
   TRegEx* = ref TRegExDesc ## a compiled regular expression
     
   EInvalidRegEx* = object of EInvalidValue
     ## is raised if the pattern is no valid regular expression.
 
+proc raiseInvalidRegex(msg: string) {.noinline, noreturn.} = 
+  var e: ref EInvalidRegEx
+  new(e)
+  e.msg = msg
+  raise e
+  
 proc rawCompile(pattern: string, flags: cint): PPcre =
   var
     msg: CString
     offset: cint
-    com = pcre.Compile(pattern, flags, addr(msg), addr(offset), nil)
-  if com == nil:
-    var e: ref EInvalidRegEx
-    new(e)
-    e.msg = $msg & "\n" & pattern & "\n" & repeatChar(offset) & "^\n"
-    raise e
-  return com
+  result = pcre.Compile(pattern, flags, addr(msg), addr(offset), nil)
+  if result == nil:
+    raiseInvalidRegEx($msg & "\n" & pattern & "\n" & repeatChar(offset) & "^\n")
 
 proc finalizeRegEx(x: TRegEx) = 
   # XXX This is a hack, but PCRE does not export its "free" function properly.
   # Sigh. The hack relies on PCRE's implementation (see ``pcre_get.c``).
   # Fortunately the implementation is unlikely to change. 
   pcre.free_substring(cast[cstring](x.h))
+  if not isNil(x.e):
+    pcre.free_substring(cast[cstring](x.e))
 
-proc re*(s: string, flags = {reExtended}): TRegEx =
+proc re*(s: string, flags = {reExtended, reStudy}): TRegEx =
   ## Constructor of regular expressions. Note that Nimrod's
   ## extended raw string literals support this syntax ``re"[abc]"`` as
   ## a short form for ``re(r"[abc]")``.
   new(result, finalizeRegEx)
-  result.h = rawCompile(s, cast[cint](flags))
-  
+  result.h = rawCompile(s, cast[cint](flags - {reStudy}))
+  if reStudy in flags:
+    var msg: cstring
+    result.e = pcre.study(result.h, 0, msg)
+    if not isNil(msg): raiseInvalidRegex($msg)
+
 proc matchOrFind(s: string, pattern: TRegEx, matches: var openarray[string],
                  start, flags: cint): cint =
   var
     rawMatches: array[0..maxSubpatterns * 3 - 1, cint]
-    res = pcre.Exec(pattern.h, nil, s, len(s), start, flags,
+    res = pcre.Exec(pattern.h, pattern.e, s, len(s), start, flags,
       cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
   if res < 0'i32: return res
   for i in 1..int(res)-1:
@@ -83,13 +94,13 @@ proc matchOrFind(s: string, pattern: TRegEx, matches: var openarray[string],
   
 proc findBounds*(s: string, pattern: TRegEx, matches: var openarray[string],
                  start = 0): tuple[first, last: int] =
-  ## returns the starting position and end position of ``pattern`` in ``s`` 
+  ## returns the starting position and end position of `pattern` in `s` 
   ## and the captured
-  ## substrings in the array ``matches``. If it does not match, nothing
-  ## is written into ``matches`` and (-1,0) is returned.
+  ## substrings in the array `matches`. If it does not match, nothing
+  ## is written into `matches` and ``(-1,0)`` is returned.
   var
     rawMatches: array[0..maxSubpatterns * 3 - 1, cint]
-    res = pcre.Exec(pattern.h, nil, s, len(s), start, 0'i32,
+    res = pcre.Exec(pattern.h, pattern.e, s, len(s), start, 0'i32,
       cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
   if res < 0'i32: return (-1, 0)
   for i in 1..int(res)-1:
@@ -98,10 +109,40 @@ proc findBounds*(s: string, pattern: TRegEx, matches: var openarray[string],
     if a >= 0'i32: matches[i-1] = copy(s, int(a), int(b)-1)
     else: matches[i-1] = ""
   return (rawMatches[0].int, rawMatches[1].int - 1)
+  
+proc findBounds*(s: string, pattern: TRegEx, 
+                 matches: var openarray[tuple[first, last: int]],
+                 start = 0): tuple[first, last: int] =
+  ## returns the starting position and end position of ``pattern`` in ``s`` 
+  ## and the captured substrings in the array `matches`. 
+  ## If it does not match, nothing is written into `matches` and
+  ## ``(-1,0)`` is returned.
+  var
+    rawMatches: array[0..maxSubpatterns * 3 - 1, cint]
+    res = pcre.Exec(pattern.h, pattern.e, s, len(s), start, 0'i32,
+      cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
+  if res < 0'i32: return (-1, 0)
+  for i in 1..int(res)-1:
+    var a = rawMatches[i * 2]
+    var b = rawMatches[i * 2 + 1]
+    if a >= 0'i32: matches[i-1] = (int(a), int(b)-1)
+    else: matches[i-1] = (-1,0)
+  return (rawMatches[0].int, rawMatches[1].int - 1)
 
+proc findBounds*(s: string, pattern: TRegEx, 
+                 start = 0): tuple[first, last: int] =
+  ## returns the starting position of `pattern` in `s`. If it does not
+  ## match, ``(-1,0)`` is returned.
+  var
+    rawMatches: array[0..3 - 1, cint]
+    res = pcre.Exec(pattern.h, nil, s, len(s), start, 0'i32,
+      cast[ptr cint](addr(rawMatches)), 3)
+  if res < 0'i32: return (int(res), 0)
+  return (int(rawMatches[0]), int(rawMatches[1]-1))
+  
 proc matchOrFind(s: string, pattern: TRegEx, start, flags: cint): cint =
   var rawMatches: array [0..maxSubpatterns * 3 - 1, cint]
-  result = pcre.Exec(pattern.h, nil, s, len(s), start, flags,
+  result = pcre.Exec(pattern.h, pattern.e, s, len(s), start, flags,
                     cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
   if result >= 0'i32:
     result = rawMatches[1] - rawMatches[0]
@@ -139,7 +180,7 @@ proc find*(s: string, pattern: TRegEx, matches: var openarray[string],
   ## is written into ``matches`` and -1 is returned.
   var
     rawMatches: array[0..maxSubpatterns * 3 - 1, cint]
-    res = pcre.Exec(pattern.h, nil, s, len(s), start, 0'i32,
+    res = pcre.Exec(pattern.h, pattern.e, s, len(s), start, 0'i32,
       cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
   if res < 0'i32: return res
   for i in 1..int(res)-1:
@@ -219,31 +260,64 @@ proc endsWith*(s: string, suffix: TRegEx): bool =
   for i in 0 .. s.len-1:
     if matchLen(s, suffix, i) == s.len - i: return true
 
-proc replace*(s: string, sub: TRegEx, by: string): string =
+proc replace*(s: string, sub: TRegEx, by = ""): string =
+  ## Replaces `sub` in `s` by the string `by`. Captures cannot be 
+  ## accessed in `by`. Examples:
+  ##
+  ## .. code-block:: nimrod
+  ##   "var1=key; var2=key2".replace(re"(\w+)'='(\w+)")
+  ##
+  ## Results in:
+  ##
+  ## .. code-block:: nimrod
+  ##
+  ##   "; "
+  result = ""
+  var prev = 0
+  while true:
+    var match = findBounds(s, sub, prev)
+    if match.first < 0: break
+    add(result, copy(s, prev, match.first-1))
+    add(result, by)
+    prev = match.last + 1
+  add(result, copy(s, prev))
+  
+proc replacef*(s: string, sub: TRegEx, by: string): string =
   ## Replaces `sub` in `s` by the string `by`. Captures can be accessed in `by`
   ## with the notation ``$i`` and ``$#`` (see strutils.`%`). Examples:
   ##
   ## .. code-block:: nimrod
-  ##   "var1=key; var2=key2".replace(re"(\w+)'='(\w+)", "$1<-$2$2")
+  ## "var1=key; var2=key2".replace(re"(\w+)'='(\w+)", "$1<-$2$2")
   ##
   ## Results in:
   ##
   ## .. code-block:: nimrod
   ##
-  ##   "var1<-keykey; val2<-key2key2"
+  ## "var1<-keykey; val2<-key2key2"
   result = ""
-  var i = 0
   var caps: array[0..maxSubpatterns-1, string]
-  while i < s.len:
-    var x = matchLen(s, sub, caps, i)
-    if x <= 0:
-      add(result, s[i])
-      inc(i)
-    else:
-      addf(result, by, caps)
-      inc(i, x)
-  # copy the rest:
-  add(result, copy(s, i))
+  var prev = 0
+  while true:
+    var match = findBounds(s, sub, caps, prev)
+    if match.first < 0: break
+    add(result, copy(s, prev, match.first-1))
+    addf(result, by, caps)
+    prev = match.last + 1
+  add(result, copy(s, prev))
+  when false:
+    result = ""
+    var i = 0
+    var caps: array[0..maxSubpatterns-1, string]
+    while i < s.len:
+      var x = matchLen(s, sub, caps, i)
+      if x <= 0:
+        add(result, s[i])
+        inc(i)
+      else:
+        addf(result, by, caps)
+        inc(i, x)
+    # copy the rest:
+    add(result, copy(s, i))
   
 proc parallelReplace*(s: string, subs: openArray[
                       tuple[pattern: TRegEx, repl: string]]): string = 
@@ -376,8 +450,10 @@ when isMainModule:
     assert false
     
   assert "var1=key; var2=key2".endsWith(re"\w+=\w+")
-  assert("var1=key; var2=key2".replace(re"(\w+)=(\w+)", "$1<-$2$2") ==
+  assert("var1=key; var2=key2".replacef(re"(\w+)=(\w+)", "$1<-$2$2") ==
          "var1<-keykey; var2<-key2key2")
+  assert("var1=key; var2=key2".replace(re"(\w+)=(\w+)", "$1<-$2$2") ==
+         "$1<-$2$2; $1<-$2$2")
 
   for word in split("00232this02939is39an22example111", re"\d+"):
     writeln(stdout, word)