fixes #5444 - nre.findIter keeps searching when no match is possible (#5453)

author: Florent <florent@napalu.ch> 2017-03-02 11:48:41 +0100
committer: Andreas Rumpf <rumpf_a@web.de> 2017-03-02 11:48:41 +0100
commit: 34a3d40d18ef4ff73c629e38738068fe509e3c6c (patch)
tree: 752560732786fe844d1d08f74c391c610491b0ae
parent: 32159ee827694985a5c1013a46761ad257ea7c75 (diff)
download: Nim-34a3d40d18ef4ff73c629e38738068fe509e3c6c.tar.gz
2 files changed, 25 insertions, 6 deletions
diff --git a/lib/impure/nre.nim b/lib/impure/nre.nim
index dda4b033f..4013182af 100644
--- a/lib/impure/nre.nim
+++ b/lib/impure/nre.nim
@@ -516,23 +516,23 @@ iterator findIter*(str: string, pattern: Regex, start = 0, endpos = int.high): R
   let unicode = uint32(getinfo[culong](pattern, pcre.INFO_OPTIONS) and
     pcre.UTF8) > 0u32
   let strlen = if endpos == int.high: str.len else: endpos+1
-
   var offset = start
   var match: Option[RegexMatch]
+  var neverMatched = true
+
   while true:
     var flags = 0
-
     if match.isSome and
        match.get.matchBounds.a > match.get.matchBounds.b:
       # 0-len match
       flags = pcre.NOTEMPTY_ATSTART
-
     match = str.matchImpl(pattern, offset, endpos, flags)
 
     if match.isNone:
       # either the end of the input or the string
-      # cannot be split here
-      if offset >= strlen:
+      # cannot be split here - we also need to bail
+      # if we've never matched and we've already tried to...
+      if offset >= strlen or neverMatched:
         break
 
       if matchesCrLf and offset < (str.len - 1) and
@@ -546,11 +546,11 @@ iterator findIter*(str: string, pattern: Regex, start = 0, endpos = int.high): R
       else:
         offset += 1
     else:
+      neverMatched = false
       offset = match.get.matchBounds.b + 1
 
       yield match.get
 
-
 proc find*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[RegexMatch] =
   ## Finds the given pattern in the string between the end and start
   ## positions.
diff --git a/tests/stdlib/nre/find.nim b/tests/stdlib/nre/find.nim
index 94fdd0bc1..116d2111c 100644
--- a/tests/stdlib/nre/find.nim
+++ b/tests/stdlib/nre/find.nim
@@ -1,6 +1,7 @@
 import unittest, sequtils
 import nre except toSeq
 import optional_nonstrict
+import times, strutils
 
 suite "find":
   test "find text":
@@ -25,3 +26,21 @@ suite "find":
     check("word word".findAll(re"\b") == @["", "", "", ""])
     check("word\r\lword".findAll(re"(*ANYCRLF)(?m)$") == @["", ""])
     check("слово слово".findAll(re"(*U)\b") == @["", "", "", ""])
+
+  test "bail early":
+    ## we expect nothing to be found and we should be bailing out early which means that
+    ## the timing difference between searching in small and large data should be well
+    ## within a tolerance area
+    const tolerance = 0.0001
+    var smallData = repeat("url.sequence = \"http://whatever.com/jwhrejrhrjrhrjhrrjhrjrhrjrh\"", 10)
+    var largeData = repeat("url.sequence = \"http://whatever.com/jwhrejrhrjrhrjhrrjhrjrhrjrh\"", 1000000)
+    var start = cpuTime()
+    check(largeData.findAll(re"url.*? = &#39;(.*?)&#39;") == newSeq[string]())
+    var stop = cpuTime()
+    var elapsedLarge = stop - start
+    start = cpuTime()
+    check(smallData.findAll(re"url.*? = &#39;(.*?)&#39;") == newSeq[string]())
+    stop = cpuTime()
+    var elapsedSmall = stop - start
+    var difference =  elapsedLarge - elapsedSmall
+    check(difference < tolerance)
author	Florent <florent@napalu.ch>	2017-03-02 11:48:41 +0100
committer	Andreas Rumpf <rumpf_a@web.de>	2017-03-02 11:48:41 +0100
commit	34a3d40d18ef4ff73c629e38738068fe509e3c6c (patch)
tree	752560732786fe844d1d08f74c391c610491b0ae
parent	32159ee827694985a5c1013a46761ad257ea7c75 (diff)
download	Nim-34a3d40d18ef4ff73c629e38738068fe509e3c6c.tar.gz