Implement match, find, split

author: Flaviu Tamas <tamasflaviu@gmail.com> 2015-01-12 20:09:19 -0500
committer: Flaviu Tamas <tamasflaviu@gmail.com> 2015-01-12 20:09:19 -0500
commit: 2474758ed52bcdd48212ea51f7f05003c2468e4e (patch)
tree: b02eee56e5c91e06dbb98f20d45b1f88ae8ddfa3 /src/nre.nim
parent: 6fe0de0639fe3163376acfb3dad75e2264a88363 (diff)
download: Nim-2474758ed52bcdd48212ea51f7f05003c2468e4e.tar.gz
1 files changed, 119 insertions, 7 deletions
diff --git a/src/nre.nim b/src/nre.nim
index 29313e9eb..c9e9d9ecd 100644
--- a/src/nre.nim
+++ b/src/nre.nim
@@ -6,6 +6,7 @@ from future import lc, `[]`
 from strutils import toLower, `%`
 from math import ceil
 import optional_t
+from unicode import runeLenAt
 
 # Type definitions {{{
 type
@@ -54,6 +55,27 @@ proc captureNameId*(self: Regex): Table[string, int] =
   ## Returns a map from named capture groups to their numerical
   ## identifier
   return self.captureNameToId
+
+proc matchesCrLf(self: Regex): bool =
+  let flags = getinfo[cint](self, pcre.INFO_OPTIONS)
+  let newlineFlags = flags and (pcre.NEWLINE_CRLF or
+                                pcre.NEWLINE_ANY or
+                                pcre.NEWLINE_ANYCRLF)
+  if newLineFlags > 0:
+    return true
+
+  # get flags from build config
+  var confFlags: cint
+  if pcre.config(pcre.CONFIG_NEWLINE, addr confFlags) != 0:
+    assert(false, "CONFIG_NEWLINE apparently got screwed up")
+
+  case confFlags
+  of 13: return false
+  of 10: return false
+  of (13 shl 8) or 10: return true
+  of -2: return true
+  of -1: return true
+  else: return false
 # }}}
 
 # Capture accessors {{{
@@ -255,11 +277,7 @@ proc initRegex*(pattern: string, options = "Sx"): Regex =
   result.captureNameToId = result.getNameToNumberTable()
 # }}}
 
-proc match*(self: Regex, str: string, start = 0, endpos = -1): Option[RegexMatch] =
-  ## Returns Some if there is a match between `start` and `endpos`, otherwise
-  ## it returns None.
-  ##
-  ## if `endpos == -1`, then `endpos = str.len`
+proc matchImpl*(self: Regex, str: string, start, endpos: int, flags: int): Option[RegexMatch] =
   var result: RegexMatch
   new(result)
   result.pattern = self
@@ -277,11 +295,105 @@ proc match*(self: Regex, str: string, start = 0, endpos = -1): Option[RegexMatch
                           cstring(str),
                           cint(max(str.len, endpos)),
                           cint(start),
-                          cint(0),
-                          cast[ptr cint](addr result.pcreMatchBounds[0]), cint(vecsize))
+                          cint(flags),
+                          cast[ptr cint](addr result.pcreMatchBounds[0]),
+                          cint(vecsize))
   if execRet >= 0:
     return Some(result)
   elif execRet == pcre.ERROR_NOMATCH:
     return None[RegexMatch]()
   else:
     raise newException(AssertionError, "Internal error: errno " & $execRet)
+
+proc match*(self: Regex, str: string, start = 0, endpos = -1): Option[RegexMatch] =
+  ## Returns Some if there is a match between `start` and `endpos`, otherwise
+  ## it returns None.
+  ##
+  ## if `endpos == -1`, then `endpos = str.len`
+  return matchImpl(self, str, start, endpos, 0)
+
+iterator findIter*(self: Regex, str: string, start = 0, endpos = -1): RegexMatch =
+  # see pcredemo for explaination
+  let matchesCrLf = self.matchesCrLf()
+  let unicode = bool(getinfo[cint](self, pcre.INFO_OPTIONS) and pcre.UTF8)
+  let endpos = if endpos == -1: str.len else: endpos
+
+  var offset = start
+  var previousMatch: RegexMatch
+  while offset != endpos:
+    if offset > endpos:
+      # eos occurs in the middle of a unicode char? die.
+      raise newException(AssertionError, "Input string has malformed unicode")
+
+    var flags = 0
+
+    if previousMatch != nil and
+        previousMatch.matchBounds.a == previousMatch.matchBounds.b:
+      # 0-len match
+      flags = pcre.NOTEMPTY_ATSTART or pcre.ANCHORED
+
+    let currentMatch = self.matchImpl(str, offset, endpos, flags)
+    previousMatch = currentMatch.get(nil)
+
+    if currentMatch.isNone:
+      # either the end of the input or the string
+      # cannot be split here
+      offset += 1
+
+      if matchesCrLf and offset < (str.len - 1) and
+         str[offset] == '\r' and str[offset + 1] == '\l':
+        # if PCRE treats CrLf as newline, skip both at the same time
+        offset += 1
+      elif unicode:
+        # XXX what about invalid unicode?
+        offset += str.runeLenAt(offset)
+    else:
+      let currentMatch = currentMatch.get
+      offset = currentMatch.matchBounds.b
+
+      yield currentMatch
+
+proc find*(self: Regex, str: string, start = 0, endpos = -1): Option[RegexMatch] =
+  for match in self.findIter(str, start, endpos):
+    return Some(match)
+
+  return None[RegexMatch]()
+
+proc findAll*(self: Regex, str: string, start = 0, endpos = -1): seq[RegexMatch] =
+  accumulateResult(self.findIter(str, start, endpos))
+
+proc renderBounds(str: string, bounds: Slice[int]): string =
+  result = " " & str & "⫞\n"
+  for i in -1 .. <bounds.a:
+    result.add(" ")
+  for i in bounds.a .. bounds.b:
+    result.add("^")
+
+proc split*(self: Regex, str: string): seq[string] =
+  result = @[]
+  var lastIdx = 0
+
+  for match in self.findIter(str):
+    # upper bound is exclusive, lower is inclusive:
+    #
+    # 0123456
+    #  ^^^
+    # (1, 4)
+    var bounds = match.matchBounds
+
+    if lastIdx == 0 and
+       lastIdx == bounds.a and
+       bounds.a == bounds.b:
+      # "12".split("") would be @["", "1", "2"], but
+      # if we skip an empty first match, it's the correct
+      # @["1", "2"]
+      discard
+    else:
+      result.add(str.substr(lastIdx, bounds.a - 1))
+
+    lastIdx = bounds.b
+
+  # last match: Each match takes the previous substring,
+  # but "1 2".split(/ /) needs to return @["1", "2"].
+  # This handles "2"
+  result.add(str.substr(lastIdx, str.len - 1))
author	Flaviu Tamas <tamasflaviu@gmail.com>	2015-01-12 20:09:19 -0500
committer	Flaviu Tamas <tamasflaviu@gmail.com>	2015-01-12 20:09:19 -0500
commit	2474758ed52bcdd48212ea51f7f05003c2468e4e (patch)
tree	b02eee56e5c91e06dbb98f20d45b1f88ae8ddfa3 /src/nre.nim
parent	6fe0de0639fe3163376acfb3dad75e2264a88363 (diff)
download	Nim-2474758ed52bcdd48212ea51f7f05003c2468e4e.tar.gz