diff options
author | Araq <rumpf_a@web.de> | 2014-08-31 13:46:27 +0200 |
---|---|---|
committer | Araq <rumpf_a@web.de> | 2014-08-31 13:46:27 +0200 |
commit | 3ba34f1762742682a54dfdc30986818b5c1ecd81 (patch) | |
tree | d996f1d21582a365eefd1f863555d5f5045b2be0 /lib/impure/re.nim | |
parent | 697789a3134b7923edd3fe1bd14faa23da6be35c (diff) | |
download | Nim-3ba34f1762742682a54dfdc30986818b5c1ecd81.tar.gz |
prettified re.nim; make some tests green
Diffstat (limited to 'lib/impure/re.nim')
-rw-r--r-- | lib/impure/re.nim | 115 |
1 files changed, 59 insertions, 56 deletions
diff --git a/lib/impure/re.nim b/lib/impure/re.nim index a129a5eba..fc9ed3ce0 100644 --- a/lib/impure/re.nim +++ b/lib/impure/re.nim @@ -28,7 +28,7 @@ const ## More subpatterns cannot be captured! type - TRegexFlag* = enum ## options for regular expressions + RegexFlag* = enum ## options for regular expressions reIgnoreCase = 0, ## do caseless matching reMultiLine = 1, ## ``^`` and ``$`` match newlines within data reDotAll = 2, ## ``.`` matches anything including NL @@ -36,17 +36,20 @@ type reStudy = 4 ## study the expression (may be omitted if the ## expression will be used only once) - TRegexDesc {.pure, final.} = object + RegexDesc = object h: PPcre e: ptr TExtra - TRegex* = ref TRegexDesc ## a compiled regular expression + Regex* = ref RegexDesc ## a compiled regular expression - EInvalidRegEx* = object of EInvalidValue + RegexError* = object of ValueError ## is raised if the pattern is no valid regular expression. +{.deprecated: [TRegexFlag: RegexFlag, TRegexDesc: RegexDesc, TRegex: Regex, + EInvalidRegEx: RegexError].} + proc raiseInvalidRegex(msg: string) {.noinline, noreturn.} = - var e: ref EInvalidRegEx + var e: ref RegexError new(e) e.msg = msg raise e @@ -55,11 +58,11 @@ proc rawCompile(pattern: string, flags: cint): PPcre = var msg: cstring offset: cint - result = pcre.Compile(pattern, flags, addr(msg), addr(offset), nil) + result = pcre.compile(pattern, flags, addr(msg), addr(offset), nil) if result == nil: raiseInvalidRegex($msg & "\n" & pattern & "\n" & repeatChar(offset) & "^\n") -proc finalizeRegEx(x: TRegex) = +proc finalizeRegEx(x: Regex) = # XXX This is a hack, but PCRE does not export its "free" function properly. # Sigh. The hack relies on PCRE's implementation (see ``pcre_get.c``). # Fortunately the implementation is unlikely to change. @@ -67,7 +70,7 @@ proc finalizeRegEx(x: TRegex) = if not isNil(x.e): pcre.free_substring(cast[cstring](x.e)) -proc re*(s: string, flags = {reExtended, reStudy}): TRegex = +proc re*(s: string, flags = {reExtended, reStudy}): Regex = ## Constructor of regular expressions. Note that Nim's ## extended raw string literals support this syntax ``re"[abc]"`` as ## a short form for ``re(r"[abc]")``. @@ -78,12 +81,12 @@ proc re*(s: string, flags = {reExtended, reStudy}): TRegex = result.e = pcre.study(result.h, 0, msg) if not isNil(msg): raiseInvalidRegex($msg) -proc matchOrFind(s: string, pattern: TRegex, matches: var openArray[string], +proc matchOrFind(s: string, pattern: Regex, matches: var openArray[string], start, flags: cint): cint = var - rawMatches: array[0..maxSubpatterns * 3 - 1, cint] - res = pcre.Exec(pattern.h, pattern.e, s, len(s).cint, start, flags, - cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3) + rawMatches: array[0..MaxSubpatterns * 3 - 1, cint] + res = pcre.exec(pattern.h, pattern.e, s, len(s).cint, start, flags, + cast[ptr cint](addr(rawMatches)), MaxSubpatterns * 3) if res < 0'i32: return res for i in 1..int(res)-1: var a = rawMatches[i * 2] @@ -92,16 +95,16 @@ proc matchOrFind(s: string, pattern: TRegex, matches: var openArray[string], else: matches[i-1] = "" return rawMatches[1] - rawMatches[0] -proc findBounds*(s: string, pattern: TRegex, matches: var openArray[string], +proc findBounds*(s: string, pattern: Regex, matches: var openArray[string], start = 0): tuple[first, last: int] = ## returns the starting position and end position of `pattern` in `s` ## and the captured ## substrings in the array `matches`. If it does not match, nothing ## is written into `matches` and ``(-1,0)`` is returned. var - rawMatches: array[0..maxSubpatterns * 3 - 1, cint] - res = pcre.Exec(pattern.h, pattern.e, s, len(s).cint, start.cint, 0'i32, - cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3) + rawMatches: array[0..MaxSubpatterns * 3 - 1, cint] + res = pcre.exec(pattern.h, pattern.e, s, len(s).cint, start.cint, 0'i32, + cast[ptr cint](addr(rawMatches)), MaxSubpatterns * 3) if res < 0'i32: return (-1, 0) for i in 1..int(res)-1: var a = rawMatches[i * 2] @@ -110,7 +113,7 @@ proc findBounds*(s: string, pattern: TRegex, matches: var openArray[string], else: matches[i-1] = "" return (rawMatches[0].int, rawMatches[1].int - 1) -proc findBounds*(s: string, pattern: TRegex, +proc findBounds*(s: string, pattern: Regex, matches: var openArray[tuple[first, last: int]], start = 0): tuple[first, last: int] = ## returns the starting position and end position of ``pattern`` in ``s`` @@ -118,9 +121,9 @@ proc findBounds*(s: string, pattern: TRegex, ## If it does not match, nothing is written into `matches` and ## ``(-1,0)`` is returned. var - rawMatches: array[0..maxSubpatterns * 3 - 1, cint] - res = pcre.Exec(pattern.h, pattern.e, s, len(s).cint, start.cint, 0'i32, - cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3) + rawMatches: array[0..MaxSubpatterns * 3 - 1, cint] + res = pcre.exec(pattern.h, pattern.e, s, len(s).cint, start.cint, 0'i32, + cast[ptr cint](addr(rawMatches)), MaxSubpatterns * 3) if res < 0'i32: return (-1, 0) for i in 1..int(res)-1: var a = rawMatches[i * 2] @@ -129,25 +132,25 @@ proc findBounds*(s: string, pattern: TRegex, else: matches[i-1] = (-1,0) return (rawMatches[0].int, rawMatches[1].int - 1) -proc findBounds*(s: string, pattern: TRegex, +proc findBounds*(s: string, pattern: Regex, start = 0): tuple[first, last: int] = ## returns the starting position of `pattern` in `s`. If it does not ## match, ``(-1,0)`` is returned. var rawMatches: array[0..3 - 1, cint] - res = pcre.Exec(pattern.h, nil, s, len(s).cint, start.cint, 0'i32, + res = pcre.exec(pattern.h, nil, s, len(s).cint, start.cint, 0'i32, cast[ptr cint](addr(rawMatches)), 3) if res < 0'i32: return (int(res), 0) return (int(rawMatches[0]), int(rawMatches[1]-1)) -proc matchOrFind(s: string, pattern: TRegex, start, flags: cint): cint = - var rawMatches: array [0..maxSubpatterns * 3 - 1, cint] - result = pcre.Exec(pattern.h, pattern.e, s, len(s).cint, start, flags, - cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3) +proc matchOrFind(s: string, pattern: Regex, start, flags: cint): cint = + var rawMatches: array [0..MaxSubpatterns * 3 - 1, cint] + result = pcre.exec(pattern.h, pattern.e, s, len(s).cint, start, flags, + cast[ptr cint](addr(rawMatches)), MaxSubpatterns * 3) if result >= 0'i32: result = rawMatches[1] - rawMatches[0] -proc match*(s: string, pattern: TRegex, matches: var openArray[string], +proc match*(s: string, pattern: Regex, matches: var openArray[string], start = 0): bool = ## returns ``true`` if ``s[start..]`` matches the ``pattern`` and ## the captured substrings in the array ``matches``. If it does not @@ -156,32 +159,32 @@ proc match*(s: string, pattern: TRegex, matches: var openArray[string], return matchOrFind(s, pattern, matches, start.cint, pcre.ANCHORED) == cint(s.len - start) -proc match*(s: string, pattern: TRegex, start = 0): bool = +proc match*(s: string, pattern: Regex, start = 0): bool = ## returns ``true`` if ``s[start..]`` matches the ``pattern``. return matchOrFind(s, pattern, start.cint, pcre.ANCHORED) == cint(s.len-start) -proc matchLen*(s: string, pattern: TRegex, matches: var openArray[string], +proc matchLen*(s: string, pattern: Regex, matches: var openArray[string], start = 0): int = ## the same as ``match``, but it returns the length of the match, ## if there is no match, -1 is returned. Note that a match length ## of zero can happen. return matchOrFind(s, pattern, matches, start.cint, pcre.ANCHORED) -proc matchLen*(s: string, pattern: TRegex, start = 0): int = +proc matchLen*(s: string, pattern: Regex, start = 0): int = ## the same as ``match``, but it returns the length of the match, ## if there is no match, -1 is returned. Note that a match length ## of zero can happen. return matchOrFind(s, pattern, start.cint, pcre.ANCHORED) -proc find*(s: string, pattern: TRegex, matches: var openArray[string], +proc find*(s: string, pattern: Regex, matches: var openArray[string], start = 0): int = ## returns the starting position of ``pattern`` in ``s`` and the captured ## substrings in the array ``matches``. If it does not match, nothing ## is written into ``matches`` and -1 is returned. var - rawMatches: array[0..maxSubpatterns * 3 - 1, cint] - res = pcre.Exec(pattern.h, pattern.e, s, len(s).cint, start.cint, 0'i32, - cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3) + rawMatches: array[0..MaxSubpatterns * 3 - 1, cint] + res = pcre.exec(pattern.h, pattern.e, s, len(s).cint, start.cint, 0'i32, + cast[ptr cint](addr(rawMatches)), MaxSubpatterns * 3) if res < 0'i32: return res for i in 1..int(res)-1: var a = rawMatches[i * 2] @@ -190,33 +193,33 @@ proc find*(s: string, pattern: TRegex, matches: var openArray[string], else: matches[i-1] = "" return rawMatches[0] -proc find*(s: string, pattern: TRegex, start = 0): int = +proc find*(s: string, pattern: Regex, start = 0): int = ## returns the starting position of ``pattern`` in ``s``. If it does not ## match, -1 is returned. var rawMatches: array[0..3 - 1, cint] - res = pcre.Exec(pattern.h, nil, s, len(s).cint, start.cint, 0'i32, + res = pcre.exec(pattern.h, nil, s, len(s).cint, start.cint, 0'i32, cast[ptr cint](addr(rawMatches)), 3) if res < 0'i32: return res return rawMatches[0] -iterator findAll*(s: string, pattern: TRegex, start = 0): string = +iterator findAll*(s: string, pattern: Regex, start = 0): string = ## Yields all matching *substrings* of `s` that match `pattern`. ## ## Note that since this is an iterator you should not modify the string you ## are iterating over: bad things could happen. var i = int32(start) - var rawMatches: array[0..maxSubpatterns * 3 - 1, cint] + var rawMatches: array[0..MaxSubpatterns * 3 - 1, cint] while true: - let res = pcre.Exec(pattern.h, pattern.e, s, len(s).cint, i, 0'i32, - cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3) + let res = pcre.exec(pattern.h, pattern.e, s, len(s).cint, i, 0'i32, + cast[ptr cint](addr(rawMatches)), MaxSubpatterns * 3) if res < 0'i32: break let a = rawMatches[0] let b = rawMatches[1] yield substr(s, int(a), int(b)-1) i = b -proc findAll*(s: string, pattern: TRegex, start = 0): seq[string] = +proc findAll*(s: string, pattern: Regex, start = 0): seq[string] = ## returns all matching *substrings* of `s` that match `pattern`. ## If it does not match, @[] is returned. accumulateResult(findAll(s, pattern, start)) @@ -224,7 +227,7 @@ proc findAll*(s: string, pattern: TRegex, start = 0): seq[string] = when not defined(nimhygiene): {.pragma: inject.} -template `=~` *(s: string, pattern: TRegex): expr = +template `=~` *(s: string, pattern: Regex): expr = ## This calls ``match`` with an implicit declared ``matches`` array that ## can be used in the scope of the ``=~`` call: ## @@ -242,32 +245,32 @@ template `=~` *(s: string, pattern: TRegex): expr = ## else: ## echo("syntax error") ## - bind maxSubPatterns + bind MaxSubPatterns when not declaredInScope(matches): var matches {.inject.}: array[0..MaxSubpatterns-1, string] match(s, pattern, matches) # ------------------------- more string handling ------------------------------ -proc contains*(s: string, pattern: TRegex, start = 0): bool = +proc contains*(s: string, pattern: Regex, start = 0): bool = ## same as ``find(s, pattern, start) >= 0`` return find(s, pattern, start) >= 0 -proc contains*(s: string, pattern: TRegex, matches: var openArray[string], +proc contains*(s: string, pattern: Regex, matches: var openArray[string], start = 0): bool = ## same as ``find(s, pattern, matches, start) >= 0`` return find(s, pattern, matches, start) >= 0 -proc startsWith*(s: string, prefix: TRegex): bool = +proc startsWith*(s: string, prefix: Regex): bool = ## returns true if `s` starts with the pattern `prefix` result = matchLen(s, prefix) >= 0 -proc endsWith*(s: string, suffix: TRegex): bool = +proc endsWith*(s: string, suffix: Regex): bool = ## returns true if `s` ends with the pattern `prefix` for i in 0 .. s.len-1: if matchLen(s, suffix, i) == s.len - i: return true -proc replace*(s: string, sub: TRegex, by = ""): string = +proc replace*(s: string, sub: Regex, by = ""): string = ## Replaces `sub` in `s` by the string `by`. Captures cannot be ## accessed in `by`. Examples: ## @@ -289,7 +292,7 @@ proc replace*(s: string, sub: TRegex, by = ""): string = prev = match.last + 1 add(result, substr(s, prev)) -proc replacef*(s: string, sub: TRegex, by: string): string = +proc replacef*(s: string, sub: Regex, by: string): string = ## Replaces `sub` in `s` by the string `by`. Captures can be accessed in `by` ## with the notation ``$i`` and ``$#`` (see strutils.`%`). Examples: ## @@ -302,7 +305,7 @@ proc replacef*(s: string, sub: TRegex, by: string): string = ## ## "var1<-keykey; val2<-key2key2" result = "" - var caps: array[0..maxSubpatterns-1, string] + var caps: array[0..MaxSubpatterns-1, string] var prev = 0 while true: var match = findBounds(s, sub, caps, prev) @@ -314,7 +317,7 @@ proc replacef*(s: string, sub: TRegex, by: string): string = when false: result = "" var i = 0 - var caps: array[0..maxSubpatterns-1, string] + var caps: array[0..MaxSubpatterns-1, string] while i < s.len: var x = matchLen(s, sub, caps, i) if x <= 0: @@ -327,12 +330,12 @@ proc replacef*(s: string, sub: TRegex, by: string): string = add(result, substr(s, i)) proc parallelReplace*(s: string, subs: openArray[ - tuple[pattern: TRegex, repl: string]]): string = + tuple[pattern: Regex, repl: string]]): string = ## Returns a modified copy of `s` with the substitutions in `subs` ## applied in parallel. result = "" var i = 0 - var caps: array[0..maxSubpatterns-1, string] + var caps: array[0..MaxSubpatterns-1, string] while i < s.len: block searchSubs: for j in 0..high(subs): @@ -347,14 +350,14 @@ proc parallelReplace*(s: string, subs: openArray[ add(result, substr(s, i)) proc transformFile*(infile, outfile: string, - subs: openArray[tuple[pattern: TRegex, repl: string]]) = + subs: openArray[tuple[pattern: Regex, repl: string]]) = ## reads in the file `infile`, performs a parallel replacement (calls ## `parallelReplace`) and writes back to `outfile`. Raises ``EIO`` if an ## error occurs. This is supposed to be used for quick scripting. var x = readFile(infile).string writeFile(outfile, x.parallelReplace(subs)) -iterator split*(s: string, sep: TRegex): string = +iterator split*(s: string, sep: Regex): string = ## Splits the string `s` into substrings. ## ## Substrings are separated by the regular expression `sep`. @@ -386,7 +389,7 @@ iterator split*(s: string, sep: TRegex): string = if first < last: yield substr(s, first, last-1) -proc split*(s: string, sep: TRegex): seq[string] = +proc split*(s: string, sep: Regex): seq[string] = ## Splits the string `s` into substrings. accumulateResult(split(s, sep)) |