diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/impure/nre.nim | 701 | ||||
-rw-r--r-- | lib/impure/nre/.gitignore | 9 | ||||
-rw-r--r-- | lib/impure/nre/private/pcre.nim | 442 | ||||
-rw-r--r-- | lib/impure/nre/private/util.nim | 62 |
4 files changed, 1214 insertions, 0 deletions
diff --git a/lib/impure/nre.nim b/lib/impure/nre.nim new file mode 100644 index 000000000..a351fd8d3 --- /dev/null +++ b/lib/impure/nre.nim @@ -0,0 +1,701 @@ +# +# Nim's Runtime Library +# (c) Copyright 2015 Nim Contributers +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + + +import nre.private.pcre as pcre +import nre.private.util +import tables +import unsigned +from future import lc, `[]` +from strutils import toLower, `%` +from math import ceil +import options +from unicode import runeLenAt + + +## What is NRE? +## ============ +## +## A regular expression library for Nim using PCRE to do the hard work. +## +## Why? +## ---- +## +## The `re.nim <http://nim-lang.org/re.html>`__ module that +## `Nim <http://nim-lang.org/>`__ provides in its standard library is +## inadequate: +## +## - It provides only a limited number of captures, while the underling +## library (PCRE) allows an unlimited number. +## +## - Instead of having one proc that returns both the bounds and +## substring, it has one for the bounds and another for the substring. +## +## - If the splitting regex is empty (``""``), then it returns the input +## string instead of following `Perl <https://ideone.com/dDMjmz>`__, +## `Javascript <http://jsfiddle.net/xtcbxurg/>`__, and +## `Java <https://ideone.com/hYJuJ5>`__'s precedent of returning a list +## of each character (``"123".split(re"") == @["1", "2", "3"]``). +## +## Licencing +## --------- +## +## PCRE has some additional terms that you must comply with if you use this module.:: +## +## > Copyright (c) 1997-2001 University of Cambridge +## > +## > Permission is granted to anyone to use this software for any purpose on any +## > computer system, and to redistribute it freely, subject to the following +## > restrictions: +## > +## > 1. This software is distributed in the hope that it will be useful, +## > but WITHOUT ANY WARRANTY; without even the implied warranty of +## > MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +## > +## > 2. The origin of this software must not be misrepresented, either by +## > explicit claim or by omission. In practice, this means that if you use +## > PCRE in software that you distribute to others, commercially or +## > otherwise, you must put a sentence like this +## > +## > Regular expression support is provided by the PCRE library package, +## > which is open source software, written by Philip Hazel, and copyright +## > by the University of Cambridge, England. +## > +## > somewhere reasonably visible in your documentation and in any relevant +## > files or online help data or similar. A reference to the ftp site for +## > the source, that is, to +## > +## > ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/ +## > +## > should also be given in the documentation. However, this condition is not +## > intended to apply to whole chains of software. If package A includes PCRE, +## > it must acknowledge it, but if package B is software that includes package +## > A, the condition is not imposed on package B (unless it uses PCRE +## > independently). +## > +## > 3. Altered versions must be plainly marked as such, and must not be +## > misrepresented as being the original software. +## > +## > 4. If PCRE is embedded in any software that is released under the GNU +## > General Purpose Licence (GPL), or Lesser General Purpose Licence (LGPL), +## > then the terms of that licence shall supersede any condition above with +## > which it is incompatible. + + +# Type definitions {{{ +type + Regex* = ref object + ## Represents the pattern that things are matched against, constructed with + ## ``re(string)``. Examples: ``re"foo"``, ``re(r"(*ANYCRLF)(?x)foo # + ## comment".`` + ## + ## ``pattern: string`` + ## the string that was used to create the pattern. + ## + ## ``captureCount: int`` + ## the number of captures that the pattern has. + ## + ## ``captureNameId: Table[string, int]`` + ## a table from the capture names to their numeric id. + ## + ## + ## Options + ## ....... + ## + ## The following options may appear anywhere in the pattern, and they affect + ## the rest of it. + ## + ## - ``(?i)`` - case insensitive + ## - ``(?m)`` - multi-line: ``^`` and ``$`` match the beginning and end of + ## lines, not of the subject string + ## - ``(?s)`` - ``.`` also matches newline (*dotall*) + ## - ``(?U)`` - expressions are not greedy by default. ``?`` can be added + ## to a qualifier to make it greedy + ## - ``(?x)`` - whitespace and comments (``#``) are ignored (*extended*) + ## - ``(?X)`` - character escapes without special meaning (``\w`` vs. + ## ``\a``) are errors (*extra*) + ## + ## One or a combination of these options may appear only at the beginning + ## of the pattern: + ## + ## - ``(*UTF8)`` - treat both the pattern and subject as UTF-8 + ## - ``(*UCP)`` - Unicode character properties; ``\w`` matches ``я`` + ## - ``(*U)`` - a combination of the two options above + ## - ``(*FIRSTLINE*)`` - fails if there is not a match on the first line + ## - ``(*NO_AUTO_CAPTURE)`` - turn off auto-capture for groups; + ## ``(?<name>...)`` can be used to capture + ## - ``(*CR)`` - newlines are separated by ``\r`` + ## - ``(*LF)`` - newlines are separated by ``\n`` (UNIX default) + ## - ``(*CRLF)`` - newlines are separated by ``\r\n`` (Windows default) + ## - ``(*ANYCRLF)`` - newlines are separated by any of the above + ## - ``(*ANY)`` - newlines are separated by any of the above and Unicode + ## newlines: + ## + ## single characters VT (vertical tab, U+000B), FF (form feed, U+000C), + ## NEL (next line, U+0085), LS (line separator, U+2028), and PS + ## (paragraph separator, U+2029). For the 8-bit library, the last two + ## are recognized only in UTF-8 mode. + ## — man pcre + ## + ## - ``(*JAVASCRIPT_COMPAT)`` - JavaScript compatibility + ## - ``(*NO_STUDY)`` - turn off studying; study is enabled by default + ## + ## For more details on the leading option groups, see the `Option + ## Setting <http://man7.org/linux/man-pages/man3/pcresyntax.3.html#OPTION_SETTING>`__ + ## and the `Newline + ## Convention <http://man7.org/linux/man-pages/man3/pcresyntax.3.html#NEWLINE_CONVENTION>`__ + ## sections of the `PCRE syntax + ## manual <http://man7.org/linux/man-pages/man3/pcresyntax.3.html>`__. + pattern*: string ## not nil + pcreObj: ptr pcre.Pcre ## not nil + pcreExtra: ptr pcre.ExtraData ## nil + + captureNameToId: Table[string, int] + + RegexMatch* = object + ## Usually seen as Option[RegexMatch], it represents the result of an + ## execution. On failure, it is none, on success, it is some. + ## + ## ``pattern: Regex`` + ## the pattern that is being matched + ## + ## ``str: string`` + ## the string that was matched against + ## + ## ``captures[]: string`` + ## the string value of whatever was captured at that id. If the value + ## is invalid, then behavior is undefined. If the id is ``-1``, then + ## the whole match is returned. If the given capture was not matched, + ## ``nil`` is returned. + ## + ## - ``"abc".match(re"(\w)").captures[0] == "a"`` + ## - ``"abc".match(re"(?<letter>\w)").captures["letter"] == "a"`` + ## - ``"abc".match(re"(\w)\w").captures[-1] == "ab"`` + ## + ## ``captureBounds[]: Option[Slice[int]]`` + ## gets the bounds of the given capture according to the same rules as + ## the above. If the capture is not filled, then ``None`` is returned. + ## The bounds are both inclusive. + ## + ## - ``"abc".match(re"(\w)").captureBounds[0] == 0 .. 0`` + ## - ``"abc".match(re"").captureBounds[-1] == 0 .. -1`` + ## - ``"abc".match(re"abc").captureBounds[-1] == 0 .. 2`` + ## + ## ``match: string`` + ## the full text of the match. + ## + ## ``matchBounds: Slice[int]`` + ## the bounds of the match, as in ``captureBounds[]`` + ## + ## ``(captureBounds|captures).toTable`` + ## returns a table with each named capture as a key. + ## + ## ``(captureBounds|captures).toSeq`` + ## returns all the captures by their number. + ## + ## ``$: string`` + ## same as ``match`` + pattern*: Regex ## The regex doing the matching. + ## Not nil. + str*: string ## The string that was matched against. + ## Not nil. + pcreMatchBounds: seq[Slice[cint]] ## First item is the bounds of the match + ## Other items are the captures + ## `a` is inclusive start, `b` is exclusive end + + Captures* = distinct RegexMatch + CaptureBounds* = distinct RegexMatch + + RegexException* = ref object of Exception + + RegexInternalError* = ref object of RegexException + ## Internal error in the module, this probably means that there is a bug + + InvalidUnicodeError* = ref object of RegexException + ## Thrown when matching fails due to invalid unicode in strings + pos*: int ## the location of the invalid unicode in bytes + + SyntaxError* = ref object of RegexException + ## Thrown when there is a syntax error in the + ## regular expression string passed in + pos*: int ## the location of the syntax error in bytes + pattern*: string ## the pattern that caused the problem + + StudyError* = ref object of RegexException + ## Thrown when studying the regular expression failes + ## for whatever reason. The message contains the error + ## code. +# }}} + +proc getinfo[T](pattern: Regex, opt: cint): T = + let retcode = pcre.fullinfo(pattern.pcreObj, pattern.pcreExtra, opt, addr result) + + if retcode < 0: + # XXX Error message that doesn't expose implementation details + raise newException(FieldError, "Invalid getinfo for $1, errno $2" % [$opt, $retcode]) + +# Regex accessors {{{ +proc captureCount*(pattern: Regex): int = + return getinfo[cint](pattern, pcre.INFO_CAPTURECOUNT) + +proc captureNameId*(pattern: Regex): Table[string, int] = + return pattern.captureNameToId + +proc matchesCrLf(pattern: Regex): bool = + let flags = uint32(getinfo[culong](pattern, pcre.INFO_OPTIONS)) + let newlineFlags = flags and (pcre.NEWLINE_CRLF or + pcre.NEWLINE_ANY or + pcre.NEWLINE_ANYCRLF) + if newLineFlags > 0u32: + return true + + # get flags from build config + var confFlags: cint + if pcre.config(pcre.CONFIG_NEWLINE, addr confFlags) != 0: + assert(false, "CONFIG_NEWLINE apparently got screwed up") + + case confFlags + of 13: return false + of 10: return false + of (13 shl 8) or 10: return true + of -2: return true + of -1: return true + else: return false +# }}} + +# Capture accessors {{{ +proc captureBounds*(pattern: RegexMatch): CaptureBounds = return CaptureBounds(pattern) + +proc captures*(pattern: RegexMatch): Captures = return Captures(pattern) + +proc `[]`*(pattern: CaptureBounds, i: int): Option[Slice[int]] = + let pattern = RegexMatch(pattern) + if pattern.pcreMatchBounds[i + 1].a != -1: + let bounds = pattern.pcreMatchBounds[i + 1] + return some(int(bounds.a) .. int(bounds.b-1)) + else: + return none(Slice[int]) + +proc `[]`*(pattern: Captures, i: int): string = + let pattern = RegexMatch(pattern) + let bounds = pattern.captureBounds[i] + + if bounds.isSome: + let bounds = bounds.get + return pattern.str.substr(bounds.a, bounds.b) + else: + return nil + +proc match*(pattern: RegexMatch): string = + return pattern.captures[-1] + +proc matchBounds*(pattern: RegexMatch): Slice[int] = + return pattern.captureBounds[-1].get + +proc `[]`*(pattern: CaptureBounds, name: string): Option[Slice[int]] = + let pattern = RegexMatch(pattern) + return pattern.captureBounds[pattern.pattern.captureNameToId.fget(name)] + +proc `[]`*(pattern: Captures, name: string): string = + let pattern = RegexMatch(pattern) + return pattern.captures[pattern.pattern.captureNameToId.fget(name)] + +template toTableImpl(cond: bool): stmt {.immediate, dirty.} = + for key in RegexMatch(pattern).pattern.captureNameId.keys: + let nextVal = pattern[key] + if cond: + result[key] = default + else: + result[key] = nextVal + +proc toTable*(pattern: Captures, default: string = nil): Table[string, string] = + result = initTable[string, string]() + toTableImpl(nextVal == nil) + +proc toTable*(pattern: CaptureBounds, default = none(Slice[int])): + Table[string, Option[Slice[int]]] = + result = initTable[string, Option[Slice[int]]]() + toTableImpl(nextVal.isNone) + +template itemsImpl(cond: bool): stmt {.immediate, dirty.} = + for i in 0 .. <RegexMatch(pattern).pattern.captureCount: + let nextVal = pattern[i] + if cond: + yield default + else: + yield nextVal + +iterator items*(pattern: CaptureBounds, default = none(Slice[int])): Option[Slice[int]] = + itemsImpl(nextVal.isNone) + +iterator items*(pattern: Captures, default: string = nil): string = + itemsImpl(nextVal == nil) + +proc toSeq*(pattern: CaptureBounds, default = none(Slice[int])): seq[Option[Slice[int]]] = + accumulateResult(pattern.items(default)) + +proc toSeq*(pattern: Captures, default: string = nil): seq[string] = + accumulateResult(pattern.items(default)) + +proc `$`*(pattern: RegexMatch): string = + return pattern.captures[-1] + +proc `==`*(a, b: Regex): bool = + if not a.isNil and not b.isNil: + return a.pattern == b.pattern and + a.pcreObj == b.pcreObj and + a.pcreExtra == b.pcreExtra + else: + return system.`==`(a, b) + +proc `==`*(a, b: RegexMatch): bool = + return a.pattern == b.pattern and + a.str == b.str +# }}} + +# Creation & Destruction {{{ +# PCRE Options {{{ +const PcreOptions = { + "NEVER_UTF": pcre.NEVER_UTF, + "ANCHORED": pcre.ANCHORED, + "DOLLAR_ENDONLY": pcre.DOLLAR_ENDONLY, + "FIRSTLINE": pcre.FIRSTLINE, + "NO_AUTO_CAPTURE": pcre.NO_AUTO_CAPTURE, + "JAVASCRIPT_COMPAT": pcre.JAVASCRIPT_COMPAT, + "U": pcre.UTF8 or pcre.UCP +}.toTable + +# Options that are supported inside regular expressions themselves +const SkipOptions = [ + "LIMIT_MATCH=", "LIMIT_RECURSION=", "NO_AUTO_POSSESS", "NO_START_OPT", + "UTF8", "UTF16", "UTF32", "UTF", "UCP", + "CR", "LF", "CRLF", "ANYCRLF", "ANY", "BSR_ANYCRLF", "BSR_UNICODE" +] + +proc extractOptions(pattern: string): tuple[pattern: string, flags: int, study: bool] = + result = ("", 0, true) + + var optionStart = 0 + var equals = false + for i, c in pattern: + if optionStart == i: + if c != '(': + break + optionStart = i + + elif optionStart == i-1: + if c != '*': + break + + elif c == ')': + let name = pattern[optionStart+2 .. i-1] + if equals or name in SkipOptions: + result.pattern.add pattern[optionStart .. i] + elif PcreOptions.hasKey name: + result.flags = result.flags or PcreOptions[name] + elif name == "NO_STUDY": + result.study = false + else: + break + optionStart = i+1 + equals = false + + elif not equals: + if c == '=': + equals = true + if pattern[optionStart+2 .. i] notin SkipOptions: + break + elif c notin {'A'..'Z', '0'..'9', '_'}: + break + + result.pattern.add pattern[optionStart .. pattern.high] + +# }}} + +type UncheckedArray {.unchecked.}[T] = array[0 .. 0, T] + +proc destroyRegex(pattern: Regex) = + pcre.free_substring(cast[cstring](pattern.pcreObj)) + pattern.pcreObj = nil + if pattern.pcreExtra != nil: + pcre.free_study(pattern.pcreExtra) + +proc getNameToNumberTable(pattern: Regex): Table[string, int] = + let entryCount = getinfo[cint](pattern, pcre.INFO_NAMECOUNT) + let entrySize = getinfo[cint](pattern, pcre.INFO_NAMEENTRYSIZE) + let table = cast[ptr UncheckedArray[uint8]]( + getinfo[int](pattern, pcre.INFO_NAMETABLE)) + + result = initTable[string, int]() + + for i in 0 .. <entryCount: + let pos = i * entrySize + let num = (int(table[pos]) shl 8) or int(table[pos + 1]) - 1 + var name = "" + + var idx = 2 + while table[pos + idx] != 0: + name.add(char(table[pos + idx])) + idx += 1 + + result[name] = num + +proc initRegex(pattern: string, flags: int, study = true): Regex = + new(result, destroyRegex) + result.pattern = pattern + + var errorMsg: cstring + var errOffset: cint + + result.pcreObj = pcre.compile(cstring(pattern), + # better hope int is at least 4 bytes.. + cint(flags), addr errorMsg, + addr errOffset, nil) + if result.pcreObj == nil: + # failed to compile + raise SyntaxError(msg: $errorMsg, pos: errOffset, pattern: pattern) + + if study: + # XXX investigate JIT + result.pcreExtra = pcre.study(result.pcreObj, 0x0, addr errorMsg) + if errorMsg != nil: + raise StudyError(msg: $errorMsg) + + result.captureNameToId = result.getNameToNumberTable() + +proc re*(pattern: string): Regex = + let (pattern, flags, study) = extractOptions(pattern) + initRegex(pattern, flags, study) +# }}} + +# Operations {{{ +proc matchImpl(str: string, pattern: Regex, start, endpos: int, flags: int): Option[RegexMatch] = + var myResult = RegexMatch(pattern : pattern, str : str) + # See PCRE man pages. + # 2x capture count to make room for start-end pairs + # 1x capture count as slack space for PCRE + let vecsize = (pattern.captureCount() + 1) * 3 + # div 2 because each element is 2 cints long + myResult.pcreMatchBounds = newSeq[Slice[cint]](ceil(vecsize / 2).int) + myResult.pcreMatchBounds.setLen(vecsize div 3) + + let strlen = if endpos == int.high: str.len else: endpos+1 + doAssert(strlen <= str.len) # don't want buffer overflows + + let execRet = pcre.exec(pattern.pcreObj, + pattern.pcreExtra, + cstring(str), + cint(strlen), + cint(start), + cint(flags), + cast[ptr cint](addr myResult.pcreMatchBounds[0]), + cint(vecsize)) + if execRet >= 0: + return some(myResult) + + case execRet: + of pcre.ERROR_NOMATCH: + return none(RegexMatch) + of pcre.ERROR_NULL: + raise newException(AccessViolationError, "Expected non-null parameters") + of pcre.ERROR_BADOPTION: + raise RegexInternalError(msg : "Unknown pattern flag. Either a bug or " & + "outdated PCRE.") + of pcre.ERROR_BADUTF8, pcre.ERROR_SHORTUTF8, pcre.ERROR_BADUTF8_OFFSET: + raise InvalidUnicodeError(msg : "Invalid unicode byte sequence", + pos : myResult.pcreMatchBounds[0].a) + else: + raise RegexInternalError(msg : "Unknown internal error: " & $execRet) + +proc match*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[RegexMatch] = + ## Like ```find(...)`` <#proc-find>`__, but anchored to the start of the + ## string. This means that ``"foo".match(re"f") == true``, but + ## ``"foo".match(re"o") == false``. + return str.matchImpl(pattern, start, endpos, pcre.ANCHORED) + +iterator findIter*(str: string, pattern: Regex, start = 0, endpos = int.high): RegexMatch = + ## Works the same as ```find(...)`` <#proc-find>`__, but finds every + ## non-overlapping match. ``"2222".find(re"22")`` is ``"22", "22"``, not + ## ``"22", "22", "22"``. + ## + ## Arguments are the same as ```find(...)`` <#proc-find>`__ + ## + ## Variants: + ## + ## - ``proc findAll(...)`` returns a ``seq[string]`` + # see pcredemo for explaination + let matchesCrLf = pattern.matchesCrLf() + let unicode = uint32(getinfo[culong](pattern, pcre.INFO_OPTIONS) and + pcre.UTF8) > 0u32 + let strlen = if endpos == int.high: str.len else: endpos+1 + + var offset = start + var match: Option[RegexMatch] + while true: + var flags = 0 + + if match.isSome and + match.get.matchBounds.a > match.get.matchBounds.b: + # 0-len match + flags = pcre.NOTEMPTY_ATSTART + + match = str.matchImpl(pattern, offset, endpos, flags) + + if match.isNone: + # either the end of the input or the string + # cannot be split here + if offset >= strlen: + break + + if matchesCrLf and offset < (str.len - 1) and + str[offset] == '\r' and str[offset + 1] == '\l': + # if PCRE treats CrLf as newline, skip both at the same time + offset += 2 + elif unicode: + # XXX what about invalid unicode? + offset += str.runeLenAt(offset) + assert(offset <= strlen) + else: + offset += 1 + else: + offset = match.get.matchBounds.b + 1 + + yield match.get + + +proc find*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[RegexMatch] = + ## Finds the given pattern in the string between the end and start + ## positions. + ## + ## ``start`` + ## The start point at which to start matching. ``|abc`` is ``0``; + ## ``a|bc`` is ``1`` + ## + ## ``endpos`` + ## The maximum index for a match; ``int.high`` means the end of the + ## string, otherwise it’s an inclusive upper bound. + return str.matchImpl(pattern, start, endpos, 0) + +proc findAll*(str: string, pattern: Regex, start = 0, endpos = int.high): seq[string] = + result = @[] + for match in str.findIter(pattern, start, endpos): + result.add(match.match) + +proc split*(str: string, pattern: Regex, maxSplit = -1, start = 0): seq[string] = + ## Splits the string with the given regex. This works according to the + ## rules that Perl and Javascript use: + ## + ## - If the match is zero-width, then the string is still split: + ## ``"123".split(r"") == @["1", "2", "3"]``. + ## + ## - If the pattern has a capture in it, it is added after the string + ## split: ``"12".split(re"(\d)") == @["", "1", "", "2", ""]``. + ## + ## - If ``maxsplit != -1``, then the string will only be split + ## ``maxsplit - 1`` times. This means that there will be ``maxsplit`` + ## strings in the output seq. + ## ``"1.2.3".split(re"\.", maxsplit = 2) == @["1", "2.3"]`` + ## + ## ``start`` behaves the same as in ```find(...)`` <#proc-find>`__. + result = @[] + var lastIdx = start + var splits = 0 + var bounds = 0 .. 0 + + for match in str.findIter(pattern, start = start): + # bounds are inclusive: + # + # 0123456 + # ^^^ + # (1, 3) + bounds = match.matchBounds + + # "12".split("") would be @["", "1", "2"], but + # if we skip an empty first match, it's the correct + # @["1", "2"] + if bounds.a <= bounds.b or bounds.a > start: + result.add(str.substr(lastIdx, bounds.a - 1)) + splits += 1 + + lastIdx = bounds.b + 1 + + for cap in match.captures: + # if there are captures, include them in the result + result.add(cap) + + if splits == maxSplit - 1: + break + + # "12".split("\b") would be @["1", "2", ""], but + # if we skip an empty last match, it's the correct + # @["1", "2"] + if bounds.a <= bounds.b or bounds.b < str.high: + # last match: Each match takes the previous substring, + # but "1 2".split(/ /) needs to return @["1", "2"]. + # This handles "2" + result.add(str.substr(bounds.b + 1, str.high)) + +template replaceImpl(str: string, pattern: Regex, + replacement: expr): stmt {.immediate, dirty.} = + # XXX seems very similar to split, maybe I can reduce code duplication + # somehow? + result = "" + var lastIdx = 0 + for match {.inject.} in str.findIter(pattern): + let bounds = match.matchBounds + result.add(str.substr(lastIdx, bounds.a - 1)) + let nextVal = replacement + assert(nextVal != nil) + result.add(nextVal) + + lastIdx = bounds.b + 1 + + result.add(str.substr(lastIdx, str.len - 1)) + return result + +proc replace*(str: string, pattern: Regex, + subproc: proc (match: RegexMatch): string): string = + ## Replaces each match of Regex in the string with ``sub``, which should + ## never be or return ``nil``. + ## + ## If ``sub`` is a ``proc (RegexMatch): string``, then it is executed with + ## each match and the return value is the replacement value. + ## + ## If ``sub`` is a ``proc (string): string``, then it is executed with the + ## full text of the match and and the return value is the replacement + ## value. + ## + ## If ``sub`` is a string, the syntax is as follows: + ## + ## - ``$$`` - literal ``$`` + ## - ``$123`` - capture number ``123`` + ## - ``$foo`` - named capture ``foo`` + ## - ``${foo}`` - same as above + ## - ``$1$#`` - first and second captures + ## - ``$#`` - first capture + ## - ``$0`` - full match + ## + ## If a given capture is missing, a ``ValueError`` exception is thrown. + replaceImpl(str, pattern, subproc(match)) + +proc replace*(str: string, pattern: Regex, + subproc: proc (match: string): string): string = + replaceImpl(str, pattern, subproc(match.match)) + +proc replace*(str: string, pattern: Regex, sub: string): string = + # - 1 because the string numbers are 0-indexed + replaceImpl(str, pattern, + formatStr(sub, match.captures[name], match.captures[id - 1])) + +# }}} + +let SpecialCharMatcher = re"([\\+*?[^\]$(){}=!<>|:-])" +proc escapeRe*(str: string): string = + ## Escapes the string so it doesn’t match any special characters. + ## Incompatible with the Extra flag (``X``). + str.replace(SpecialCharMatcher, "\\$1") diff --git a/lib/impure/nre/.gitignore b/lib/impure/nre/.gitignore new file mode 100644 index 000000000..3d647a25e --- /dev/null +++ b/lib/impure/nre/.gitignore @@ -0,0 +1,9 @@ +# all executables +* +!*/ +!*.* +*.exe + +# Wildcard patterns. +*.swp +nimcache diff --git a/lib/impure/nre/private/pcre.nim b/lib/impure/nre/private/pcre.nim new file mode 100644 index 000000000..83b0d5f79 --- /dev/null +++ b/lib/impure/nre/private/pcre.nim @@ -0,0 +1,442 @@ +const pcreHeader = "<pcre.h>" +when not defined(pcreDll): + when hostOS == "windows": + const pcreDll = "pcre.dll" + elif hostOS == "macosx": + const pcreDll = "libpcre(.3|.1|).dylib" + else: + const pcreDll = "libpcre.so(.3|.1|)" + {.pragma: pcreImport, dynlib: pcreDll.} +else: + {.pragma: pcreImport, header: pcreHeader.} +{.deadCodeElim: on.} # Don't error unless unsupported features are used + +#************************************************ +# Perl-Compatible Regular Expressions * +#*********************************************** +# This is the public header file for the Pcre library, to be #included by +#applications that call the Pcre functions. +# +# Copyright (c) 1997-2014 University of Cambridge +# +#----------------------------------------------------------------------------- +#Redistribution and use in source and binary forms, with or without +#modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# Neither the name of the University of Cambridge nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +#AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +#IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +#ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +#LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +#CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +#SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +#INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +#CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +#ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +#POSSIBILITY OF SUCH DAMAGE. +#----------------------------------------------------------------------------- +# +# The current Pcre version information. + + +const + MAJOR* = 8 + MINOR* = 36 + PRERELEASE* = true + DATE* = 2014 - 9 - 26 + +# When an application links to a Pcre DLL in Windows, the symbols that are +#imported have to be identified as such. When building PCRE, the appropriate +#export setting is defined in pcre_internal.h, which includes this file. So we +#don't change existing definitions of PCRE_EXP_DECL and PCRECPP_EXP_DECL. +# By default, we use the standard "extern" declarations. +# Have to include stdlib.h in order to ensure that size_t is defined; +#it is needed here for malloc. + +# Allow for C++ users +# Public options. Some are compile-time only, some are run-time only, and some +#are both. Most of the compile-time options are saved with the compiled regex so +#that they can be inspected during studying (and therefore JIT compiling). Note +#that pcre_study() has its own set of options. Originally, all the options +#defined here used distinct bits. However, almost all the bits in a 32-bit word +#are now used, so in order to conserve them, option bits that were previously +#only recognized at matching time (i.e. by pcre_exec() or pcre_dfa_exec()) may +#also be used for compile-time options that affect only compiling and are not +#relevant for studying or JIT compiling. +# +#Some options for pcre_compile() change its behaviour but do not affect the +#behaviour of the execution functions. Other options are passed through to the +#execution functions and affect their behaviour, with or without affecting the +#behaviour of pcre_compile(). +# +#Options that can be passed to pcre_compile() are tagged Cx below, with these +#variants: +# +#C1 Affects compile only +#C2 Does not affect compile; affects exec, dfa_exec +#C3 Affects compile, exec, dfa_exec +#C4 Affects compile, exec, dfa_exec, study +#C5 Affects compile, exec, study +# +#Options that can be set for pcre_exec() and/or pcre_dfa_exec() are flagged with +#E and D, respectively. They take precedence over C3, C4, and C5 settings passed +#from pcre_compile(). Those that are compatible with JIT execution are flagged +#with J. + +const + CASELESS* = 0x00000001 + MULTILINE* = 0x00000002 + DOTALL* = 0x00000004 + EXTENDED* = 0x00000008 + ANCHORED* = 0x00000010 + DOLLAR_ENDONLY* = 0x00000020 + EXTRA* = 0x00000040 + NOTBOL* = 0x00000080 + NOTEOL* = 0x00000100 + UNGREEDY* = 0x00000200 + NOTEMPTY* = 0x00000400 + UTF8* = 0x00000800 + UTF16* = 0x00000800 + UTF32* = 0x00000800 + NO_AUTO_CAPTURE* = 0x00001000 + NO_UTF8_CHECK* = 0x00002000 + NO_UTF16_CHECK* = 0x00002000 + NO_UTF32_CHECK* = 0x00002000 + AUTO_CALLOUT* = 0x00004000 + PARTIAL_SOFT* = 0x00008000 + PARTIAL* = 0x00008000 + +# This pair use the same bit. + +const + NEVER_UTF* = 0x00010000 + DFA_SHORTEST* = 0x00010000 + +# This pair use the same bit. + +const + NO_AUTO_POSSESS* = 0x00020000 + DFA_RESTART* = 0x00020000 + FIRSTLINE* = 0x00040000 + DUPNAMES* = 0x00080000 + NEWLINE_CR* = 0x00100000 + NEWLINE_LF* = 0x00200000 + NEWLINE_CRLF* = 0x00300000 + NEWLINE_ANY* = 0x00400000 + NEWLINE_ANYCRLF* = 0x00500000 + BSR_ANYCRLF* = 0x00800000 + BSR_UNICODE* = 0x01000000 + JAVASCRIPT_COMPAT* = 0x02000000 + NO_START_OPTIMIZE* = 0x04000000 + NO_START_OPTIMISE* = 0x04000000 + PARTIAL_HARD* = 0x08000000 + NOTEMPTY_ATSTART* = 0x10000000 + UCP* = 0x20000000 + +# Exec-time and get/set-time error codes + +const + ERROR_NOMATCH* = (- 1) + ERROR_NULL* = (- 2) + ERROR_BADOPTION* = (- 3) + ERROR_BADMAGIC* = (- 4) + ERROR_UNKNOWN_OPCODE* = (- 5) + ERROR_UNKNOWN_NODE* = (- 5) # For backward compatibility + ERROR_NOMEMORY* = (- 6) + ERROR_NOSUBSTRING* = (- 7) + ERROR_MATCHLIMIT* = (- 8) + ERROR_CALLOUT* = (- 9) # Never used by Pcre itself + ERROR_BADUTF8* = (- 10) # Same for 8/16/32 + ERROR_BADUTF16* = (- 10) # Same for 8/16/32 + ERROR_BADUTF32* = (- 10) # Same for 8/16/32 + ERROR_BADUTF8_OFFSET* = (- 11) # Same for 8/16 + ERROR_BADUTF16_OFFSET* = (- 11) # Same for 8/16 + ERROR_PARTIAL* = (- 12) + ERROR_BADPARTIAL* = (- 13) + ERROR_INTERNAL* = (- 14) + ERROR_BADCOUNT* = (- 15) + ERROR_DFA_UITEM* = (- 16) + ERROR_DFA_UCOND* = (- 17) + ERROR_DFA_UMLIMIT* = (- 18) + ERROR_DFA_WSSIZE* = (- 19) + ERROR_DFA_RECURSE* = (- 20) + ERROR_RECURSIONLIMIT* = (- 21) + ERROR_NULLWSLIMIT* = (- 22) # No longer actually used + ERROR_BADNEWLINE* = (- 23) + ERROR_BADOFFSET* = (- 24) + ERROR_SHORTUTF8* = (- 25) + ERROR_SHORTUTF16* = (- 25) # Same for 8/16 + ERROR_RECURSELOOP* = (- 26) + ERROR_JIT_STACKLIMIT* = (- 27) + ERROR_BADMODE* = (- 28) + ERROR_BADENDIANNESS* = (- 29) + ERROR_DFA_BADRESTART* = (- 30) + ERROR_JIT_BADOPTION* = (- 31) + ERROR_BADLENGTH* = (- 32) + ERROR_UNSET* = (- 33) + +# Specific error codes for UTF-8 validity checks + +const + UTF8_ERR0* = 0 + UTF8_ERR1* = 1 + UTF8_ERR2* = 2 + UTF8_ERR3* = 3 + UTF8_ERR4* = 4 + UTF8_ERR5* = 5 + UTF8_ERR6* = 6 + UTF8_ERR7* = 7 + UTF8_ERR8* = 8 + UTF8_ERR9* = 9 + UTF8_ERR10* = 10 + UTF8_ERR11* = 11 + UTF8_ERR12* = 12 + UTF8_ERR13* = 13 + UTF8_ERR14* = 14 + UTF8_ERR15* = 15 + UTF8_ERR16* = 16 + UTF8_ERR17* = 17 + UTF8_ERR18* = 18 + UTF8_ERR19* = 19 + UTF8_ERR20* = 20 + UTF8_ERR21* = 21 + UTF8_ERR22* = 22 + +# Specific error codes for UTF-16 validity checks + +const + UTF16_ERR0* = 0 + UTF16_ERR1* = 1 + UTF16_ERR2* = 2 + UTF16_ERR3* = 3 + UTF16_ERR4* = 4 + +# Specific error codes for UTF-32 validity checks + +const + UTF32_ERR0* = 0 + UTF32_ERR1* = 1 + UTF32_ERR2* = 2 + UTF32_ERR3* = 3 + +# Request types for pcre_fullinfo() + +const + INFO_OPTIONS* = 0 + INFO_SIZE* = 1 + INFO_CAPTURECOUNT* = 2 + INFO_BACKREFMAX* = 3 + INFO_FIRSTBYTE* = 4 + INFO_FIRSTCHAR* = 4 + INFO_FIRSTTABLE* = 5 + INFO_LASTLITERAL* = 6 + INFO_NAMEENTRYSIZE* = 7 + INFO_NAMECOUNT* = 8 + INFO_NAMETABLE* = 9 + INFO_STUDYSIZE* = 10 + INFO_DEFAULT_TABLES* = 11 + INFO_OKPARTIAL* = 12 + INFO_JCHANGED* = 13 + INFO_HASCRORLF* = 14 + INFO_MINLENGTH* = 15 + INFO_JIT* = 16 + INFO_JITSIZE* = 17 + INFO_MAXLOOKBEHIND* = 18 + INFO_FIRSTCHARACTER* = 19 + INFO_FIRSTCHARACTERFLAGS* = 20 + INFO_REQUIREDCHAR* = 21 + INFO_REQUIREDCHARFLAGS* = 22 + INFO_MATCHLIMIT* = 23 + INFO_RECURSIONLIMIT* = 24 + INFO_MATCH_EMPTY* = 25 + +# Request types for pcre_config(). Do not re-arrange, in order to remain +#compatible. + +const + CONFIG_UTF8* = 0 + CONFIG_NEWLINE* = 1 + CONFIG_LINK_SIZE* = 2 + CONFIG_POSIX_MALLOC_THRESHOLD* = 3 + CONFIG_MATCH_LIMIT* = 4 + CONFIG_STACKRECURSE* = 5 + CONFIG_UNICODE_PROPERTIES* = 6 + CONFIG_MATCH_LIMIT_RECURSION* = 7 + CONFIG_BSR* = 8 + CONFIG_JIT* = 9 + CONFIG_UTF16* = 10 + CONFIG_JITTARGET* = 11 + CONFIG_UTF32* = 12 + CONFIG_PARENS_LIMIT* = 13 + +# Request types for pcre_study(). Do not re-arrange, in order to remain +#compatible. + +const + STUDY_JIT_COMPILE* = 0x00000001 + STUDY_JIT_PARTIAL_SOFT_COMPILE* = 0x00000002 + STUDY_JIT_PARTIAL_HARD_COMPILE* = 0x00000004 + STUDY_EXTRA_NEEDED* = 0x00000008 + +# Bit flags for the pcre[16|32]_extra structure. Do not re-arrange or redefine +#these bits, just add new ones on the end, in order to remain compatible. + +const + EXTRA_STUDY_DATA* = 0x00000001 + EXTRA_MATCH_LIMIT* = 0x00000002 + EXTRA_CALLOUT_DATA* = 0x00000004 + EXTRA_TABLES* = 0x00000008 + EXTRA_MATCH_LIMIT_RECURSION* = 0x00000010 + EXTRA_MARK* = 0x00000020 + EXTRA_EXECUTABLE_JIT* = 0x00000040 + +# Types + +type + Pcre* = object + Pcre16* = object + Pcre32* = object + jit_stack* = object + jit_stack16* = object + jit_stack32* = object + +# The structure for passing additional data to pcre_exec(). This is defined in +#such as way as to be extensible. Always add new fields at the end, in order to +#remain compatible. + +type + ExtraData* = object + flags*: culong # Bits for which fields are set + study_data*: pointer # Opaque data from pcre_study() + match_limit*: culong # Maximum number of calls to match() + callout_data*: pointer # Data passed back in callouts + tables*: ptr cuchar # Pointer to character tables + match_limit_recursion*: culong # Max + # recursive calls to match() + mark*: ptr ptr cuchar # For passing back a mark pointer + executable_jit*: pointer # Contains a pointer to a compiled jit code + +# The structure for passing out data via the pcre_callout_function. We use a +#structure so that new fields can be added on the end in future versions, +#without changing the API of the function, thereby allowing old clients to work +#without modification. + +type + callout_block* = object + version*: cint # Identifies version of block + # ------------------------ Version 0 ------------------------------- + callout_number*: cint # Number compiled into pattern + offset_vector*: ptr cint # The offset vector + subject*: cstring # The subject being matched + subject_length*: cint # The length of the subject + start_match*: cint # Offset to start of this match attempt + current_position*: cint # Where we currently are in the subject + capture_top*: cint # Max current capture + capture_last*: cint # Most recently closed capture + callout_data*: pointer # Data passed in with the call + # ------------------- Added for Version 1 + # -------------------------- + pattern_position*: cint # Offset to next item in the pattern + next_item_length*: cint # Length of next item in the pattern + # ------------------- Added for Version 2 + # -------------------------- + mark*: ptr cuchar # Pointer to current mark or NULL + # + # ------------------------------------------------------------------ +# Indirection for store get and free functions. These can be set to +#alternative malloc/free functions if required. Special ones are used in the +#non-recursive case for "frames". There is also an optional callout function +#that is triggered by the (?) regex item. For Virtual Pascal, these definitions +#have to take another form. + +proc malloc*(a2: csize): pointer {.cdecl, importc: "pcre_malloc", pcreImport.} +proc free*(a2: pointer) {.cdecl, importc: "pcre_free", pcreImport.} +proc stack_malloc*(a2: csize): pointer {.cdecl, importc: "pcre_stack_malloc", pcreImport.} +proc stack_free*(a2: pointer) {.cdecl, importc: "pcre_free", pcreImport.} +proc callout*(a2: ptr callout_block): cint {.cdecl, importc: "pcre_callout", pcreImport.} +proc stack_guard*(): cint {.cdecl, importc: "pcre_stack_guard", pcreImport.} + +# User defined callback which provides a stack just before the match starts. + +type + jit_callback* = proc (a2: pointer): ptr jit_stack {.cdecl.} + +# Exported Pcre functions + +proc compile*(a2: cstring; a3: cint; a4: ptr cstring; a5: ptr cint; + a6: ptr cuchar): ptr Pcre {.cdecl, importc: "pcre_compile", + pcreImport.} +proc compile2*(a2: cstring; a3: cint; a4: ptr cint; a5: ptr cstring; + a6: ptr cint; a7: ptr cuchar): ptr Pcre {.cdecl, + importc: "pcre_compile2", pcreImport.} +proc config*(a2: cint; a3: pointer): cint {.cdecl, importc: "pcre_config", + pcreImport.} +proc copy_named_substring*(a2: ptr Pcre; a3: cstring; a4: ptr cint; a5: cint; + a6: cstring; a7: cstring; a8: cint): cint {.cdecl, + importc: "pcre_copy_named_substring", pcreImport.} +proc copy_substring*(a2: cstring; a3: ptr cint; a4: cint; a5: cint; a6: cstring; + a7: cint): cint {.cdecl, importc: "pcre_copy_substring", + pcreImport.} +proc dfa_exec*(a2: ptr Pcre; a3: ptr ExtraData; a4: cstring; a5: cint; a6: cint; + a7: cint; a8: ptr cint; a9: cint; a10: ptr cint; a11: cint): cint {. + cdecl, importc: "pcre_dfa_exec", pcreImport.} +proc exec*(a2: ptr Pcre; a3: ptr ExtraData; a4: cstring; a5: cint; a6: cint; a7: cint; + a8: ptr cint; a9: cint): cint {.cdecl, importc: "pcre_exec", + pcreImport.} +proc jit_exec*(a2: ptr Pcre; a3: ptr ExtraData; a4: cstring; a5: cint; a6: cint; + a7: cint; a8: ptr cint; a9: cint; a10: ptr jit_stack): cint {. + cdecl, importc: "pcre_jit_exec", pcreImport.} +proc free_substring*(a2: cstring) {.cdecl, importc: "pcre_free_substring", + pcreImport.} +proc free_substring_list*(a2: ptr cstring) {.cdecl, + importc: "pcre_free_substring_list", pcreImport.} +proc fullinfo*(a2: ptr Pcre; a3: ptr ExtraData; a4: cint; a5: pointer): cint {. + cdecl, importc: "pcre_fullinfo", pcreImport.} +proc get_named_substring*(a2: ptr Pcre; a3: cstring; a4: ptr cint; a5: cint; + a6: cstring; a7: cstringArray): cint {.cdecl, + importc: "pcre_get_named_substring", pcreImport.} +proc get_stringnumber*(a2: ptr Pcre; a3: cstring): cint {.cdecl, + importc: "pcre_get_stringnumber", pcreImport.} +proc get_stringtable_entries*(a2: ptr Pcre; a3: cstring; a4: cstringArray; + a5: cstringArray): cint {.cdecl, + importc: "pcre_get_stringtable_entries", pcreImport.} +proc get_substring*(a2: cstring; a3: ptr cint; a4: cint; a5: cint; + a6: cstringArray): cint {.cdecl, + importc: "pcre_get_substring", pcreImport.} +proc get_substring_list*(a2: cstring; a3: ptr cint; a4: cint; + a5: ptr cstringArray): cint {.cdecl, + importc: "pcre_get_substring_list", pcreImport.} +proc maketables*(): ptr cuchar {.cdecl, importc: "pcre_maketables", + pcreImport.} +proc refcount*(a2: ptr Pcre; a3: cint): cint {.cdecl, importc: "pcre_refcount", + pcreImport.} +proc study*(a2: ptr Pcre; a3: cint; a4: ptr cstring): ptr ExtraData {.cdecl, + importc: "pcre_study", pcreImport.} +proc free_study*(a2: ptr ExtraData) {.cdecl, importc: "pcre_free_study", + pcreImport.} +proc version*(): cstring {.cdecl, importc: "pcre_version", pcreImport.} +# Utility functions for byte order swaps. + +proc pattern_to_host_byte_order*(a2: ptr Pcre; a3: ptr ExtraData; a4: ptr cuchar): cint {. + cdecl, importc: "pcre_pattern_to_host_byte_order", pcreImport.} +# JIT compiler related functions. + +proc jit_stack_alloc*(a2: cint; a3: cint): ptr jit_stack {.cdecl, + importc: "pcre_jit_stack_alloc", pcreImport.} +proc jit_stack_free*(a2: ptr jit_stack) {.cdecl, importc: "pcre_jit_stack_free", + pcreImport.} +proc assign_jit_stack*(a2: ptr ExtraData; a3: jit_callback; a4: pointer) {.cdecl, + importc: "pcre_assign_jit_stack", pcreImport.} +proc jit_free_unused_memory*() {.cdecl, importc: "pcre_jit_free_unused_memory", + pcreImport.} diff --git a/lib/impure/nre/private/util.nim b/lib/impure/nre/private/util.nim new file mode 100644 index 000000000..00fd40fac --- /dev/null +++ b/lib/impure/nre/private/util.nim @@ -0,0 +1,62 @@ +import tables + +proc fget*[K, V](self: Table[K, V], key: K): V = + if self.hasKey(key): + return self[key] + else: + raise newException(KeyError, "Key does not exist in table: " & $key) + +const Ident = {'a'..'z', 'A'..'Z', '0'..'9', '_', '\128'..'\255'} +const StartIdent = Ident - {'0'..'9'} + +proc checkNil(arg: string): string = + if arg == nil: + raise newException(ValueError, "Cannot use nil capture") + else: + return arg + +template formatStr*(howExpr, namegetter, idgetter: expr): expr = + let how = howExpr + var val = newStringOfCap(how.len) + var i = 0 + var lastNum = 1 + + while i < how.len: + if how[i] != '$': + val.add(how[i]) + i += 1 + else: + if how[i + 1] == '$': + val.add('$') + i += 2 + elif how[i + 1] == '#': + var id {.inject.} = lastNum + val.add(checkNil(idgetter)) + lastNum += 1 + i += 2 + elif how[i + 1] in {'0'..'9'}: + i += 1 + var id {.inject.} = 0 + while i < how.len and how[i] in {'0'..'9'}: + id += (id * 10) + (ord(how[i]) - ord('0')) + i += 1 + val.add(checkNil(idgetter)) + lastNum = id + 1 + elif how[i + 1] in StartIdent: + i += 1 + var name {.inject.} = "" + while i < how.len and how[i] in Ident: + name.add(how[i]) + i += 1 + val.add(checkNil(namegetter)) + elif how[i + 1] == '{': + i += 2 + var name {.inject.} = "" + while i < how.len and how[i] != '}': + name.add(how[i]) + i += 1 + i += 1 + val.add(checkNil(namegetter)) + else: + raise newException(Exception, "Syntax error in format string at " & $i) + val |