import private.pcre as pcre import private.util import tables import unsigned from future import lc, `[]` from strutils import toLower, `%` from math import ceil import optional_t from unicode import runeLenAt # Type definitions {{{ type Regex* = ref object ## Represents a compiled pattern pattern*: string ## not nil pcreObj: ptr pcre.Pcre ## not nil pcreExtra: ptr pcre.ExtraData ## nil captureNameToId: Table[string, int] RegexMatch* = ref object ## Is returned upon a match. pattern*: Regex ## The regex doing the matching. ## Not nil. str*: string ## The string that was matched against. ## Not nil. pcreMatchBounds: seq[Slice[cint]] ## First item is the bounds of the match ## Other items are the captures ## `a` is inclusive start, `b` is exclusive end Captures* = distinct RegexMatch CaptureBounds* = distinct RegexMatch SyntaxError* = ref object of Exception ## Thrown when there is a syntax error in the ## regular expression string passed in pos*: int ## the location of the syntax error in bytes pattern*: string ## the pattern that caused the problem StudyError* = ref object of Exception ## Thrown when studying the regular expression failes ## for whatever reason. The message contains the error ## code. # }}} proc getinfo[T](pattern: Regex, opt: cint): T = let retcode = pcre.fullinfo(pattern.pcreObj, pattern.pcreExtra, opt, addr result) if retcode < 0: # XXX Error message that doesn't expose implementation details raise newException(FieldError, "Invalid getinfo for $1, errno $2" % [$opt, $retcode]) # Regex accessors {{{ proc captureCount*(pattern: Regex): int = ## Get the maximum number of captures ## ## Does not return the number of captured captures return getinfo[int](pattern, pcre.INFO_CAPTURECOUNT) proc captureNameId*(pattern: Regex): Table[string, int] = ## Returns a map from named capture groups to their numerical ## identifier return pattern.captureNameToId proc matchesCrLf(pattern: Regex): bool = let flags = getinfo[cint](pattern, pcre.INFO_OPTIONS) let newlineFlags = flags and (pcre.NEWLINE_CRLF or pcre.NEWLINE_ANY or pcre.NEWLINE_ANYCRLF) if newLineFlags > 0: return true # get flags from build config var confFlags: cint if pcre.config(pcre.CONFIG_NEWLINE, addr confFlags) != 0: assert(false, "CONFIG_NEWLINE apparently got screwed up") case confFlags of 13: return false of 10: return false of (13 shl 8) or 10: return true of -2: return true of -1: return true else: return false # }}} # Capture accessors {{{ proc captureBounds*(pattern: RegexMatch): CaptureBounds = return CaptureBounds(pattern) proc captures*(pattern: RegexMatch): Captures = return Captures(pattern) proc `[]`*(pattern: CaptureBounds, i: int): Option[Slice[int]] = ## Gets the bounds of the `i`th capture. ## Undefined behavior if `i` is out of bounds ## If `i` is a failed optional capture, returns None ## If `i == -1`, returns the whole match let pattern = RegexMatch(pattern) if pattern.pcreMatchBounds[i + 1].a != -1: let bounds = pattern.pcreMatchBounds[i + 1] return Some(int(bounds.a) .. int(bounds.b)) else: return None[Slice[int]]() proc `[]`*(pattern: Captures, i: int): string = ## gets the `i`th capture ## Undefined behavior if `i` is out of bounds ## If `i` is a failed optional capture, returns nil ## If `i == -1`, returns the whole match let pattern = RegexMatch(pattern) let bounds = pattern.captureBounds[i] if bounds: let bounds = bounds.get return pattern.str.substr(bounds.a, bounds.b-1) else: return nil proc match*(pattern: RegexMatch): string = return pattern.captures[-1] proc matchBounds*(pattern: RegexMatch): Slice[int] = return pattern.captureBounds[-1].get proc `[]`*(pattern: CaptureBounds, name: string): Option[Slice[int]] = ## Will fail with KeyError if `name` is not a real named capture let pattern = RegexMatch(pattern) return pattern.captureBounds[pattern.pattern.captureNameToId.fget(name)] proc `[]`*(pattern: Captures, name: string): string = ## Will fail with KeyError if `name` is not a real named capture let pattern = RegexMatch(pattern) return pattern.captures[pattern.pattern.captureNameToId.fget(name)] template toTableImpl(cond: bool): stmt {.immediate, dirty.} = for key in RegexMatch(pattern).pattern.captureNameId.keys: let nextVal = pattern[key] if cond: result[key] = default else: result[key] = nextVal proc toTable*(pattern: Captures, default: string = nil): Table[string, string] = ## Gets all the named captures and returns them result = initTable[string, string]() toTableImpl(nextVal == nil) proc toTable*(pattern: CaptureBounds, default = None[Slice[int]]()): Table[string, Option[Slice[int]]] = ## Gets all the named captures and returns them result = initTable[string, Option[Slice[int]]]() toTableImpl(nextVal.isNone) template itemsImpl(cond: bool): stmt {.immediate, dirty.} = for i in 0 .. ': if longOpt == "no_study": result.study = false else: result.flags = result.flags or Options.fget(longOpt) longOpt = nil else: longOpt.add(c.toLower) continue # }}} result.flags = result.flags or Options.fget($c) # }}} type UncheckedArray {.unchecked.}[T] = array[0 .. 0, T] proc destroyRegex(pattern: Regex) = pcre.free_substring(cast[cstring](pattern.pcreObj)) pattern.pcreObj = nil if pattern.pcreExtra != nil: pcre.free_study(pattern.pcreExtra) proc getNameToNumberTable(pattern: Regex): Table[string, int] = let entryCount = getinfo[cint](pattern, pcre.INFO_NAMECOUNT) let entrySize = getinfo[cint](pattern, pcre.INFO_NAMEENTRYSIZE) let table = cast[ptr UncheckedArray[uint8]]( getinfo[int](pattern, pcre.INFO_NAMETABLE)) result = initTable[string, int]() for i in 0 .. = 0: return result elif execRet == pcre.ERROR_NOMATCH: return nil else: raise newException(AssertionError, "Internal error: errno " & $execRet) proc match*(str: string, pattern: Regex, start = 0, endpos = -1): RegexMatch = return str.matchImpl(pattern, start, endpos, pcre.ANCHORED) iterator findIter*(str: string, pattern: Regex, start = 0, endpos = -1): RegexMatch = # see pcredemo for explaination let matchesCrLf = pattern.matchesCrLf() let unicode = (getinfo[cint](pattern, pcre.INFO_OPTIONS) and pcre.UTF8) > 0 let endpos = if endpos == -1: str.len else: endpos var offset = start var previousMatch: RegexMatch while true: var flags = 0 if previousMatch != nil and previousMatch.matchBounds.a == previousMatch.matchBounds.b: # 0-len match flags = pcre.NOTEMPTY_ATSTART or pcre.ANCHORED let currentMatch = str.matchImpl(pattern, offset, endpos, flags) previousMatch = currentMatch if currentMatch == nil: # either the end of the input or the string # cannot be split here offset += 1 if matchesCrLf and offset < (str.len - 1) and str[offset] == '\r' and str[offset + 1] == '\l': # if PCRE treats CrLf as newline, skip both at the same time offset += 1 elif unicode: # XXX what about invalid unicode? offset += str.runeLenAt(offset) assert(offset <= endpos) else: offset = currentMatch.matchBounds.b yield currentMatch if offset >= endpos: # do while break proc find*(str: string, pattern: Regex, start = 0, endpos = -1): RegexMatch = ## Returns a `RegexMatch` if there is a match between `start` and `endpos`, otherwise ## it returns nil. ## ## if `endpos == -1`, then `endpos = str.len` return str.matchImpl(pattern, start, endpos, 0) proc findAll*(str: string, pattern: Regex, start = 0, endpos = -1): seq[string] = result = @[] for match in str.findIter(pattern, start, endpos): result.add(match.match) proc renderBounds(str: string, bounds: Slice[int]): string = result = " " & str & "⫞\n" for i in -1 .. start: result.add(str.substr(lastIdx, bounds.a - 1)) splits += 1 lastIdx = bounds.b for cap in match.captures: # if there are captures, include them in the result result.add(cap) if splits == maxSplit - 1: break # "12".split("\b") would be @["1", "2", ""], but # if we skip an empty last match, it's the correct # @["1", "2"] if bounds.a < bounds.b or bounds.b < str.len: # last match: Each match takes the previous substring, # but "1 2".split(/ /) needs to return @["1", "2"]. # This handles "2" result.add(str.substr(bounds.b, str.len - 1)) template replaceImpl(str: string, pattern: Regex, replacement: expr): stmt {.immediate, dirty.} = # XXX seems very similar to split, maybe I can reduce code duplication # somehow? result = "" var lastIdx = 0 for match {.inject.} in str.findIter(pattern): let bounds = match.matchBounds result.add(str.substr(lastIdx, bounds.a - 1)) let nextVal = replacement assert(nextVal != nil) result.add(nextVal) lastIdx = bounds.b result.add(str.substr(lastIdx, str.len - 1)) return result proc replace*(str: string, pattern: Regex, subproc: proc (match: RegexMatch): string): string = replaceImpl(str, pattern, subproc(match)) proc replace*(str: string, pattern: Regex, subproc: proc (match: string): string): string = replaceImpl(str, pattern, subproc(match.match)) proc replace*(str: string, pattern: Regex, sub: string): string = # - 1 because the string numbers are 0-indexed replaceImpl(str, pattern, formatStr(sub, match.captures[name], match.captures[id - 1])) # }}} let SpecialCharMatcher = re"([\\+*?[^\]$(){}=!<>|:-])" proc escapeRe*(str: string): string = str.replace(SpecialCharMatcher, "\\$1")