summary refs log tree commit diff stats
path: root/lib/impure/nre.nim
blob: 94dd89db5e59622b98e74e8fa4c9108cb86c663a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
pre { line-height: 125%; }
td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
.highlight .hll { background-color: #ffffcc }
.highlight .c { color: #888888 } /* Comment */
.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
.highlight .k { color: #008800; font-weight: bold } /* Keyword */
.highlight .ch { color: #888888 } /* Comment.Hashbang */
.highlight .cm { color: #888888 } /* Comment.Multiline */
.highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
.highlight .cpf { color: #888888 } /* Comment.PreprocFile */
.highlight .c1 { color: #888888 } /* Comment.Single */
.highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */
.highlight .gr { color: #aa0000 } /* Generic.Error */
.highlight .gh { color: #333333 } /* Generic.Heading */
.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
.highlight .go { color: #888888 } /* Generic.Output */
.highlight .gp { color: #555555 } /* Generic.Prompt */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #666666 } /* Generic.Subheading */
.highlight .gt { color: #aa0000 } /* Generic.Traceback */
.highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #008800 } /* Keyword.Pseudo */
.highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */
.highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */
.highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */
.highlight .na { color: #336699 } /* Name.Attribute */
.highlight .nb { color: #003388 } /* Name.Builtin */
.highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */
.highlight .no { color: #003366; font-weight: bold } /* Name.Constant */
.highlight .nd { color: #555555 } /* Name.Decorator */
.highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */
.highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */
.highlight .nl { color: #336699; font-style: italic } /* Name.Label */
.highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */
.highlight .py { color: #336699; font-weight: bold } /* Name.Property */
.highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */
.highlight .nv { color: #336699 } /* Name.Variable */
.highlight .ow { color: #008800 } /* Operator.Word */
.highlight .w { color: #bbbbbb } /* Text.Whitespace */
.highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */
.highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */
.highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */
.highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */
.highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */
.highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */
.highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */
.highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */
.highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */
.highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */
.highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */
.highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */
.highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */
.highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */
.highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */
.highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */
.highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */
.highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */
.highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */
.highlight .vc { color: #336699 } /* Name.Variable.Class */
.highlight .vg { color: #dd7700 } /* Name.Variable.Global */
.highlight .vi { color: #3333bb } /* Name.Variable.Instance */
.highlight .vm { color: #336699 } /* Name.Variable.Magic */
.highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */
#
#
#            Nim's Runtime Library
#        (c) Copyright 2012 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## Timer support for the realtime GC. Based on
## `<https://github.com/jckarter/clay/blob/master/compiler/src/hirestimer.cpp>`_

type
  Ticks = distinct int64
  Nanos = int64

when defined(windows):

  proc QueryPerformanceCounter(res: var Ticks) {.
    importc: "QueryPerformanceCounter", stdcall, dynlib: "kernel32".}
  proc QueryPerformanceFrequency(res: var int64) {.
    importc: "QueryPerformanceFrequency", stdcall, dynlib: "kernel32".}

  proc getTicks(): Ticks {.inline.} =
    QueryPerformanceCounter(result)

  proc `-`(a, b: Ticks): Nanos =
    var frequency: int64
    QueryPerformanceFrequency(frequency)
    var performanceCounterRate = 1e+9'f64 / float64(frequency)

    result = Nanos(float64(a.int64 - b.int64) * performanceCounterRate)

elif defined(macosx):
  type
    MachTimebaseInfoData {.pure, final,
        importc: "mach_timebase_info_data_t",
        header: "<mach/mach_time.h>".} = object
      numer, denom: int32

  proc mach_absolute_time(): int64 {.importc, header: "<mach/mach.h>".}
  proc mach_timebase_info(info: var MachTimebaseInfoData) {.importc,
    header: "<mach/mach_time.h>".}

  proc getTicks(): Ticks {.inline.} =
    result = Ticks(mach_absolute_time())

  var timeBaseInfo: MachTimebaseInfoData
  mach_timebase_info(timeBaseInfo)

  proc `-`(a, b: Ticks): Nanos =
    result = (a.int64 - b.int64)  * timeBaseInfo.numer div timeBaseInfo.denom

elif defined(posixRealtime):
  type
    Clockid {.importc: "clockid_t", header: "<time.h>", final.} = object

    TimeSpec {.importc: "struct timespec", header: "<time.h>",
               final, pure.} = object ## struct timespec
      tv_sec: int  ## Seconds.
      tv_nsec: int ## Nanoseconds.

  var
    CLOCK_REALTIME {.importc: "CLOCK_REALTIME", header: "<time.h>".}: Clockid

  proc clock_gettime(clkId: Clockid, tp: var Timespec) {.
    importc: "clock_gettime", header: "<time.h>".}

  proc getTicks(): Ticks =
    var t: Timespec
    clock_gettime(CLOCK_REALTIME, t)
    result = Ticks(int64(t.tv_sec) * 1000000000'i64 + int64(t.tv_nsec))

  proc `-`(a, b: Ticks): Nanos {.borrow.}

else:
  # fallback Posix implementation:
  when not declared(Time):
    when defined(linux):
      type Time = clong
    else:
      type Time = int

  type
    Timeval {.importc: "struct timeval", header: "<sys/select.h>",
               final, pure.} = object ## struct timeval
      tv_sec: Time  ## Seconds.
      tv_usec: clong ## Microseconds.

  proc posix_gettimeofday(tp: var Timeval, unused: pointer = nil) {.
    importc: "gettimeofday", header: "<sys/time.h>".}

  proc getTicks(): Ticks =
    var t: Timeval
    posix_gettimeofday(t)
    result = Ticks(int64(t.tv_sec) * 1000_000_000'i64 +
                    int64(t.tv_usec) * 1000'i64)

  proc `-`(a, b: Ticks): Nanos {.borrow.}
'#n712'>712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752
#
#            Nim's Runtime Library
#        (c) Copyright 2015 Nim Contributors
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#


from pcre import nil
import nre/private/util
import tables
from strutils import `%`
from math import ceil
import options
from unicode import runeLenAt

export options


## What is NRE?
## ============
##
## A regular expression library for Nim using PCRE to do the hard work.
##
## For documentation on how to write patterns, there exists `the official PCRE
## pattern documentation
## <https://www.pcre.org/original/doc/html/pcrepattern.html>`_. You can also
## search the internet for a wide variety of third-party documentation and
## tools.
##
## **Note**: If you love ``sequtils.toSeq`` we have bad news for you. This
## library doesn't work with it due to documented compiler limitations. As
## a workaround, use this:
##
## .. code-block:: nim
##
##    import nre except toSeq
##
##
## Licencing
## ---------
##
## PCRE has `some additional terms`_ that you must agree to in order to use
## this module.
##
## .. _`some additional terms`: http://pcre.sourceforge.net/license.txt
##
runnableExamples:
  let vowels = re"[aeoui]"

  let expectedResults = [
    1 .. 1,
    2 .. 2,
    4 .. 4,
    6 .. 6,
    7 .. 7,
  ]
  var i = 0
  for match in "moigagoo".findIter(vowels):
    doAssert match.matchBounds == expectedResults[i]
    inc i

  let firstVowel = "foo".find(vowels)
  let hasVowel = firstVowel.isSome()
  if hasVowel:
    let matchBounds = firstVowel.get().captureBounds[-1]
    doAssert matchBounds.a == 1


# Type definitions {{{
type
  Regex* = ref object
    ## Represents the pattern that things are matched against, constructed with
    ## ``re(string)``. Examples: ``re"foo"``, ``re(r"(*ANYCRLF)(?x)foo #
    ## comment".``
    ##
    ## ``pattern: string``
    ##     the string that was used to create the pattern. For details on how
    ##     to write a pattern, please see `the official PCRE pattern
    ##     documentation.
    ##     <https://www.pcre.org/original/doc/html/pcrepattern.html>`_
    ##
    ## ``captureCount: int``
    ##     the number of captures that the pattern has.
    ##
    ## ``captureNameId: Table[string, int]``
    ##     a table from the capture names to their numeric id.
    ##
    ##
    ## Options
    ## .......
    ##
    ## The following options may appear anywhere in the pattern, and they affect
    ## the rest of it.
    ##
    ## -  ``(?i)`` - case insensitive
    ## -  ``(?m)`` - multi-line: ``^`` and ``$`` match the beginning and end of
    ##    lines, not of the subject string
    ## -  ``(?s)`` - ``.`` also matches newline (*dotall*)
    ## -  ``(?U)`` - expressions are not greedy by default. ``?`` can be added
    ##    to a qualifier to make it greedy
    ## -  ``(?x)`` - whitespace and comments (``#``) are ignored (*extended*)
    ## -  ``(?X)`` - character escapes without special meaning (``\w`` vs.
    ##    ``\a``) are errors (*extra*)
    ##
    ## One or a combination of these options may appear only at the beginning
    ## of the pattern:
    ##
    ## -  ``(*UTF8)`` - treat both the pattern and subject as UTF-8
    ## -  ``(*UCP)`` - Unicode character properties; ``\w`` matches ``я``
    ## -  ``(*U)`` - a combination of the two options above
    ## -  ``(*FIRSTLINE*)`` - fails if there is not a match on the first line
    ## -  ``(*NO_AUTO_CAPTURE)`` - turn off auto-capture for groups;
    ##    ``(?<name>...)`` can be used to capture
    ## -  ``(*CR)`` - newlines are separated by ``\r``
    ## -  ``(*LF)`` - newlines are separated by ``\n`` (UNIX default)
    ## -  ``(*CRLF)`` - newlines are separated by ``\r\n`` (Windows default)
    ## -  ``(*ANYCRLF)`` - newlines are separated by any of the above
    ## -  ``(*ANY)`` - newlines are separated by any of the above and Unicode
    ##    newlines:
    ##
    ##     single characters VT (vertical tab, U+000B), FF (form feed, U+000C),
    ##     NEL (next line, U+0085), LS (line separator, U+2028), and PS
    ##     (paragraph separator, U+2029). For the 8-bit library, the last two
    ##     are recognized only in UTF-8 mode.
    ##     —  man pcre
    ##
    ## -  ``(*JAVASCRIPT_COMPAT)`` - JavaScript compatibility
    ## -  ``(*NO_STUDY)`` - turn off studying; study is enabled by default
    ##
    ## For more details on the leading option groups, see the `Option
    ## Setting <http://man7.org/linux/man-pages/man3/pcresyntax.3.html#OPTION_SETTING>`_
    ## and the `Newline
    ## Convention <http://man7.org/linux/man-pages/man3/pcresyntax.3.html#NEWLINE_CONVENTION>`_
    ## sections of the `PCRE syntax
    ## manual <http://man7.org/linux/man-pages/man3/pcresyntax.3.html>`_.
    ##
    ## Some of these options are not part of PCRE and are converted by nre
    ## into PCRE flags. These include ``NEVER_UTF``, ``ANCHORED``,
    ## ``DOLLAR_ENDONLY``, ``FIRSTLINE``, ``NO_AUTO_CAPTURE``,
    ## ``JAVASCRIPT_COMPAT``, ``U``, ``NO_STUDY``. In other PCRE wrappers, you
    ## will need to pass these as seperate flags to PCRE.
    pattern*: string  ## not nil
    pcreObj: ptr pcre.Pcre  ## not nil
    pcreExtra: ptr pcre.ExtraData  ## nil

    captureNameToId: Table[string, int]

  RegexMatch* = object
    ## Usually seen as Option[RegexMatch], it represents the result of an
    ## execution. On failure, it is none, on success, it is some.
    ##
    ## ``pattern: Regex``
    ##     the pattern that is being matched
    ##
    ## ``str: string``
    ##     the string that was matched against
    ##
    ## ``captures[]: string``
    ##     the string value of whatever was captured at that id. If the value
    ##     is invalid, then behavior is undefined. If the id is ``-1``, then
    ##     the whole match is returned. If the given capture was not matched,
    ##     ``nil`` is returned.
    ##
    ##     -  ``"abc".match(re"(\w)").get.captures[0] == "a"``
    ##     -  ``"abc".match(re"(?<letter>\w)").get.captures["letter"] == "a"``
    ##     -  ``"abc".match(re"(\w)\w").get.captures[-1] == "ab"``
    ##
    ## ``captureBounds[]: HSlice[int, int]``
    ##     gets the bounds of the given capture according to the same rules as
    ##     the above. If the capture is not filled, then ``None`` is returned.
    ##     The bounds are both inclusive.
    ##
    ##     -  ``"abc".match(re"(\w)").get.captureBounds[0] == 0 .. 0``
    ##     -  ``0 in "abc".match(re"(\w)").get.captureBounds == true``
    ##     -  ``"abc".match(re"").get.captureBounds[-1] == 0 .. -1``
    ##     -  ``"abc".match(re"abc").get.captureBounds[-1] == 0 .. 2``
    ##
    ## ``match: string``
    ##     the full text of the match.
    ##
    ## ``matchBounds: HSlice[int, int]``
    ##     the bounds of the match, as in ``captureBounds[]``
    ##
    ## ``(captureBounds|captures).toTable``
    ##     returns a table with each named capture as a key.
    ##
    ## ``(captureBounds|captures).toSeq``
    ##     returns all the captures by their number.
    ##
    ## ``$: string``
    ##     same as ``match``
    pattern*: Regex  ## The regex doing the matching.
                     ## Not nil.
    str*: string  ## The string that was matched against.
                  ## Not nil.
    pcreMatchBounds: seq[HSlice[cint, cint]] ## First item is the bounds of the match
                                            ## Other items are the captures
                                            ## `a` is inclusive start, `b` is exclusive end

  Captures* = distinct RegexMatch
  CaptureBounds* = distinct RegexMatch

  RegexError* = ref object of Exception

  RegexInternalError* = ref object of RegexError
    ## Internal error in the module, this probably means that there is a bug

  InvalidUnicodeError* = ref object of RegexError
    ## Thrown when matching fails due to invalid unicode in strings
    pos*: int  ## the location of the invalid unicode in bytes

  SyntaxError* = ref object of RegexError
    ## Thrown when there is a syntax error in the
    ## regular expression string passed in
    pos*: int  ## the location of the syntax error in bytes
    pattern*: string  ## the pattern that caused the problem

  StudyError* = ref object of RegexError
    ## Thrown when studying the regular expression failes
    ## for whatever reason. The message contains the error
    ## code.

runnableExamples:
    # This MUST be kept in sync with the examples in RegexMatch
    doAssert "abc".match(re"(\w)").get.captures[0] == "a"
    doAssert "abc".match(re"(?<letter>\w)").get.captures["letter"] == "a"
    doAssert "abc".match(re"(\w)\w").get.captures[-1] == "ab"

    doAssert "abc".match(re"(\w)").get.captureBounds[0] == 0 .. 0
    doAssert 0 in "abc".match(re"(\w)").get.captureBounds == true
    doAssert "abc".match(re"").get.captureBounds[-1] == 0 .. -1
    doAssert "abc".match(re"abc").get.captureBounds[-1] == 0 .. 2
# }}}

proc getinfo[T](pattern: Regex, opt: cint): T =
  let retcode = pcre.fullinfo(pattern.pcreObj, pattern.pcreExtra, opt, addr result)

  if retcode < 0:
    # XXX Error message that doesn't expose implementation details
    raise newException(FieldError, "Invalid getinfo for $1, errno $2" % [$opt, $retcode])

# Regex accessors {{{
proc captureCount*(pattern: Regex): int =
  return getinfo[cint](pattern, pcre.INFO_CAPTURECOUNT)

proc captureNameId*(pattern: Regex): Table[string, int] =
  return pattern.captureNameToId

proc matchesCrLf(pattern: Regex): bool =
  let flags = uint32(getinfo[culong](pattern, pcre.INFO_OPTIONS))
  let newlineFlags = flags and (pcre.NEWLINE_CRLF or
                                pcre.NEWLINE_ANY or
                                pcre.NEWLINE_ANYCRLF)
  if newLineFlags > 0u32:
    return true

  # get flags from build config
  var confFlags: cint
  if pcre.config(pcre.CONFIG_NEWLINE, addr confFlags) != 0:
    assert(false, "CONFIG_NEWLINE apparently got screwed up")

  case confFlags
  of 13: return false
  of 10: return false
  of (13 shl 8) or 10: return true
  of -2: return true
  of -1: return true
  else: return false
# }}}

# Capture accessors {{{
func captureBounds*(pattern: RegexMatch): CaptureBounds = return CaptureBounds(pattern)

func captures*(pattern: RegexMatch): Captures = return Captures(pattern)

func contains*(pattern: CaptureBounds, i: int): bool =
  let pattern = RegexMatch(pattern)
  pattern.pcreMatchBounds[i + 1].a != -1

func contains*(pattern: Captures, i: int): bool =
  i in CaptureBounds(pattern)

func `[]`*(pattern: CaptureBounds, i: int): HSlice[int, int] =
  let pattern = RegexMatch(pattern)
  if not (i in pattern.captureBounds):
    raise newException(IndexError, "Group '" & $i & "' was not captured")

  let bounds = pattern.pcreMatchBounds[i + 1]
  int(bounds.a)..int(bounds.b-1)

func `[]`*(pattern: Captures, i: int): string =
  let pattern = RegexMatch(pattern)
  let bounds = pattern.captureBounds[i]

  pattern.str.substr(bounds.a, bounds.b)

func match*(pattern: RegexMatch): string =
  return pattern.captures[-1]

func matchBounds*(pattern: RegexMatch): HSlice[int, int] =
  return pattern.captureBounds[-1]

func contains*(pattern: CaptureBounds, name: string): bool =
  let pattern = RegexMatch(pattern)
  let nameToId = pattern.pattern.captureNameToId
  if not (name in nameToId):
      return false
  nameToId[name] in pattern.captureBounds

func contains*(pattern: Captures, name: string): bool =
  name in CaptureBounds(pattern)

func checkNamedCaptured(pattern: RegexMatch, name: string): void =
  if not (name in pattern.captureBounds):
    raise newException(KeyError, "Group '" & name & "' was not captured")

func `[]`*(pattern: CaptureBounds, name: string): HSlice[int, int] =
  let pattern = RegexMatch(pattern)
  checkNamedCaptured(pattern, name)
  pattern.captureBounds[pattern.pattern.captureNameToId[name]]

func `[]`*(pattern: Captures, name: string): string =
  let pattern = RegexMatch(pattern)
  checkNamedCaptured(pattern, name)
  return pattern.captures[pattern.pattern.captureNameToId[name]]

template toTableImpl() {.dirty.} =
  for key in RegexMatch(pattern).pattern.captureNameId.keys:
    if key in pattern:
        result[key] = pattern[key]

func toTable*(pattern: Captures): Table[string, string] =
  result = initTable[string, string]()
  toTableImpl()

func toTable*(pattern: CaptureBounds): Table[string, HSlice[int, int]] =
  result = initTable[string, HSlice[int, int]]()
  toTableImpl()

template itemsImpl() {.dirty.} =
  for i in 0 ..< RegexMatch(pattern).pattern.captureCount:
    # done in this roundabout way to avoid multiple yields (potential code
    # bloat)
    let nextYieldVal = if i in pattern:
      some(pattern[i])
    else:
      default

    yield nextYieldVal

iterator items*(pattern: CaptureBounds,
                default = none(HSlice[int, int])): Option[HSlice[int, int]] =
  itemsImpl()

iterator items*(pattern: Captures,
                default: Option[string] = none(string)): Option[string] =
  itemsImpl()

proc toSeq*(pattern: CaptureBounds,
            default = none(HSlice[int, int])): seq[Option[HSlice[int, int]]] =
  accumulateResult(pattern.items(default))

proc toSeq*(pattern: Captures,
            default: Option[string] = none(string)): seq[Option[string]] =
  accumulateResult(pattern.items(default))

proc `$`*(pattern: RegexMatch): string =
  return pattern.captures[-1]

proc `==`*(a, b: Regex): bool =
  if not a.isNil and not b.isNil:
    return a.pattern == b.pattern and
           a.pcreObj == b.pcreObj and
           a.pcreExtra == b.pcreExtra
  else:
    return system.`==`(a, b)

proc `==`*(a, b: RegexMatch): bool =
  return a.pattern == b.pattern and
         a.str == b.str
# }}}

# Creation & Destruction {{{
# PCRE Options {{{
const PcreOptions = {
  "NEVER_UTF": pcre.NEVER_UTF,
  "ANCHORED": pcre.ANCHORED,
  "DOLLAR_ENDONLY": pcre.DOLLAR_ENDONLY,
  "FIRSTLINE": pcre.FIRSTLINE,
  "NO_AUTO_CAPTURE": pcre.NO_AUTO_CAPTURE,
  "JAVASCRIPT_COMPAT": pcre.JAVASCRIPT_COMPAT,
  "U": pcre.UTF8 or pcre.UCP
}.toTable

# Options that are supported inside regular expressions themselves
const SkipOptions = [
  "LIMIT_MATCH=", "LIMIT_RECURSION=", "NO_AUTO_POSSESS", "NO_START_OPT",
  "UTF8", "UTF16", "UTF32", "UTF", "UCP",
  "CR", "LF", "CRLF", "ANYCRLF", "ANY", "BSR_ANYCRLF", "BSR_UNICODE"
]

proc extractOptions(pattern: string): tuple[pattern: string, flags: int, study: bool] =
  result = ("", 0, true)

  var optionStart = 0
  var equals = false
  for i, c in pattern:
    if optionStart == i:
      if c != '(':
        break
      optionStart = i

    elif optionStart == i-1:
      if c != '*':
        break

    elif c == ')':
      let name = pattern[optionStart+2 .. i-1]
      if equals or name in SkipOptions:
        result.pattern.add pattern[optionStart .. i]
      elif PcreOptions.hasKey name:
        result.flags = result.flags or PcreOptions[name]
      elif name == "NO_STUDY":
        result.study = false
      else:
        break
      optionStart = i+1
      equals = false

    elif not equals:
      if c == '=':
        equals = true
        if pattern[optionStart+2 .. i] notin SkipOptions:
          break
      elif c notin {'A'..'Z', '0'..'9', '_'}:
        break

  result.pattern.add pattern[optionStart .. pattern.high]

# }}}

proc destroyRegex(pattern: Regex) =
  pcre.free_substring(cast[cstring](pattern.pcreObj))
  pattern.pcreObj = nil
  if pattern.pcreExtra != nil:
    pcre.free_study(pattern.pcreExtra)

proc getNameToNumberTable(pattern: Regex): Table[string, int] =
  let entryCount = getinfo[cint](pattern, pcre.INFO_NAMECOUNT)
  let entrySize = getinfo[cint](pattern, pcre.INFO_NAMEENTRYSIZE)
  let table = cast[ptr UncheckedArray[uint8]](
                getinfo[int](pattern, pcre.INFO_NAMETABLE))

  result = initTable[string, int]()

  for i in 0 ..< entryCount:
    let pos = i * entrySize
    let num = (int(table[pos]) shl 8) or int(table[pos + 1]) - 1
    var name = ""

    var idx = 2
    while table[pos + idx] != 0:
      name.add(char(table[pos + idx]))
      idx += 1

    result[name] = num

proc initRegex(pattern: string, flags: int, study = true): Regex =
  new(result, destroyRegex)
  result.pattern = pattern

  var errorMsg: cstring
  var errOffset: cint

  result.pcreObj = pcre.compile(cstring(pattern),
                                # better hope int is at least 4 bytes..
                                cint(flags), addr errorMsg,
                                addr errOffset, nil)
  if result.pcreObj == nil:
    # failed to compile
    raise SyntaxError(msg: $errorMsg, pos: errOffset, pattern: pattern)

  if study:
    var options: cint = 0
    var hasJit: cint
    if pcre.config(pcre.CONFIG_JIT, addr hasJit) == 0:
      if hasJit == 1'i32:
        options = pcre.STUDY_JIT_COMPILE
    result.pcreExtra = pcre.study(result.pcreObj, options, addr errorMsg)
    if errorMsg != nil:
      raise StudyError(msg: $errorMsg)

  result.captureNameToId = result.getNameToNumberTable()

proc re*(pattern: string): Regex =
  let (pattern, flags, study) = extractOptions(pattern)
  initRegex(pattern, flags, study)
# }}}

# Operations {{{
proc matchImpl(str: string, pattern: Regex, start, endpos: int, flags: int): Option[RegexMatch] =
  var myResult = RegexMatch(pattern : pattern, str : str)
  # See PCRE man pages.
  # 2x capture count to make room for start-end pairs
  # 1x capture count as slack space for PCRE
  let vecsize = (pattern.captureCount() + 1) * 3
  # div 2 because each element is 2 cints long
  # plus 1 because we need the ceiling, not the floor
  myResult.pcreMatchBounds = newSeq[HSlice[cint, cint]]((vecsize + 1) div 2)
  myResult.pcreMatchBounds.setLen(vecsize div 3)

  let strlen = if endpos == int.high: str.len else: endpos+1
  doAssert(strlen <= str.len)  # don't want buffer overflows

  let execRet = pcre.exec(pattern.pcreObj,
                          pattern.pcreExtra,
                          cstring(str),
                          cint(strlen),
                          cint(start),
                          cint(flags),
                          cast[ptr cint](addr myResult.pcreMatchBounds[0]),
                          cint(vecsize))
  if execRet >= 0:
    return some(myResult)

  case execRet:
    of pcre.ERROR_NOMATCH:
      return none(RegexMatch)
    of pcre.ERROR_NULL:
      raise newException(AccessViolationError, "Expected non-null parameters")
    of pcre.ERROR_BADOPTION:
      raise RegexInternalError(msg : "Unknown pattern flag. Either a bug or " &
        "outdated PCRE.")
    of pcre.ERROR_BADUTF8, pcre.ERROR_SHORTUTF8, pcre.ERROR_BADUTF8_OFFSET:
      raise InvalidUnicodeError(msg : "Invalid unicode byte sequence",
        pos : myResult.pcreMatchBounds[0].a)
    else:
      raise RegexInternalError(msg : "Unknown internal error: " & $execRet)

proc match*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[RegexMatch] =
  ## Like ```find(...)`` <#proc-find>`_, but anchored to the start of the
  ## string.
  ##
  runnableExamples:
    doAssert "foo".match(re"f").isSome
    doAssert "foo".match(re"o").isNone

  return str.matchImpl(pattern, start, endpos, pcre.ANCHORED)

iterator findIter*(str: string, pattern: Regex, start = 0, endpos = int.high): RegexMatch =
  ## Works the same as ```find(...)`` <#proc-find>`_, but finds every
  ## non-overlapping match. ``"2222".find(re"22")`` is ``"22", "22"``, not
  ## ``"22", "22", "22"``.
  ##
  ## Arguments are the same as ```find(...)`` <#proc-find>`_
  ##
  ## Variants:
  ##
  ## -  ``proc findAll(...)`` returns a ``seq[string]``
  # see pcredemo for explanation
  let matchesCrLf = pattern.matchesCrLf()
  let unicode = uint32(getinfo[culong](pattern, pcre.INFO_OPTIONS) and
    pcre.UTF8) > 0u32
  let strlen = if endpos == int.high: str.len else: endpos+1
  var offset = start
  var match: Option[RegexMatch]
  var neverMatched = true

  while true:
    var flags = 0
    if match.isSome and
       match.get.matchBounds.a > match.get.matchBounds.b:
      # 0-len match
      flags = pcre.NOTEMPTY_ATSTART
    match = str.matchImpl(pattern, offset, endpos, flags)

    if match.isNone:
      # either the end of the input or the string
      # cannot be split here - we also need to bail
      # if we've never matched and we've already tried to...
      if offset >= strlen or neverMatched:
        break

      if matchesCrLf and offset < (str.len - 1) and
         str[offset] == '\r' and str[offset + 1] == '\L':
        # if PCRE treats CrLf as newline, skip both at the same time
        offset += 2
      elif unicode:
        # XXX what about invalid unicode?
        offset += str.runeLenAt(offset)
        assert(offset <= strlen)
      else:
        offset += 1
    else:
      neverMatched = false
      offset = match.get.matchBounds.b + 1

      yield match.get

proc find*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[RegexMatch] =
  ## Finds the given pattern in the string between the end and start
  ## positions.
  ##
  ## ``start``
  ##     The start point at which to start matching. ``|abc`` is ``0``;
  ##     ``a|bc`` is ``1``
  ##
  ## ``endpos``
  ##     The maximum index for a match; ``int.high`` means the end of the
  ##     string, otherwise it’s an inclusive upper bound.
  return str.matchImpl(pattern, start, endpos, 0)

proc findAll*(str: string, pattern: Regex, start = 0, endpos = int.high): seq[string] =
  result = @[]
  for match in str.findIter(pattern, start, endpos):
    result.add(match.match)

proc contains*(str: string, pattern: Regex, start = 0, endpos = int.high): bool =
  ## Determine if the string contains the given pattern between the end and
  ## start positions:
  ## This function is equivalent to ``isSome(str.find(pattern, start, endpos))``.
  ##
  runnableExamples:
    doAssert "abc".contains(re"bc") == true
    doAssert "abc".contains(re"cd") == false
    doAssert "abc".contains(re"a", start = 1) == false

  return isSome(str.find(pattern, start, endpos))

proc split*(str: string, pattern: Regex, maxSplit = -1, start = 0): seq[string] =
  ## Splits the string with the given regex. This works according to the
  ## rules that Perl and Javascript use.
  ##
  ## ``start`` behaves the same as in ```find(...)`` <#proc-find>`_.
  ##
  runnableExamples:
    # -  If the match is zero-width, then the string is still split:
    doAssert "123".split(re"") == @["1", "2", "3"]

    # -  If the pattern has a capture in it, it is added after the string
    #    split:
    doAssert "12".split(re"(\d)") == @["", "1", "", "2", ""]

    # -  If ``maxsplit != -1``, then the string will only be split
    #    ``maxsplit - 1`` times. This means that there will be ``maxsplit``
    #    strings in the output seq.
    doAssert "1.2.3".split(re"\.", maxsplit = 2) == @["1", "2.3"]

  result = @[]
  var lastIdx = start
  var splits = 0
  var bounds = 0 .. -1
  var never_ran = true

  for match in str.findIter(pattern, start = start):
    never_ran = false

    # bounds are inclusive:
    #
    # 0123456
    #  ^^^
    # (1, 3)
    bounds = match.matchBounds

    # "12".split("") would be @["", "1", "2"], but
    # if we skip an empty first match, it's the correct
    # @["1", "2"]
    if bounds.a <= bounds.b or bounds.a > start:
      result.add(str.substr(lastIdx, bounds.a - 1))
      splits += 1

    lastIdx = bounds.b + 1

    for cap in match.captures:
      # if there are captures, include them in the result
      if cap.isSome:
        result.add(cap.get)

    if splits == maxSplit - 1:
      break

  # "12".split("\b") would be @["1", "2", ""], but
  # if we skip an empty last match, it's the correct
  # @["1", "2"]
  # If matches were never found, then the input string is the result
  if bounds.a <= bounds.b or bounds.b < str.high or never_ran:
    # last match: Each match takes the previous substring,
    # but "1 2".split(/ /) needs to return @["1", "2"].
    # This handles "2"
    result.add(str.substr(bounds.b + 1, str.high))

template replaceImpl(str: string, pattern: Regex,
                     replacement: untyped) {.dirty.} =
  # XXX seems very similar to split, maybe I can reduce code duplication
  # somehow?
  result = ""
  var lastIdx = 0
  for match {.inject.} in str.findIter(pattern):
    let bounds = match.matchBounds
    result.add(str.substr(lastIdx, bounds.a - 1))
    let nextVal = replacement
    result.add(nextVal)

    lastIdx = bounds.b + 1

  result.add(str.substr(lastIdx, str.len - 1))
  return result

proc replace*(str: string, pattern: Regex,
              subproc: proc (match: RegexMatch): string): string =
  ## Replaces each match of Regex in the string with ``subproc``, which should
  ## never be or return ``nil``.
  ##
  ## If ``subproc`` is a ``proc (RegexMatch): string``, then it is executed with
  ## each match and the return value is the replacement value.
  ##
  ## If ``subproc`` is a ``proc (string): string``, then it is executed with the
  ## full text of the match and and the return value is the replacement
  ## value.
  ##
  ## If ``subproc`` is a string, the syntax is as follows:
  ##
  ## -  ``$$`` - literal ``$``
  ## -  ``$123`` - capture number ``123``
  ## -  ``$foo`` - named capture ``foo``
  ## -  ``${foo}`` - same as above
  ## -  ``$1$#`` - first and second captures
  ## -  ``$#`` - first capture
  ## -  ``$0`` - full match
  ##
  ## If a given capture is missing, ``IndexError`` thrown for un-named captures
  ## and ``KeyError`` for named captures.
  replaceImpl(str, pattern, subproc(match))

proc replace*(str: string, pattern: Regex,
              subproc: proc (match: string): string): string =
  replaceImpl(str, pattern, subproc(match.match))

proc replace*(str: string, pattern: Regex, sub: string): string =
  # - 1 because the string numbers are 0-indexed
  replaceImpl(str, pattern,
    formatStr(sub, match.captures[name], match.captures[id - 1]))

# }}}

let SpecialCharMatcher = re"([\\+*?[^\]$(){}=!<>|:-])"
proc escapeRe*(str: string): string =
  ## Escapes the string so it doesn’t match any special characters.
  ## Incompatible with the Extra flag (``X``).
  str.replace(SpecialCharMatcher, "\\$1")