lib/impure/nre.nim


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748

#
#            Nim's Runtime Library
#        (c) Copyright 2015 Nim Contributors
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## What is NRE?
## ============
##
## A regular expression library for Nim using PCRE to do the hard work.
##
## For documentation on how to write patterns, there exists `the official PCRE
## pattern documentation
## <https://www.pcre.org/original/doc/html/pcrepattern.html>`_. You can also
## search the internet for a wide variety of third-party documentation and
## tools.
##
## **Note**: If you love ``sequtils.toSeq`` we have bad news for you. This
## library doesn't work with it due to documented compiler limitations. As
## a workaround, use this:
##
## .. code-block:: nim
##
##    import nre except toSeq
##
##
## Licencing
## ---------
##
## PCRE has `some additional terms`_ that you must agree to in order to use
## this module.
##
## .. _`some additional terms`: http://pcre.sourceforge.net/license.txt
##
runnableExamples:
  let vowels = re"[aeoui]"

  let expectedResults = [
    1 .. 1,
    2 .. 2,
    4 .. 4,
    6 .. 6,
    7 .. 7,
  ]
  var i = 0
  for match in "moigagoo".findIter(vowels):
    doAssert match.matchBounds == expectedResults[i]
    inc i

  let firstVowel = "foo".find(vowels)
  let hasVowel = firstVowel.isSome()
  if hasVowel:
    let matchBounds = firstVowel.get().captureBounds[-1]
    doAssert matchBounds.a == 1

from pcre import nil
import nre/private/util
import tables
from strutils import `%`
from math import ceil
import options
from unicode import runeLenAt

export options

# Type definitions {{{
type
  Regex* = ref object
    ## Represents the pattern that things are matched against, constructed with
    ## ``re(string)``. Examples: ``re"foo"``, ``re(r"(*ANYCRLF)(?x)foo #
    ## comment".``
    ##
    ## ``pattern: string``
    ##     the string that was used to create the pattern. For details on how
    ##     to write a pattern, please see `the official PCRE pattern
    ##     documentation.
    ##     <https://www.pcre.org/original/doc/html/pcrepattern.html>`_
    ##
    ## ``captureCount: int``
    ##     the number of captures that the pattern has.
    ##
    ## ``captureNameId: Table[string, int]``
    ##     a table from the capture names to their numeric id.
    ##
    ##
    ## Options
    ## .......
    ##
    ## The following options may appear anywhere in the pattern, and they affect
    ## the rest of it.
    ##
    ## -  ``(?i)`` - case insensitive
    ## -  ``(?m)`` - multi-line: ``^`` and ``$`` match the beginning and end of
    ##    lines, not of the subject string
    ## -  ``(?s)`` - ``.`` also matches newline (*dotall*)
    ## -  ``(?U)`` - expressions are not greedy by default. ``?`` can be added
    ##    to a qualifier to make it greedy
    ## -  ``(?x)`` - whitespace and comments (``#``) are ignored (*extended*)
    ## -  ``(?X)`` - character escapes without special meaning (``\w`` vs.
    ##    ``\a``) are errors (*extra*)
    ##
    ## One or a combination of these options may appear only at the beginning
    ## of the pattern:
    ##
    ## -  ``(*UTF8)`` - treat both the pattern and subject as UTF-8
    ## -  ``(*UCP)`` - Unicode character properties; ``\w`` matches ``я``
    ## -  ``(*U)`` - a combination of the two options above
    ## -  ``(*FIRSTLINE*)`` - fails if there is not a match on the first line
    ## -  ``(*NO_AUTO_CAPTURE)`` - turn off auto-capture for groups;
    ##    ``(?<name>...)`` can be used to capture
    ## -  ``(*CR)`` - newlines are separated by ``\r``
    ## -  ``(*LF)`` - newlines are separated by ``\n`` (UNIX default)
    ## -  ``(*CRLF)`` - newlines are separated by ``\r\n`` (Windows default)
    ## -  ``(*ANYCRLF)`` - newlines are separated by any of the above
    ## -  ``(*ANY)`` - newlines are separated by any of the above and Unicode
    ##    newlines:
    ##
    ##     single characters VT (vertical tab, U+000B), FF (form feed, U+000C),
    ##     NEL (next line, U+0085), LS (line separator, U+2028), and PS
    ##     (paragraph separator, U+2029). For the 8-bit library, the last two
    ##     are recognized only in UTF-8 mode.
    ##     —  man pcre
    ##
    ## -  ``(*JAVASCRIPT_COMPAT)`` - JavaScript compatibility
    ## -  ``(*NO_STUDY)`` - turn off studying; study is enabled by default
    ##
    ## For more details on the leading option groups, see the `Option
    ## Setting <http://man7.org/linux/man-pages/man3/pcresyntax.3.html#OPTION_SETTING>`_
    ## and the `Newline
    ## Convention <http://man7.org/linux/man-pages/man3/pcresyntax.3.html#NEWLINE_CONVENTION>`_
    ## sections of the `PCRE syntax
    ## manual <http://man7.org/linux/man-pages/man3/pcresyntax.3.html>`_.
    ##
    ## Some of these options are not part of PCRE and are converted by nre
    ## into PCRE flags. These include ``NEVER_UTF``, ``ANCHORED``,
    ## ``DOLLAR_ENDONLY``, ``FIRSTLINE``, ``NO_AUTO_CAPTURE``,
    ## ``JAVASCRIPT_COMPAT``, ``U``, ``NO_STUDY``. In other PCRE wrappers, you
    ## will need to pass these as separate flags to PCRE.
    pattern*: string  ## not nil
    pcreObj: ptr pcre.Pcre  ## not nil
    pcreExtra: ptr pcre.ExtraData  ## nil

    captureNameToId: Table[string, int]

  RegexMatch* = object
    ## Usually seen as Option[RegexMatch], it represents the result of an
    ## execution. On failure, it is none, on success, it is some.
    ##
    ## ``pattern: Regex``
    ##     the pattern that is being matched
    ##
    ## ``str: string``
    ##     the string that was matched against
    ##
    ## ``captures[]: string``
    ##     the string value of whatever was captured at that id. If the value
    ##     is invalid, then behavior is undefined. If the id is ``-1``, then
    ##     the whole match is returned. If the given capture was not matched,
    ##     ``nil`` is returned.
    ##
    ##     -  ``"abc".match(re"(\w)").get.captures[0] == "a"``
    ##     -  ``"abc".match(re"(?<letter>\w)").get.captures["letter"] == "a"``
    ##     -  ``"abc".match(re"(\w)\w").get.captures[-1] == "ab"``
    ##
    ## ``captureBounds[]: HSlice[int, int]``
    ##     gets the bounds of the given capture according to the same rules as
    ##     the above. If the capture is not filled, then ``None`` is returned.
    ##     The bounds are both inclusive.
    ##
    ##     -  ``"abc".match(re"(\w)").get.captureBounds[0] == 0 .. 0``
    ##     -  ``0 in "abc".match(re"(\w)").get.captureBounds == true``
    ##     -  ``"abc".match(re"").get.captureBounds[-1] == 0 .. -1``
    ##     -  ``"abc".match(re"abc").get.captureBounds[-1] == 0 .. 2``
    ##
    ## ``match: string``
    ##     the full text of the match.
    ##
    ## ``matchBounds: HSlice[int, int]``
    ##     the bounds of the match, as in ``captureBounds[]``
    ##
    ## ``(captureBounds|captures).toTable``
    ##     returns a table with each named capture as a key.
    ##
    ## ``(captureBounds|captures).toSeq``
    ##     returns all the captures by their number.
    ##
    ## ``$: string``
    ##     same as ``match``
    pattern*: Regex  ## The regex doing the matching.
                     ## Not nil.
    str*: string  ## The string that was matched against.
                  ## Not nil.
    pcreMatchBounds: seq[HSlice[cint, cint]] ## First item is the bounds of the match
                                            ## Other items are the captures
                                            ## `a` is inclusive start, `b` is exclusive end

  Captures* = distinct RegexMatch
  CaptureBounds* = distinct RegexMatch

  RegexError* = ref object of Exception

  RegexInternalError* = ref object of RegexError
    ## Internal error in the module, this probably means that there is a bug

  InvalidUnicodeError* = ref object of RegexError
    ## Thrown when matching fails due to invalid unicode in strings
    pos*: int  ## the location of the invalid unicode in bytes

  SyntaxError* = ref object of RegexError
    ## Thrown when there is a syntax error in the
    ## regular expression string passed in
    pos*: int  ## the location of the syntax error in bytes
    pattern*: string  ## the pattern that caused the problem

  StudyError* = ref object of RegexError
    ## Thrown when studying the regular expression failes
    ## for whatever reason. The message contains the error
    ## code.

runnableExamples:
    # This MUST be kept in sync with the examples in RegexMatch
    doAssert "abc".match(re"(\w)").get.captures[0] == "a"
    doAssert "abc".match(re"(?<letter>\w)").get.captures["letter"] == "a"
    doAssert "abc".match(re"(\w)\w").get.captures[-1] == "ab"

    doAssert "abc".match(re"(\w)").get.captureBounds[0] == 0 .. 0
    doAssert 0 in "abc".match(re"(\w)").get.captureBounds == true
    doAssert "abc".match(re"").get.captureBounds[-1] == 0 .. -1
    doAssert "abc".match(re"abc").get.captureBounds[-1] == 0 .. 2
# }}}

proc getinfo[T](pattern: Regex, opt: cint): T =
  let retcode = pcre.fullinfo(pattern.pcreObj, pattern.pcreExtra, opt, addr result)

  if retcode < 0:
    # XXX Error message that doesn't expose implementation details
    raise newException(FieldError, "Invalid getinfo for $1, errno $2" % [$opt, $retcode])

# Regex accessors {{{
proc captureCount*(pattern: Regex): int =
  return getinfo[cint](pattern, pcre.INFO_CAPTURECOUNT)

proc captureNameId*(pattern: Regex): Table[string, int] =
  return pattern.captureNameToId

proc matchesCrLf(pattern: Regex): bool =
  let flags = uint32(getinfo[culong](pattern, pcre.INFO_OPTIONS))
  let newlineFlags = flags and (pcre.NEWLINE_CRLF or
                                pcre.NEWLINE_ANY or
                                pcre.NEWLINE_ANYCRLF)
  if newLineFlags > 0u32:
    return true

  # get flags from build config
  var confFlags: cint
  if pcre.config(pcre.CONFIG_NEWLINE, addr confFlags) != 0:
    assert(false, "CONFIG_NEWLINE apparently got screwed up")

  case confFlags
  of 13: return false
  of 10: return false
  of (13 shl 8) or 10: return true
  of -2: return true
  of -1: return true
  else: return false
# }}}

# Capture accessors {{{
func captureBounds*(pattern: RegexMatch): CaptureBounds = return CaptureBounds(pattern)

func captures*(pattern: RegexMatch): Captures = return Captures(pattern)

func contains*(pattern: CaptureBounds, i: int): bool =
  let pattern = RegexMatch(pattern)
  pattern.pcreMatchBounds[i + 1].a != -1

func contains*(pattern: Captures, i: int): bool =
  i in CaptureBounds(pattern)

func `[]`*(pattern: CaptureBounds, i: int): HSlice[int, int] =
  let pattern = RegexMatch(pattern)
  if not (i in pattern.captureBounds):
    raise newException(IndexError, "Group '" & $i & "' was not captured")

  let bounds = pattern.pcreMatchBounds[i + 1]
  int(bounds.a)..int(bounds.b-1)

func `[]`*(pattern: Captures, i: int): string =
  let pattern = RegexMatch(pattern)
  let bounds = pattern.captureBounds[i]

  pattern.str.substr(bounds.a, bounds.b)

func match*(pattern: RegexMatch): string =
  return pattern.captures[-1]

func matchBounds*(pattern: RegexMatch): HSlice[int, int] =
  return pattern.captureBounds[-1]

func contains*(pattern: CaptureBounds, name: string): bool =
  let pattern = RegexMatch(pattern)
  let nameToId = pattern.pattern.captureNameToId
  if not (name in nameToId):
      return false
  nameToId[name] in pattern.captureBounds

func contains*(pattern: Captures, name: string): bool =
  name in CaptureBounds(pattern)

func checkNamedCaptured(pattern: RegexMatch, name: string): void =
  if not (name in pattern.captureBounds):
    raise newException(KeyError, "Group '" & name & "' was not captured")

func `[]`*(pattern: CaptureBounds, name: string): HSlice[int, int] =
  let pattern = RegexMatch(pattern)
  checkNamedCaptured(pattern, name)
  pattern.captureBounds[pattern.pattern.captureNameToId[name]]

func `[]`*(pattern: Captures, name: string): string =
  let pattern = RegexMatch(pattern)
  checkNamedCaptured(pattern, name)
  return pattern.captures[pattern.pattern.captureNameToId[name]]

template toTableImpl() {.dirty.} =
  for key in RegexMatch(pattern).pattern.captureNameId.keys:
    if key in pattern:
        result[key] = pattern[key]

func toTable*(pattern: Captures): Table[string, string] =
  result = initTable[string, string]()
  toTableImpl()

func toTable*(pattern: CaptureBounds): Table[string, HSlice[int, int]] =
  result = initTable[string, HSlice[int, int]]()
  toTableImpl()

template itemsImpl() {.dirty.} =
  for i in 0 ..< RegexMatch(pattern).pattern.captureCount:
    # done in this roundabout way to avoid multiple yields (potential code
    # bloat)
    let nextYieldVal = if i in pattern:
      some(pattern[i])
    else:
      default

    yield nextYieldVal

iterator items*(pattern: CaptureBounds,
                default = none(HSlice[int, int])): Option[HSlice[int, int]] =
  itemsImpl()

iterator items*(pattern: Captures,
                default: Option[string] = none(string)): Option[string] =
  itemsImpl()

proc toSeq*(pattern: CaptureBounds,
            default = none(HSlice[int, int])): seq[Option[HSlice[int, int]]] =
  accumulateResult(pattern.items(default))

proc toSeq*(pattern: Captures,
            default: Option[string] = none(string)): seq[Option[string]] =
  accumulateResult(pattern.items(default))

proc `$`*(pattern: RegexMatch): string =
  return pattern.captures[-1]

proc `==`*(a, b: Regex): bool =
  if not a.isNil and not b.isNil:
    return a.pattern == b.pattern and
           a.pcreObj == b.pcreObj and
           a.pcreExtra == b.pcreExtra
  else:
    return system.`==`(a, b)

proc `==`*(a, b: RegexMatch): bool =
  return a.pattern == b.pattern and
         a.str == b.str
# }}}

# Creation & Destruction {{{
# PCRE Options {{{
const PcreOptions = {
  "NEVER_UTF": pcre.NEVER_UTF,
  "ANCHORED": pcre.ANCHORED,
  "DOLLAR_ENDONLY": pcre.DOLLAR_ENDONLY,
  "FIRSTLINE": pcre.FIRSTLINE,
  "NO_AUTO_CAPTURE": pcre.NO_AUTO_CAPTURE,
  "JAVASCRIPT_COMPAT": pcre.JAVASCRIPT_COMPAT,
  "U": pcre.UTF8 or pcre.UCP
}.toTable

# Options that are supported inside regular expressions themselves
const SkipOptions = [
  "LIMIT_MATCH=", "LIMIT_RECURSION=", "NO_AUTO_POSSESS", "NO_START_OPT",
  "UTF8", "UTF16", "UTF32", "UTF", "UCP",
  "CR", "LF", "CRLF", "ANYCRLF", "ANY", "BSR_ANYCRLF", "BSR_UNICODE"
]

proc extractOptions(pattern: string): tuple[pattern: string, flags: int, study: bool] =
  result = ("", 0, true)

  var optionStart = 0
  var equals = false
  for i, c in pattern:
    if optionStart == i:
      if c != '(':
        break
      optionStart = i

    elif optionStart == i-1:
      if c != '*':
        break

    elif c == ')':
      let name = pattern[optionStart+2 .. i-1]
      if equals or name in SkipOptions:
        result.pattern.add pattern[optionStart .. i]
      elif PcreOptions.hasKey name:
        result.flags = result.flags or PcreOptions[name]
      elif name == "NO_STUDY":
        result.study = false
      else:
        break
      optionStart = i+1
      equals = false

    elif not equals:
      if c == '=':
        equals = true
        if pattern[optionStart+2 .. i] notin SkipOptions:
          break
      elif c notin {'A'..'Z', '0'..'9', '_'}:
        break

  result.pattern.add pattern[optionStart .. pattern.high]

# }}}

proc destroyRegex(pattern: Regex) =
  pcre.free_substring(cast[cstring](pattern.pcreObj))
  if pattern.pcreExtra != nil:
    pcre.free_study(pattern.pcreExtra)

proc getNameToNumberTable(pattern: Regex): Table[string, int] =
  let entryCount = getinfo[cint](pattern, pcre.INFO_NAMECOUNT)
  let entrySize = getinfo[cint](pattern, pcre.INFO_NAMEENTRYSIZE)
  let table = cast[ptr UncheckedArray[uint8]](
                getinfo[int](pattern, pcre.INFO_NAMETABLE))

  result = initTable[string, int]()

  for i in 0 ..< entryCount:
    let pos = i * entrySize
    let num = (int(table[pos]) shl 8) or int(table[pos + 1]) - 1
    var name = ""

    var idx = 2
    while table[pos + idx] != 0:
      name.add(char(table[pos + idx]))
      idx += 1

    result[name] = num

proc initRegex(pattern: string, flags: int, study = true): Regex =
  new(result, destroyRegex)
  result.pattern = pattern

  var errorMsg: cstring
  var errOffset: cint

  result.pcreObj = pcre.compile(cstring(pattern),
                                # better hope int is at least 4 bytes..
                                cint(flags), addr errorMsg,
                                addr errOffset, nil)
  if result.pcreObj == nil:
    # failed to compile
    raise SyntaxError(msg: $errorMsg, pos: errOffset, pattern: pattern)

  if study:
    var options: cint = 0
    var hasJit: cint
    if pcre.config(pcre.CONFIG_JIT, addr hasJit) == 0:
      if hasJit == 1'i32:
        options = pcre.STUDY_JIT_COMPILE
    result.pcreExtra = pcre.study(result.pcreObj, options, addr errorMsg)
    if errorMsg != nil:
      raise StudyError(msg: $errorMsg)

  result.captureNameToId = result.getNameToNumberTable()

proc re*(pattern: string): Regex =
  let (pattern, flags, study) = extractOptions(pattern)
  initRegex(pattern, flags, study)
# }}}

# Operations {{{
proc matchImpl(str: string, pattern: Regex, start, endpos: int, flags: int): Option[RegexMatch] =
  var myResult = RegexMatch(pattern : pattern, str : str)
  # See PCRE man pages.
  # 2x capture count to make room for start-end pairs
  # 1x capture count as slack space for PCRE
  let vecsize = (pattern.captureCount() + 1) * 3
  # div 2 because each element is 2 cints long
  # plus 1 because we need the ceiling, not the floor
  myResult.pcreMatchBounds = newSeq[HSlice[cint, cint]]((vecsize + 1) div 2)
  myResult.pcreMatchBounds.setLen(vecsize div 3)

  let strlen = if endpos == int.high: str.len else: endpos+1
  doAssert(strlen <= str.len)  # don't want buffer overflows

  let execRet = pcre.exec(pattern.pcreObj,
                          pattern.pcreExtra,
                          cstring(str),
                          cint(strlen),
                          cint(start),
                          cint(flags),
                          cast[ptr cint](addr myResult.pcreMatchBounds[0]),
                          cint(vecsize))
  if execRet >= 0:
    return some(myResult)

  case execRet:
    of pcre.ERROR_NOMATCH:
      return none(RegexMatch)
    of pcre.ERROR_NULL:
      raise newException(AccessViolationError, "Expected non-null parameters")
    of pcre.ERROR_BADOPTION:
      raise RegexInternalError(msg : "Unknown pattern flag. Either a bug or " &
        "outdated PCRE.")
    of pcre.ERROR_BADUTF8, pcre.ERROR_SHORTUTF8, pcre.ERROR_BADUTF8_OFFSET:
      raise InvalidUnicodeError(msg : "Invalid unicode byte sequence",
        pos : myResult.pcreMatchBounds[0].a)
    else:
      raise RegexInternalError(msg : "Unknown internal error: " & $execRet)

proc match*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[RegexMatch] =
  ## Like ` ``find(...)`` <#proc-find>`_, but anchored to the start of the
  ## string.
  ##
  runnableExamples:
    doAssert "foo".match(re"f").isSome
    doAssert "foo".match(re"o").isNone

  return str.matchImpl(pattern, start, endpos, pcre.ANCHORED)

iterator findIter*(str: string, pattern: Regex, start = 0, endpos = int.high): RegexMatch =
  ## Works the same as ` ``find(...)`` <#proc-find>`_, but finds every
  ## non-overlapping match. ``"2222".find(re"22")`` is ``"22", "22"``, not
  ## ``"22", "22", "22"``.
  ##
  ## Arguments are the same as ` ``find(...)`` <#proc-find>`_
  ##
  ## Variants:
  ##
  ## -  ``proc findAll(...)`` returns a ``seq[string]``
  # see pcredemo for explanation
  let matchesCrLf = pattern.matchesCrLf()
  let unicode = uint32(getinfo[culong](pattern, pcre.INFO_OPTIONS) and
    pcre.UTF8) > 0u32
  let strlen = if endpos == int.high: str.len else: endpos+1
  var offset = start
  var match: Option[RegexMatch]
  var neverMatched = true

  while true:
    var flags = 0
    if match.isSome and
       match.get.matchBounds.a > match.get.matchBounds.b:
      # 0-len match
      flags = pcre.NOTEMPTY_ATSTART
    match = str.matchImpl(pattern, offset, endpos, flags)

    if match.isNone:
      # either the end of the input or the string
      # cannot be split here - we also need to bail
      # if we've never matched and we've already tried to...
      if offset >= strlen or neverMatched:
        break

      if matchesCrLf and offset < (str.len - 1) and
         str[offset] == '\r' and str[offset + 1] == '\L':
        # if PCRE treats CrLf as newline, skip both at the same time
        offset += 2
      elif unicode:
        # XXX what about invalid unicode?
        offset += str.runeLenAt(offset)
        assert(offset <= strlen)
      else:
        offset += 1
    else:
      neverMatched = false
      offset = match.get.matchBounds.b + 1

      yield match.get

proc find*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[RegexMatch] =
  ## Finds the given pattern in the string between the end and start
  ## positions.
  ##
  ## ``start``
  ##     The start point at which to start matching. ``|abc`` is ``0``;
  ##     ``a|bc`` is ``1``
  ##
  ## ``endpos``
  ##     The maximum index for a match; ``int.high`` means the end of the
  ##     string, otherwise it’s an inclusive upper bound.
  return str.matchImpl(pattern, start, endpos, 0)

proc findAll*(str: string, pattern: Regex, start = 0, endpos = int.high): seq[string] =
  result = @[]
  for match in str.findIter(pattern, start, endpos):
    result.add(match.match)

proc contains*(str: string, pattern: Regex, start = 0, endpos = int.high): bool =
  ## Determine if the string contains the given pattern between the end and
  ## start positions:
  ## This function is equivalent to ``isSome(str.find(pattern, start, endpos))``.
  ##
  runnableExamples:
    doAssert "abc".contains(re"bc") == true
    doAssert "abc".contains(re"cd") == false
    doAssert "abc".contains(re"a", start = 1) == false

  return isSome(str.find(pattern, start, endpos))

proc split*(str: string, pattern: Regex, maxSplit = -1, start = 0): seq[string] =
  ## Splits the string with the given regex. This works according to the
  ## rules that Perl and Javascript use.
  ##
  ## ``start`` behaves the same as in ` ``find(...)`` <#proc-find>`_.
  ##
  runnableExamples:
    # -  If the match is zero-width, then the string is still split:
    doAssert "123".split(re"") == @["1", "2", "3"]

    # -  If the pattern has a capture in it, it is added after the string
    #    split:
    doAssert "12".split(re"(\d)") == @["", "1", "", "2", ""]

    # -  If ``maxsplit != -1``, then the string will only be split
    #    ``maxsplit - 1`` times. This means that there will be ``maxsplit``
    #    strings in the output seq.
    doAssert "1.2.3".split(re"\.", maxsplit = 2) == @["1", "2.3"]

  result = @[]
  var lastIdx = start
  var splits = 0
  var bounds = 0 .. -1
  var never_ran = true

  for match in str.findIter(pattern, start = start):
    never_ran = false

    # bounds are inclusive:
    #
    # 0123456
    #  ^^^
    # (1, 3)
    bounds = match.matchBounds

    # "12".split("") would be @["", "1", "2"], but
    # if we skip an empty first match, it's the correct
    # @["1", "2"]
    if bounds.a <= bounds.b or bounds.a > start:
      result.add(str.substr(lastIdx, bounds.a - 1))
      splits += 1

    lastIdx = bounds.b + 1

    for cap in match.captures:
      # if there are captures, include them in the result
      if cap.isSome:
        result.add(cap.get)

    if splits == maxSplit - 1:
      break

  # "12".split("\b") would be @["1", "2", ""], but
  # if we skip an empty last match, it's the correct
  # @["1", "2"]
  # If matches were never found, then the input string is the result
  if bounds.a <= bounds.b or bounds.b < str.high or never_ran:
    # last match: Each match takes the previous substring,
    # but "1 2".split(/ /) needs to return @["1", "2"].
    # This handles "2"
    result.add(str.substr(bounds.b + 1, str.high))

template replaceImpl(str: string, pattern: Regex,
                     replacement: untyped) {.dirty.} =
  # XXX seems very similar to split, maybe I can reduce code duplication
  # somehow?
  result = ""
  var lastIdx = 0
  for match {.inject.} in str.findIter(pattern):
    let bounds = match.matchBounds
    result.add(str.substr(lastIdx, bounds.a - 1))
    let nextVal = replacement
    result.add(nextVal)

    lastIdx = bounds.b + 1

  result.add(str.substr(lastIdx, str.len - 1))
  return result

proc replace*(str: string, pattern: Regex,
              subproc: proc (match: RegexMatch): string): string =
  ## Replaces each match of Regex in the string with ``subproc``, which should
  ## never be or return ``nil``.
  ##
  ## If ``subproc`` is a ``proc (RegexMatch): string``, then it is executed with
  ## each match and the return value is the replacement value.
  ##
  ## If ``subproc`` is a ``proc (string): string``, then it is executed with the
  ## full text of the match and and the return value is the replacement
  ## value.
  ##
  ## If ``subproc`` is a string, the syntax is as follows:
  ##
  ## -  ``$$`` - literal ``$``
  ## -  ``$123`` - capture number ``123``
  ## -  ``$foo`` - named capture ``foo``
  ## -  ``${foo}`` - same as above
  ## -  ``$1$#`` - first and second captures
  ## -  ``$#`` - first capture
  ## -  ``$0`` - full match
  ##
  ## If a given capture is missing, ``IndexError`` thrown for un-named captures
  ## and ``KeyError`` for named captures.
  replaceImpl(str, pattern, subproc(match))

proc replace*(str: string, pattern: Regex,
              subproc: proc (match: string): string): string =
  replaceImpl(str, pattern, subproc(match.match))

proc replace*(str: string, pattern: Regex, sub: string): string =
  # - 1 because the string numbers are 0-indexed
  replaceImpl(str, pattern,
    formatStr(sub, match.captures[name], match.captures[id - 1]))

# }}}

let SpecialCharMatcher = re"([\\+*?[^\]$(){}=!<>|:-])"
proc escapeRe*(str: string): string =
  ## Escapes the string so it doesn’t match any special characters.
  ## Incompatible with the Extra flag (``X``).
  str.replace(SpecialCharMatcher, "\\$1")