about summary refs log tree commit diff stats
path: root/src/js/regex.nim
blob: f34f88e1b7f962f514841f8aba98e04763b35e5b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# Interface for QuickJS libregexp.

import options
import unicode

import bindings/libregexp
import bindings/quickjs
import js/javascript
import utils/twtstr

export
  LRE_FLAG_GLOBAL,
  LRE_FLAG_IGNORECASE,
  LRE_FLAG_MULTILINE,
  LRE_FLAG_DOTALL,
  LRE_FLAG_UTF16,
  LRE_FLAG_STICKY

type
  Regex* = object
    bytecode*: ptr uint8
    plen*: cint
    clone*: bool
    buf*: string

  RegexResult* = object
    success*: bool
    captures*: seq[tuple[s, e: int]] # start, end

  RegexReplace* = object
    regex: Regex
    rule: string
    global: bool

type string16 = distinct string

# Convert a UTF-8 string to UTF-16.
# Note: this doesn't check for (invalid) UTF-8 containing surrogates.
proc toUTF16(s: string): string16 =
  var res = ""
  var i = 0
  template put16(c: uint16) =
    res.setLen(res.len + 2)
    res[i] = cast[char](c)
    inc i
    res[i] = cast[char](c shr 8)
    inc i
  for r in s.runes:
    var c = uint32(r)
    if c < 0x10000: # ucs-2
      put16 uint16(c)
    elif c <= 0x10FFFF: # surrogate
      c -= 0x10000
      put16 uint16((c shr 10) + 0xD800)
      put16 uint16((c and 0x3FF) + 0xDC00)
    else: # invalid
      put16 uint16(0xFFFD)
  result = string16(res)

func len(s: string16): int {.borrow.}
func `[]`(s: string16, i: int): char = string(s)[i]
func `[]`(s: string16, i: BackwardsIndex): char = string(s)[i]

template fastRuneAt(s: string16, i: int, r: untyped, doInc = true, be = false) =
  if i + 1 == s.len: # unmatched byte
    when doInc: inc i
    r = Rune(0xFFFD)
  else:
    when be:
      var c1: uint32 = (uint32(s[i]) shl 8) + uint32(s[i + 1])
    else:
      var c1: uint32 = uint32(s[i]) + (uint32(s[i + 1]) shl 8)
    if c1 >= 0xD800 or c1 < 0xDC00:
      if i + 2 == s.len or i + 3 == s.len:
        when doInc: i += 2
        r = Rune(c1) # unmatched surrogate
      else:
        when be:
          var c2: uint32 = (uint32(s[i + 2]) shl 8) + uint32(s[i + 3])
        else:
          var c2: uint32 = uint32(s[i + 2]) + (uint32(s[i + 3]) shl 8)
        if c2 >= 0xDC00 and c2 < 0xE000:
          r = Rune((((c1 and 0x3FF) shl 10) or (c2 and 0x3FF)) + 0x10000)
          when doInc: i += 4
        else:
          r = Rune(c1) # unmatched surrogate
          when doInc: i += 2
    else:
      r = Rune(c1) # ucs-2
      when doInc: i += 2

var dummyRuntime = newJSRuntime()
var dummyContext = dummyRuntime.newJSContextRaw()

proc `=destroy`*(regex: var Regex) =
  if regex.bytecode != nil:
    if regex.clone:
      dealloc(regex.bytecode)
    else:
      dummyRuntime.js_free_rt(regex.bytecode)
    regex.bytecode = nil

proc `=copy`*(dest: var Regex, source: Regex) =
  if dest.bytecode != source.bytecode:
    `=destroy`(dest)
    wasMoved(dest)
    dest.bytecode = cast[ptr uint8](alloc(source.plen))
    copyMem(dest.bytecode, source.bytecode, source.plen)
    dest.clone = true
    dest.buf = source.buf
    dest.plen = source.plen

func `$`*(regex: Regex): string =
  regex.buf

proc compileRegex*(buf: string, flags: int): Option[Regex] =
  var regex: Regex
  var error_msg_size = 64
  var error_msg = cast[cstring](alloc0(error_msg_size))
  let bytecode = lre_compile(addr regex.plen, error_msg, cint(error_msg_size), cstring(buf), csize_t(buf.len), cint(flags), dummyContext)
  regex.buf = buf
  if error_msg != nil:
    #TODO error handling?
    dealloc(error_msg)
    error_msg = nil
  if bytecode == nil:
    return none(Regex) # Failed to compile.
  regex.bytecode = bytecode
  return some(regex)

proc compileSearchRegex*(str: string): Option[Regex] =
  # Parse any applicable flags in regex/<flags>. The last forward slash is
  # dropped when <flags> is empty, and interpreted as a character when the
  # flags are is invalid.

  var i = str.high
  var flagsi = -1
  while i >= 0:
    case str[i]
    of '/':
      flagsi = i
      break
    of 'i', 'm', 's', 'u': discard
    else: break # invalid flag
    dec i

  var flags = LRE_FLAG_GLOBAL # for easy backwards matching

  if flagsi == -1:
    return compileRegex(str, flags)

  for i in flagsi..str.high:
    case str[i]
    of '/': discard
    of 'i': flags = flags or LRE_FLAG_IGNORECASE
    of 'm': flags = flags or LRE_FLAG_MULTILINE
    of 's': flags = flags or LRE_FLAG_DOTALL
    of 'u': flags = flags or LRE_FLAG_UTF16
    else: assert false
  return compileRegex(str.substr(0, flagsi - 1), flags)

proc exec*(regex: Regex, str: string, start = 0, length = -1, nocaps = false): RegexResult =
  let length = if length == -1:
    str.len
  else:
    length
  assert 0 <= start and start <= length, "Start: " & $start & ", length: " & $length & " str: " & $str

  let captureCount = lre_get_capture_count(regex.bytecode)
  var capture: ptr ptr uint8 = nil
  if captureCount > 0:
    capture = cast[ptr ptr uint8](alloc0(sizeof(ptr uint8) * captureCount * 2))
  var cstr = cstring(str)
  let ascii = str.isAscii()
  var ustr: string16
  if not ascii:
    if start != 0 or length != str.len:
      ustr = toUTF16(str.substr(start, length))
    else:
      ustr = toUTF16(str)
    cstr = cstring(ustr)
  let flags = lre_get_flags(regex.bytecode)
  var start = start
  while true:
    let ret = lre_exec(capture, regex.bytecode,
                       cast[ptr uint8](cstr), cint(start),
                       cint(length), cint(not ascii), dummyContext)
    if ret != 1: #TODO error handling? (-1)
      break
    result.success = true
    if captureCount == 0 or nocaps:
      break
    let cstrAddress = cast[int](cstr)
    start = (cast[ptr int](cast[int](capture) + sizeof(ptr uint8))[] - cstrAddress) shr cint(not ascii)
    var i = 0
    while i < captureCount * sizeof(ptr uint8):
      let s = cast[ptr int](cast[int](capture) + i)[] - cstrAddress
      i += sizeof(ptr uint8)
      let e = cast[ptr int](cast[int](capture) + i)[] - cstrAddress
      i += sizeof(ptr uint8)
      if ascii:
        result.captures.add((s, e))
      else:
        var s8 = 0
        var e8 = 0
        var i = 0
        var r: Rune
        while i < s and i < ustr.len:
          fastRuneAt(ustr, i, r)
          let si = r.size()
          s8 += si
          e8 += si
        while i < e and i < ustr.len:
          fastRuneAt(ustr, i, r)
          e8 += r.size()
        result.captures.add((s8, e8))
    if (flags and LRE_FLAG_GLOBAL) != 1:
      break
  if captureCount > 0:
    dealloc(capture)

proc match*(regex: Regex, str: string, start = 0, length = str.len): bool =
  return regex.exec(str, start, length, nocaps = true).success