1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
|
# Interface for QuickJS libregexp.
import options
import unicode
import bindings/libregexp
import bindings/quickjs
import js/javascript
import strings/charset
export
LRE_FLAG_GLOBAL,
LRE_FLAG_IGNORECASE,
LRE_FLAG_MULTILINE,
LRE_FLAG_DOTALL,
LRE_FLAG_UTF16,
LRE_FLAG_STICKY
type
Regex* = object
bytecode*: ptr uint8
RegexResult* = object
success*: bool
captures*: seq[tuple[s, e: int]] # start, end
var dummyRuntime = newJSRuntime()
var dummyContext = dummyRuntime.newJSContextRaw()
proc `=destroy`(regex: var Regex) =
if regex.bytecode != nil:
dummyRuntime.js_free_rt(regex.bytecode)
regex.bytecode = nil
proc compileRegex*(buf: string, flags: int): Option[Regex] =
var regex: Regex
var len: cint
var error_msg_size = 64
var error_msg = cast[cstring](alloc0(error_msg_size))
let bytecode = lre_compile(addr len, error_msg, cint(error_msg_size), cstring(buf), csize_t(buf.len), cint(flags), dummyContext)
if error_msg != nil:
#TODO error handling?
#eprint "err", error_msg
dealloc(error_msg)
error_msg = nil
if bytecode == nil:
return none(Regex) # Failed to compile.
regex.bytecode = bytecode
return some(regex)
proc compileSearchRegex*(str: string): Option[Regex] =
# Parse any applicable flags in regex/<flags>. The last forward slash is
# dropped when <flags> is empty, and interpreted as a character when the
# flags are is invalid.
var i = str.high
var flagsi = -1
while i >= 0:
case str[i]
of '/':
flagsi = i
break
of 'i', 'm', 's', 'u': discard
else: break # invalid flag
dec i
var flags = LRE_FLAG_GLOBAL # for easy backwards matching
if flagsi == -1:
return compileRegex(str, flags)
for i in flagsi..str.high:
case str[i]
of '/': discard
of 'i': flags = flags or LRE_FLAG_IGNORECASE
of 'm': flags = flags or LRE_FLAG_MULTILINE
of 's': flags = flags or LRE_FLAG_DOTALL
of 'u': flags = flags or LRE_FLAG_UTF16
else: assert false
return compileRegex(str.substr(0, flagsi - 1), flags)
proc exec*(regex: Regex, str: string, start = 0): RegexResult =
assert 0 <= start and start <= str.len
let captureCount = lre_get_capture_count(regex.bytecode)
var capture: ptr ptr uint8 = nil
if captureCount > 0:
capture = cast[ptr ptr uint8](alloc0(sizeof(ptr uint8) * captureCount * 2))
var cstr = cstring(str)
var ascii = true
for c in str:
if c > char(0x80):
ascii = false
break
var ustr: string16
if not ascii:
ustr = toUTF16(str)
cstr = cstring(ustr)
let ret = lre_exec(capture, regex.bytecode,
cast[ptr uint8](cstr), cint(start),
cint(str.len), cint(not ascii), dummyContext)
result.success = ret == 1 #TODO error handling? (-1)
if result.success:
var i = 0
let cstrAddress = cast[int](cstr)
while i < captureCount * sizeof(ptr uint8):
let startPointerAddress = cast[int](capture) + i
i += sizeof(ptr uint8)
let endPointerAddress = cast[int](capture) + i
i += sizeof(ptr uint8)
let startPointer = cast[ptr ptr uint8](startPointerAddress)
let endPointer = cast[ptr ptr uint8](endPointerAddress)
let startAddress = cast[int](startPointer[])
let endAddress = cast[int](endPointer[])
var s = startAddress - cstrAddress
var e = endAddress - cstrAddress
if ascii:
result.captures.add((s, e))
else:
var s8 = 0
var e8 = 0
var i = 0
var r: Rune
while i < s and i < ustr.len:
fastRuneAt(ustr, i, r)
let si = r.size()
s8 += si
e8 += si
while i < e and i < ustr.len:
fastRuneAt(ustr, i, r)
e8 += r.size()
result.captures.add((s8, e8))
dealloc(capture)
|