# Interface for QuickJS libregexp. import options import unicode import bindings/libregexp import bindings/quickjs import js/javascript import utils/twtstr export LRE_FLAG_GLOBAL, LRE_FLAG_IGNORECASE, LRE_FLAG_MULTILINE, LRE_FLAG_DOTALL, LRE_FLAG_UTF16, LRE_FLAG_STICKY type Regex* = object bytecode*: ptr uint8 plen*: cint clone*: bool buf*: string RegexResult* = object success*: bool captures*: seq[tuple[s, e: int]] # start, end RegexReplace* = object regex: Regex rule: string global: bool type string16 = distinct string # Convert a UTF-8 string to UTF-16. # Note: this doesn't check for (invalid) UTF-8 containing surrogates. proc toUTF16(s: string): string16 = var res = "" var i = 0 template put16(c: uint16) = res.setLen(res.len + 2) res[i] = cast[char](c) inc i res[i] = cast[char](c shr 8) inc i for r in s.runes: var c = uint32(r) if c < 0x10000: # ucs-2 put16 uint16(c) elif c <= 0x10FFFF: # surrogate c -= 0x10000 put16 uint16((c shr 10) + 0xD800) put16 uint16((c and 0x3FF) + 0xDC00) else: # invalid put16 uint16(0xFFFD) result = string16(res) func len(s: string16): int {.borrow.} func `[]`(s: string16, i: int): char = string(s)[i] func `[]`(s: string16, i: BackwardsIndex): char = string(s)[i] template fastRuneAt(s: string16, i: int, r: untyped, doInc = true, be = false) = if i + 1 == s.len: # unmatched byte when doInc: inc i r = Rune(0xFFFD) else: when be: var c1: uint32 = (uint32(s[i]) shl 8) + uint32(s[i + 1]) else: var c1: uint32 = uint32(s[i]) + (uint32(s[i + 1]) shl 8) if c1 >= 0xD800 or c1 < 0xDC00: if i + 2 == s.len or i + 3 == s.len: when doInc: i += 2 r = Rune(c1) # unmatched surrogate else: when be: var c2: uint32 = (uint32(s[i + 2]) shl 8) + uint32(s[i + 3]) else: var c2: uint32 = uint32(s[i + 2]) + (uint32(s[i + 3]) shl 8) if c2 >= 0xDC00 and c2 < 0xE000: r = Rune((((c1 and 0x3FF) shl 10) or (c2 and 0x3FF)) + 0x10000) when doInc: i += 4 else: r = Rune(c1) # unmatched surrogate when doInc: i += 2 else: r = Rune(c1) # ucs-2 when doInc: i += 2 var dummyRuntime = newJSRuntime() var dummyContext = dummyRuntime.newJSContextRaw() proc `=destroy`*(regex: var Regex) = if regex.bytecode != nil: if regex.clone: dealloc(regex.bytecode) else: dummyRuntime.js_free_rt(regex.bytecode) regex.bytecode = nil proc `=copy`*(dest: var Regex, source: Regex) = if dest.bytecode != source.bytecode: `=destroy`(dest) wasMoved(dest) dest.bytecode = cast[ptr uint8](alloc(source.plen)) copyMem(dest.bytecode, source.bytecode, source.plen) dest.clone = true dest.buf = source.buf dest.plen = source.plen func `$`*(regex: Regex): string = regex.buf proc compileRegex*(buf: string, flags: int): Option[Regex] = var regex: Regex var error_msg_size = 64 var error_msg = cast[cstring](alloc0(error_msg_size)) let bytecode = lre_compile(addr regex.plen, error_msg, cint(error_msg_size), cstring(buf), csize_t(buf.len), cint(flags), dummyContext) regex.buf = buf if error_msg != nil: #TODO error handling? dealloc(error_msg) error_msg = nil if bytecode == nil: return none(Regex) # Failed to compile. regex.bytecode = bytecode return some(regex) proc compileSearchRegex*(str: string): Option[Regex] = # Parse any applicable flags in regex/. The last forward slash is # dropped when is empty, and interpreted as a character when the # flags are is invalid. var i = str.high var flagsi = -1 while i >= 0: case str[i] of '/': flagsi = i break of 'i', 'm', 's', 'u': discard else: break # invalid flag dec i var flags = LRE_FLAG_GLOBAL # for easy backwards matching if flagsi == -1: return compileRegex(str, flags) for i in flagsi..str.high: case str[i] of '/': discard of 'i': flags = flags or LRE_FLAG_IGNORECASE of 'm': flags = flags or LRE_FLAG_MULTILINE of 's': flags = flags or LRE_FLAG_DOTALL of 'u': flags = flags or LRE_FLAG_UTF16 else: assert false return compileRegex(str.substr(0, flagsi - 1), flags) proc exec*(regex: Regex, str: string, start = 0, length = -1, nocaps = false): RegexResult = let length = if length == -1: str.len else: length assert 0 <= start and start <= length, "Start: " & $start & ", length: " & $length & " str: " & $str let captureCount = lre_get_capture_count(regex.bytecode) var capture: ptr ptr uint8 = nil if captureCount > 0: capture = cast[ptr ptr uint8](alloc0(sizeof(ptr uint8) * captureCount * 2)) var cstr = cstring(str) let ascii = str.isAscii() var ustr: string16 if not ascii: if start != 0 or length != str.len: ustr = toUTF16(str.substr(start, length)) else: ustr = toUTF16(str) cstr = cstring(ustr) let flags = lre_get_flags(regex.bytecode) var start = start while true: let ret = lre_exec(capture, regex.bytecode, cast[ptr uint8](cstr), cint(start), cint(length), cint(not ascii), dummyContext) if ret != 1: #TODO error handling? (-1) break result.success = true if captureCount == 0 or nocaps: break let cstrAddress = cast[int](cstr) start = (cast[ptr int](cast[int](capture) + sizeof(ptr uint8))[] - cstrAddress) shr cint(not ascii) var i = 0 while i < captureCount * sizeof(ptr uint8): let s = cast[ptr int](cast[int](capture) + i)[] - cstrAddress i += sizeof(ptr uint8) let e = cast[ptr int](cast[int](capture) + i)[] - cstrAddress i += sizeof(ptr uint8) if ascii: result.captures.add((s, e)) else: var s8 = 0 var e8 = 0 var i = 0 var r: Rune while i < s and i < ustr.len: fastRuneAt(ustr, i, r) let si = r.size() s8 += si e8 += si while i < e and i < ustr.len: fastRuneAt(ustr, i, r) e8 += r.size() result.captures.add((s8, e8)) if (flags and LRE_FLAG_GLOBAL) != 1: break if captureCount > 0: dealloc(capture) proc match*(regex: Regex, str: string, start = 0, length = str.len): bool = return regex.exec(str, start, length, nocaps = true).success