diff options
author | bptato <nincsnevem662@gmail.com> | 2023-07-04 19:09:33 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2023-07-04 19:12:07 +0200 |
commit | b15cbe8fc079d34891f7d8553e12ce97e6ecc0c1 (patch) | |
tree | 074343e5dfb254729ffc7211733a2f5bf2676298 /lib/quickjs | |
parent | af1d9de37938a37dc0368be242c44a1ab5b83a56 (diff) | |
download | chawan-b15cbe8fc079d34891f7d8553e12ce97e6ecc0c1.tar.gz |
Add utf-8 support to libregexp
This allows us to greatly simplify exec(Regex). In particular, we no longer have to convert any line containing non-ascii characters into UTF-16 (which was a significant inefficiency in regex search until now).
Diffstat (limited to 'lib/quickjs')
-rw-r--r-- | lib/quickjs/libregexp.c | 20 |
1 files changed, 14 insertions, 6 deletions
diff --git a/lib/quickjs/libregexp.c b/lib/quickjs/libregexp.c index 379bfc7a..0dfd873a 100644 --- a/lib/quickjs/libregexp.c +++ b/lib/quickjs/libregexp.c @@ -1924,7 +1924,7 @@ static BOOL is_word_char(uint32_t c) #define GET_CHAR(c, cptr, cbuf_end) \ do { \ if (cbuf_type == 0) { \ - c = *cptr++; \ + c = unicode_from_utf8(cptr, cbuf_end - cptr, &cptr); \ } else { \ uint32_t __c1; \ c = *(uint16_t *)cptr; \ @@ -1943,7 +1943,8 @@ static BOOL is_word_char(uint32_t c) #define PEEK_CHAR(c, cptr, cbuf_end) \ do { \ if (cbuf_type == 0) { \ - c = cptr[0]; \ + const uint8_t *__cpt2; \ + c = unicode_from_utf8(cptr, cbuf_end - cptr, &__cpt2); \ } else { \ uint32_t __c1; \ c = ((uint16_t *)cptr)[0]; \ @@ -1960,7 +1961,11 @@ static BOOL is_word_char(uint32_t c) #define PEEK_PREV_CHAR(c, cptr, cbuf_start) \ do { \ if (cbuf_type == 0) { \ - c = cptr[-1]; \ + const uint8_t *__cpt2 = cptr; \ + int __i = 0; \ + while (__cpt2 > cbuf_start && ((*__cpt2-- >> 6) & 2)) \ + __i++; \ + c = unicode_from_utf8(__cpt2, __i, &__cpt2); \ } else { \ uint32_t __c1; \ c = ((uint16_t *)cptr)[-1]; \ @@ -1977,8 +1982,11 @@ static BOOL is_word_char(uint32_t c) #define GET_PREV_CHAR(c, cptr, cbuf_start) \ do { \ if (cbuf_type == 0) { \ - cptr--; \ - c = cptr[0]; \ + const uint8_t *__cpt2; \ + int __i = 0; \ + while (cptr > cbuf_start && ((*cptr-- >> 6) & 2)) \ + __i++; \ + c = unicode_from_utf8(cptr, __i, &__cpt2); \ } else { \ uint32_t __c1; \ cptr -= 2; \ @@ -1997,7 +2005,7 @@ static BOOL is_word_char(uint32_t c) #define PREV_CHAR(cptr, cbuf_start) \ do { \ if (cbuf_type == 0) { \ - cptr--; \ + while (cptr > cbuf_start && ((*cptr-- >> 6) & 2)); \ } else { \ cptr -= 2; \ if (cbuf_type == 2) { \ |