diff options
author | bptato <nincsnevem662@gmail.com> | 2024-02-05 19:20:16 +0100 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2024-02-05 19:20:16 +0100 |
commit | d8156669faa76e199bd2a0576163ae58f8ae7185 (patch) | |
tree | 7cfbdf03ae197b0a3179e89c3a1f469a2703b3d1 /lib | |
parent | f127f93f92f0d82ef10f9aed9d9e238e134620ee (diff) | |
download | chawan-d8156669faa76e199bd2a0576163ae58f8ae7185.tar.gz |
regex: fix 8-bit narrow strings in JS
The previous approach to add UTF-8 support to libregexp was broken. This time, we use a separate flag (cbuf_len == 3) to indicate UTF-8 input.
Diffstat (limited to 'lib')
-rw-r--r-- | lib/quickjs/libregexp.c | 64 |
1 files changed, 39 insertions, 25 deletions
diff --git a/lib/quickjs/libregexp.c b/lib/quickjs/libregexp.c index 73ef95aa..43a5b2dc 100644 --- a/lib/quickjs/libregexp.c +++ b/lib/quickjs/libregexp.c @@ -1845,8 +1845,8 @@ static BOOL is_word_char(uint32_t c) #define GET_CHAR(c, cptr, cbuf_end) \ do { \ if (cbuf_type == 0) { \ - c = unicode_from_utf8(cptr, cbuf_end - cptr, &cptr); \ - } else { \ + c = *cptr++; \ + } else if (cbuf_type < 3) { \ uint32_t __c1; \ c = *(uint16_t *)cptr; \ cptr += 2; \ @@ -1858,15 +1858,16 @@ static BOOL is_word_char(uint32_t c) cptr += 2; \ } \ } \ + } else { \ + c = unicode_from_utf8(cptr, cbuf_end - cptr, &cptr); \ } \ } while (0) #define PEEK_CHAR(c, cptr, cbuf_end) \ do { \ if (cbuf_type == 0) { \ - const uint8_t *__cpt2; \ - c = unicode_from_utf8(cptr, cbuf_end - cptr, &__cpt2); \ - } else { \ + c = cptr[0]; \ + } else if (cbuf_type < 3) { \ uint32_t __c1; \ c = ((uint16_t *)cptr)[0]; \ if (c >= 0xd800 && c < 0xdc00 && \ @@ -1876,18 +1877,17 @@ static BOOL is_word_char(uint32_t c) c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \ } \ } \ - } \ + } else { \ + const uint8_t *__cpt2; \ + c = unicode_from_utf8(cptr, cbuf_end - cptr, &__cpt2); \ + } \ } while (0) #define PEEK_PREV_CHAR(c, cptr, cbuf_start) \ do { \ if (cbuf_type == 0) { \ - const uint8_t *__cpt2 = cptr; \ - int __i = 0; \ - while (__cpt2 > cbuf_start && ((*__cpt2-- >> 6) & 2)) \ - __i++; \ - c = unicode_from_utf8(__cpt2, __i, &__cpt2); \ - } else { \ + c = cptr[-1]; \ + } else if (cbuf_type < 3) { \ uint32_t __c1; \ c = ((uint16_t *)cptr)[-1]; \ if (c >= 0xdc00 && c < 0xe000 && \ @@ -1897,18 +1897,21 @@ static BOOL is_word_char(uint32_t c) c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \ } \ } \ + } else { \ + const uint8_t *__cpt2 = cptr; \ + int __i = 0; \ + while (__cpt2 > cbuf_start && ((*__cpt2-- >> 6) & 2)) \ + __i++; \ + c = unicode_from_utf8(__cpt2, __i, &__cpt2); \ } \ } while (0) #define GET_PREV_CHAR(c, cptr, cbuf_start) \ do { \ if (cbuf_type == 0) { \ - const uint8_t *__cpt2; \ - int __i = 0; \ - while (cptr > cbuf_start && ((*cptr-- >> 6) & 2)) \ - __i++; \ - c = unicode_from_utf8(cptr, __i, &__cpt2); \ - } else { \ + cptr--; \ + c = cptr[0]; \ + } else if (cbuf_type < 3) { \ uint32_t __c1; \ cptr -= 2; \ c = ((uint16_t *)cptr)[0]; \ @@ -1920,14 +1923,20 @@ static BOOL is_word_char(uint32_t c) c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \ } \ } \ - } \ + } else { \ + const uint8_t *__cpt2; \ + int __i = 0; \ + while (cptr > cbuf_start && ((*cptr-- >> 6) & 2)) \ + __i++; \ + c = unicode_from_utf8(cptr, __i, &__cpt2); \ + } \ } while (0) #define PREV_CHAR(cptr, cbuf_start) \ do { \ if (cbuf_type == 0) { \ - while (cptr > cbuf_start && ((*cptr-- >> 6) & 2)); \ - } else { \ + cptr--; \ + } else if (cbuf_type < 3) { \ cptr -= 2; \ if (cbuf_type == 2) { \ c = ((uint16_t *)cptr)[0]; \ @@ -1937,6 +1946,8 @@ static BOOL is_word_char(uint32_t c) cptr -= 2; \ } \ } \ + } else { \ + while (cptr > cbuf_start && ((*cptr-- >> 6) & 2)); \ } \ } while (0) @@ -1961,7 +1972,8 @@ typedef struct REExecState { typedef struct { const uint8_t *cbuf; const uint8_t *cbuf_end; - /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */ + /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16, + 3 = 8 bit chars, UTF-8 */ int cbuf_type; int capture_count; int stack_size_max; @@ -2428,9 +2440,11 @@ int lre_exec(uint8_t **capture, int cbuf_type, void *opaque) { REExecContext s_s, *s = &s_s; - int re_flags, i, alloca_size, ret; + int re_flags, i, alloca_size, ret, cbuf_width = cbuf_type; StackInt *stack_buf; + if (cbuf_width == 3) /* UTF-8 */ + cbuf_width = 1; re_flags = bc_buf[RE_HEADER_FLAGS]; s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0; s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0; @@ -2438,7 +2452,7 @@ int lre_exec(uint8_t **capture, s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT]; s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE]; s->cbuf = cbuf; - s->cbuf_end = cbuf + (clen << cbuf_type); + s->cbuf_end = cbuf + (clen << cbuf_width); s->cbuf_type = cbuf_type; if (s->cbuf_type == 1 && s->is_utf16) s->cbuf_type = 2; @@ -2456,7 +2470,7 @@ int lre_exec(uint8_t **capture, alloca_size = s->stack_size_max * sizeof(stack_buf[0]); stack_buf = alloca(alloca_size); ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN, - cbuf + (cindex << cbuf_type), FALSE); + cbuf + (cindex << cbuf_width), FALSE); lre_realloc(s->opaque, s->state_stack, 0); return ret; } |