Add utf-8 support to libregexp

This allows us to greatly simplify exec(Regex). In particular, we no longer have to convert any line containing non-ascii characters into UTF-16 (which was a significant inefficiency in regex search until now).
author: bptato <nincsnevem662@gmail.com> 2023-07-04 19:09:33 +0200
committer: bptato <nincsnevem662@gmail.com> 2023-07-04 19:12:07 +0200
commit: b15cbe8fc079d34891f7d8553e12ce97e6ecc0c1 (patch)
tree: 074343e5dfb254729ffc7211733a2f5bf2676298 /lib/quickjs
parent: af1d9de37938a37dc0368be242c44a1ab5b83a56 (diff)
download: chawan-b15cbe8fc079d34891f7d8553e12ce97e6ecc0c1.tar.gz
1 files changed, 14 insertions, 6 deletions
diff --git a/lib/quickjs/libregexp.c b/lib/quickjs/libregexp.c
index 379bfc7a..0dfd873a 100644
--- a/lib/quickjs/libregexp.c
+++ b/lib/quickjs/libregexp.c
@@ -1924,7 +1924,7 @@ static BOOL is_word_char(uint32_t c)
 #define GET_CHAR(c, cptr, cbuf_end)                                     \
     do {                                                                \
         if (cbuf_type == 0) {                                           \
-            c = *cptr++;                                                \
+            c = unicode_from_utf8(cptr, cbuf_end - cptr, &cptr);        \
         } else {                                                        \
             uint32_t __c1;                                              \
             c = *(uint16_t *)cptr;                                      \
@@ -1943,7 +1943,8 @@ static BOOL is_word_char(uint32_t c)
 #define PEEK_CHAR(c, cptr, cbuf_end)             \
     do {                                         \
         if (cbuf_type == 0) {                    \
-            c = cptr[0];                         \
+            const uint8_t *__cpt2;                                      \
+            c = unicode_from_utf8(cptr, cbuf_end - cptr, &__cpt2);      \
         } else {                                 \
             uint32_t __c1;                                              \
             c = ((uint16_t *)cptr)[0];                                  \
@@ -1960,7 +1961,11 @@ static BOOL is_word_char(uint32_t c)
 #define PEEK_PREV_CHAR(c, cptr, cbuf_start)                 \
     do {                                         \
         if (cbuf_type == 0) {                    \
-            c = cptr[-1];                        \
+            const uint8_t *__cpt2 = cptr;                               \
+            int __i = 0;                                                \
+            while (__cpt2 > cbuf_start && ((*__cpt2-- >> 6) & 2))       \
+                __i++;                                                  \
+            c = unicode_from_utf8(__cpt2, __i, &__cpt2);                \
         } else {                                 \
             uint32_t __c1;                                              \
             c = ((uint16_t *)cptr)[-1];                                 \
@@ -1977,8 +1982,11 @@ static BOOL is_word_char(uint32_t c)
 #define GET_PREV_CHAR(c, cptr, cbuf_start)       \
     do {                                         \
         if (cbuf_type == 0) {                    \
-            cptr--;                              \
-            c = cptr[0];                         \
+            const uint8_t *__cpt2;                                      \
+            int __i = 0;                                                \
+            while (cptr > cbuf_start && ((*cptr-- >> 6) & 2))           \
+                __i++;                                                  \
+            c = unicode_from_utf8(cptr, __i, &__cpt2);                  \
         } else {                                 \
             uint32_t __c1;                                              \
             cptr -= 2;                                                  \
@@ -1997,7 +2005,7 @@ static BOOL is_word_char(uint32_t c)
 #define PREV_CHAR(cptr, cbuf_start)       \
     do {                                  \
         if (cbuf_type == 0) {             \
-            cptr--;                       \
+            while (cptr > cbuf_start && ((*cptr-- >> 6) & 2));          \
         } else {                          \
             cptr -= 2;                          \
             if (cbuf_type == 2) {                                       \
author	bptato <nincsnevem662@gmail.com>	2023-07-04 19:09:33 +0200
committer	bptato <nincsnevem662@gmail.com>	2023-07-04 19:12:07 +0200
commit	b15cbe8fc079d34891f7d8553e12ce97e6ecc0c1 (patch)
tree	074343e5dfb254729ffc7211733a2f5bf2676298 /lib/quickjs
parent	af1d9de37938a37dc0368be242c44a1ab5b83a56 (diff)
download	chawan-b15cbe8fc079d34891f7d8553e12ce97e6ecc0c1.tar.gz