about summary refs log tree commit diff stats
path: root/lib/quickjs
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2023-07-04 19:09:33 +0200
committerbptato <nincsnevem662@gmail.com>2023-07-04 19:12:07 +0200
commitb15cbe8fc079d34891f7d8553e12ce97e6ecc0c1 (patch)
tree074343e5dfb254729ffc7211733a2f5bf2676298 /lib/quickjs
parentaf1d9de37938a37dc0368be242c44a1ab5b83a56 (diff)
downloadchawan-b15cbe8fc079d34891f7d8553e12ce97e6ecc0c1.tar.gz
Add utf-8 support to libregexp
This allows us to greatly simplify exec(Regex). In particular, we
no longer have to convert any line containing non-ascii characters
into UTF-16 (which was a significant inefficiency in regex search
until now).
Diffstat (limited to 'lib/quickjs')
-rw-r--r--lib/quickjs/libregexp.c20
1 files changed, 14 insertions, 6 deletions
diff --git a/lib/quickjs/libregexp.c b/lib/quickjs/libregexp.c
index 379bfc7a..0dfd873a 100644
--- a/lib/quickjs/libregexp.c
+++ b/lib/quickjs/libregexp.c
@@ -1924,7 +1924,7 @@ static BOOL is_word_char(uint32_t c)
 #define GET_CHAR(c, cptr, cbuf_end)                                     \
     do {                                                                \
         if (cbuf_type == 0) {                                           \
-            c = *cptr++;                                                \
+            c = unicode_from_utf8(cptr, cbuf_end - cptr, &cptr);        \
         } else {                                                        \
             uint32_t __c1;                                              \
             c = *(uint16_t *)cptr;                                      \
@@ -1943,7 +1943,8 @@ static BOOL is_word_char(uint32_t c)
 #define PEEK_CHAR(c, cptr, cbuf_end)             \
     do {                                         \
         if (cbuf_type == 0) {                    \
-            c = cptr[0];                         \
+            const uint8_t *__cpt2;                                      \
+            c = unicode_from_utf8(cptr, cbuf_end - cptr, &__cpt2);      \
         } else {                                 \
             uint32_t __c1;                                              \
             c = ((uint16_t *)cptr)[0];                                  \
@@ -1960,7 +1961,11 @@ static BOOL is_word_char(uint32_t c)
 #define PEEK_PREV_CHAR(c, cptr, cbuf_start)                 \
     do {                                         \
         if (cbuf_type == 0) {                    \
-            c = cptr[-1];                        \
+            const uint8_t *__cpt2 = cptr;                               \
+            int __i = 0;                                                \
+            while (__cpt2 > cbuf_start && ((*__cpt2-- >> 6) & 2))       \
+                __i++;                                                  \
+            c = unicode_from_utf8(__cpt2, __i, &__cpt2);                \
         } else {                                 \
             uint32_t __c1;                                              \
             c = ((uint16_t *)cptr)[-1];                                 \
@@ -1977,8 +1982,11 @@ static BOOL is_word_char(uint32_t c)
 #define GET_PREV_CHAR(c, cptr, cbuf_start)       \
     do {                                         \
         if (cbuf_type == 0) {                    \
-            cptr--;                              \
-            c = cptr[0];                         \
+            const uint8_t *__cpt2;                                      \
+            int __i = 0;                                                \
+            while (cptr > cbuf_start && ((*cptr-- >> 6) & 2))           \
+                __i++;                                                  \
+            c = unicode_from_utf8(cptr, __i, &__cpt2);                  \
         } else {                                 \
             uint32_t __c1;                                              \
             cptr -= 2;                                                  \
@@ -1997,7 +2005,7 @@ static BOOL is_word_char(uint32_t c)
 #define PREV_CHAR(cptr, cbuf_start)       \
     do {                                  \
         if (cbuf_type == 0) {             \
-            cptr--;                       \
+            while (cptr > cbuf_start && ((*cptr-- >> 6) & 2));          \
         } else {                          \
             cptr -= 2;                          \
             if (cbuf_type == 2) {                                       \