Improve surrogate handling readability

- add inline function to test and convert surrogates is_surrogate(c), is_hi_surrogate(c), is_lo_surrogate(c), get_hi_surrogate(c), get_lo_surrogate(c), from_surrogate(hi, lo) - use names for BC header offsets and lengths in libregexp.c - remove strict aliasing violations in `lre_exec_backtrack()` - pass all context variables to XXX_CHAR macros in `lre_exec_backtrack()`
author: Charlie Gordon <github@chqrlie.org> 2024-02-20 00:22:32 +0100
committer: bptato <nincsnevem662@gmail.com> 2024-03-02 18:12:24 +0100
commit: d4885cdedbe2d2bb366a178e54622893898a4793 (patch)
tree: bae983ae428d3fd3b927d98927d6a616d39cfd4f /lib
parent: 566a850964945d12e95fa6290f63783977038f53 (diff)
download: chawan-d4885cdedbe2d2bb366a178e54622893898a4793.tar.gz
3 files changed, 150 insertions, 124 deletions
diff --git a/lib/quickjs/cutils.h b/lib/quickjs/cutils.h
index ea7a760b..ff2d3fb6 100644
--- a/lib/quickjs/cutils.h
+++ b/lib/quickjs/cutils.h
@@ -45,9 +45,10 @@
 #ifndef countof
 #define countof(x) (sizeof(x) / sizeof((x)[0]))
 #endif
-
+#ifndef container_of
 /* return the pointer of type 'type *' containing 'ptr' as field 'member' */
 #define container_of(ptr, type, member) ((type *)((uint8_t *)(ptr) - offsetof(type, member)))
+#endif
 
 typedef int BOOL;
 
@@ -207,17 +208,22 @@ static inline void put_u8(uint8_t *tab, uint8_t val)
     *tab = val;
 }
 
+#ifndef bswap16
 static inline uint16_t bswap16(uint16_t x)
 {
     return (x >> 8) | (x << 8);
 }
+#endif
 
+#ifndef bswap32
 static inline uint32_t bswap32(uint32_t v)
 {
     return ((v & 0xff000000) >> 24) | ((v & 0x00ff0000) >>  8) |
         ((v & 0x0000ff00) <<  8) | ((v & 0x000000ff) << 24);
 }
+#endif
 
+#ifndef bswap64
 static inline uint64_t bswap64(uint64_t v)
 {
     return ((v & ((uint64_t)0xff << (7 * 8))) >> (7 * 8)) |
@@ -229,6 +235,7 @@ static inline uint64_t bswap64(uint64_t v)
         ((v & ((uint64_t)0xff << (1 * 8))) << (5 * 8)) |
         ((v & ((uint64_t)0xff << (0 * 8))) << (7 * 8));
 }
+#endif
 
 /* XXX: should take an extra argument to pass slack information to the caller */
 typedef void *DynBufReallocFunc(void *opaque, void *ptr, size_t size);
@@ -278,6 +285,36 @@ static inline void dbuf_set_error(DynBuf *s)
 int unicode_to_utf8(uint8_t *buf, unsigned int c);
 int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
 
+static inline BOOL is_surrogate(uint32_t c)
+{
+    return (c >> 11) == (0xD800 >> 11); // 0xD800-0xDFFF
+}
+
+static inline BOOL is_hi_surrogate(uint32_t c)
+{
+    return (c >> 10) == (0xD800 >> 10); // 0xD800-0xDBFF
+}
+
+static inline BOOL is_lo_surrogate(uint32_t c)
+{
+    return (c >> 10) == (0xDC00 >> 10); // 0xDC00-0xDFFF
+}
+
+static inline uint32_t get_hi_surrogate(uint32_t c)
+{
+    return (c >> 10) - (0x10000 >> 10) + 0xD800;
+}
+
+static inline uint32_t get_lo_surrogate(uint32_t c)
+{
+    return (c & 0x3FF) | 0xDC00;
+}
+
+static inline uint32_t from_surrogate(uint32_t hi, uint32_t lo)
+{
+    return 0x10000 + 0x400 * (hi - 0xD800) + (lo - 0xDC00);
+}
+
 static inline int from_hex(int c)
 {
     if (c >= '0' && c <= '9')
diff --git a/lib/quickjs/libregexp.c b/lib/quickjs/libregexp.c
index 1251d1b1..05fbd4bd 100644
--- a/lib/quickjs/libregexp.c
+++ b/lib/quickjs/libregexp.c
@@ -100,6 +100,7 @@ static const REOpCode reopcode_info[REOP_COUNT] = {
 #define RE_HEADER_FLAGS         0
 #define RE_HEADER_CAPTURE_COUNT 1
 #define RE_HEADER_STACK_SIZE    2
+#define RE_HEADER_BYTECODE_LEN  3
 
 #define RE_HEADER_LEN 7
 
@@ -224,16 +225,16 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
 
     assert(buf_len >= RE_HEADER_LEN);
 
-    re_flags = buf[0];
-    bc_len = get_u32(buf + 3);
+    re_flags = lre_get_flags(buf);
+    bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN);
     assert(bc_len + RE_HEADER_LEN <= buf_len);
     printf("flags: 0x%x capture_count=%d stack_size=%d\n",
-           re_flags, buf[1], buf[2]);
+           re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]);
     if (re_flags & LRE_FLAG_NAMED_GROUPS) {
         const char *p;
         p = (char *)buf + RE_HEADER_LEN + bc_len;
         printf("named groups: ");
-        for(i = 1; i < buf[1]; i++) {
+        for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) {
             if (i != 1)
                 printf(",");
             printf("<%s>", p);
@@ -494,7 +495,7 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16)
                     }
                     c = (c << 4) | h;
                 }
-                if (c >= 0xd800 && c < 0xdc00 &&
+                if (is_hi_surrogate(c) &&
                     allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
                     /* convert an escaped surrogate pair into a
                        unicode char */
@@ -505,9 +506,9 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16)
                             break;
                         c1 = (c1 << 4) | h;
                     }
-                    if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) {
+                    if (i == 4 && is_lo_surrogate(c1)) {
                         p += 6;
-                        c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
+                        c = from_surrogate(c, c1);
                     }
                 }
             }
@@ -936,7 +937,7 @@ static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
         case REOP_backward_back_reference:
             break;
         default:
-            /* safe behvior: we cannot predict the outcome */
+            /* safe behavior: we cannot predict the outcome */
             return TRUE;
         }
         pos += len;
@@ -1005,10 +1006,10 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
             break;
         } else if (c >= 128) {
             c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
-            if (c >= 0xD800 && c <= 0xDBFF) {
+            if (is_hi_surrogate(c)) {
                 d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
-                if (d >= 0xDC00 && d <= 0xDFFF) {
-                    c = 0x10000 + 0x400 * (c - 0xD800) + (d - 0xDC00);
+                if (is_lo_surrogate(d)) {
+                    c = from_surrogate(c, d);
                     p = p1;
                 }
             }
@@ -1116,9 +1117,10 @@ static int find_group_name(REParseState *s, const char *name)
     size_t len, name_len;
     int capture_index;
 
-    name_len = strlen(name);
     p = (char *)s->group_names.buf;
+    if (!p) return -1;
     buf_end = (char *)s->group_names.buf + s->group_names.size;
+    name_len = strlen(name);
     capture_index = 1;
     while (p < buf_end) {
         len = strlen(p);
@@ -1813,7 +1815,8 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
 
     s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
     s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
-    put_u32(s->byte_code.buf + 3, s->byte_code.size - RE_HEADER_LEN);
+    put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN,
+            s->byte_code.size - RE_HEADER_LEN);
 
     /* add the named groups if needed */
     if (s->group_names.size > (s->capture_count - 1)) {
@@ -1844,39 +1847,36 @@ static BOOL is_word_char(uint32_t c)
             (c == '_'));
 }
 
-#define GET_CHAR(c, cptr, cbuf_end)                                     \
+#define GET_CHAR(c, cptr, cbuf_end, cbuf_type)                          \
     do {                                                                \
         if (cbuf_type == 0) {                                           \
             c = *cptr++;                                                \
         } else if (cbuf_type < 3) {                                     \
-            uint32_t __c1;                                              \
-            c = *(uint16_t *)cptr;                                      \
-            cptr += 2;                                                  \
-            if (c >= 0xd800 && c < 0xdc00 &&                            \
-                cbuf_type == 2 && cptr < cbuf_end) {                    \
-                __c1 = *(uint16_t *)cptr;                               \
-                if (__c1 >= 0xdc00 && __c1 < 0xe000) {                  \
-                    c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
-                    cptr += 2;                                          \
+            const uint16_t *_p = (const uint16_t *)cptr;                \
+            const uint16_t *_end = (const uint16_t *)cbuf_end;          \
+            c = *_p++;                                                  \
+            if (is_hi_surrogate(c) && cbuf_type == 2) {                 \
+                if (_p < _end && is_lo_surrogate(*_p)) {                \
+                    c = from_surrogate(c, *_p++);                       \
                 }                                                       \
             }                                                           \
+            cptr = (const void *)_p;                                    \
         } else {                                                        \
             c = unicode_from_utf8(cptr, cbuf_end - cptr, &cptr);        \
         }                                                               \
     } while (0)
 
-#define PEEK_CHAR(c, cptr, cbuf_end)                                    \
+#define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type)                         \
     do {                                                                \
         if (cbuf_type == 0) {                                           \
             c = cptr[0];                                                \
         } else if (cbuf_type < 3) {                                     \
-            uint32_t __c1;                                              \
-            c = ((uint16_t *)cptr)[0];                                  \
-            if (c >= 0xd800 && c < 0xdc00 &&                            \
-                cbuf_type == 2 && (cptr + 2) < cbuf_end) {              \
-                __c1 = ((uint16_t *)cptr)[1];                           \
-                if (__c1 >= 0xdc00 && __c1 < 0xe000) {                  \
-                    c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
+            const uint16_t *_p = (const uint16_t *)cptr;                \
+            const uint16_t *_end = (const uint16_t *)cbuf_end;          \
+            c = *_p++;                                                  \
+            if (is_hi_surrogate(c) && cbuf_type == 2) {                 \
+                if (_p < _end && is_lo_surrogate(*_p)) {                \
+                    c = from_surrogate(c, *_p);                         \
                 }                                                       \
             }                                                           \
         } else {                                                        \
@@ -1885,18 +1885,17 @@ static BOOL is_word_char(uint32_t c)
         }                                                               \
     } while (0)
 
-#define PEEK_PREV_CHAR(c, cptr, cbuf_start)                             \
+#define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type)                  \
     do {                                                                \
         if (cbuf_type == 0) {                                           \
             c = cptr[-1];                                               \
         } else if (cbuf_type < 3) {                                     \
-            uint32_t __c1;                                              \
-            c = ((uint16_t *)cptr)[-1];                                 \
-            if (c >= 0xdc00 && c < 0xe000 &&                            \
-                cbuf_type == 2 && (cptr - 4) >= cbuf_start) {           \
-                __c1 = ((uint16_t *)cptr)[-2];                          \
-                if (__c1 >= 0xd800 && __c1 < 0xdc00 ) {                 \
-                    c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
+            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
+            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
+            c = *_p;                                                    \
+            if (is_lo_surrogate(c) && cbuf_type == 2) {                 \
+                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
+                    c = from_surrogate(*--_p, c);                       \
                 }                                                       \
             }                                                           \
         } else {                                                        \
@@ -1908,23 +1907,21 @@ static BOOL is_word_char(uint32_t c)
         }                                                               \
     } while (0)
 
-#define GET_PREV_CHAR(c, cptr, cbuf_start)                              \
+#define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type)                   \
     do {                                                                \
         if (cbuf_type == 0) {                                           \
             cptr--;                                                     \
             c = cptr[0];                                                \
         } else if (cbuf_type < 3) {                                     \
-            uint32_t __c1;                                              \
-            cptr -= 2;                                                  \
-            c = ((uint16_t *)cptr)[0];                                  \
-            if (c >= 0xdc00 && c < 0xe000 &&                            \
-                cbuf_type == 2 && cptr > cbuf_start) {                  \
-                __c1 = ((uint16_t *)cptr)[-1];                          \
-                if (__c1 >= 0xd800 && __c1 < 0xdc00 ) {                 \
-                    cptr -= 2;                                          \
-                    c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
+            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
+            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
+            c = *_p;                                                    \
+            if (is_lo_surrogate(c) && cbuf_type == 2) {                 \
+                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
+                    c = from_surrogate(*--_p, c);                       \
                 }                                                       \
             }                                                           \
+            cptr = (const void *)_p;                                    \
         } else {                                                        \
             const uint8_t *__cpt2;                                      \
             int __i = 0;                                                \
@@ -1934,20 +1931,19 @@ static BOOL is_word_char(uint32_t c)
         }                                                               \
     } while (0)
 
-#define PREV_CHAR(cptr, cbuf_start)                                     \
+#define PREV_CHAR(cptr, cbuf_start, cbuf_type)                          \
     do {                                                                \
         if (cbuf_type == 0) {                                           \
             cptr--;                                                     \
         } else if (cbuf_type < 3) {                                     \
-            cptr -= 2;                                                  \
-            if (cbuf_type == 2) {                                       \
-                c = ((uint16_t *)cptr)[0];                              \
-                if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) {   \
-                    c = ((uint16_t *)cptr)[-1];                         \
-                    if (c >= 0xd800 && c < 0xdc00)                      \
-                        cptr -= 2;                                      \
+            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
+            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
+            if (is_lo_surrogate(*_p) && cbuf_type == 2) {               \
+                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
+                    --_p;                                               \
                 }                                                       \
             }                                                           \
+            cptr = (const void *)_p;                                    \
         } else {                                                        \
             while (cptr > cbuf_start && ((*cptr-- >> 6) & 2));          \
         }                                                               \
@@ -2090,7 +2086,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                             /* go backward */
                             char_count = get_u32(pc + 12);
                             for(i = 0; i < char_count; i++) {
-                                PREV_CHAR(cptr, s->cbuf);
+                                PREV_CHAR(cptr, s->cbuf, cbuf_type);
                             }
                             pc = (pc + 16) + (int)get_u32(pc);
                             rs->cptr = cptr;
@@ -2125,7 +2121,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
         test_char:
             if (cptr >= cbuf_end)
                 goto no_match;
-            GET_CHAR(c, cptr, cbuf_end);
+            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
             if (s->ignore_case) {
                 c = lre_canonicalize(c, s->is_unicode);
             }
@@ -2172,7 +2168,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                 break;
             if (!s->multi_line)
                 goto no_match;
-            PEEK_PREV_CHAR(c, cptr, s->cbuf);
+            PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
             if (!is_line_terminator(c))
                 goto no_match;
             break;
@@ -2181,21 +2177,21 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                 break;
             if (!s->multi_line)
                 goto no_match;
-            PEEK_CHAR(c, cptr, cbuf_end);
+            PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
             if (!is_line_terminator(c))
                 goto no_match;
             break;
         case REOP_dot:
             if (cptr == cbuf_end)
                 goto no_match;
-            GET_CHAR(c, cptr, cbuf_end);
+            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
             if (is_line_terminator(c))
                 goto no_match;
             break;
         case REOP_any:
             if (cptr == cbuf_end)
                 goto no_match;
-            GET_CHAR(c, cptr, cbuf_end);
+            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
             break;
         case REOP_save_start:
         case REOP_save_end:
@@ -2247,14 +2243,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                 if (cptr == s->cbuf) {
                     v1 = FALSE;
                 } else {
-                    PEEK_PREV_CHAR(c, cptr, s->cbuf);
+                    PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
                     v1 = is_word_char(c);
                 }
                 /* current char */
                 if (cptr >= cbuf_end) {
                     v2 = FALSE;
                 } else {
-                    PEEK_CHAR(c, cptr, cbuf_end);
+                    PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
                     v2 = is_word_char(c);
                 }
                 if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
@@ -2279,8 +2275,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                     while (cptr1 < cptr1_end) {
                         if (cptr >= cbuf_end)
                             goto no_match;
-                        GET_CHAR(c1, cptr1, cptr1_end);
-                        GET_CHAR(c2, cptr, cbuf_end);
+                        GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
+                        GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
                         if (s->ignore_case) {
                             c1 = lre_canonicalize(c1, s->is_unicode);
                             c2 = lre_canonicalize(c2, s->is_unicode);
@@ -2293,8 +2289,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                     while (cptr1 > cptr1_start) {
                         if (cptr == s->cbuf)
                             goto no_match;
-                        GET_PREV_CHAR(c1, cptr1, cptr1_start);
-                        GET_PREV_CHAR(c2, cptr, s->cbuf);
+                        GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
+                        GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
                         if (s->ignore_case) {
                             c1 = lre_canonicalize(c1, s->is_unicode);
                             c2 = lre_canonicalize(c2, s->is_unicode);
@@ -2314,7 +2310,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                 pc += 2;
                 if (cptr >= cbuf_end)
                     goto no_match;
-                GET_CHAR(c, cptr, cbuf_end);
+                GET_CHAR(c, cptr, cbuf_end, cbuf_type);
                 if (s->ignore_case) {
                     c = lre_canonicalize(c, s->is_unicode);
                 }
@@ -2354,7 +2350,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                 pc += 2;
                 if (cptr >= cbuf_end)
                     goto no_match;
-                GET_CHAR(c, cptr, cbuf_end);
+                GET_CHAR(c, cptr, cbuf_end, cbuf_type);
                 if (s->ignore_case) {
                     c = lre_canonicalize(c, s->is_unicode);
                 }
@@ -2386,7 +2382,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
             /* go to the previous char */
             if (cptr == s->cbuf)
                 goto no_match;
-            PREV_CHAR(cptr, s->cbuf);
+            PREV_CHAR(cptr, s->cbuf, cbuf_type);
             break;
         case REOP_simple_greedy_quant:
             {
@@ -2447,7 +2443,7 @@ int lre_exec(uint8_t **capture,
 
     if (cbuf_width == 3) /* UTF-8 */
         cbuf_width = 0;
-    re_flags = bc_buf[RE_HEADER_FLAGS];
+    re_flags = lre_get_flags(bc_buf);
     s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
     s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
     s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
@@ -2494,8 +2490,8 @@ const char *lre_get_groupnames(const uint8_t *bc_buf)
     uint32_t re_bytecode_len;
     if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
         return NULL;
-    re_bytecode_len = get_u32(bc_buf + 3);
-    return (const char *)(bc_buf + 7 + re_bytecode_len);
+    re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN);
+    return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len);
 }
 
 #ifdef TEST
@@ -2512,25 +2508,26 @@ void *lre_realloc(void *opaque, void *ptr, size_t size)
 
 int main(int argc, char **argv)
 {
-    int len, ret, i;
+    int len, flags, ret, i;
     uint8_t *bc;
     char error_msg[64];
     uint8_t *capture[CAPTURE_COUNT_MAX * 2];
     const char *input;
     int input_len, capture_count;
 
-    if (argc < 3) {
-        printf("usage: %s regexp input\n", argv[0]);
-        exit(1);
+    if (argc < 4) {
+        printf("usage: %s regexp flags input\n", argv[0]);
+        return 1;
     }
+    flags = atoi(argv[2]);
     bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
-                     strlen(argv[1]), 0, NULL);
+                     strlen(argv[1]), flags, NULL);
     if (!bc) {
         fprintf(stderr, "error: %s\n", error_msg);
         exit(1);
     }
 
-    input = argv[2];
+    input = argv[3];
     input_len = strlen(input);
 
     ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
diff --git a/lib/quickjs/quickjs.c b/lib/quickjs/quickjs.c
index 63024416..4e5f9fb7 100644
--- a/lib/quickjs/quickjs.c
+++ b/lib/quickjs/quickjs.c
@@ -3690,10 +3690,9 @@ static int string_buffer_putc(StringBuffer *s, uint32_t c)
 {
     if (unlikely(c >= 0x10000)) {
         /* surrogate pair */
-        c -= 0x10000;
-        if (string_buffer_putc16(s, (c >> 10) + 0xd800))
+        if (string_buffer_putc16(s, get_hi_surrogate(c)))
             return -1;
-        c = (c & 0x3ff) + 0xdc00;
+        c = get_lo_surrogate(c);
     }
     return string_buffer_putc16(s, c);
 }
@@ -3704,10 +3703,10 @@ static int string_getc(const JSString *p, int *pidx)
     idx = *pidx;
     if (p->is_wide_char) {
         c = p->u.str16[idx++];
-        if (c >= 0xd800 && c < 0xdc00 && idx < p->len) {
+        if (is_hi_surrogate(c) && idx < p->len) {
             c1 = p->u.str16[idx];
-            if (c1 >= 0xdc00 && c1 < 0xe000) {
-                c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
+            if (is_lo_surrogate(c1)) {
+                c = from_surrogate(c, c1);
                 idx++;
             }
         }
@@ -3905,9 +3904,8 @@ JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len)
                 } else if (c <= 0x10FFFF) {
                     p = p_next;
                     /* surrogate pair */
-                    c -= 0x10000;
-                    string_buffer_putc16(b, (c >> 10) + 0xd800);
-                    c = (c & 0x3ff) + 0xdc00;
+                    string_buffer_putc16(b, get_hi_surrogate(c));
+                    c = get_lo_surrogate(c);
                 } else {
                     /* invalid char */
                     c = 0xfffd;
@@ -4045,13 +4043,12 @@ const char *JS_ToCStringLen2(JSContext *ctx, size_t *plen, JSValueConst val1, BO
             if (c < 0x80) {
                 *q++ = c;
             } else {
-                if (c >= 0xd800 && c < 0xdc00) {
+                if (is_hi_surrogate(c)) {
                     if (pos < len && !cesu8) {
                         c1 = src[pos];
-                        if (c1 >= 0xdc00 && c1 < 0xe000) {
+                        if (is_lo_surrogate(c1)) {
                             pos++;
-                            /* surrogate pair */
-                            c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
+                            c = from_surrogate(c, c1);
                         } else {
                             /* Keep unmatched surrogate code points */
                             /* c = 0xfffd; */ /* error */
@@ -11831,7 +11828,7 @@ static JSValue JS_ToQuotedString(JSContext *ctx, JSValueConst val1)
                 goto fail;
             break;
         default:
-            if (c < 32 || (c >= 0xd800 && c < 0xe000)) {
+            if (c < 32 || is_surrogate(c)) {
                 snprintf(buf, sizeof(buf), "\\u%04x", c);
                 if (string_buffer_puts8(b, buf))
                     goto fail;
@@ -41691,18 +41688,18 @@ static int64_t string_advance_index(JSString *p, int64_t index, BOOL unicode)
    -1 if none */
 static int js_string_find_invalid_codepoint(JSString *p)
 {
-    int i, c;
+    int i;
     if (!p->is_wide_char)
         return -1;
     for(i = 0; i < p->len; i++) {
-        c = p->u.str16[i];
-        if (c >= 0xD800 && c <= 0xDFFF) {
-            if (c >= 0xDC00 || (i + 1) >= p->len)
-                return i;
-            c = p->u.str16[i + 1];
-            if (c < 0xDC00 || c > 0xDFFF)
+        uint32_t c = p->u.str16[i];
+        if (is_surrogate(c)) {
+            if (is_hi_surrogate(c) && (i + 1) < p->len
+            &&  is_lo_surrogate(p->u.str16[i + 1])) {
+                i++;
+            } else {
                 return i;
-            i++;
+            }
         }
     }
     return -1;
@@ -41729,7 +41726,7 @@ static JSValue js_string_toWellFormed(JSContext *ctx, JSValueConst this_val,
 {
     JSValue str, ret;
     JSString *p;
-    int c, i;
+    int i;
 
     str = JS_ToStringCheckObject(ctx, this_val);
     if (JS_IsException(str))
@@ -41748,17 +41745,13 @@ static JSValue js_string_toWellFormed(JSContext *ctx, JSValueConst this_val,
 
     p = JS_VALUE_GET_STRING(ret);
     for (; i < p->len; i++) {
-        c = p->u.str16[i];
-        if (c >= 0xD800 && c <= 0xDFFF) {
-            if (c >= 0xDC00 || (i + 1) >= p->len) {
-                p->u.str16[i] = 0xFFFD;
+        uint32_t c = p->u.str16[i];
+        if (is_surrogate(c)) {
+            if (is_hi_surrogate(c) && (i + 1) < p->len
+            &&  is_lo_surrogate(p->u.str16[i + 1])) {
+                i++;
             } else {
-                c = p->u.str16[i + 1];
-                if (c < 0xDC00 || c > 0xDFFF) {
-                    p->u.str16[i] = 0xFFFD;
-                } else {
-                    i++;
-                }
+                p->u.str16[i] = 0xFFFD;
             }
         }
     }
@@ -42535,10 +42528,10 @@ static int string_prevc(JSString *p, int *pidx)
     idx--;
     if (p->is_wide_char) {
         c = p->u.str16[idx];
-        if (c >= 0xdc00 && c < 0xe000 && idx > 0) {
+        if (is_lo_surrogate(c) && idx > 0) {
             c1 = p->u.str16[idx - 1];
-            if (c1 >= 0xd800 && c1 <= 0xdc00) {
-                c = (((c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000;
+            if (is_hi_surrogate(c1)) {
+                c = from_surrogate(c1, c);
                 idx--;
             }
         }
@@ -49222,8 +49215,7 @@ static JSValue js_global_decodeURI(JSContext *ctx, JSValueConst this_val,
                     }
                     c = (c << 6) | (c1 & 0x3f);
                 }
-                if (c < c_min || c > 0x10FFFF ||
-                    (c >= 0xd800 && c < 0xe000)) {
+                if (c < c_min || c > 0x10FFFF || is_surrogate(c)) {
                     js_throw_URIError(ctx, "malformed UTF-8");
                     goto fail;
                 }
@@ -49298,21 +49290,21 @@ static JSValue js_global_encodeURI(JSContext *ctx, JSValueConst this_val,
         if (isURIUnescaped(c, isComponent)) {
             string_buffer_putc16(b, c);
         } else {
-            if (c >= 0xdc00 && c <= 0xdfff) {
+            if (is_lo_surrogate(c)) {
                 js_throw_URIError(ctx, "invalid character");
                 goto fail;
-            } else if (c >= 0xd800 && c <= 0xdbff) {
+            } else if (is_hi_surrogate(c)) {
                 if (k >= p->len) {
                     js_throw_URIError(ctx, "expecting surrogate pair");
                     goto fail;
                 }
                 c1 = string_get(p, k);
                 k++;
-                if (c1 < 0xdc00 || c1 > 0xdfff) {
+                if (!is_lo_surrogate(c1)) {
                     js_throw_URIError(ctx, "expecting surrogate pair");
                     goto fail;
                 }
-                c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
+                c = from_surrogate(c, c1);
             }
             if (c < 0x80) {
                 encodeURI_hex(b, c);
author	Charlie Gordon <github@chqrlie.org>	2024-02-20 00:22:32 +0100
committer	bptato <nincsnevem662@gmail.com>	2024-03-02 18:12:24 +0100
commit	d4885cdedbe2d2bb366a178e54622893898a4793 (patch)
tree	bae983ae428d3fd3b927d98927d6a616d39cfd4f /lib
parent	566a850964945d12e95fa6290f63783977038f53 (diff)
download	chawan-d4885cdedbe2d2bb366a178e54622893898a4793.tar.gz