about summary refs log tree commit diff stats
path: root/lib
diff options
context:
space:
mode:
authorFabrice Bellard <fabrice@bellard.org>2024-01-08 18:42:29 +0100
committerbptato <nincsnevem662@gmail.com>2024-01-11 18:51:43 +0100
commit3853d2be3916b98d449e785071e370fe5b9a36ee (patch)
treed291ca65a9151a30d9c31b101e06e942a59ebaa0 /lib
parent0ecfe8945a5b064db2d2bf3984ece31bcea5e351 (diff)
downloadchawan-3853d2be3916b98d449e785071e370fe5b9a36ee.tar.gz
fixed regexp case insensitive flag
Diffstat (limited to 'lib')
-rw-r--r--lib/quickjs/libregexp.c57
-rw-r--r--lib/quickjs/libunicode-table.h124
-rw-r--r--lib/quickjs/libunicode.c386
-rw-r--r--lib/quickjs/libunicode.h3
4 files changed, 374 insertions, 196 deletions
diff --git a/lib/quickjs/libregexp.c b/lib/quickjs/libregexp.c
index 05df8653..6c8b57e0 100644
--- a/lib/quickjs/libregexp.c
+++ b/lib/quickjs/libregexp.c
@@ -34,9 +34,6 @@
 /*
   TODO:
 
-  - Add full unicode canonicalize rules for character ranges (not
-    really useful but needed for exact "ignorecase" compatibility).
-
   - Add a lock step execution mode (=linear time execution guaranteed)
     when the regular expression is "simple" i.e. no backreference nor
     complicated lookahead. The opcodes are designed for this execution
@@ -120,33 +117,6 @@ static int dbuf_insert(DynBuf *s, int pos, int len)
     return 0;
 }
 
-/* canonicalize with the specific JS regexp rules */
-static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16)
-{
-    uint32_t res[LRE_CC_RES_LEN_MAX];
-    int len;
-    if (is_utf16) {
-        if (likely(c < 128)) {
-            if (c >= 'A' && c <= 'Z')
-                c = c - 'A' + 'a';
-        } else {
-            lre_case_conv(res, c, 2);
-            c = res[0];
-        }
-    } else {
-        if (likely(c < 128)) {
-            if (c >= 'a' && c <= 'z')
-                c = c - 'a' + 'A';
-        } else {
-            /* legacy regexp: to upper case if single char >= 128 */
-            len = lre_case_conv(res, c, FALSE);
-            if (len == 1 && res[0] >= 128)
-                c = res[0];
-        }
-    }
-    return c;
-}
-
 static const uint16_t char_range_d[] = {
     1,
     0x0030, 0x0039 + 1,
@@ -245,31 +215,6 @@ static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
     return -1;
 }
 
-static int cr_canonicalize(CharRange *cr)
-{
-    CharRange a;
-    uint32_t pt[2];
-    int i, ret;
-
-    cr_init(&a, cr->mem_opaque, lre_realloc);
-    pt[0] = 'a';
-    pt[1] = 'z' + 1;
-    ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER);
-    if (ret)
-        goto fail;
-    /* convert to upper case */
-    /* XXX: the generic unicode case would be much more complicated
-       and not really useful */
-    for(i = 0; i < a.len; i++) {
-        a.points[i] += 'A' - 'a';
-    }
-    /* Note: for simplicity we keep the lower case ranges */
-    ret = cr_union1(cr, a.points, a.len);
- fail:
-    cr_free(&a);
-    return ret;
-}
-
 #ifdef DUMP_REOP
 static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
                                                      int buf_len)
@@ -922,7 +867,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
         }
     }
     if (s->ignore_case) {
-        if (cr_canonicalize(cr))
+        if (cr_regexp_canonicalize(cr, s->is_utf16))
             goto memory_error;
     }
     if (invert) {
diff --git a/lib/quickjs/libunicode-table.h b/lib/quickjs/libunicode-table.h
index b64178b4..513ed94b 100644
--- a/lib/quickjs/libunicode-table.h
+++ b/lib/quickjs/libunicode-table.h
@@ -3779,72 +3779,70 @@ static const uint8_t unicode_prop_Changes_When_Titlecased1_table[22] = {
     0x8b, 0x80, 0x8e, 0x80, 0xae, 0x80,
 };
 
-static const uint8_t unicode_prop_Changes_When_Casefolded1_table[33] = {
-    0x40, 0xde, 0x80, 0xcf, 0x80, 0x97, 0x80, 0x44,
-    0x3c, 0x80, 0x59, 0x11, 0x80, 0x40, 0xe4, 0x3f,
-    0x3f, 0x87, 0x89, 0x11, 0x05, 0x02, 0x11, 0x80,
-    0xa9, 0x11, 0x80, 0x60, 0xdb, 0x07, 0x86, 0x8b,
-    0x84,
+static const uint8_t unicode_prop_Changes_When_Casefolded1_table[29] = {
+    0x41, 0xef, 0x80, 0x41, 0x9e, 0x80, 0x9e, 0x80,
+    0x5a, 0xe4, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00,
+    0x80, 0xde, 0x06, 0x06, 0x80, 0x8a, 0x09, 0x81,
+    0x89, 0x10, 0x81, 0x8d, 0x80,
 };
 
-static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[451] = {
+static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[447] = {
     0x40, 0x9f, 0x06, 0x00, 0x01, 0x00, 0x01, 0x12,
-    0x10, 0x82, 0x9f, 0x80, 0xcf, 0x01, 0x80, 0x8b,
-    0x07, 0x80, 0xfb, 0x01, 0x01, 0x80, 0xa5, 0x80,
-    0x40, 0xbb, 0x88, 0x9e, 0x29, 0x84, 0xda, 0x08,
-    0x81, 0x89, 0x80, 0xa3, 0x04, 0x02, 0x04, 0x08,
-    0x80, 0xc9, 0x82, 0x9c, 0x80, 0x41, 0x93, 0x80,
-    0x40, 0x93, 0x80, 0xd7, 0x83, 0x42, 0xde, 0x87,
-    0xfb, 0x08, 0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11,
-    0x80, 0x40, 0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe,
-    0x80, 0xa7, 0x81, 0xad, 0x80, 0xb5, 0x80, 0x88,
-    0x03, 0x03, 0x03, 0x80, 0x8b, 0x80, 0x88, 0x00,
-    0x26, 0x80, 0x90, 0x80, 0x88, 0x03, 0x03, 0x03,
-    0x80, 0x8b, 0x80, 0x41, 0x41, 0x80, 0xe1, 0x81,
-    0x46, 0x52, 0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10,
-    0x8a, 0x80, 0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1,
-    0xa4, 0x40, 0xd9, 0x80, 0x40, 0xd5, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x3f, 0x3f, 0x87,
-    0x89, 0x11, 0x04, 0x00, 0x29, 0x04, 0x12, 0x80,
-    0x88, 0x12, 0x80, 0x88, 0x11, 0x11, 0x04, 0x08,
-    0x8f, 0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b,
-    0x00, 0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a,
-    0x80, 0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a,
-    0x01, 0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06,
-    0x05, 0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80,
-    0x40, 0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41,
-    0x34, 0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6,
-    0x82, 0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0,
-    0x80, 0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40,
-    0xd5, 0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09,
-    0x80, 0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf,
-    0x9e, 0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f,
-    0x60, 0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40,
-    0x80, 0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80,
-    0x60, 0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81,
-    0x89, 0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9,
-    0xa5, 0x86, 0x8b, 0x24, 0x00, 0x97, 0x04, 0x00,
-    0x01, 0x01, 0x80, 0xeb, 0xa0, 0x41, 0x6a, 0x91,
-    0xbf, 0x81, 0xb5, 0xa7, 0x8c, 0x82, 0x99, 0x95,
-    0x94, 0x81, 0x8b, 0x80, 0x92, 0x03, 0x1a, 0x00,
-    0x80, 0x40, 0x86, 0x08, 0x80, 0x9f, 0x99, 0x40,
-    0x83, 0x15, 0x0d, 0x0d, 0x0a, 0x16, 0x06, 0x80,
-    0x88, 0x47, 0x87, 0x20, 0xa9, 0x80, 0x88, 0x60,
-    0xb4, 0xe4, 0x83, 0x54, 0xb9, 0x86, 0x8d, 0x87,
-    0xbf, 0x85, 0x42, 0x3e, 0xd4, 0x80, 0xc6, 0x01,
-    0x08, 0x09, 0x0b, 0x80, 0x8b, 0x00, 0x06, 0x80,
-    0xc0, 0x03, 0x0f, 0x06, 0x80, 0x9b, 0x03, 0x04,
-    0x00, 0x16, 0x80, 0x41, 0x53, 0x81, 0x41, 0x23,
-    0x81, 0xb1, 0x48, 0x2f, 0xbd, 0x4d, 0x91, 0x18,
-    0x9a, 0x01, 0x00, 0x08, 0x80, 0x89, 0x03, 0x00,
-    0x00, 0x28, 0x18, 0x00, 0x00, 0x02, 0x01, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x0b,
-    0x06, 0x03, 0x03, 0x00, 0x80, 0x89, 0x80, 0x90,
-    0x22, 0x04, 0x80, 0x90, 0x42, 0x43, 0x8a, 0x84,
-    0x9e, 0x80, 0x9f, 0x99, 0x82, 0xa2, 0x80, 0xee,
-    0x82, 0x8c, 0xab, 0x83, 0x88, 0x31, 0x49, 0x9d,
-    0x89, 0x60, 0xfc, 0x05, 0x42, 0x1d, 0x6b, 0x05,
-    0xe1, 0x4f, 0xff,
+    0x10, 0x82, 0xf3, 0x80, 0x8b, 0x80, 0x40, 0x84,
+    0x01, 0x01, 0x80, 0xa2, 0x01, 0x80, 0x40, 0xbb,
+    0x88, 0x9e, 0x29, 0x84, 0xda, 0x08, 0x81, 0x89,
+    0x80, 0xa3, 0x04, 0x02, 0x04, 0x08, 0x07, 0x80,
+    0x9e, 0x80, 0xa0, 0x82, 0x9c, 0x80, 0x42, 0x28,
+    0x80, 0xd7, 0x83, 0x42, 0xde, 0x87, 0xfb, 0x08,
+    0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11, 0x80, 0x40,
+    0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe, 0x80, 0xa7,
+    0x81, 0xad, 0x80, 0xb5, 0x80, 0x88, 0x03, 0x03,
+    0x03, 0x80, 0x8b, 0x80, 0x88, 0x00, 0x26, 0x80,
+    0x90, 0x80, 0x88, 0x03, 0x03, 0x03, 0x80, 0x8b,
+    0x80, 0x41, 0x41, 0x80, 0xe1, 0x81, 0x46, 0x52,
+    0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10, 0x8a, 0x80,
+    0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1, 0xa4, 0x40,
+    0xd5, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00, 0x80,
+    0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+    0xb7, 0x05, 0x00, 0x13, 0x05, 0x11, 0x02, 0x0c,
+    0x11, 0x00, 0x00, 0x0c, 0x15, 0x05, 0x08, 0x8f,
+    0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b, 0x00,
+    0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a, 0x80,
+    0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a, 0x01,
+    0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06, 0x05,
+    0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80, 0x40,
+    0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41, 0x34,
+    0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6, 0x82,
+    0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0, 0x80,
+    0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40, 0xd5,
+    0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09, 0x80,
+    0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf, 0x9e,
+    0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f, 0x60,
+    0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40, 0x80,
+    0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80, 0x60,
+    0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81, 0x89,
+    0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9, 0xc2,
+    0x00, 0x97, 0x04, 0x00, 0x01, 0x01, 0x80, 0xeb,
+    0xa0, 0x41, 0x6a, 0x91, 0xbf, 0x81, 0xb5, 0xa7,
+    0x8c, 0x82, 0x99, 0x95, 0x94, 0x81, 0x8b, 0x80,
+    0x92, 0x03, 0x1a, 0x00, 0x80, 0x40, 0x86, 0x08,
+    0x80, 0x9f, 0x99, 0x40, 0x83, 0x15, 0x0d, 0x0d,
+    0x0a, 0x16, 0x06, 0x80, 0x88, 0x47, 0x87, 0x20,
+    0xa9, 0x80, 0x88, 0x60, 0xb4, 0xe4, 0x83, 0x54,
+    0xb9, 0x86, 0x8d, 0x87, 0xbf, 0x85, 0x42, 0x3e,
+    0xd4, 0x80, 0xc6, 0x01, 0x08, 0x09, 0x0b, 0x80,
+    0x8b, 0x00, 0x06, 0x80, 0xc0, 0x03, 0x0f, 0x06,
+    0x80, 0x9b, 0x03, 0x04, 0x00, 0x16, 0x80, 0x41,
+    0x53, 0x81, 0x41, 0x23, 0x81, 0xb1, 0x48, 0x2f,
+    0xbd, 0x4d, 0x91, 0x18, 0x9a, 0x01, 0x00, 0x08,
+    0x80, 0x89, 0x03, 0x00, 0x00, 0x28, 0x18, 0x00,
+    0x00, 0x02, 0x01, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x00, 0x0b, 0x06, 0x03, 0x03, 0x00,
+    0x80, 0x89, 0x80, 0x90, 0x22, 0x04, 0x80, 0x90,
+    0x42, 0x43, 0x8a, 0x84, 0x9e, 0x80, 0x9f, 0x99,
+    0x82, 0xa2, 0x80, 0xee, 0x82, 0x8c, 0xab, 0x83,
+    0x88, 0x31, 0x49, 0x9d, 0x89, 0x60, 0xfc, 0x05,
+    0x42, 0x1d, 0x6b, 0x05, 0xe1, 0x4f, 0xff,
 };
 
 static const uint8_t unicode_prop_ASCII_Hex_Digit_table[5] = {
diff --git a/lib/quickjs/libunicode.c b/lib/quickjs/libunicode.c
index 63c12a07..0712a6c0 100644
--- a/lib/quickjs/libunicode.c
+++ b/lib/quickjs/libunicode.c
@@ -43,11 +43,111 @@ enum {
     RUN_TYPE_UF_D1_EXT,
     RUN_TYPE_U_EXT,
     RUN_TYPE_LF_EXT,
-    RUN_TYPE_U_EXT2,
-    RUN_TYPE_L_EXT2,
-    RUN_TYPE_U_EXT3,
+    RUN_TYPE_UF_EXT2,
+    RUN_TYPE_LF_EXT2,
+    RUN_TYPE_UF_EXT3,
 };
 
+static int lre_case_conv1(uint32_t c, int conv_type)
+{
+    uint32_t res[LRE_CC_RES_LEN_MAX];
+    lre_case_conv(res, c, conv_type);
+    return res[0];
+}
+
+/* case conversion using the table entry 'idx' with value 'v' */
+static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v)
+{
+    uint32_t code, data, type, a, is_lower;
+    is_lower = (conv_type != 0);
+    type = (v >> (32 - 17 - 7 - 4)) & 0xf;
+    data = ((v & 0xf) << 8) | case_conv_table2[idx];
+    code = v >> (32 - 17);
+    switch(type) {
+    case RUN_TYPE_U:
+    case RUN_TYPE_L:
+    case RUN_TYPE_UF:
+    case RUN_TYPE_LF:
+        if (conv_type == (type & 1) ||
+            (type >= RUN_TYPE_UF && conv_type == 2)) {
+            c = c - code + (case_conv_table1[data] >> (32 - 17));
+        }
+        break;
+    case RUN_TYPE_UL:
+        a = c - code;
+        if ((a & 1) != (1 - is_lower))
+            break;
+        c = (a ^ 1) + code;
+        break;
+    case RUN_TYPE_LSU:
+        a = c - code;
+        if (a == 1) {
+            c += 2 * is_lower - 1;
+        } else if (a == (1 - is_lower) * 2) {
+            c += (2 * is_lower - 1) * 2;
+        }
+        break;
+    case RUN_TYPE_U2L_399_EXT2:
+        if (!is_lower) {
+            res[0] = c - code + case_conv_ext[data >> 6];
+            res[1] = 0x399;
+            return 2;
+        } else {
+            c = c - code + case_conv_ext[data & 0x3f];
+        }
+        break;
+    case RUN_TYPE_UF_D20:
+        if (conv_type == 1)
+            break;
+        c = data + (conv_type == 2) * 0x20;
+        break;
+    case RUN_TYPE_UF_D1_EXT:
+        if (conv_type == 1)
+            break;
+        c = case_conv_ext[data] + (conv_type == 2);
+        break;
+    case RUN_TYPE_U_EXT:
+    case RUN_TYPE_LF_EXT:
+        if (is_lower != (type - RUN_TYPE_U_EXT))
+            break;
+        c = case_conv_ext[data];
+        break;
+    case RUN_TYPE_LF_EXT2:
+        if (!is_lower)
+            break;
+        res[0] = c - code + case_conv_ext[data >> 6];
+        res[1] = case_conv_ext[data & 0x3f];
+        return 2;
+    case RUN_TYPE_UF_EXT2:
+        if (conv_type == 1)
+            break;
+        res[0] = c - code + case_conv_ext[data >> 6];
+        res[1] = case_conv_ext[data & 0x3f];
+        if (conv_type == 2) {
+            /* convert to lower */
+            res[0] = lre_case_conv1(res[0], 1);
+            res[1] = lre_case_conv1(res[1], 1);
+        }
+        return 2;
+    default:
+    case RUN_TYPE_UF_EXT3:
+        if (conv_type == 1)
+            break;
+        res[0] = case_conv_ext[data >> 8];
+        res[1] = case_conv_ext[(data >> 4) & 0xf];
+        res[2] = case_conv_ext[data & 0xf];
+        if (conv_type == 2) {
+            /* convert to lower */
+            res[0] = lre_case_conv1(res[0], 1);
+            res[1] = lre_case_conv1(res[1], 1);
+            res[2] = lre_case_conv1(res[2], 1);
+        }
+        return 3;
+    }
+    res[0] = c;
+    return 1;
+}
+
 /* conv_type:
    0 = to upper 
    1 = to lower
@@ -66,10 +166,9 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
             }
         }
     } else {
-        uint32_t v, code, data, type, len, a, is_lower;
+        uint32_t v, code, len;
         int idx, idx_min, idx_max;
         
-        is_lower = (conv_type != 0);
         idx_min = 0;
         idx_max = countof(case_conv_table1) - 1;
         while (idx_min <= idx_max) {
@@ -82,74 +181,7 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
             } else if (c >= code + len) {
                 idx_min = idx + 1;
             } else {
-                type = (v >> (32 - 17 - 7 - 4)) & 0xf;
-                data = ((v & 0xf) << 8) | case_conv_table2[idx];
-                switch(type) {
-                case RUN_TYPE_U:
-                case RUN_TYPE_L:
-                case RUN_TYPE_UF:
-                case RUN_TYPE_LF:
-                    if (conv_type == (type & 1) ||
-                        (type >= RUN_TYPE_UF && conv_type == 2)) {
-                        c = c - code + (case_conv_table1[data] >> (32 - 17));
-                    }
-                    break;
-                case RUN_TYPE_UL:
-                    a = c - code;
-                    if ((a & 1) != (1 - is_lower))
-                        break;
-                    c = (a ^ 1) + code;
-                    break;
-                case RUN_TYPE_LSU:
-                    a = c - code;
-                    if (a == 1) {
-                        c += 2 * is_lower - 1;
-                    } else if (a == (1 - is_lower) * 2) {
-                        c += (2 * is_lower - 1) * 2;
-                    }
-                    break;
-                case RUN_TYPE_U2L_399_EXT2:
-                    if (!is_lower) {
-                        res[0] = c - code + case_conv_ext[data >> 6];
-                        res[1] = 0x399;
-                        return 2;
-                    } else {
-                        c = c - code + case_conv_ext[data & 0x3f];
-                    }
-                    break;
-                case RUN_TYPE_UF_D20:
-                    if (conv_type == 1)
-                        break;
-                    c = data + (conv_type == 2) * 0x20;
-                    break;
-                case RUN_TYPE_UF_D1_EXT:
-                    if (conv_type == 1)
-                        break;
-                    c = case_conv_ext[data] + (conv_type == 2);
-                    break;
-                case RUN_TYPE_U_EXT:
-                case RUN_TYPE_LF_EXT:
-                    if (is_lower != (type - RUN_TYPE_U_EXT))
-                        break;
-                    c = case_conv_ext[data];
-                    break;
-                case RUN_TYPE_U_EXT2:
-                case RUN_TYPE_L_EXT2:
-                    if (conv_type != (type - RUN_TYPE_U_EXT2))
-                        break;
-                    res[0] = c - code + case_conv_ext[data >> 6];
-                    res[1] = case_conv_ext[data & 0x3f];
-                    return 2;
-                default:
-                case RUN_TYPE_U_EXT3:
-                    if (conv_type != 0)
-                        break;
-                    res[0] = case_conv_ext[data >> 8];
-                    res[1] = case_conv_ext[(data >> 4) & 0xf];
-                    res[2] = case_conv_ext[data & 0xf];
-                    return 3;
-                }
-                break;
+                return lre_case_conv_entry(res, c, conv_type, idx, v);
             }
         }
     }
@@ -157,6 +189,77 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
     return 1;
 }
 
+static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode)
+{
+    uint32_t res[LRE_CC_RES_LEN_MAX];
+    int len;
+
+    if (is_unicode) {
+        len = lre_case_conv_entry(res, c, 2, idx, v);
+        if (len == 1) {
+            c = res[0];
+        } else {
+            /* handle the few specific multi-character cases (see
+               unicode_gen.c:dump_case_folding_special_cases()) */
+            if (c == 0xfb06) {
+                c = 0xfb05;
+            } else if (c == 0x01fd3) {
+                c = 0x390;
+            } else if (c == 0x01fe3) {
+                c = 0x3b0;
+            }
+        }
+    } else {
+        if (likely(c < 128)) {
+            if (c >= 'a' && c <= 'z')
+                c = c - 'a' + 'A';
+        } else {
+            /* legacy regexp: to upper case if single char >= 128 */
+            len = lre_case_conv_entry(res, c, FALSE, idx, v);
+            if (len == 1 && res[0] >= 128)
+                c = res[0];
+        }
+    }
+    return c;
+}
+
+/* JS regexp specific rules for case folding */
+int lre_canonicalize(uint32_t c, BOOL is_unicode)
+{
+    if (c < 128) {
+        /* fast case */
+        if (is_unicode) {
+            if (c >= 'A' && c <= 'Z') {
+                c = c - 'A' + 'a';
+            }
+        } else {
+            if (c >= 'a' && c <= 'z') {
+                c = c - 'a' + 'A';
+            }
+        }
+    } else {
+        uint32_t v, code, len;
+        int idx, idx_min, idx_max;
+        
+        idx_min = 0;
+        idx_max = countof(case_conv_table1) - 1;
+        while (idx_min <= idx_max) {
+            idx = (unsigned)(idx_max + idx_min) / 2;
+            v = case_conv_table1[idx];
+            code = v >> (32 - 17);
+            len = (v >> (32 - 17 - 7)) & 0x7f;
+            if (c < code) {
+                idx_max = idx - 1;
+            } else if (c >= code + len) {
+                idx_min = idx + 1;
+            } else {
+                return lre_case_folding_entry(c, idx, v, is_unicode);
+            }
+        }
+    }
+    return c;
+}
+
 static uint32_t get_le24(const uint8_t *ptr)
 {
 #if defined(__x86__) || defined(__x86_64__)
@@ -1179,11 +1282,11 @@ static int unicode_case1(CharRange *cr, int case_mask)
 #define MR(x) (1 << RUN_TYPE_ ## x)
     const uint32_t tab_run_mask[3] = {
         MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
-        MR(UF_D1_EXT) | MR(U_EXT) | MR(U_EXT2) | MR(U_EXT3),
+        MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
 
-        MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(L_EXT2),
+        MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
 
-        MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT),
+        MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
     };
 #undef MR
     uint32_t mask, v, code, type, len, i, idx;
@@ -1236,7 +1339,136 @@ static int unicode_case1(CharRange *cr, int case_mask)
     }
     return 0;
 }
-        
+
+static int point_cmp(const void *p1, const void *p2, void *arg)
+{
+    uint32_t v1 = *(uint32_t *)p1;
+    uint32_t v2 = *(uint32_t *)p2;
+    return (v1 > v2) - (v1 < v2);
+}
+
+static void cr_sort_and_remove_overlap(CharRange *cr)
+{
+    uint32_t start, end, start1, end1, i, j;
+    
+    /* the resulting ranges are not necessarily sorted and may overlap */
+    rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
+    j = 0;
+    for(i = 0; i < cr->len; ) {
+        start = cr->points[i];
+        end = cr->points[i + 1];
+        i += 2;
+        while (i < cr->len) {
+            start1 = cr->points[i];
+            end1 = cr->points[i + 1];
+            if (start1 > end) {
+                /* |------|
+                 *           |-------| */
+                break;
+            } else if (end1 <= end) {
+                /* |------|
+                 *    |--| */
+                i += 2;
+            } else {
+                /* |------|
+                 *     |-------| */
+                end = end1;
+                i += 2;
+            }
+        }
+        cr->points[j] = start;
+        cr->points[j + 1] = end;
+        j += 2;
+    }
+    cr->len = j;
+}
+
+/* canonicalize a character set using the JS regex case folding rules
+   (see lre_canonicalize()) */
+int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
+{
+    CharRange cr_inter, cr_mask, cr_result, cr_sub;
+    uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
+    
+    cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
+    cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
+    cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
+    cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
+
+    if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
+        goto fail;
+    if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
+        goto fail;
+
+    if (cr_invert(&cr_mask))
+        goto fail;
+    if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
+        goto fail;
+    
+    /* cr_inter = cr & cr_mask */
+    /* cr_sub = cr & ~cr_mask */
+
+    /* use the case conversion table to compute the result */
+    d_start = -1;
+    d_end = -1;
+    idx = 0;
+    v = case_conv_table1[idx];
+    code = v >> (32 - 17);
+    len = (v >> (32 - 17 - 7)) & 0x7f;
+    for(i = 0; i < cr_inter.len; i += 2) {
+        start = cr_inter.points[i];
+        end = cr_inter.points[i + 1];
+
+        for(c = start; c < end; c++) {
+            for(;;) {
+                if (c >= code && c < code + len)
+                    break;
+                idx++;
+                assert(idx < countof(case_conv_table1));
+                v = case_conv_table1[idx];
+                code = v >> (32 - 17);
+                len = (v >> (32 - 17 - 7)) & 0x7f;
+            }
+            d = lre_case_folding_entry(c, idx, v, is_unicode);
+            /* try to merge with the current interval */
+            if (d_start == -1) {
+                d_start = d;
+                d_end = d + 1;
+            } else if (d_end == d) {
+                d_end++;
+            } else {
+                cr_add_interval(&cr_result, d_start, d_end);
+                d_start = d;
+                d_end = d + 1;
+            }
+        }
+    }
+    if (d_start != -1) {
+        if (cr_add_interval(&cr_result, d_start, d_end))
+            goto fail;
+    }
+
+    /* the resulting ranges are not necessarily sorted and may overlap */
+    cr_sort_and_remove_overlap(&cr_result);
+
+    /* or with the character not affected by the case folding */
+    cr->len = 0;
+    if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
+        goto fail;
+
+    cr_free(&cr_inter);
+    cr_free(&cr_mask);
+    cr_free(&cr_result);
+    cr_free(&cr_sub);
+    return 0;
+ fail:
+    cr_free(&cr_inter);
+    cr_free(&cr_mask);
+    cr_free(&cr_result);
+    cr_free(&cr_sub);
+    return -1;
+}
+
 typedef enum {
     POP_GC,
     POP_PROP,
diff --git a/lib/quickjs/libunicode.h b/lib/quickjs/libunicode.h
index cfa600a5..8abacb01 100644
--- a/lib/quickjs/libunicode.h
+++ b/lib/quickjs/libunicode.h
@@ -41,6 +41,7 @@ typedef enum {
 } UnicodeNormalizationEnum;
 
 int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
+int lre_canonicalize(uint32_t c, BOOL is_unicode);
 LRE_BOOL lre_is_cased(uint32_t c);
 LRE_BOOL lre_is_case_ignorable(uint32_t c);
 
@@ -101,6 +102,8 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
 
 int cr_invert(CharRange *cr);
 
+int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode);
+
 #ifdef CONFIG_ALL_UNICODE
 
 LRE_BOOL lre_is_id_start(uint32_t c);