diff options
author | Fabrice Bellard <fabrice@bellard.org> | 2024-01-08 18:42:29 +0100 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2024-01-11 18:51:43 +0100 |
commit | 3853d2be3916b98d449e785071e370fe5b9a36ee (patch) | |
tree | d291ca65a9151a30d9c31b101e06e942a59ebaa0 /lib | |
parent | 0ecfe8945a5b064db2d2bf3984ece31bcea5e351 (diff) | |
download | chawan-3853d2be3916b98d449e785071e370fe5b9a36ee.tar.gz |
fixed regexp case insensitive flag
Diffstat (limited to 'lib')
-rw-r--r-- | lib/quickjs/libregexp.c | 57 | ||||
-rw-r--r-- | lib/quickjs/libunicode-table.h | 124 | ||||
-rw-r--r-- | lib/quickjs/libunicode.c | 386 | ||||
-rw-r--r-- | lib/quickjs/libunicode.h | 3 |
4 files changed, 374 insertions, 196 deletions
diff --git a/lib/quickjs/libregexp.c b/lib/quickjs/libregexp.c index 05df8653..6c8b57e0 100644 --- a/lib/quickjs/libregexp.c +++ b/lib/quickjs/libregexp.c @@ -34,9 +34,6 @@ /* TODO: - - Add full unicode canonicalize rules for character ranges (not - really useful but needed for exact "ignorecase" compatibility). - - Add a lock step execution mode (=linear time execution guaranteed) when the regular expression is "simple" i.e. no backreference nor complicated lookahead. The opcodes are designed for this execution @@ -120,33 +117,6 @@ static int dbuf_insert(DynBuf *s, int pos, int len) return 0; } -/* canonicalize with the specific JS regexp rules */ -static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16) -{ - uint32_t res[LRE_CC_RES_LEN_MAX]; - int len; - if (is_utf16) { - if (likely(c < 128)) { - if (c >= 'A' && c <= 'Z') - c = c - 'A' + 'a'; - } else { - lre_case_conv(res, c, 2); - c = res[0]; - } - } else { - if (likely(c < 128)) { - if (c >= 'a' && c <= 'z') - c = c - 'a' + 'A'; - } else { - /* legacy regexp: to upper case if single char >= 128 */ - len = lre_case_conv(res, c, FALSE); - if (len == 1 && res[0] >= 128) - c = res[0]; - } - } - return c; -} - static const uint16_t char_range_d[] = { 1, 0x0030, 0x0039 + 1, @@ -245,31 +215,6 @@ static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c) return -1; } -static int cr_canonicalize(CharRange *cr) -{ - CharRange a; - uint32_t pt[2]; - int i, ret; - - cr_init(&a, cr->mem_opaque, lre_realloc); - pt[0] = 'a'; - pt[1] = 'z' + 1; - ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER); - if (ret) - goto fail; - /* convert to upper case */ - /* XXX: the generic unicode case would be much more complicated - and not really useful */ - for(i = 0; i < a.len; i++) { - a.points[i] += 'A' - 'a'; - } - /* Note: for simplicity we keep the lower case ranges */ - ret = cr_union1(cr, a.points, a.len); - fail: - cr_free(&a); - return ret; -} - #ifdef DUMP_REOP static __maybe_unused void lre_dump_bytecode(const uint8_t *buf, int buf_len) @@ -922,7 +867,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp) } } if (s->ignore_case) { - if (cr_canonicalize(cr)) + if (cr_regexp_canonicalize(cr, s->is_utf16)) goto memory_error; } if (invert) { diff --git a/lib/quickjs/libunicode-table.h b/lib/quickjs/libunicode-table.h index b64178b4..513ed94b 100644 --- a/lib/quickjs/libunicode-table.h +++ b/lib/quickjs/libunicode-table.h @@ -3779,72 +3779,70 @@ static const uint8_t unicode_prop_Changes_When_Titlecased1_table[22] = { 0x8b, 0x80, 0x8e, 0x80, 0xae, 0x80, }; -static const uint8_t unicode_prop_Changes_When_Casefolded1_table[33] = { - 0x40, 0xde, 0x80, 0xcf, 0x80, 0x97, 0x80, 0x44, - 0x3c, 0x80, 0x59, 0x11, 0x80, 0x40, 0xe4, 0x3f, - 0x3f, 0x87, 0x89, 0x11, 0x05, 0x02, 0x11, 0x80, - 0xa9, 0x11, 0x80, 0x60, 0xdb, 0x07, 0x86, 0x8b, - 0x84, +static const uint8_t unicode_prop_Changes_When_Casefolded1_table[29] = { + 0x41, 0xef, 0x80, 0x41, 0x9e, 0x80, 0x9e, 0x80, + 0x5a, 0xe4, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00, + 0x80, 0xde, 0x06, 0x06, 0x80, 0x8a, 0x09, 0x81, + 0x89, 0x10, 0x81, 0x8d, 0x80, }; -static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[451] = { +static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[447] = { 0x40, 0x9f, 0x06, 0x00, 0x01, 0x00, 0x01, 0x12, - 0x10, 0x82, 0x9f, 0x80, 0xcf, 0x01, 0x80, 0x8b, - 0x07, 0x80, 0xfb, 0x01, 0x01, 0x80, 0xa5, 0x80, - 0x40, 0xbb, 0x88, 0x9e, 0x29, 0x84, 0xda, 0x08, - 0x81, 0x89, 0x80, 0xa3, 0x04, 0x02, 0x04, 0x08, - 0x80, 0xc9, 0x82, 0x9c, 0x80, 0x41, 0x93, 0x80, - 0x40, 0x93, 0x80, 0xd7, 0x83, 0x42, 0xde, 0x87, - 0xfb, 0x08, 0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11, - 0x80, 0x40, 0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe, - 0x80, 0xa7, 0x81, 0xad, 0x80, 0xb5, 0x80, 0x88, - 0x03, 0x03, 0x03, 0x80, 0x8b, 0x80, 0x88, 0x00, - 0x26, 0x80, 0x90, 0x80, 0x88, 0x03, 0x03, 0x03, - 0x80, 0x8b, 0x80, 0x41, 0x41, 0x80, 0xe1, 0x81, - 0x46, 0x52, 0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10, - 0x8a, 0x80, 0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1, - 0xa4, 0x40, 0xd9, 0x80, 0x40, 0xd5, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x3f, 0x3f, 0x87, - 0x89, 0x11, 0x04, 0x00, 0x29, 0x04, 0x12, 0x80, - 0x88, 0x12, 0x80, 0x88, 0x11, 0x11, 0x04, 0x08, - 0x8f, 0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b, - 0x00, 0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a, - 0x80, 0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a, - 0x01, 0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06, - 0x05, 0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80, - 0x40, 0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41, - 0x34, 0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6, - 0x82, 0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0, - 0x80, 0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40, - 0xd5, 0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09, - 0x80, 0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf, - 0x9e, 0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f, - 0x60, 0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40, - 0x80, 0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80, - 0x60, 0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81, - 0x89, 0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9, - 0xa5, 0x86, 0x8b, 0x24, 0x00, 0x97, 0x04, 0x00, - 0x01, 0x01, 0x80, 0xeb, 0xa0, 0x41, 0x6a, 0x91, - 0xbf, 0x81, 0xb5, 0xa7, 0x8c, 0x82, 0x99, 0x95, - 0x94, 0x81, 0x8b, 0x80, 0x92, 0x03, 0x1a, 0x00, - 0x80, 0x40, 0x86, 0x08, 0x80, 0x9f, 0x99, 0x40, - 0x83, 0x15, 0x0d, 0x0d, 0x0a, 0x16, 0x06, 0x80, - 0x88, 0x47, 0x87, 0x20, 0xa9, 0x80, 0x88, 0x60, - 0xb4, 0xe4, 0x83, 0x54, 0xb9, 0x86, 0x8d, 0x87, - 0xbf, 0x85, 0x42, 0x3e, 0xd4, 0x80, 0xc6, 0x01, - 0x08, 0x09, 0x0b, 0x80, 0x8b, 0x00, 0x06, 0x80, - 0xc0, 0x03, 0x0f, 0x06, 0x80, 0x9b, 0x03, 0x04, - 0x00, 0x16, 0x80, 0x41, 0x53, 0x81, 0x41, 0x23, - 0x81, 0xb1, 0x48, 0x2f, 0xbd, 0x4d, 0x91, 0x18, - 0x9a, 0x01, 0x00, 0x08, 0x80, 0x89, 0x03, 0x00, - 0x00, 0x28, 0x18, 0x00, 0x00, 0x02, 0x01, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x0b, - 0x06, 0x03, 0x03, 0x00, 0x80, 0x89, 0x80, 0x90, - 0x22, 0x04, 0x80, 0x90, 0x42, 0x43, 0x8a, 0x84, - 0x9e, 0x80, 0x9f, 0x99, 0x82, 0xa2, 0x80, 0xee, - 0x82, 0x8c, 0xab, 0x83, 0x88, 0x31, 0x49, 0x9d, - 0x89, 0x60, 0xfc, 0x05, 0x42, 0x1d, 0x6b, 0x05, - 0xe1, 0x4f, 0xff, + 0x10, 0x82, 0xf3, 0x80, 0x8b, 0x80, 0x40, 0x84, + 0x01, 0x01, 0x80, 0xa2, 0x01, 0x80, 0x40, 0xbb, + 0x88, 0x9e, 0x29, 0x84, 0xda, 0x08, 0x81, 0x89, + 0x80, 0xa3, 0x04, 0x02, 0x04, 0x08, 0x07, 0x80, + 0x9e, 0x80, 0xa0, 0x82, 0x9c, 0x80, 0x42, 0x28, + 0x80, 0xd7, 0x83, 0x42, 0xde, 0x87, 0xfb, 0x08, + 0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11, 0x80, 0x40, + 0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe, 0x80, 0xa7, + 0x81, 0xad, 0x80, 0xb5, 0x80, 0x88, 0x03, 0x03, + 0x03, 0x80, 0x8b, 0x80, 0x88, 0x00, 0x26, 0x80, + 0x90, 0x80, 0x88, 0x03, 0x03, 0x03, 0x80, 0x8b, + 0x80, 0x41, 0x41, 0x80, 0xe1, 0x81, 0x46, 0x52, + 0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10, 0x8a, 0x80, + 0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1, 0xa4, 0x40, + 0xd5, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00, 0x80, + 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, + 0xb7, 0x05, 0x00, 0x13, 0x05, 0x11, 0x02, 0x0c, + 0x11, 0x00, 0x00, 0x0c, 0x15, 0x05, 0x08, 0x8f, + 0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b, 0x00, + 0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a, 0x80, + 0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a, 0x01, + 0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06, 0x05, + 0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80, 0x40, + 0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41, 0x34, + 0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6, 0x82, + 0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0, 0x80, + 0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40, 0xd5, + 0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09, 0x80, + 0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf, 0x9e, + 0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f, 0x60, + 0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40, 0x80, + 0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80, 0x60, + 0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81, 0x89, + 0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9, 0xc2, + 0x00, 0x97, 0x04, 0x00, 0x01, 0x01, 0x80, 0xeb, + 0xa0, 0x41, 0x6a, 0x91, 0xbf, 0x81, 0xb5, 0xa7, + 0x8c, 0x82, 0x99, 0x95, 0x94, 0x81, 0x8b, 0x80, + 0x92, 0x03, 0x1a, 0x00, 0x80, 0x40, 0x86, 0x08, + 0x80, 0x9f, 0x99, 0x40, 0x83, 0x15, 0x0d, 0x0d, + 0x0a, 0x16, 0x06, 0x80, 0x88, 0x47, 0x87, 0x20, + 0xa9, 0x80, 0x88, 0x60, 0xb4, 0xe4, 0x83, 0x54, + 0xb9, 0x86, 0x8d, 0x87, 0xbf, 0x85, 0x42, 0x3e, + 0xd4, 0x80, 0xc6, 0x01, 0x08, 0x09, 0x0b, 0x80, + 0x8b, 0x00, 0x06, 0x80, 0xc0, 0x03, 0x0f, 0x06, + 0x80, 0x9b, 0x03, 0x04, 0x00, 0x16, 0x80, 0x41, + 0x53, 0x81, 0x41, 0x23, 0x81, 0xb1, 0x48, 0x2f, + 0xbd, 0x4d, 0x91, 0x18, 0x9a, 0x01, 0x00, 0x08, + 0x80, 0x89, 0x03, 0x00, 0x00, 0x28, 0x18, 0x00, + 0x00, 0x02, 0x01, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x0b, 0x06, 0x03, 0x03, 0x00, + 0x80, 0x89, 0x80, 0x90, 0x22, 0x04, 0x80, 0x90, + 0x42, 0x43, 0x8a, 0x84, 0x9e, 0x80, 0x9f, 0x99, + 0x82, 0xa2, 0x80, 0xee, 0x82, 0x8c, 0xab, 0x83, + 0x88, 0x31, 0x49, 0x9d, 0x89, 0x60, 0xfc, 0x05, + 0x42, 0x1d, 0x6b, 0x05, 0xe1, 0x4f, 0xff, }; static const uint8_t unicode_prop_ASCII_Hex_Digit_table[5] = { diff --git a/lib/quickjs/libunicode.c b/lib/quickjs/libunicode.c index 63c12a07..0712a6c0 100644 --- a/lib/quickjs/libunicode.c +++ b/lib/quickjs/libunicode.c @@ -43,11 +43,111 @@ enum { RUN_TYPE_UF_D1_EXT, RUN_TYPE_U_EXT, RUN_TYPE_LF_EXT, - RUN_TYPE_U_EXT2, - RUN_TYPE_L_EXT2, - RUN_TYPE_U_EXT3, + RUN_TYPE_UF_EXT2, + RUN_TYPE_LF_EXT2, + RUN_TYPE_UF_EXT3, }; +static int lre_case_conv1(uint32_t c, int conv_type) +{ + uint32_t res[LRE_CC_RES_LEN_MAX]; + lre_case_conv(res, c, conv_type); + return res[0]; +} + +/* case conversion using the table entry 'idx' with value 'v' */ +static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v) +{ + uint32_t code, data, type, a, is_lower; + is_lower = (conv_type != 0); + type = (v >> (32 - 17 - 7 - 4)) & 0xf; + data = ((v & 0xf) << 8) | case_conv_table2[idx]; + code = v >> (32 - 17); + switch(type) { + case RUN_TYPE_U: + case RUN_TYPE_L: + case RUN_TYPE_UF: + case RUN_TYPE_LF: + if (conv_type == (type & 1) || + (type >= RUN_TYPE_UF && conv_type == 2)) { + c = c - code + (case_conv_table1[data] >> (32 - 17)); + } + break; + case RUN_TYPE_UL: + a = c - code; + if ((a & 1) != (1 - is_lower)) + break; + c = (a ^ 1) + code; + break; + case RUN_TYPE_LSU: + a = c - code; + if (a == 1) { + c += 2 * is_lower - 1; + } else if (a == (1 - is_lower) * 2) { + c += (2 * is_lower - 1) * 2; + } + break; + case RUN_TYPE_U2L_399_EXT2: + if (!is_lower) { + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = 0x399; + return 2; + } else { + c = c - code + case_conv_ext[data & 0x3f]; + } + break; + case RUN_TYPE_UF_D20: + if (conv_type == 1) + break; + c = data + (conv_type == 2) * 0x20; + break; + case RUN_TYPE_UF_D1_EXT: + if (conv_type == 1) + break; + c = case_conv_ext[data] + (conv_type == 2); + break; + case RUN_TYPE_U_EXT: + case RUN_TYPE_LF_EXT: + if (is_lower != (type - RUN_TYPE_U_EXT)) + break; + c = case_conv_ext[data]; + break; + case RUN_TYPE_LF_EXT2: + if (!is_lower) + break; + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = case_conv_ext[data & 0x3f]; + return 2; + case RUN_TYPE_UF_EXT2: + if (conv_type == 1) + break; + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = case_conv_ext[data & 0x3f]; + if (conv_type == 2) { + /* convert to lower */ + res[0] = lre_case_conv1(res[0], 1); + res[1] = lre_case_conv1(res[1], 1); + } + return 2; + default: + case RUN_TYPE_UF_EXT3: + if (conv_type == 1) + break; + res[0] = case_conv_ext[data >> 8]; + res[1] = case_conv_ext[(data >> 4) & 0xf]; + res[2] = case_conv_ext[data & 0xf]; + if (conv_type == 2) { + /* convert to lower */ + res[0] = lre_case_conv1(res[0], 1); + res[1] = lre_case_conv1(res[1], 1); + res[2] = lre_case_conv1(res[2], 1); + } + return 3; + } + res[0] = c; + return 1; +} + /* conv_type: 0 = to upper 1 = to lower @@ -66,10 +166,9 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) } } } else { - uint32_t v, code, data, type, len, a, is_lower; + uint32_t v, code, len; int idx, idx_min, idx_max; - is_lower = (conv_type != 0); idx_min = 0; idx_max = countof(case_conv_table1) - 1; while (idx_min <= idx_max) { @@ -82,74 +181,7 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) } else if (c >= code + len) { idx_min = idx + 1; } else { - type = (v >> (32 - 17 - 7 - 4)) & 0xf; - data = ((v & 0xf) << 8) | case_conv_table2[idx]; - switch(type) { - case RUN_TYPE_U: - case RUN_TYPE_L: - case RUN_TYPE_UF: - case RUN_TYPE_LF: - if (conv_type == (type & 1) || - (type >= RUN_TYPE_UF && conv_type == 2)) { - c = c - code + (case_conv_table1[data] >> (32 - 17)); - } - break; - case RUN_TYPE_UL: - a = c - code; - if ((a & 1) != (1 - is_lower)) - break; - c = (a ^ 1) + code; - break; - case RUN_TYPE_LSU: - a = c - code; - if (a == 1) { - c += 2 * is_lower - 1; - } else if (a == (1 - is_lower) * 2) { - c += (2 * is_lower - 1) * 2; - } - break; - case RUN_TYPE_U2L_399_EXT2: - if (!is_lower) { - res[0] = c - code + case_conv_ext[data >> 6]; - res[1] = 0x399; - return 2; - } else { - c = c - code + case_conv_ext[data & 0x3f]; - } - break; - case RUN_TYPE_UF_D20: - if (conv_type == 1) - break; - c = data + (conv_type == 2) * 0x20; - break; - case RUN_TYPE_UF_D1_EXT: - if (conv_type == 1) - break; - c = case_conv_ext[data] + (conv_type == 2); - break; - case RUN_TYPE_U_EXT: - case RUN_TYPE_LF_EXT: - if (is_lower != (type - RUN_TYPE_U_EXT)) - break; - c = case_conv_ext[data]; - break; - case RUN_TYPE_U_EXT2: - case RUN_TYPE_L_EXT2: - if (conv_type != (type - RUN_TYPE_U_EXT2)) - break; - res[0] = c - code + case_conv_ext[data >> 6]; - res[1] = case_conv_ext[data & 0x3f]; - return 2; - default: - case RUN_TYPE_U_EXT3: - if (conv_type != 0) - break; - res[0] = case_conv_ext[data >> 8]; - res[1] = case_conv_ext[(data >> 4) & 0xf]; - res[2] = case_conv_ext[data & 0xf]; - return 3; - } - break; + return lre_case_conv_entry(res, c, conv_type, idx, v); } } } @@ -157,6 +189,77 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) return 1; } +static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode) +{ + uint32_t res[LRE_CC_RES_LEN_MAX]; + int len; + + if (is_unicode) { + len = lre_case_conv_entry(res, c, 2, idx, v); + if (len == 1) { + c = res[0]; + } else { + /* handle the few specific multi-character cases (see + unicode_gen.c:dump_case_folding_special_cases()) */ + if (c == 0xfb06) { + c = 0xfb05; + } else if (c == 0x01fd3) { + c = 0x390; + } else if (c == 0x01fe3) { + c = 0x3b0; + } + } + } else { + if (likely(c < 128)) { + if (c >= 'a' && c <= 'z') + c = c - 'a' + 'A'; + } else { + /* legacy regexp: to upper case if single char >= 128 */ + len = lre_case_conv_entry(res, c, FALSE, idx, v); + if (len == 1 && res[0] >= 128) + c = res[0]; + } + } + return c; +} + +/* JS regexp specific rules for case folding */ +int lre_canonicalize(uint32_t c, BOOL is_unicode) +{ + if (c < 128) { + /* fast case */ + if (is_unicode) { + if (c >= 'A' && c <= 'Z') { + c = c - 'A' + 'a'; + } + } else { + if (c >= 'a' && c <= 'z') { + c = c - 'a' + 'A'; + } + } + } else { + uint32_t v, code, len; + int idx, idx_min, idx_max; + + idx_min = 0; + idx_max = countof(case_conv_table1) - 1; + while (idx_min <= idx_max) { + idx = (unsigned)(idx_max + idx_min) / 2; + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + if (c < code) { + idx_max = idx - 1; + } else if (c >= code + len) { + idx_min = idx + 1; + } else { + return lre_case_folding_entry(c, idx, v, is_unicode); + } + } + } + return c; +} + static uint32_t get_le24(const uint8_t *ptr) { #if defined(__x86__) || defined(__x86_64__) @@ -1179,11 +1282,11 @@ static int unicode_case1(CharRange *cr, int case_mask) #define MR(x) (1 << RUN_TYPE_ ## x) const uint32_t tab_run_mask[3] = { MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) | - MR(UF_D1_EXT) | MR(U_EXT) | MR(U_EXT2) | MR(U_EXT3), + MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3), - MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(L_EXT2), + MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2), - MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT), + MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3), }; #undef MR uint32_t mask, v, code, type, len, i, idx; @@ -1236,7 +1339,136 @@ static int unicode_case1(CharRange *cr, int case_mask) } return 0; } - + +static int point_cmp(const void *p1, const void *p2, void *arg) +{ + uint32_t v1 = *(uint32_t *)p1; + uint32_t v2 = *(uint32_t *)p2; + return (v1 > v2) - (v1 < v2); +} + +static void cr_sort_and_remove_overlap(CharRange *cr) +{ + uint32_t start, end, start1, end1, i, j; + + /* the resulting ranges are not necessarily sorted and may overlap */ + rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL); + j = 0; + for(i = 0; i < cr->len; ) { + start = cr->points[i]; + end = cr->points[i + 1]; + i += 2; + while (i < cr->len) { + start1 = cr->points[i]; + end1 = cr->points[i + 1]; + if (start1 > end) { + /* |------| + * |-------| */ + break; + } else if (end1 <= end) { + /* |------| + * |--| */ + i += 2; + } else { + /* |------| + * |-------| */ + end = end1; + i += 2; + } + } + cr->points[j] = start; + cr->points[j + 1] = end; + j += 2; + } + cr->len = j; +} + +/* canonicalize a character set using the JS regex case folding rules + (see lre_canonicalize()) */ +int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode) +{ + CharRange cr_inter, cr_mask, cr_result, cr_sub; + uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d; + + cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_result, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func); + + if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U)) + goto fail; + if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER)) + goto fail; + + if (cr_invert(&cr_mask)) + goto fail; + if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER)) + goto fail; + + /* cr_inter = cr & cr_mask */ + /* cr_sub = cr & ~cr_mask */ + + /* use the case conversion table to compute the result */ + d_start = -1; + d_end = -1; + idx = 0; + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + for(i = 0; i < cr_inter.len; i += 2) { + start = cr_inter.points[i]; + end = cr_inter.points[i + 1]; + + for(c = start; c < end; c++) { + for(;;) { + if (c >= code && c < code + len) + break; + idx++; + assert(idx < countof(case_conv_table1)); + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + } + d = lre_case_folding_entry(c, idx, v, is_unicode); + /* try to merge with the current interval */ + if (d_start == -1) { + d_start = d; + d_end = d + 1; + } else if (d_end == d) { + d_end++; + } else { + cr_add_interval(&cr_result, d_start, d_end); + d_start = d; + d_end = d + 1; + } + } + } + if (d_start != -1) { + if (cr_add_interval(&cr_result, d_start, d_end)) + goto fail; + } + + /* the resulting ranges are not necessarily sorted and may overlap */ + cr_sort_and_remove_overlap(&cr_result); + + /* or with the character not affected by the case folding */ + cr->len = 0; + if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION)) + goto fail; + + cr_free(&cr_inter); + cr_free(&cr_mask); + cr_free(&cr_result); + cr_free(&cr_sub); + return 0; + fail: + cr_free(&cr_inter); + cr_free(&cr_mask); + cr_free(&cr_result); + cr_free(&cr_sub); + return -1; +} + typedef enum { POP_GC, POP_PROP, diff --git a/lib/quickjs/libunicode.h b/lib/quickjs/libunicode.h index cfa600a5..8abacb01 100644 --- a/lib/quickjs/libunicode.h +++ b/lib/quickjs/libunicode.h @@ -41,6 +41,7 @@ typedef enum { } UnicodeNormalizationEnum; int lre_case_conv(uint32_t *res, uint32_t c, int conv_type); +int lre_canonicalize(uint32_t c, BOOL is_unicode); LRE_BOOL lre_is_cased(uint32_t c); LRE_BOOL lre_is_case_ignorable(uint32_t c); @@ -101,6 +102,8 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len, int cr_invert(CharRange *cr); +int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode); + #ifdef CONFIG_ALL_UNICODE LRE_BOOL lre_is_id_start(uint32_t c); |