diff options
author | Fabrice Bellard <fabrice@bellard.org> | 2024-01-08 18:42:29 +0100 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2024-01-11 18:51:43 +0100 |
commit | 3853d2be3916b98d449e785071e370fe5b9a36ee (patch) | |
tree | d291ca65a9151a30d9c31b101e06e942a59ebaa0 /lib/quickjs/libunicode.c | |
parent | 0ecfe8945a5b064db2d2bf3984ece31bcea5e351 (diff) | |
download | chawan-3853d2be3916b98d449e785071e370fe5b9a36ee.tar.gz |
fixed regexp case insensitive flag
Diffstat (limited to 'lib/quickjs/libunicode.c')
-rw-r--r-- | lib/quickjs/libunicode.c | 386 |
1 files changed, 309 insertions, 77 deletions
diff --git a/lib/quickjs/libunicode.c b/lib/quickjs/libunicode.c index 63c12a07..0712a6c0 100644 --- a/lib/quickjs/libunicode.c +++ b/lib/quickjs/libunicode.c @@ -43,11 +43,111 @@ enum { RUN_TYPE_UF_D1_EXT, RUN_TYPE_U_EXT, RUN_TYPE_LF_EXT, - RUN_TYPE_U_EXT2, - RUN_TYPE_L_EXT2, - RUN_TYPE_U_EXT3, + RUN_TYPE_UF_EXT2, + RUN_TYPE_LF_EXT2, + RUN_TYPE_UF_EXT3, }; +static int lre_case_conv1(uint32_t c, int conv_type) +{ + uint32_t res[LRE_CC_RES_LEN_MAX]; + lre_case_conv(res, c, conv_type); + return res[0]; +} + +/* case conversion using the table entry 'idx' with value 'v' */ +static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v) +{ + uint32_t code, data, type, a, is_lower; + is_lower = (conv_type != 0); + type = (v >> (32 - 17 - 7 - 4)) & 0xf; + data = ((v & 0xf) << 8) | case_conv_table2[idx]; + code = v >> (32 - 17); + switch(type) { + case RUN_TYPE_U: + case RUN_TYPE_L: + case RUN_TYPE_UF: + case RUN_TYPE_LF: + if (conv_type == (type & 1) || + (type >= RUN_TYPE_UF && conv_type == 2)) { + c = c - code + (case_conv_table1[data] >> (32 - 17)); + } + break; + case RUN_TYPE_UL: + a = c - code; + if ((a & 1) != (1 - is_lower)) + break; + c = (a ^ 1) + code; + break; + case RUN_TYPE_LSU: + a = c - code; + if (a == 1) { + c += 2 * is_lower - 1; + } else if (a == (1 - is_lower) * 2) { + c += (2 * is_lower - 1) * 2; + } + break; + case RUN_TYPE_U2L_399_EXT2: + if (!is_lower) { + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = 0x399; + return 2; + } else { + c = c - code + case_conv_ext[data & 0x3f]; + } + break; + case RUN_TYPE_UF_D20: + if (conv_type == 1) + break; + c = data + (conv_type == 2) * 0x20; + break; + case RUN_TYPE_UF_D1_EXT: + if (conv_type == 1) + break; + c = case_conv_ext[data] + (conv_type == 2); + break; + case RUN_TYPE_U_EXT: + case RUN_TYPE_LF_EXT: + if (is_lower != (type - RUN_TYPE_U_EXT)) + break; + c = case_conv_ext[data]; + break; + case RUN_TYPE_LF_EXT2: + if (!is_lower) + break; + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = case_conv_ext[data & 0x3f]; + return 2; + case RUN_TYPE_UF_EXT2: + if (conv_type == 1) + break; + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = case_conv_ext[data & 0x3f]; + if (conv_type == 2) { + /* convert to lower */ + res[0] = lre_case_conv1(res[0], 1); + res[1] = lre_case_conv1(res[1], 1); + } + return 2; + default: + case RUN_TYPE_UF_EXT3: + if (conv_type == 1) + break; + res[0] = case_conv_ext[data >> 8]; + res[1] = case_conv_ext[(data >> 4) & 0xf]; + res[2] = case_conv_ext[data & 0xf]; + if (conv_type == 2) { + /* convert to lower */ + res[0] = lre_case_conv1(res[0], 1); + res[1] = lre_case_conv1(res[1], 1); + res[2] = lre_case_conv1(res[2], 1); + } + return 3; + } + res[0] = c; + return 1; +} + /* conv_type: 0 = to upper 1 = to lower @@ -66,10 +166,9 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) } } } else { - uint32_t v, code, data, type, len, a, is_lower; + uint32_t v, code, len; int idx, idx_min, idx_max; - is_lower = (conv_type != 0); idx_min = 0; idx_max = countof(case_conv_table1) - 1; while (idx_min <= idx_max) { @@ -82,74 +181,7 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) } else if (c >= code + len) { idx_min = idx + 1; } else { - type = (v >> (32 - 17 - 7 - 4)) & 0xf; - data = ((v & 0xf) << 8) | case_conv_table2[idx]; - switch(type) { - case RUN_TYPE_U: - case RUN_TYPE_L: - case RUN_TYPE_UF: - case RUN_TYPE_LF: - if (conv_type == (type & 1) || - (type >= RUN_TYPE_UF && conv_type == 2)) { - c = c - code + (case_conv_table1[data] >> (32 - 17)); - } - break; - case RUN_TYPE_UL: - a = c - code; - if ((a & 1) != (1 - is_lower)) - break; - c = (a ^ 1) + code; - break; - case RUN_TYPE_LSU: - a = c - code; - if (a == 1) { - c += 2 * is_lower - 1; - } else if (a == (1 - is_lower) * 2) { - c += (2 * is_lower - 1) * 2; - } - break; - case RUN_TYPE_U2L_399_EXT2: - if (!is_lower) { - res[0] = c - code + case_conv_ext[data >> 6]; - res[1] = 0x399; - return 2; - } else { - c = c - code + case_conv_ext[data & 0x3f]; - } - break; - case RUN_TYPE_UF_D20: - if (conv_type == 1) - break; - c = data + (conv_type == 2) * 0x20; - break; - case RUN_TYPE_UF_D1_EXT: - if (conv_type == 1) - break; - c = case_conv_ext[data] + (conv_type == 2); - break; - case RUN_TYPE_U_EXT: - case RUN_TYPE_LF_EXT: - if (is_lower != (type - RUN_TYPE_U_EXT)) - break; - c = case_conv_ext[data]; - break; - case RUN_TYPE_U_EXT2: - case RUN_TYPE_L_EXT2: - if (conv_type != (type - RUN_TYPE_U_EXT2)) - break; - res[0] = c - code + case_conv_ext[data >> 6]; - res[1] = case_conv_ext[data & 0x3f]; - return 2; - default: - case RUN_TYPE_U_EXT3: - if (conv_type != 0) - break; - res[0] = case_conv_ext[data >> 8]; - res[1] = case_conv_ext[(data >> 4) & 0xf]; - res[2] = case_conv_ext[data & 0xf]; - return 3; - } - break; + return lre_case_conv_entry(res, c, conv_type, idx, v); } } } @@ -157,6 +189,77 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) return 1; } +static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode) +{ + uint32_t res[LRE_CC_RES_LEN_MAX]; + int len; + + if (is_unicode) { + len = lre_case_conv_entry(res, c, 2, idx, v); + if (len == 1) { + c = res[0]; + } else { + /* handle the few specific multi-character cases (see + unicode_gen.c:dump_case_folding_special_cases()) */ + if (c == 0xfb06) { + c = 0xfb05; + } else if (c == 0x01fd3) { + c = 0x390; + } else if (c == 0x01fe3) { + c = 0x3b0; + } + } + } else { + if (likely(c < 128)) { + if (c >= 'a' && c <= 'z') + c = c - 'a' + 'A'; + } else { + /* legacy regexp: to upper case if single char >= 128 */ + len = lre_case_conv_entry(res, c, FALSE, idx, v); + if (len == 1 && res[0] >= 128) + c = res[0]; + } + } + return c; +} + +/* JS regexp specific rules for case folding */ +int lre_canonicalize(uint32_t c, BOOL is_unicode) +{ + if (c < 128) { + /* fast case */ + if (is_unicode) { + if (c >= 'A' && c <= 'Z') { + c = c - 'A' + 'a'; + } + } else { + if (c >= 'a' && c <= 'z') { + c = c - 'a' + 'A'; + } + } + } else { + uint32_t v, code, len; + int idx, idx_min, idx_max; + + idx_min = 0; + idx_max = countof(case_conv_table1) - 1; + while (idx_min <= idx_max) { + idx = (unsigned)(idx_max + idx_min) / 2; + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + if (c < code) { + idx_max = idx - 1; + } else if (c >= code + len) { + idx_min = idx + 1; + } else { + return lre_case_folding_entry(c, idx, v, is_unicode); + } + } + } + return c; +} + static uint32_t get_le24(const uint8_t *ptr) { #if defined(__x86__) || defined(__x86_64__) @@ -1179,11 +1282,11 @@ static int unicode_case1(CharRange *cr, int case_mask) #define MR(x) (1 << RUN_TYPE_ ## x) const uint32_t tab_run_mask[3] = { MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) | - MR(UF_D1_EXT) | MR(U_EXT) | MR(U_EXT2) | MR(U_EXT3), + MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3), - MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(L_EXT2), + MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2), - MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT), + MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3), }; #undef MR uint32_t mask, v, code, type, len, i, idx; @@ -1236,7 +1339,136 @@ static int unicode_case1(CharRange *cr, int case_mask) } return 0; } - + +static int point_cmp(const void *p1, const void *p2, void *arg) +{ + uint32_t v1 = *(uint32_t *)p1; + uint32_t v2 = *(uint32_t *)p2; + return (v1 > v2) - (v1 < v2); +} + +static void cr_sort_and_remove_overlap(CharRange *cr) +{ + uint32_t start, end, start1, end1, i, j; + + /* the resulting ranges are not necessarily sorted and may overlap */ + rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL); + j = 0; + for(i = 0; i < cr->len; ) { + start = cr->points[i]; + end = cr->points[i + 1]; + i += 2; + while (i < cr->len) { + start1 = cr->points[i]; + end1 = cr->points[i + 1]; + if (start1 > end) { + /* |------| + * |-------| */ + break; + } else if (end1 <= end) { + /* |------| + * |--| */ + i += 2; + } else { + /* |------| + * |-------| */ + end = end1; + i += 2; + } + } + cr->points[j] = start; + cr->points[j + 1] = end; + j += 2; + } + cr->len = j; +} + +/* canonicalize a character set using the JS regex case folding rules + (see lre_canonicalize()) */ +int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode) +{ + CharRange cr_inter, cr_mask, cr_result, cr_sub; + uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d; + + cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_result, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func); + + if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U)) + goto fail; + if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER)) + goto fail; + + if (cr_invert(&cr_mask)) + goto fail; + if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER)) + goto fail; + + /* cr_inter = cr & cr_mask */ + /* cr_sub = cr & ~cr_mask */ + + /* use the case conversion table to compute the result */ + d_start = -1; + d_end = -1; + idx = 0; + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + for(i = 0; i < cr_inter.len; i += 2) { + start = cr_inter.points[i]; + end = cr_inter.points[i + 1]; + + for(c = start; c < end; c++) { + for(;;) { + if (c >= code && c < code + len) + break; + idx++; + assert(idx < countof(case_conv_table1)); + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + } + d = lre_case_folding_entry(c, idx, v, is_unicode); + /* try to merge with the current interval */ + if (d_start == -1) { + d_start = d; + d_end = d + 1; + } else if (d_end == d) { + d_end++; + } else { + cr_add_interval(&cr_result, d_start, d_end); + d_start = d; + d_end = d + 1; + } + } + } + if (d_start != -1) { + if (cr_add_interval(&cr_result, d_start, d_end)) + goto fail; + } + + /* the resulting ranges are not necessarily sorted and may overlap */ + cr_sort_and_remove_overlap(&cr_result); + + /* or with the character not affected by the case folding */ + cr->len = 0; + if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION)) + goto fail; + + cr_free(&cr_inter); + cr_free(&cr_mask); + cr_free(&cr_result); + cr_free(&cr_sub); + return 0; + fail: + cr_free(&cr_inter); + cr_free(&cr_mask); + cr_free(&cr_result); + cr_free(&cr_sub); + return -1; +} + typedef enum { POP_GC, POP_PROP, |