diff options
author | Charlie Gordon <github@chqrlie.org> | 2024-05-05 12:10:24 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2024-05-07 13:13:02 +0200 |
commit | 2954a12cc8278196cfa52a6256f3507447f3f18d (patch) | |
tree | 9e50c2d9f0c0aed6dea0a3bff47cef3e0849809d | |
parent | 1e2cfd58f0fa5e063d7a36da918ec2523474b66e (diff) | |
download | chawan-2954a12cc8278196cfa52a6256f3507447f3f18d.tar.gz |
Improve unicode table handling (#286)
- Document table and index formats - Add size statistics - Fix UBSAN issue in `get_le24()` Fixes #285
-rw-r--r-- | lib/quickjs/libunicode-table.h | 161 | ||||
-rw-r--r-- | lib/quickjs/libunicode.c | 36 |
2 files changed, 148 insertions, 49 deletions
diff --git a/lib/quickjs/libunicode-table.h b/lib/quickjs/libunicode-table.h index 513ed94b..72d495e7 100644 --- a/lib/quickjs/libunicode-table.h +++ b/lib/quickjs/libunicode-table.h @@ -189,9 +189,13 @@ static const uint8_t unicode_prop_Cased1_table[196] = { }; static const uint8_t unicode_prop_Cased1_index[21] = { - 0xb9, 0x02, 0xe0, 0xc0, 0x1d, 0x20, 0xe5, 0x2c, - 0x20, 0xb1, 0x07, 0x21, 0xc1, 0xd6, 0x21, 0x4a, - 0xf1, 0x01, 0x8a, 0xf1, 0x01, + 0xb9, 0x02, 0xe0, // 002B9 at 39 + 0xc0, 0x1d, 0x20, // 01DC0 at 65 + 0xe5, 0x2c, 0x20, // 02CE5 at 97 + 0xb1, 0x07, 0x21, // 107B1 at 129 + 0xc1, 0xd6, 0x21, // 1D6C1 at 161 + 0x4a, 0xf1, 0x01, // 1F14A at 192 + 0x8a, 0xf1, 0x01, // 1F18A at 224 (upper bound) }; static const uint8_t unicode_prop_Case_Ignorable_table[737] = { @@ -291,15 +295,29 @@ static const uint8_t unicode_prop_Case_Ignorable_table[737] = { }; static const uint8_t unicode_prop_Case_Ignorable_index[69] = { - 0xbe, 0x05, 0x00, 0xfe, 0x07, 0x00, 0x52, 0x0a, - 0xa0, 0xc1, 0x0b, 0x00, 0x82, 0x0d, 0x00, 0x3f, - 0x10, 0x80, 0xd4, 0x17, 0x40, 0xcf, 0x1a, 0x20, - 0xf5, 0x1c, 0x00, 0x80, 0x20, 0x00, 0x16, 0xa0, - 0x00, 0xc6, 0xa8, 0x00, 0xc2, 0xaa, 0x60, 0x56, - 0xfe, 0x20, 0xb1, 0x07, 0x01, 0x75, 0x10, 0x01, - 0xeb, 0x12, 0x21, 0x41, 0x16, 0x01, 0x5c, 0x1a, - 0x01, 0x43, 0x1f, 0x01, 0x2e, 0xcf, 0x41, 0x25, - 0xe0, 0x01, 0xf0, 0x01, 0x0e, + 0xbe, 0x05, 0x00, // 005BE at 32 + 0xfe, 0x07, 0x00, // 007FE at 64 + 0x52, 0x0a, 0xa0, // 00A52 at 101 + 0xc1, 0x0b, 0x00, // 00BC1 at 128 + 0x82, 0x0d, 0x00, // 00D82 at 160 + 0x3f, 0x10, 0x80, // 0103F at 196 + 0xd4, 0x17, 0x40, // 017D4 at 226 + 0xcf, 0x1a, 0x20, // 01ACF at 257 + 0xf5, 0x1c, 0x00, // 01CF5 at 288 + 0x80, 0x20, 0x00, // 02080 at 320 + 0x16, 0xa0, 0x00, // 0A016 at 352 + 0xc6, 0xa8, 0x00, // 0A8C6 at 384 + 0xc2, 0xaa, 0x60, // 0AAC2 at 419 + 0x56, 0xfe, 0x20, // 0FE56 at 449 + 0xb1, 0x07, 0x01, // 107B1 at 480 + 0x75, 0x10, 0x01, // 11075 at 512 + 0xeb, 0x12, 0x21, // 112EB at 545 + 0x41, 0x16, 0x01, // 11641 at 576 + 0x5c, 0x1a, 0x01, // 11A5C at 608 + 0x43, 0x1f, 0x01, // 11F43 at 640 + 0x2e, 0xcf, 0x41, // 1CF2E at 674 + 0x25, 0xe0, 0x01, // 1E025 at 704 + 0xf0, 0x01, 0x0e, // E01F0 at 736 (upper bound) }; static const uint8_t unicode_prop_ID_Start_table[1100] = { @@ -444,20 +462,41 @@ static const uint8_t unicode_prop_ID_Start_table[1100] = { }; static const uint8_t unicode_prop_ID_Start_index[105] = { - 0xf6, 0x03, 0x20, 0xa6, 0x07, 0x00, 0xa9, 0x09, - 0x20, 0xb1, 0x0a, 0x00, 0xba, 0x0b, 0x20, 0x3b, - 0x0d, 0x20, 0xc7, 0x0e, 0x20, 0x49, 0x12, 0x00, - 0x9b, 0x16, 0x00, 0xac, 0x19, 0x00, 0xc0, 0x1d, - 0x80, 0x80, 0x20, 0x20, 0x70, 0x2d, 0x00, 0x00, - 0x32, 0x00, 0xda, 0xa7, 0x00, 0x4c, 0xaa, 0x20, - 0xc7, 0xd7, 0x20, 0xfc, 0xfd, 0x20, 0x9d, 0x02, - 0x21, 0x96, 0x05, 0x01, 0xf3, 0x08, 0x01, 0xb3, - 0x0c, 0x21, 0x73, 0x11, 0x61, 0x34, 0x13, 0x01, - 0x1b, 0x17, 0x21, 0x8a, 0x1a, 0x01, 0x34, 0x1f, - 0x21, 0xbf, 0x6a, 0x01, 0x23, 0xb1, 0xa1, 0xad, - 0xd4, 0x01, 0x6f, 0xd7, 0x01, 0xff, 0xe7, 0x61, - 0x5e, 0xee, 0x01, 0xe1, 0xeb, 0x22, 0xb0, 0x23, - 0x03, + 0xf6, 0x03, 0x20, // 003F6 at 33 + 0xa6, 0x07, 0x00, // 007A6 at 64 + 0xa9, 0x09, 0x20, // 009A9 at 97 + 0xb1, 0x0a, 0x00, // 00AB1 at 128 + 0xba, 0x0b, 0x20, // 00BBA at 161 + 0x3b, 0x0d, 0x20, // 00D3B at 193 + 0xc7, 0x0e, 0x20, // 00EC7 at 225 + 0x49, 0x12, 0x00, // 01249 at 256 + 0x9b, 0x16, 0x00, // 0169B at 288 + 0xac, 0x19, 0x00, // 019AC at 320 + 0xc0, 0x1d, 0x80, // 01DC0 at 356 + 0x80, 0x20, 0x20, // 02080 at 385 + 0x70, 0x2d, 0x00, // 02D70 at 416 + 0x00, 0x32, 0x00, // 03200 at 448 + 0xda, 0xa7, 0x00, // 0A7DA at 480 + 0x4c, 0xaa, 0x20, // 0AA4C at 513 + 0xc7, 0xd7, 0x20, // 0D7C7 at 545 + 0xfc, 0xfd, 0x20, // 0FDFC at 577 + 0x9d, 0x02, 0x21, // 1029D at 609 + 0x96, 0x05, 0x01, // 10596 at 640 + 0xf3, 0x08, 0x01, // 108F3 at 672 + 0xb3, 0x0c, 0x21, // 10CB3 at 705 + 0x73, 0x11, 0x61, // 11173 at 739 + 0x34, 0x13, 0x01, // 11334 at 768 + 0x1b, 0x17, 0x21, // 1171B at 801 + 0x8a, 0x1a, 0x01, // 11A8A at 832 + 0x34, 0x1f, 0x21, // 11F34 at 865 + 0xbf, 0x6a, 0x01, // 16ABF at 896 + 0x23, 0xb1, 0xa1, // 1B123 at 933 + 0xad, 0xd4, 0x01, // 1D4AD at 960 + 0x6f, 0xd7, 0x01, // 1D76F at 992 + 0xff, 0xe7, 0x61, // 1E7FF at 1027 + 0x5e, 0xee, 0x01, // 1EE5E at 1056 + 0xe1, 0xeb, 0x22, // 2EBE1 at 1089 + 0xb0, 0x23, 0x03, // 323B0 at 1120 (upper bound) }; static const uint8_t unicode_prop_ID_Continue1_table[660] = { @@ -547,14 +586,27 @@ static const uint8_t unicode_prop_ID_Continue1_table[660] = { }; static const uint8_t unicode_prop_ID_Continue1_index[63] = { - 0xfa, 0x06, 0x00, 0x70, 0x09, 0x00, 0xf0, 0x0a, - 0x40, 0x57, 0x0c, 0x00, 0xf0, 0x0d, 0x60, 0xc7, - 0x0f, 0x20, 0xea, 0x17, 0x40, 0x05, 0x1b, 0x00, - 0x41, 0x20, 0x00, 0x0c, 0xa8, 0x80, 0x37, 0xaa, - 0x20, 0x50, 0xfe, 0x20, 0x3a, 0x0d, 0x21, 0x74, - 0x11, 0x01, 0x5a, 0x14, 0x21, 0x44, 0x19, 0x81, - 0x5a, 0x1d, 0xa1, 0xf5, 0x6a, 0x21, 0x45, 0xd2, - 0x41, 0xaf, 0xe2, 0x21, 0xf0, 0x01, 0x0e, + 0xfa, 0x06, 0x00, // 006FA at 32 + 0x70, 0x09, 0x00, // 00970 at 64 + 0xf0, 0x0a, 0x40, // 00AF0 at 98 + 0x57, 0x0c, 0x00, // 00C57 at 128 + 0xf0, 0x0d, 0x60, // 00DF0 at 163 + 0xc7, 0x0f, 0x20, // 00FC7 at 193 + 0xea, 0x17, 0x40, // 017EA at 226 + 0x05, 0x1b, 0x00, // 01B05 at 256 + 0x41, 0x20, 0x00, // 02041 at 288 + 0x0c, 0xa8, 0x80, // 0A80C at 324 + 0x37, 0xaa, 0x20, // 0AA37 at 353 + 0x50, 0xfe, 0x20, // 0FE50 at 385 + 0x3a, 0x0d, 0x21, // 10D3A at 417 + 0x74, 0x11, 0x01, // 11174 at 448 + 0x5a, 0x14, 0x21, // 1145A at 481 + 0x44, 0x19, 0x81, // 11944 at 516 + 0x5a, 0x1d, 0xa1, // 11D5A at 549 + 0xf5, 0x6a, 0x21, // 16AF5 at 577 + 0x45, 0xd2, 0x41, // 1D245 at 610 + 0xaf, 0xe2, 0x21, // 1E2AF at 641 + 0xf0, 0x01, 0x0e, // E01F0 at 672 (upper bound) }; #ifdef CONFIG_ALL_UNICODE @@ -676,17 +728,35 @@ static const uint8_t unicode_cc_table[899] = { }; static const uint8_t unicode_cc_index[87] = { - 0x4d, 0x03, 0x00, 0x97, 0x05, 0x20, 0xc6, 0x05, - 0x00, 0xe7, 0x06, 0x00, 0x45, 0x07, 0x00, 0x9c, - 0x08, 0x00, 0x4d, 0x09, 0x00, 0x3c, 0x0b, 0x00, - 0x3d, 0x0d, 0x00, 0x36, 0x0f, 0x00, 0x38, 0x10, - 0x20, 0x3a, 0x19, 0x00, 0xcb, 0x1a, 0x20, 0xd3, - 0x1c, 0x00, 0xcf, 0x1d, 0x00, 0xe2, 0x20, 0x00, - 0x2e, 0x30, 0x20, 0x2b, 0xa9, 0x20, 0xed, 0xab, - 0x00, 0x39, 0x0a, 0x01, 0x51, 0x0f, 0x01, 0x73, - 0x11, 0x01, 0x75, 0x13, 0x01, 0x2b, 0x17, 0x21, - 0x3f, 0x1c, 0x21, 0x9e, 0xbc, 0x21, 0x08, 0xe0, - 0x01, 0x44, 0xe9, 0x01, 0x4b, 0xe9, 0x01, + 0x4d, 0x03, 0x00, // 0034D at 32 + 0x97, 0x05, 0x20, // 00597 at 65 + 0xc6, 0x05, 0x00, // 005C6 at 96 + 0xe7, 0x06, 0x00, // 006E7 at 128 + 0x45, 0x07, 0x00, // 00745 at 160 + 0x9c, 0x08, 0x00, // 0089C at 192 + 0x4d, 0x09, 0x00, // 0094D at 224 + 0x3c, 0x0b, 0x00, // 00B3C at 256 + 0x3d, 0x0d, 0x00, // 00D3D at 288 + 0x36, 0x0f, 0x00, // 00F36 at 320 + 0x38, 0x10, 0x20, // 01038 at 353 + 0x3a, 0x19, 0x00, // 0193A at 384 + 0xcb, 0x1a, 0x20, // 01ACB at 417 + 0xd3, 0x1c, 0x00, // 01CD3 at 448 + 0xcf, 0x1d, 0x00, // 01DCF at 480 + 0xe2, 0x20, 0x00, // 020E2 at 512 + 0x2e, 0x30, 0x20, // 0302E at 545 + 0x2b, 0xa9, 0x20, // 0A92B at 577 + 0xed, 0xab, 0x00, // 0ABED at 608 + 0x39, 0x0a, 0x01, // 10A39 at 640 + 0x51, 0x0f, 0x01, // 10F51 at 672 + 0x73, 0x11, 0x01, // 11173 at 704 + 0x75, 0x13, 0x01, // 11375 at 736 + 0x2b, 0x17, 0x21, // 1172B at 769 + 0x3f, 0x1c, 0x21, // 11C3F at 801 + 0x9e, 0xbc, 0x21, // 1BC9E at 833 + 0x08, 0xe0, 0x01, // 1E008 at 864 + 0x44, 0xe9, 0x01, // 1E944 at 896 + 0x4b, 0xe9, 0x01, // 1E94B at 928 (upper bound) }; static const uint32_t unicode_decomp_table1[699] = { @@ -4484,3 +4554,4 @@ static const uint16_t unicode_prop_len_table[] = { }; #endif /* CONFIG_ALL_UNICODE */ +/* 62 tables / 32261 bytes, 5 index / 345 bytes */ diff --git a/lib/quickjs/libunicode.c b/lib/quickjs/libunicode.c index 4200252d..a631bbd2 100644 --- a/lib/quickjs/libunicode.c +++ b/lib/quickjs/libunicode.c @@ -262,11 +262,7 @@ int lre_canonicalize(uint32_t c, BOOL is_unicode) static uint32_t get_le24(const uint8_t *ptr) { -#if defined(__x86__) || defined(__x86_64__) - return *(uint16_t *)ptr | (ptr[2] << 16); -#else return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16); -#endif } #define UNICODE_INDEX_BLOCK_LEN 32 @@ -317,6 +313,14 @@ static BOOL lre_is_in_table(uint32_t c, const uint8_t *table, return FALSE; /* outside the table */ p = table + pos; bit = 0; + /* Compressed run length encoding: + 00..3F: 2 packed lengths: 3-bit + 3-bit + 40..5F: 5-bits plus extra byte for length + 60..7F: 5-bits plus 2 extra bytes for length + 80..FF: 7-bit length + lengths must be incremented to get character count + Ranges alternate between false and true return value. + */ for(;;) { b = *p++; if (b < 64) { @@ -833,6 +837,13 @@ static int unicode_get_cc(uint32_t c) if (pos < 0) return 0; p = unicode_cc_table + pos; + /* Compressed run length encoding: + - 2 high order bits are combining class type + - 0:0, 1:230, 2:extra byte linear progression, 3:extra byte + - 00..2F: range length (add 1) + - 30..37: 3-bit range-length + 1 extra byte + - 38..3F: 3-bit range-length + 2 extra byte + */ for(;;) { b = *p++; type = b >> 6; @@ -1185,6 +1196,15 @@ static int unicode_general_category1(CharRange *cr, uint32_t gc_mask) p = unicode_gc_table; p_end = unicode_gc_table + countof(unicode_gc_table); c = 0; + /* Compressed range encoding: + initial byte: + bits 0..4: category number (special case 31) + bits 5..7: range length (add 1) + special case bits 5..7 == 7: read an extra byte + - 00..7F: range length (add 7 + 1) + - 80..BF: 6-bits plus extra byte for range length (add 7 + 128) + - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384) + */ while (p < p_end) { b = *p++; n = b >> 5; @@ -1238,6 +1258,14 @@ static int unicode_prop1(CharRange *cr, int prop_idx) p_end = p + unicode_prop_len_table[prop_idx]; c = 0; bit = 0; + /* Compressed range encoding: + 00..3F: 2 packed lengths: 3-bit + 3-bit + 40..5F: 5-bits plus extra byte for length + 60..7F: 5-bits plus 2 extra bytes for length + 80..FF: 7-bit length + lengths must be incremented to get character count + Ranges alternate between false and true return value. + */ while (p < p_end) { c0 = c; b = *p++; |