diff options
author | bptato <nincsnevem662@gmail.com> | 2023-01-27 21:30:28 +0100 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2023-01-27 21:48:21 +0100 |
commit | 5f88adcc8e742dc18230f40c3361801cf012a8c6 (patch) | |
tree | 4c402261a3349ad3fd1959ef58093d459fe2bccd /src/utils | |
parent | 167dd67270d5a432c584b61f5a22281ca47017d9 (diff) | |
download | chawan-5f88adcc8e742dc18230f40c3361801cf012a8c6.tar.gz |
Make width table at compile-time
Diffstat (limited to 'src/utils')
-rw-r--r-- | src/utils/twtstr.nim | 163 |
1 files changed, 100 insertions, 63 deletions
diff --git a/src/utils/twtstr.nim b/src/utils/twtstr.nim index 582d68d4..09422d0f 100644 --- a/src/utils/twtstr.nim +++ b/src/utils/twtstr.nim @@ -820,7 +820,7 @@ iterator split*(s: seq[Rune], sep: Rune): seq[Rune] = # sorted list of non-overlapping intervals of non-spacing characters generated # by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" -const combining = [ +const Combining = [ ( 0x0300, 0x036F ), ( 0x0483, 0x0486 ), ( 0x0488, 0x0489 ), ( 0x0591, 0x05BD ), ( 0x05BF, 0x05BF ), ( 0x05C1, 0x05C2 ), ( 0x05C4, 0x05C5 ), ( 0x05C7, 0x05C7 ), ( 0x0600, 0x0603 ), @@ -871,26 +871,27 @@ const combining = [ ( 0xE0100, 0xE01EF ) ] -func is_dwidth(r: Rune): bool = - let ucs = int(r) - return (ucs >= 0x1100 and - (ucs <= 0x115f or # Hangul Jamo init. consonants - ucs == 0x2329 or ucs == 0x232a or - (ucs >= 0x2e80 and ucs <= 0xa4cf and - ucs != 0x303f) or # CJK ... Yi - (ucs >= 0xac00 and ucs <= 0xd7a3) or # Hangul Syllables - (ucs >= 0xf900 and ucs <= 0xfaff) or # CJK Compatibility Ideographs - (ucs >= 0xfe10 and ucs <= 0xfe19) or # Vertical forms - (ucs >= 0xfe30 and ucs <= 0xfe6f) or # CJK Compatibility Forms - (ucs >= 0xff00 and ucs <= 0xff60) or # Fullwidth Forms - (ucs >= 0xffe0 and ucs <= 0xffe6) or - (ucs >= 0x20000 and ucs <= 0x2fffd) or - (ucs >= 0x30000 and ucs <= 0x3fffd))) +func isDoubleWidthLow(r: Rune): bool = + let ucs = cast[uint32](r) + return ucs in 0x1100u32..0x115Fu32 or # Hangul Jamo init. consonants + ucs == 0x2329 or ucs == 0x232A or + ucs in 0x2E80u32..0xA4CFu32 and ucs != 0x303F or # CJK ... Yi + ucs in 0xAC00u32..0xD7A3u32 or # Hangul Syllables + ucs in 0xF900u32..0xFAFFu32 or # CJK Compatibility Ideographs + ucs in 0xFE10u32..0xFE19u32 or # Vertical forms + ucs in 0xFE30u32..0xFE6Fu32 or # CJK Compatibility Forms + ucs in 0xFF00u32..0xFF60u32 or # Fullwidth Forms + ucs in 0xFFE0u32..0xFFE6u32 + +func isDoubleWidthHigh(r: Rune): bool = + let ucs = cast[uint32](r) + return (ucs in 0x20000u32..0x2FFFDu32) or + (ucs in 0x30000u32..0x3FFFDu32) # sorted list of non-overlapping intervals of East Asian Ambiguous characters, # generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" -const ambiguous = [ +const Ambiguous = [ ( 0x00A1, 0x00A1 ), ( 0x00A4, 0x00A4 ), ( 0x00A7, 0x00A8 ), ( 0x00AA, 0x00AA ), ( 0x00AE, 0x00AE ), ( 0x00B0, 0x00B4 ), ( 0x00B6, 0x00BA ), ( 0x00BC, 0x00BF ), ( 0x00C6, 0x00C6 ), @@ -953,60 +954,96 @@ const ambiguous = [ # to UCS without changing the traditional terminal character-width behaviour. # It is not otherwise recommended for general use. -func is_dwidth_cjk(r: Rune): bool = +func isDoubleWidthAmbiguousHigh(r: Rune): bool = # binary search in table of non-spacing characters - if binarySearch(ambiguous, int32(r), (x, y) => (if x[0] < y: -1 elif x[1] > y: 1 else: 0)) != -1: + if binarySearch(Ambiguous, int32(r), (x, y) => (if x[0] < y: -1 elif x[1] > y: 1 else: 0)) != -1: return true + return r.isDoubleWidthHigh() + +func isCombining(r: Rune): bool = + return binarySearch(Combining, int32(r), (x, y) => (if x[0] < y: -1 elif x[1] > y: 1 else: 0)) != -1 + +# Lookup tables for characters on the BMP. This "only" takes up 8k of space +# per table, as opposed to the 135k that storing all characters would require. +# The downside is obviously that we need a binary search fallback for non-bmp. +type PropertyTable = array[0..(0xFFFF div 8), uint8] + +func makePropertyTable(crit: proc(r: Rune): bool {.noSideEffect.}, + skip: openarray[(int, int)] = @[]): PropertyTable {.noInit.} = + var ucs = 0 + var j = 0 + while ucs <= 0xFFFF: + if j != skip.len and ucs == skip[j][0]: + ucs = skip[j][1] + 1 + continue + if crit(cast[Rune](ucs)): + let i = ucs div 8 + case ucs mod 8 + of 0: result[i] = result[i] or 0x01 + of 1: result[i] = result[i] or 0x02 + of 2: result[i] = result[i] or 0x04 + of 3: result[i] = result[i] or 0x08 + of 4: result[i] = result[i] or 0x10 + of 5: result[i] = result[i] or 0x20 + of 6: result[i] = result[i] or 0x40 + of 7: result[i] = result[i] or 0x80 + else: discard + inc ucs - return r.is_dwidth() - -# compute lookup table on startup -var width_table*: array[0..0x10FFFF, byte] - -# Note: control chars return a width of 2, as we display them as ^{letter}. -func makewidthtable*(cjk: bool): array[0..0x10FFFF, byte] {.noInit.} = - for r in low(char)..high(char): - if r in Controls: - result[int(r)] = 2 - else: - result[int(r)] = 1 - - var i = 0 - var next_combining = combining[i] - if cjk: - for ucs in 256..0x10FFFF: - if ucs >= next_combining[0]: - if ucs <= next_combining[1]: - result[ucs] = 0 - continue - elif i + 1 < combining.len: - inc i - next_combining = combining[i] - - if Rune(ucs).is_dwidth_cjk(): - result[ucs] = 2 - else: - result[ucs] = 1 - else: - for ucs in 256..0x10FFFF: - if ucs >= next_combining[0]: - if ucs <= next_combining[1]: - result[ucs] = 0 - continue - elif i + 1 < combining.len: - inc i - next_combining = combining[i] +# Control chars return a width of 2, and are displayed as ^{letter}. +const DoubleWidthTable = makePropertyTable((func(r: Rune): bool = + r.isAscii() and cast[char](r) in Controls or r.isDoubleWidthLow() +), Combining) +const CombiningTable = makePropertyTable(func(r: Rune): bool = + return binarySearch(Combining, int32(r), (x, y) => (if x[0] < y: -1 elif x[1] > y: 1 else: 0)) != -1 +) - if Rune(ucs).is_dwidth(): - result[ucs] = 2 - else: - result[ucs] = 1 +# One of the few global variables in the code. Honestly, it should not exist. +var is_cjk_ambiguous = false +proc set_cjk_ambiguous*(b: bool) = + is_cjk_ambiguous = b {.push boundChecks:off.} +func contains(props: PropertyTable, r: Rune): bool = + let i = cast[uint32](r) div 8 + case cast[uint32](r) mod 8 + of 0: return (props[i] and 0x01) != 0 + of 1: return (props[i] and 0x02) != 0 + of 2: return (props[i] and 0x04) != 0 + of 3: return (props[i] and 0x08) != 0 + of 4: return (props[i] and 0x10) != 0 + of 5: return (props[i] and 0x20) != 0 + of 6: return (props[i] and 0x40) != 0 + of 7: return (props[i] and 0x80) != 0 + else: discard +{.pop.} + +# Warning: this shouldn't be called without normalization. +# We could make this function more efficient in edge cases, but it's already +# too complex for my taste. func width*(r: Rune): int = {.cast(noSideEffect).}: - return int(width_table[int(r)]) -{.pop.} + if cast[uint32](r) <= 0xFFFF: + if r in CombiningTable: + return 0 + if not is_cjk_ambiguous: + if r in DoubleWidthTable: + return 2 + else: + if r in DoubleWidthTable or + binarySearch(Ambiguous, int32(r), + (x, y) => (if x[0] < y: -1 elif x[1] > y: 1 else: 0)) != -1: + return 2 + else: + if r.isCombining(): + return 0 + if not is_cjk_ambiguous: + if r.isDoubleWidthHigh(): + return 2 + else: + if r.isDoubleWidthAmbiguousHigh(): + return 2 + return 1 # Width, but also works with tabs. # Needs the column width of the text so far. |