diff options
author | bptato <nincsnevem662@gmail.com> | 2021-03-17 12:20:05 +0100 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2021-03-17 12:20:05 +0100 |
commit | 94a10242dca6181ef8f15a37e7083069ead09559 (patch) | |
tree | ccdb28c82535aa7cda25f2bfe26cbfa451eaecab /src/twtstr.nim | |
parent | 97f19da347b27a4d12f54784fa8bcbf304aa4fea (diff) | |
download | chawan-94a10242dca6181ef8f15a37e7083069ead09559.tar.gz |
...
Diffstat (limited to 'src/twtstr.nim')
-rw-r--r-- | src/twtstr.nim | 146 |
1 files changed, 96 insertions, 50 deletions
diff --git a/src/twtstr.nim b/src/twtstr.nim index 52db36cf..aa2cf2c7 100644 --- a/src/twtstr.nim +++ b/src/twtstr.nim @@ -159,6 +159,11 @@ func decValue*(r: Rune): int = return decValue(char(r)) return -1 +func toAsciiLower*(s: seq[Rune]): string = + for r in s: + if isAscii(r): + result &= lowerChars[int(r)] + func breaksWord*(r: Rune): bool = return r in breakWord @@ -168,12 +173,6 @@ func isAlphaAscii*(r: Rune): bool = func isDigitAscii*(r: Rune): bool = return isAscii(r) and isDigit(char(r)) -func isNameStartCodePoint*(r: Rune): bool = - return not isAscii(r) or r == Rune('_') or isAlphaAscii(r) - -func isNameCodePoint*(r: Rune): bool = - return isNameStartCodePoint(r) or isDigitAscii(r) or r == Rune('-') - func substr*(s: seq[Rune], i: int, j: int): seq[Rune] = if s.len == 0: return @[] @@ -210,33 +209,33 @@ func bisearch(ucs: Rune, table: openarray[(int, int)]): bool = # # - The null character (U+0000) has a column width of 0. # -# - Other C0/C1 control characters and DEL will lead to a return -# value of 2. +# - Other C0/C1 control characters and DEL will lead to a return value of 2 +# (changed from 0 b/c we normally display control chars like ^H - TODO?). # -# - Non-spacing and enclosing combining characters (general -# category code Mn or Me in the Unicode database) have a -# column width of 0. +# - Non-spacing and enclosing combining characters (general category code Mn +# or Me in the Unicode database) have a column width of 0. # # - SOFT HYPHEN (U+00AD) has a column width of 1. # # - Other format characters (general category code Cf in the Unicode # database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. # -# - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) -# have a column width of 0. +# - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) have a +# column width of 0. # -# - Spacing characters in the East Asian Wide (W) or East Asian -# Full-width (F) category as defined in Unicode Technical -# Report #11 have a column width of 2. +# - Spacing characters in the East Asian Wide (W) or East Asian Full-width +# (F) category as defined in Unicode Technical Report #11 have a column +# width of 2. # -# - All remaining characters (including all printable -# ISO 8859-1 and WGL4 characters, Unicode control characters, -# etc.) have a column width of 1. +# - All remaining characters (including all printable ISO 8859-1 and WGL4 +# characters, Unicode control characters, etc.) have a column width of 1. # #This implementation assumes that wchar_t characters are encoded #in ISO 10646. # +# sorted list of non-overlapping intervals of non-spacing characters +# generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" const combining = [ ( 0x0300, 0x036F ), ( 0x0483, 0x0486 ), ( 0x0488, 0x0489 ), ( 0x0591, 0x05BD ), ( 0x05BF, 0x05BF ), ( 0x05C1, 0x05C2 ), @@ -288,21 +287,9 @@ const combining = [ ( 0xE0100, 0xE01EF ) ] -func width*(r: Rune): int = +func is_dwidth(r: Rune): bool = let ucs = int(r) - # sorted list of non-overlapping intervals of non-spacing characters - # generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" - - # binary search in table of non-spacing characters - if bisearch(r, combining): - return 0 - - if r.isControlChar(): - return 2 - - # if we arrive here, ucs is not a combining or C0/C1 control character - - if (ucs >= 0x1100 and + return (ucs >= 0x1100 and (ucs <= 0x115f or # Hangul Jamo init. consonants ucs == 0x2329 or ucs == 0x232a or (ucs >= 0x2e80 and ucs <= 0xa4cf and @@ -314,9 +301,65 @@ func width*(r: Rune): int = (ucs >= 0xff00 and ucs <= 0xff60) or # Fullwidth Forms (ucs >= 0xffe0 and ucs <= 0xffe6) or (ucs >= 0x20000 and ucs <= 0x2fffd) or - (ucs >= 0x30000 and ucs <= 0x3fffd))): - return 2 - return 1 + (ucs >= 0x30000 and ucs <= 0x3fffd))) + +func makewidthtable(): array[0..0x10FFFF, byte] = + for r in low(char)..high(char): + if r.isControlChar(): + result[int(r)] = 2 + else: + result[int(r)] = 1 + + var i = 0 + var next_combining = combining[i] + for ucs in 256..0x10FFFF: + if ucs >= next_combining[0]: + if ucs <= next_combining[1]: + result[ucs] = 0 + continue + elif i + 1 < combining.len: + inc i + next_combining = combining[i] + + if Rune(ucs).is_dwidth(): + result[ucs] = 2 + else: + result[ucs] = 1 + + for range in combining: + for r in range[0]..range[1]: + result[r] = 0 + + +# lowmem: use slow binary search etc method +when defined(lowmem): + func width*(r: Rune): int = + # binary search in table of non-spacing characters + if bisearch(r, combining): + return 0 + + if r.isControlChar(): + return 2 + + # if we arrive here, ucs is not a combining or C0/C1 control character + + if r.is_dwidth(): + return 2 + return 1 + + func width*(r: Rune): int = + return int(width_table[int(r)]) +# small: store lookup table in memory on startup +elif defined(small): + let width_table = makewidthtable() + func width*(r: Rune): int = + {.cast(noSideEffect).}: + return int(width_table[int(r)]) +# release: store lookup table in executable +else: + const width_table = makewidthtable() + func width*(r: Rune): int = + return int(width_table[int(r)]) func width*(s: string): int = for r in s.runes(): @@ -326,15 +369,9 @@ func width*(s: seq[Rune]): int = for r in s: result += width(r) -# -# The following functions are the same as mk_wcwidth() and -# mk_wcswidth(), except that spacing characters in the East Asian -# Ambiguous (A) category as defined in Unicode Technical Report #11 -# have a column width of 2. This variant might be useful for users of -# CJK legacy encodings who want to migrate to UCS without changing -# the traditional terminal character-width behaviour. It is not -# otherwise recommended for general use. -# +# sorted list of non-overlapping intervals of East Asian Ambiguous +# characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" + const ambiguous = [ ( 0x00A1, 0x00A1 ), ( 0x00A4, 0x00A4 ), ( 0x00A7, 0x00A8 ), ( 0x00AA, 0x00AA ), ( 0x00AE, 0x00AE ), ( 0x00B0, 0x00B4 ), @@ -390,15 +427,24 @@ const ambiguous = [ ( 0xFFFD, 0xFFFD ), ( 0xF0000, 0xFFFFD ), ( 0x100000, 0x10FFFD ) ] -func mk_wcwidth_cjk(ucs: Rune): int = - # sorted list of non-overlapping intervals of East Asian Ambiguous - # characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" +# +# The following functions are the same as mk_wcwidth() and +# mk_wcswidth(), except that spacing characters in the East Asian +# Ambiguous (A) category as defined in Unicode Technical Report #11 +# have a column width of 2. This variant might be useful for users of +# CJK legacy encodings who want to migrate to UCS without changing +# the traditional terminal character-width behaviour. It is not +# otherwise recommended for general use. +# +# note: seconded, this should only be used if some option was changed (TODO: +# make such an option available) +func mk_wcwidth_cjk(r: Rune): int = # binary search in table of non-spacing characters - if bisearch(ucs, ambiguous): + if bisearch(r, ambiguous): return 2; - return width(ucs); + return r.width(); func mk_wcswidth_cjk(s: string): int = for r in s.runes: |