...

author: bptato <nincsnevem662@gmail.com> 2021-03-17 12:20:05 +0100
committer: bptato <nincsnevem662@gmail.com> 2021-03-17 12:20:05 +0100
commit: 94a10242dca6181ef8f15a37e7083069ead09559 (patch)
tree: ccdb28c82535aa7cda25f2bfe26cbfa451eaecab /src/twtstr.nim
parent: 97f19da347b27a4d12f54784fa8bcbf304aa4fea (diff)
download: chawan-94a10242dca6181ef8f15a37e7083069ead09559.tar.gz
1 files changed, 96 insertions, 50 deletions
diff --git a/src/twtstr.nim b/src/twtstr.nim
index 52db36cf..aa2cf2c7 100644
--- a/src/twtstr.nim
+++ b/src/twtstr.nim
@@ -159,6 +159,11 @@ func decValue*(r: Rune): int =
     return decValue(char(r))
   return -1
 
+func toAsciiLower*(s: seq[Rune]): string =
+  for r in s:
+    if isAscii(r):
+      result &= lowerChars[int(r)]
+
 func breaksWord*(r: Rune): bool =
   return r in breakWord
 
@@ -168,12 +173,6 @@ func isAlphaAscii*(r: Rune): bool =
 func isDigitAscii*(r: Rune): bool =
   return isAscii(r) and isDigit(char(r))
 
-func isNameStartCodePoint*(r: Rune): bool =
-  return not isAscii(r) or r == Rune('_') or isAlphaAscii(r)
-
-func isNameCodePoint*(r: Rune): bool =
-  return isNameStartCodePoint(r) or isDigitAscii(r) or r == Rune('-')
-
 func substr*(s: seq[Rune], i: int, j: int): seq[Rune] =
   if s.len == 0:
     return @[]
@@ -210,33 +209,33 @@ func bisearch(ucs: Rune, table: openarray[(int, int)]): bool =
 #
 #   - The null character (U+0000) has a column width of 0.
 #
-#   - Other C0/C1 control characters and DEL will lead to a return
-#     value of 2.
+#   - Other C0/C1 control characters and DEL will lead to a return value of 2
+#     (changed from 0 b/c we normally display control chars like ^H - TODO?).
 #
-#   - Non-spacing and enclosing combining characters (general
-#     category code Mn or Me in the Unicode database) have a
-#     column width of 0.
+#   - Non-spacing and enclosing combining characters (general category code Mn
+#     or Me in the Unicode database) have a column width of 0.
 #
 #   - SOFT HYPHEN (U+00AD) has a column width of 1.
 #
 #   - Other format characters (general category code Cf in the Unicode
 #     database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
 #
-#   - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
-#     have a column width of 0.
+#   - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) have a
+#     column width of 0.
 #
-#   - Spacing characters in the East Asian Wide (W) or East Asian
-#     Full-width (F) category as defined in Unicode Technical
-#     Report #11 have a column width of 2.
+#   - Spacing characters in the East Asian Wide (W) or East Asian Full-width
+#     (F) category as defined in Unicode Technical Report #11 have a column
+#     width of 2.
 #
-#   - All remaining characters (including all printable
-#     ISO 8859-1 and WGL4 characters, Unicode control characters,
-#     etc.) have a column width of 1.
+#   - All remaining characters (including all printable ISO 8859-1 and WGL4
+#     characters, Unicode control characters, etc.) have a column width of 1.
 #
 #This implementation assumes that wchar_t characters are encoded
 #in ISO 10646.
 #
 
+# sorted list of non-overlapping intervals of non-spacing characters
+# generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c"
 const combining = [
   ( 0x0300, 0x036F ), ( 0x0483, 0x0486 ), ( 0x0488, 0x0489 ),
   ( 0x0591, 0x05BD ), ( 0x05BF, 0x05BF ), ( 0x05C1, 0x05C2 ),
@@ -288,21 +287,9 @@ const combining = [
   ( 0xE0100, 0xE01EF )
 ]
 
-func width*(r: Rune): int =
+func is_dwidth(r: Rune): bool =
   let ucs = int(r)
-  # sorted list of non-overlapping intervals of non-spacing characters
-  # generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c"
-
-  # binary search in table of non-spacing characters
-  if bisearch(r, combining):
-    return 0
-
-  if r.isControlChar():
-    return 2
-
-  # if we arrive here, ucs is not a combining or C0/C1 control character
-
-  if (ucs >= 0x1100 and
+  return (ucs >= 0x1100 and
      (ucs <= 0x115f or                    # Hangul Jamo init. consonants
       ucs == 0x2329 or ucs == 0x232a or
       (ucs >= 0x2e80 and ucs <= 0xa4cf and
@@ -314,9 +301,65 @@ func width*(r: Rune): int =
       (ucs >= 0xff00 and ucs <= 0xff60) or # Fullwidth Forms
       (ucs >= 0xffe0 and ucs <= 0xffe6) or
       (ucs >= 0x20000 and ucs <= 0x2fffd) or
-      (ucs >= 0x30000 and ucs <= 0x3fffd))):
-    return 2
-  return 1
+      (ucs >= 0x30000 and ucs <= 0x3fffd)))
+
+func makewidthtable(): array[0..0x10FFFF, byte] =
+  for r in low(char)..high(char):
+    if r.isControlChar():
+      result[int(r)] = 2
+    else:
+      result[int(r)] = 1
+
+  var i = 0
+  var next_combining = combining[i]
+  for ucs in 256..0x10FFFF:
+    if ucs >= next_combining[0]:
+      if ucs <= next_combining[1]:
+        result[ucs] = 0
+        continue
+      elif i + 1 < combining.len:
+        inc i
+        next_combining = combining[i]
+
+    if Rune(ucs).is_dwidth():
+      result[ucs] = 2
+    else:
+      result[ucs] = 1
+
+  for range in combining:
+    for r in range[0]..range[1]:
+      result[r] = 0
+
+
+# lowmem: use slow binary search etc method
+when defined(lowmem):
+  func width*(r: Rune): int =
+    # binary search in table of non-spacing characters
+    if bisearch(r, combining):
+      return 0
+
+    if r.isControlChar():
+      return 2
+
+    # if we arrive here, ucs is not a combining or C0/C1 control character
+
+    if r.is_dwidth():
+      return 2
+    return 1
+
+  func width*(r: Rune): int =
+    return int(width_table[int(r)])
+# small: store lookup table in memory on startup
+elif defined(small):
+  let width_table = makewidthtable()
+  func width*(r: Rune): int =
+    {.cast(noSideEffect).}:
+      return int(width_table[int(r)])
+# release: store lookup table in executable
+else:
+  const width_table = makewidthtable()
+  func width*(r: Rune): int =
+    return int(width_table[int(r)])
 
 func width*(s: string): int =
   for r in s.runes():
@@ -326,15 +369,9 @@ func width*(s: seq[Rune]): int =
   for r in s:
     result += width(r)
 
-# 
-# The following functions are the same as mk_wcwidth() and
-# mk_wcswidth(), except that spacing characters in the East Asian
-# Ambiguous (A) category as defined in Unicode Technical Report #11
-# have a column width of 2. This variant might be useful for users of
-# CJK legacy encodings who want to migrate to UCS without changing
-# the traditional terminal character-width behaviour. It is not
-# otherwise recommended for general use.
-# 
+# sorted list of non-overlapping intervals of East Asian Ambiguous
+# characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c"
+
 const ambiguous = [
   ( 0x00A1, 0x00A1 ), ( 0x00A4, 0x00A4 ), ( 0x00A7, 0x00A8 ),
   ( 0x00AA, 0x00AA ), ( 0x00AE, 0x00AE ), ( 0x00B0, 0x00B4 ),
@@ -390,15 +427,24 @@ const ambiguous = [
   ( 0xFFFD, 0xFFFD ), ( 0xF0000, 0xFFFFD ), ( 0x100000, 0x10FFFD )
 ]
 
-func mk_wcwidth_cjk(ucs: Rune): int =
-  # sorted list of non-overlapping intervals of East Asian Ambiguous
-  # characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c"
+# 
+# The following functions are the same as mk_wcwidth() and
+# mk_wcswidth(), except that spacing characters in the East Asian
+# Ambiguous (A) category as defined in Unicode Technical Report #11
+# have a column width of 2. This variant might be useful for users of
+# CJK legacy encodings who want to migrate to UCS without changing
+# the traditional terminal character-width behaviour. It is not
+# otherwise recommended for general use.
+# 
+# note: seconded, this should only be used if some option was changed (TODO:
+# make such an option available)
 
+func mk_wcwidth_cjk(r: Rune): int =
   # binary search in table of non-spacing characters
-  if bisearch(ucs, ambiguous):
+  if bisearch(r, ambiguous):
     return 2;
 
-  return width(ucs);
+  return r.width();
 
 func mk_wcswidth_cjk(s: string): int =
   for r in s.runes:
author	bptato <nincsnevem662@gmail.com>	2021-03-17 12:20:05 +0100
committer	bptato <nincsnevem662@gmail.com>	2021-03-17 12:20:05 +0100
commit	94a10242dca6181ef8f15a37e7083069ead09559 (patch)
tree	ccdb28c82535aa7cda25f2bfe26cbfa451eaecab /src/twtstr.nim
parent	97f19da347b27a4d12f54784fa8bcbf304aa4fea (diff)
download	chawan-94a10242dca6181ef8f15a37e7083069ead09559.tar.gz