about summary refs log tree commit diff stats
path: root/src/utils
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2023-01-27 21:30:28 +0100
committerbptato <nincsnevem662@gmail.com>2023-01-27 21:48:21 +0100
commit5f88adcc8e742dc18230f40c3361801cf012a8c6 (patch)
tree4c402261a3349ad3fd1959ef58093d459fe2bccd /src/utils
parent167dd67270d5a432c584b61f5a22281ca47017d9 (diff)
downloadchawan-5f88adcc8e742dc18230f40c3361801cf012a8c6.tar.gz
Make width table at compile-time
Diffstat (limited to 'src/utils')
-rw-r--r--src/utils/twtstr.nim163
1 files changed, 100 insertions, 63 deletions
diff --git a/src/utils/twtstr.nim b/src/utils/twtstr.nim
index 582d68d4..09422d0f 100644
--- a/src/utils/twtstr.nim
+++ b/src/utils/twtstr.nim
@@ -820,7 +820,7 @@ iterator split*(s: seq[Rune], sep: Rune): seq[Rune] =
 
 # sorted list of non-overlapping intervals of non-spacing characters generated
 # by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c"
-const combining = [
+const Combining = [
   ( 0x0300, 0x036F ), ( 0x0483, 0x0486 ), ( 0x0488, 0x0489 ),
   ( 0x0591, 0x05BD ), ( 0x05BF, 0x05BF ), ( 0x05C1, 0x05C2 ),
   ( 0x05C4, 0x05C5 ), ( 0x05C7, 0x05C7 ), ( 0x0600, 0x0603 ),
@@ -871,26 +871,27 @@ const combining = [
   ( 0xE0100, 0xE01EF )
 ]
 
-func is_dwidth(r: Rune): bool =
-  let ucs = int(r)
-  return (ucs >= 0x1100 and
-     (ucs <= 0x115f or                    # Hangul Jamo init. consonants
-      ucs == 0x2329 or ucs == 0x232a or
-      (ucs >= 0x2e80 and ucs <= 0xa4cf and
-       ucs != 0x303f) or                  # CJK ... Yi
-      (ucs >= 0xac00 and ucs <= 0xd7a3) or # Hangul Syllables
-      (ucs >= 0xf900 and ucs <= 0xfaff) or # CJK Compatibility Ideographs
-      (ucs >= 0xfe10 and ucs <= 0xfe19) or # Vertical forms
-      (ucs >= 0xfe30 and ucs <= 0xfe6f) or # CJK Compatibility Forms
-      (ucs >= 0xff00 and ucs <= 0xff60) or # Fullwidth Forms
-      (ucs >= 0xffe0 and ucs <= 0xffe6) or
-      (ucs >= 0x20000 and ucs <= 0x2fffd) or
-      (ucs >= 0x30000 and ucs <= 0x3fffd)))
+func isDoubleWidthLow(r: Rune): bool =
+  let ucs = cast[uint32](r)
+  return ucs in 0x1100u32..0x115Fu32 or # Hangul Jamo init. consonants
+    ucs == 0x2329 or ucs == 0x232A or
+    ucs in 0x2E80u32..0xA4CFu32 and ucs != 0x303F or # CJK ... Yi
+    ucs in 0xAC00u32..0xD7A3u32 or # Hangul Syllables
+    ucs in 0xF900u32..0xFAFFu32 or # CJK Compatibility Ideographs
+    ucs in 0xFE10u32..0xFE19u32 or # Vertical forms
+    ucs in 0xFE30u32..0xFE6Fu32 or # CJK Compatibility Forms
+    ucs in 0xFF00u32..0xFF60u32 or # Fullwidth Forms
+    ucs in 0xFFE0u32..0xFFE6u32
+
+func isDoubleWidthHigh(r: Rune): bool =
+  let ucs = cast[uint32](r)
+  return (ucs in 0x20000u32..0x2FFFDu32) or
+    (ucs in 0x30000u32..0x3FFFDu32)
 
 # sorted list of non-overlapping intervals of East Asian Ambiguous characters,
 # generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c"
 
-const ambiguous = [
+const Ambiguous = [
   ( 0x00A1, 0x00A1 ), ( 0x00A4, 0x00A4 ), ( 0x00A7, 0x00A8 ),
   ( 0x00AA, 0x00AA ), ( 0x00AE, 0x00AE ), ( 0x00B0, 0x00B4 ),
   ( 0x00B6, 0x00BA ), ( 0x00BC, 0x00BF ), ( 0x00C6, 0x00C6 ),
@@ -953,60 +954,96 @@ const ambiguous = [
 # to UCS without changing the traditional terminal character-width behaviour.
 # It is not otherwise recommended for general use.
 
-func is_dwidth_cjk(r: Rune): bool =
+func isDoubleWidthAmbiguousHigh(r: Rune): bool =
   # binary search in table of non-spacing characters
-  if binarySearch(ambiguous, int32(r), (x, y) => (if x[0] < y: -1 elif x[1] > y: 1 else: 0)) != -1:
+  if binarySearch(Ambiguous, int32(r), (x, y) => (if x[0] < y: -1 elif x[1] > y: 1 else: 0)) != -1:
     return true
+  return r.isDoubleWidthHigh()
+
+func isCombining(r: Rune): bool =
+  return binarySearch(Combining, int32(r), (x, y) => (if x[0] < y: -1 elif x[1] > y: 1 else: 0)) != -1
+
+# Lookup tables for characters on the BMP. This "only" takes up 8k of space
+# per table, as opposed to the 135k that storing all characters would require.
+# The downside is obviously that we need a binary search fallback for non-bmp.
+type PropertyTable = array[0..(0xFFFF div 8), uint8]
+
+func makePropertyTable(crit: proc(r: Rune): bool {.noSideEffect.},
+    skip: openarray[(int, int)] = @[]): PropertyTable {.noInit.} =
+  var ucs = 0
+  var j = 0
+  while ucs <= 0xFFFF:
+    if j != skip.len and ucs == skip[j][0]:
+      ucs = skip[j][1] + 1
+      continue
+    if crit(cast[Rune](ucs)):
+      let i = ucs div 8
+      case ucs mod 8
+      of 0: result[i] = result[i] or 0x01
+      of 1: result[i] = result[i] or 0x02
+      of 2: result[i] = result[i] or 0x04
+      of 3: result[i] = result[i] or 0x08
+      of 4: result[i] = result[i] or 0x10
+      of 5: result[i] = result[i] or 0x20
+      of 6: result[i] = result[i] or 0x40
+      of 7: result[i] = result[i] or 0x80
+      else: discard
+    inc ucs
 
-  return r.is_dwidth()
-
-# compute lookup table on startup
-var width_table*: array[0..0x10FFFF, byte]
-
-# Note: control chars return a width of 2, as we display them as ^{letter}.
-func makewidthtable*(cjk: bool): array[0..0x10FFFF, byte] {.noInit.} =
-  for r in low(char)..high(char):
-    if r in Controls:
-      result[int(r)] = 2
-    else:
-      result[int(r)] = 1
-
-  var i = 0
-  var next_combining = combining[i]
-  if cjk:
-    for ucs in 256..0x10FFFF:
-      if ucs >= next_combining[0]:
-        if ucs <= next_combining[1]:
-          result[ucs] = 0
-          continue
-        elif i + 1 < combining.len:
-          inc i
-          next_combining = combining[i]
-
-      if Rune(ucs).is_dwidth_cjk():
-        result[ucs] = 2
-      else:
-        result[ucs] = 1
-  else:
-    for ucs in 256..0x10FFFF:
-      if ucs >= next_combining[0]:
-        if ucs <= next_combining[1]:
-          result[ucs] = 0
-          continue
-        elif i + 1 < combining.len:
-          inc i
-          next_combining = combining[i]
+# Control chars return a width of 2, and are displayed as ^{letter}.
+const DoubleWidthTable = makePropertyTable((func(r: Rune): bool =
+  r.isAscii() and cast[char](r) in Controls or r.isDoubleWidthLow()
+), Combining)
+const CombiningTable = makePropertyTable(func(r: Rune): bool =
+  return binarySearch(Combining, int32(r), (x, y) => (if x[0] < y: -1 elif x[1] > y: 1 else: 0)) != -1
+)
 
-      if Rune(ucs).is_dwidth():
-        result[ucs] = 2
-      else:
-        result[ucs] = 1
+# One of the few global variables in the code. Honestly, it should not exist.
+var is_cjk_ambiguous = false
+proc set_cjk_ambiguous*(b: bool) =
+  is_cjk_ambiguous = b
 
 {.push boundChecks:off.}
+func contains(props: PropertyTable, r: Rune): bool =
+  let i = cast[uint32](r) div 8
+  case cast[uint32](r) mod 8
+  of 0: return (props[i] and 0x01) != 0
+  of 1: return (props[i] and 0x02) != 0
+  of 2: return (props[i] and 0x04) != 0
+  of 3: return (props[i] and 0x08) != 0
+  of 4: return (props[i] and 0x10) != 0
+  of 5: return (props[i] and 0x20) != 0
+  of 6: return (props[i] and 0x40) != 0
+  of 7: return (props[i] and 0x80) != 0
+  else: discard
+{.pop.}
+
+# Warning: this shouldn't be called without normalization.
+# We could make this function more efficient in edge cases, but it's already
+# too complex for my taste.
 func width*(r: Rune): int =
   {.cast(noSideEffect).}:
-    return int(width_table[int(r)])
-{.pop.}
+    if cast[uint32](r) <= 0xFFFF:
+      if r in CombiningTable:
+        return 0
+      if not is_cjk_ambiguous:
+        if r in DoubleWidthTable:
+          return 2
+      else:
+        if r in DoubleWidthTable or
+            binarySearch(Ambiguous, int32(r),
+              (x, y) => (if x[0] < y: -1 elif x[1] > y: 1 else: 0)) != -1:
+          return 2
+    else:
+      if r.isCombining():
+        return 0
+      if not is_cjk_ambiguous:
+        if r.isDoubleWidthHigh():
+          return 2
+      else:
+        if r.isDoubleWidthAmbiguousHigh():
+          return 2
+    return 1
 
 # Width, but also works with tabs.
 # Needs the column width of the text so far.