about summary refs log tree commit diff stats
path: root/src
diff options
authorbptato <nincsnevem662@gmail.com>2023-12-13 13:29:42 +0100
committerbptato <nincsnevem662@gmail.com>2023-12-13 13:29:42 +0100
commitdf40fcdde896f636a05d1e3fe598feb2a816f2b9 (patch)
treecc8ffa2729d5e3b078782047d87bdb9019e349e9 /src
parentab203acf554993d15e37604773f160c84b4d8252 (diff)
break up twtstr somewhat
Avoid computing e.g. charwidth data for http which does not need it
at all.
Diffstat (limited to 'src')
18 files changed, 397 insertions, 380 deletions
diff --git a/src/display/lineedit.nim b/src/display/lineedit.nim
index 6dccdded..5edb31d0 100644
--- a/src/display/lineedit.nim
+++ b/src/display/lineedit.nim
@@ -7,6 +7,7 @@ import display/winattrs
 import js/javascript
 import types/cell
 import types/opt
+import utils/strwidth
 import utils/twtstr
 import chakasu/charset
diff --git a/src/display/term.nim b/src/display/term.nim
index 3a3a7773..05d54f98 100644
--- a/src/display/term.nim
+++ b/src/display/term.nim
@@ -14,6 +14,7 @@ import display/winattrs
 import types/cell
 import types/color
 import types/opt
+import utils/strwidth
 import utils/twtstr
 import chakasu/charset
diff --git a/src/html/dom.nim b/src/html/dom.nim
index 70dc7a84..aa6a3d0e 100644
--- a/src/html/dom.nim
+++ b/src/html/dom.nim
@@ -35,6 +35,7 @@ import types/referer
 import types/url
 import types/vector
 import utils/mimeguess
+import utils/strwidth
 import utils/twtstr
 import chakasu/charset
diff --git a/src/layout/engine.nim b/src/layout/engine.nim
index 3219f06d..637285a6 100644
--- a/src/layout/engine.nim
+++ b/src/layout/engine.nim
@@ -8,6 +8,7 @@ import css/values
 import display/winattrs
 import layout/box
 import layout/layoutunit
+import utils/strwidth
 import utils/twtstr
diff --git a/src/local/container.nim b/src/local/container.nim
index 355549cd..09ec5a37 100644
--- a/src/local/container.nim
+++ b/src/local/container.nim
@@ -24,6 +24,7 @@ import types/color
 import types/cookie
 import types/url
 import utils/mimeguess
+import utils/strwidth
 import utils/twtstr
 import chakasu/charset
diff --git a/src/local/pager.nim b/src/local/pager.nim
index 6977a4eb..a6a4d77b 100644
--- a/src/local/pager.nim
+++ b/src/local/pager.nim
@@ -39,6 +39,7 @@ import types/cookie
 import types/opt
 import types/urimethodmap
 import types/url
+import utils/strwidth
 import utils/twtstr
 import chakasu/charset
diff --git a/src/local/select.nim b/src/local/select.nim
index ed543638..95c89117 100644
--- a/src/local/select.nim
+++ b/src/local/select.nim
@@ -3,6 +3,7 @@ import unicode
 import js/regex
 import server/buffer
 import types/cell
+import utils/strwidth
 import utils/twtstr
diff --git a/src/main.nim b/src/main.nim
index b3742904..0d04bf3f 100644
--- a/src/main.nim
+++ b/src/main.nim
@@ -16,6 +16,7 @@ import config/config
 import io/serversocket
 import local/client
 import types/opt
+import utils/strwidth
 import utils/twtstr
 import chakasu/charset
diff --git a/src/render/renderdocument.nim b/src/render/renderdocument.nim
index 61eb0d72..f1404c6d 100644
--- a/src/render/renderdocument.nim
+++ b/src/render/renderdocument.nim
@@ -9,7 +9,7 @@ import layout/engine
 import layout/layoutunit
 import types/cell
 import types/color
-import utils/twtstr
+import utils/strwidth
 func toFormat(computed: CSSComputedValues): Format =
   if computed == nil:
diff --git a/src/render/rendertext.nim b/src/render/rendertext.nim
index 012e01e7..39380ee0 100644
--- a/src/render/rendertext.nim
+++ b/src/render/rendertext.nim
@@ -2,6 +2,7 @@ import streams
 import unicode
 import types/cell
+import utils/strwidth
 import utils/twtstr
 import chakasu/charset
diff --git a/src/server/buffer.nim b/src/server/buffer.nim
index 77404c7c..4138f1e3 100644
--- a/src/server/buffer.nim
+++ b/src/server/buffer.nim
@@ -50,6 +50,7 @@ import types/formdata
 import types/opt
 import types/referer
 import types/url
+import utils/strwidth
 import utils/twtstr
 import xhr/formdata as formdata_impl
diff --git a/src/server/forkserver.nim b/src/server/forkserver.nim
index 5563817b..7da7d3f6 100644
--- a/src/server/forkserver.nim
+++ b/src/server/forkserver.nim
@@ -18,7 +18,7 @@ import types/buffersource
 import types/cookie
 import types/urimethodmap
 import types/url
-import utils/twtstr
+import utils/strwidth
   ForkCommand* = enum
diff --git a/src/types/cell.nim b/src/types/cell.nim
index 3a8967ee..a0ba3459 100644
--- a/src/types/cell.nim
+++ b/src/types/cell.nim
@@ -3,6 +3,7 @@ import tables
 import css/stylednode
 import types/color
+import utils/strwidth
 import utils/twtstr
diff --git a/src/types/url.nim b/src/types/url.nim
index e115d54c..d5fbed9e 100644
--- a/src/types/url.nim
+++ b/src/types/url.nim
@@ -1,12 +1,16 @@
 # See https://url.spec.whatwg.org/#url-parsing.
-import strutils
-import tables
-import options
-import unicode
-import math
+import std/algorithm
+import std/math
+import std/options
+import std/strutils
+import std/tables
+import std/unicode
+import bindings/libunicode
+import data/idna
 import js/error
 import js/javascript
+import lib/punycode
 import types/blob
 import utils/twtstr
@@ -240,6 +244,106 @@ func endsInNumber(input: string): bool =
     return true
   return false
+type u32pair {.packed.} = object
+  a: uint32
+  b: uint32
+func cmpRange(x: u32pair, y: uint32): int =
+  if x.a < y:
+    return -1
+  elif x.b > y:
+    return 1
+  return 0
+func processIdna(str: string, checkhyphens, checkbidi, checkjoiners,
+    transitionalprocessing: bool): Option[string] =
+  var mapped: seq[Rune]
+  for r in str.runes():
+    let status = getIdnaTableStatus(r)
+    case status
+    of IDNA_DISALLOWED: return none(string) #error
+    of IDNA_IGNORED: discard
+    of IDNA_MAPPED: mapped &= getIdnaMapped(r).toRunes()
+      if transitionalprocessing:
+        mapped &= getDeviationMapped(r).toRunes()
+      else:
+        mapped &= r
+    of IDNA_VALID: mapped &= r
+  if mapped.len == 0: return
+  mapped.mnormalize()
+  var cr: CharRange
+  {.cast(noSideEffect).}:
+    cr_init(addr cr, nil, passRealloc)
+    let r = unicode_general_category(addr cr, "Mark")
+    assert r == 0
+  var labels: seq[string]
+  for label in ($mapped).split('.'):
+    if label.startsWith("xn--"):
+      try:
+        let s = punycode.decode(label.substr("xn--".len))
+        let x0 = s.toRunes()
+        let x1 = normalize(x0)
+        if x0 != x1:
+          return none(string) #error
+        if checkhyphens:
+          if s.len >= 4 and s[2] == '-' and s[3] == '-':
+            return none(string) #error
+          if s.len > 0 and s[0] == '-' and s[^1] == '-':
+            return none(string) #error
+        if x0.len > 0:
+          let cps = cast[ptr UncheckedArray[u32pair]](cr.points)
+          let c = cast[uint32](x0[0])
+          if binarySearch(toOpenArray(cps, 0, cr.len div 2 - 1), c, cmpRange) != -1:
+            return none(string) #error
+        for r in x0:
+          if r == Rune('.'):
+            return none(string) #error
+          let status = getIdnaTableStatus(r)
+          case status
+            return none(string) #error
+          of IDNA_DEVIATION:
+            if transitionalprocessing:
+              return none(string) #error
+          of IDNA_VALID: discard
+          #TODO check joiners
+          #TODO check bidi
+        labels.add(s)
+      except PunyError:
+        return none(string) #error
+    else:
+      labels.add(label)
+  cr_free(addr cr)
+  return some(labels.join('.'))
+func unicodeToAscii(s: string, checkhyphens, checkbidi, checkjoiners,
+    transitionalprocessing, verifydnslength: bool): Option[string] =
+  let processed = s.processIdna(checkhyphens, checkbidi, checkjoiners,
+    transitionalprocessing)
+  if processed.isnone:
+    return none(string) #error
+  var labels: seq[string]
+  var all = 0
+  for label in processed.get.split('.'):
+    if not label.isAscii():
+      try:
+        let converted = "xn--" & punycode.encode(label)
+        labels.add(converted)
+      except PunyError:
+        return none(string) #error
+    else:
+      labels.add(label)
+    if verifydnslength:
+      let rl = labels[^1].runeLen()
+      if rl notin 1..63:
+        return none(string)
+      all += rl
+  if verifydnslength:
+    if all notin 1..253:
+      return none(string) #error
+  return some(labels.join('.'))
 func domainToAscii*(domain: string, bestrict = false): Option[string] =
   var needsprocessing = false
   for s in domain.split('.'):
diff --git a/src/utils/charcategory.nim b/src/utils/charcategory.nim
new file mode 100644
index 00000000..84a63a7e
--- /dev/null
+++ b/src/utils/charcategory.nim
@@ -0,0 +1,16 @@
+import std/unicode
+const C0Controls* = {chr(0x00)..chr(0x1F)}
+const Controls* = (C0Controls + {chr(0x7F)})
+const Ascii* = {chr(0x00)..chr(0x7F)}
+const AsciiUpperAlpha* = {'A'..'Z'}
+const AsciiLowerAlpha* = {'a'..'z'}
+const AsciiAlpha* = (AsciiUpperAlpha + AsciiLowerAlpha)
+const NonAscii* = {char(0x80)..char(0xFF)}
+const AsciiDigit* = {'0'..'9'}
+const AsciiAlphaNumeric* = AsciiAlpha + AsciiDigit
+const AsciiHexDigit* = (AsciiDigit + {'a'..'f', 'A'..'F'})
+const AsciiWhitespace* = {' ', '\n', '\r', '\t', '\f'}
+func isDigitAscii*(r: Rune): bool =
+  return uint32(r) < 256 and char(r) in AsciiDigit
diff --git a/src/utils/map.nim b/src/utils/map.nim
index 526b9154..2a3515a1 100644
--- a/src/utils/map.nim
+++ b/src/utils/map.nim
@@ -1,15 +1,15 @@
 import algorithm
-func searchInMap*[U, T](a: openarray[(U, T)], u: U): int =
+func searchInMap*[U, T](a: openArray[(U, T)], u: U): int =
   when not (typeof(u) is U):
     if c > cast[typeof(c)](high(U)):
       return -1
   binarySearch(a, u, proc(x: (U, T), y: U): int = cmp(x[0], y))
-func isInMap*[U, T](a: openarray[(U, T)], u: U): bool =
+func isInMap*[U, T](a: openArray[(U, T)], u: U): bool =
   a.searchInMap(u) != -1
-func isInRange*[U](a: openarray[(U, U)], u: U): bool =
+func isInRange*[U](a: openArray[(U, U)], u: U): bool =
   let res = binarySearch(a, u, proc(x: (U, U), y: U): int =
     if x[0] < y:
diff --git a/src/utils/strwidth.nim b/src/utils/strwidth.nim
new file mode 100644
index 00000000..eb4a99d1
--- /dev/null
+++ b/src/utils/strwidth.nim
@@ -0,0 +1,246 @@
+import std/strutils
+import std/unicode
+import data/charwidth
+import js/error
+import types/opt
+import utils/charcategory
+import utils/map
+# Combining chars from https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
+# The following two functions define the column width of an ISO 10646
+# character as follows:
+#   - The null character (U+0000) has a column width of 0.
+#   - Other C0/C1 control characters and DEL will lead to a return value of 0
+#   - Non-spacing and enclosing combining characters (general category code Mn
+#     or Me in the Unicode database) have a column width of 0.
+#   - SOFT HYPHEN (U+00AD) has a column width of 1.
+#   - Other format characters (general category code Cf in the Unicode
+#     database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
+#   - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) have a
+#     column width of 0.
+#   - Spacing characters in the East Asian Wide (W) or East Asian Full-width
+#     (F) category as defined in Unicode Technical Report #11 have a column
+#     width of 2.
+#   - All remaining characters (including all printable ISO 8859-1 and WGL4
+#     characters, Unicode control characters, etc.) have a column width of 1.
+# sorted list of non-overlapping intervals of non-spacing characters generated
+# by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c"
+const Combining = [
+  (0x0300u32, 0x036Fu32), (0x0483u32, 0x0486u32), (0x0488u32, 0x0489u32),
+  (0x0591u32, 0x05BDu32), (0x05BFu32, 0x05BFu32), (0x05C1u32, 0x05C2u32),
+  (0x05C4u32, 0x05C5u32), (0x05C7u32, 0x05C7u32), (0x0600u32, 0x0603u32),
+  (0x0610u32, 0x0615u32), (0x064Bu32, 0x065Eu32), (0x0670u32, 0x0670u32),
+  (0x06D6u32, 0x06E4u32), (0x06E7u32, 0x06E8u32), (0x06EAu32, 0x06EDu32),
+  (0x070Fu32, 0x070Fu32), (0x0711u32, 0x0711u32), (0x0730u32, 0x074Au32),
+  (0x07A6u32, 0x07B0u32), (0x07EBu32, 0x07F3u32), (0x0901u32, 0x0902u32),
+  (0x093Cu32, 0x093Cu32), (0x0941u32, 0x0948u32), (0x094Du32, 0x094Du32),
+  (0x0951u32, 0x0954u32), (0x0962u32, 0x0963u32), (0x0981u32, 0x0981u32),
+  (0x09BCu32, 0x09BCu32), (0x09C1u32, 0x09C4u32), (0x09CDu32, 0x09CDu32),
+  (0x09E2u32, 0x09E3u32), (0x0A01u32, 0x0A02u32), (0x0A3Cu32, 0x0A3Cu32),
+  (0x0A41u32, 0x0A42u32), (0x0A47u32, 0x0A48u32), (0x0A4Bu32, 0x0A4Du32),
+  (0x0A70u32, 0x0A71u32), (0x0A81u32, 0x0A82u32), (0x0ABCu32, 0x0ABCu32),
+  (0x0AC1u32, 0x0AC5u32), (0x0AC7u32, 0x0AC8u32), (0x0ACDu32, 0x0ACDu32),
+  (0x0AE2u32, 0x0AE3u32), (0x0B01u32, 0x0B01u32), (0x0B3Cu32, 0x0B3Cu32),
+  (0x0B3Fu32, 0x0B3Fu32), (0x0B41u32, 0x0B43u32), (0x0B4Du32, 0x0B4Du32),
+  (0x0B56u32, 0x0B56u32), (0x0B82u32, 0x0B82u32), (0x0BC0u32, 0x0BC0u32),
+  (0x0BCDu32, 0x0BCDu32), (0x0C3Eu32, 0x0C40u32), (0x0C46u32, 0x0C48u32),
+  (0x0C4Au32, 0x0C4Du32), (0x0C55u32, 0x0C56u32), (0x0CBCu32, 0x0CBCu32),
+  (0x0CBFu32, 0x0CBFu32), (0x0CC6u32, 0x0CC6u32), (0x0CCCu32, 0x0CCDu32),
+  (0x0CE2u32, 0x0CE3u32), (0x0D41u32, 0x0D43u32), (0x0D4Du32, 0x0D4Du32),
+  (0x0DCAu32, 0x0DCAu32), (0x0DD2u32, 0x0DD4u32), (0x0DD6u32, 0x0DD6u32),
+  (0x0E31u32, 0x0E31u32), (0x0E34u32, 0x0E3Au32), (0x0E47u32, 0x0E4Eu32),
+  (0x0EB1u32, 0x0EB1u32), (0x0EB4u32, 0x0EB9u32), (0x0EBBu32, 0x0EBCu32),
+  (0x0EC8u32, 0x0ECDu32), (0x0F18u32, 0x0F19u32), (0x0F35u32, 0x0F35u32),
+  (0x0F37u32, 0x0F37u32), (0x0F39u32, 0x0F39u32), (0x0F71u32, 0x0F7Eu32),
+  (0x0F80u32, 0x0F84u32), (0x0F86u32, 0x0F87u32), (0x0F90u32, 0x0F97u32),
+  (0x0F99u32, 0x0FBCu32), (0x0FC6u32, 0x0FC6u32), (0x102Du32, 0x1030u32),
+  (0x1032u32, 0x1032u32), (0x1036u32, 0x1037u32), (0x1039u32, 0x1039u32),
+  (0x1058u32, 0x1059u32), (0x1160u32, 0x11FFu32), (0x135Fu32, 0x135Fu32),
+  (0x1712u32, 0x1714u32), (0x1732u32, 0x1734u32), (0x1752u32, 0x1753u32),
+  (0x1772u32, 0x1773u32), (0x17B4u32, 0x17B5u32), (0x17B7u32, 0x17BDu32),
+  (0x17C6u32, 0x17C6u32), (0x17C9u32, 0x17D3u32), (0x17DDu32, 0x17DDu32),
+  (0x180Bu32, 0x180Du32), (0x18A9u32, 0x18A9u32), (0x1920u32, 0x1922u32),
+  (0x1927u32, 0x1928u32), (0x1932u32, 0x1932u32), (0x1939u32, 0x193Bu32),
+  (0x1A17u32, 0x1A18u32), (0x1B00u32, 0x1B03u32), (0x1B34u32, 0x1B34u32),
+  (0x1B36u32, 0x1B3Au32), (0x1B3Cu32, 0x1B3Cu32), (0x1B42u32, 0x1B42u32),
+  (0x1B6Bu32, 0x1B73u32), (0x1DC0u32, 0x1DCAu32), (0x1DFEu32, 0x1DFFu32),
+  (0x200Bu32, 0x200Fu32), (0x202Au32, 0x202Eu32), (0x2060u32, 0x2063u32),
+  (0x206Au32, 0x206Fu32), (0x20D0u32, 0x20EFu32), (0x302Au32, 0x302Fu32),
+  (0x3099u32, 0x309Au32), (0xA806u32, 0xA806u32), (0xA80Bu32, 0xA80Bu32),
+  (0xA825u32, 0xA826u32), (0xFB1Eu32, 0xFB1Eu32), (0xFE00u32, 0xFE0Fu32),
+  (0xFE20u32, 0xFE23u32), (0xFEFFu32, 0xFEFFu32), (0xFFF9u32, 0xFFFBu32),
+  (0x10A01u32, 0x10A03u32), (0x10A05u32, 0x10A06u32), (0x10A0Cu32, 0x10A0Fu32),
+  (0x10A38u32, 0x10A3Au32), (0x10A3Fu32, 0x10A3Fu32), (0x1D167u32, 0x1D169u32),
+  (0x1D173u32, 0x1D182u32), (0x1D185u32, 0x1D18Bu32), (0x1D1AAu32, 0x1D1ADu32),
+  (0x1D242u32, 0x1D244u32), (0xE0001u32, 0xE0001u32), (0xE0020u32, 0xE007Fu32),
+  (0xE0100u32, 0xE01EFu32)
+func isDoubleWidthHigh(r: Rune): bool =
+  return DoubleWidthRanges.isInRange(uint32(r))
+func isDoubleWidthAmbiguousHigh(r: Rune): bool =
+  # binary search in table of non-spacing characters
+  if DoubleWidthAmbiguousRanges.isInRange(uint32(r)):
+    return true
+  return r.isDoubleWidthHigh()
+func isCombining(r: Rune): bool =
+  return DoubleWidthAmbiguousRanges.isInRange(uint32(r))
+# Lookup tables for characters on the BMP. This "only" takes up 8k of space
+# per table, as opposed to the 135k that storing all characters would require.
+# The downside is obviously that we need a binary search fallback for non-bmp.
+# We do not store a lookup table of ambiguous ranges, either.
+  # following won't work on 16-bit
+  doAssert sizeof(int) >= sizeof(Rune)
+type PropertyTable = array[0x10000 div (sizeof(int) * 8), int]
+type RangeMap = openarray[(uint32, uint32)]
+func makePropertyTable(ranges: RangeMap, skip: RangeMap = @[]): PropertyTable =
+  var ucs: uint32 = 0
+  var j = 0
+  var k = 0
+  while ucs <= 0xFFFF:
+    if k > ranges.len:
+      break
+    if ranges[k][0] > ucs:
+      ucs = ranges[k][0]
+      continue
+    if ranges[k][1] < ucs:
+      inc k
+      continue
+    if j != skip.len and ucs == skip[j][0]:
+      ucs = skip[j][1] + 1
+      continue
+    let i = ucs div (sizeof(int) * 8)
+    let m = ucs mod (sizeof(int) * 8)
+    result[i] = result[i] or (1 shl m)
+    inc ucs
+const DoubleWidthTable = (func(): PropertyTable =
+  var ptab = makePropertyTable(DoubleWidthRanges, Combining)
+  # Control chars return a width of 2, and are displayed as ^{letter}.
+  for c in Controls:
+    let u = int(c)
+    let i = u div (sizeof(int) * 8)
+    let m = u mod (sizeof(int) * 8)
+    ptab[i] = ptab[i] or (1 shl m)
+  return ptab
+const CombiningTable = makePropertyTable(Combining)
+# One of the few global variables in the code. Honestly, it should not exist.
+var is_cjk_ambiguous = false
+proc set_cjk_ambiguous*(b: bool) =
+  is_cjk_ambiguous = b
+{.push boundChecks:off.}
+func contains(props: PropertyTable, r: Rune): bool =
+  let u = int(r)
+  let i = u div (sizeof(int) * 8)
+  let m = u mod (sizeof(int) * 8)
+  return (props[i] and (1 shl m)) != 0
+# Warning: this shouldn't be called without normalization.
+# We could make this function more efficient in edge cases, but it's already
+# too complex for my taste.
+func width*(r: Rune): int =
+  {.cast(noSideEffect).}:
+    let u = uint32(r)
+    if u <= 0xFFFF:
+      if r in CombiningTable:
+        return 0
+      if not is_cjk_ambiguous:
+        if r in DoubleWidthTable:
+          return 2
+      else:
+        if r in DoubleWidthTable or DoubleWidthAmbiguousRanges.isInRange(u):
+          return 2
+    else:
+      if r.isCombining():
+        return 0
+      if not is_cjk_ambiguous:
+        if r.isDoubleWidthHigh():
+          return 2
+      else:
+        if r.isDoubleWidthAmbiguousHigh():
+          return 2
+    return 1
+# Width, but also works with tabs.
+# Needs the column width of the text so far.
+func twidth*(r: Rune, w: int): int =
+  if r != Rune('\t'):
+    return r.width()
+  return ((w div 8) + 1) * 8 - w
+func width*(s: string): int =
+  for r in s.runes():
+    result += r.twidth(result)
+func width*(s: string, start, len: int): int =
+  var i = start
+  var m = len
+  if m > s.len: m = s.len
+  while i < m:
+    var r: Rune
+    fastRuneAt(s, i, r)
+    result += r.twidth(result)
+func notwidth*(s: string): int =
+  for r in s.runes:
+    result += r.width()
+func twidth*(s: string, w: int): int =
+  var i = w
+  for r in s.runes():
+    i += r.twidth(w)
+  return i - w
+func padToWidth*(str: string, size: int, schar = '$'): string =
+  if str.width() < size:
+    return str & ' '.repeat(size - str.width())
+  else:
+    let size = size - 1
+    result = newStringOfCap(str.len)
+    var w = 0
+    var i = 0
+    while i < str.len:
+      var r: Rune
+      fastRuneAt(str, i, r)
+      if w + r.width <= size:
+        result &= r
+        w += r.width
+    result &= schar
+func breaksWord*(r: Rune): bool =
+  return not (r.isDigitAscii() or r.width() == 0 or r.isAlpha())
+type BoundaryFunction* = proc(x: Rune): JSResult[bool]
+proc breaksWord*(r: Rune, check: Opt[BoundaryFunction]): bool =
+  if check.isSome:
+    let f = check.get()
+    let v = f(r)
+    if v.isSome: #TODO report error?
+      return v.get()
+  return r.breaksWord()
diff --git a/src/utils/twtstr.nim b/src/utils/twtstr.nim
index c5078568..b235651f 100644
--- a/src/utils/twtstr.nim
+++ b/src/utils/twtstr.nim
@@ -8,28 +8,15 @@ import tables
 import unicode
 import bindings/libunicode
-import data/charwidth
-import data/idna
-import js/error
-import utils/map
 import types/opt
-import lib/punycode
+import utils/charcategory
+import utils/map
+export charcategory
 when defined(posix):
   import posix
-const C0Controls* = {chr(0x00)..chr(0x1F)}
-const Controls* = (C0Controls + {chr(0x7F)})
-const Ascii* = {chr(0x00)..chr(0x7F)}
-const AsciiUpperAlpha* = {'A'..'Z'}
-const AsciiLowerAlpha* = {'a'..'z'}
-const AsciiAlpha* = (AsciiUpperAlpha + AsciiLowerAlpha)
-const NonAscii* = (AllChars - Ascii)
-const AsciiDigit* = {'0'..'9'}
-const AsciiAlphaNumeric* = AsciiAlpha + AsciiDigit
-const AsciiHexDigit* = (AsciiDigit + {'a'..'f', 'A'..'F'})
-const AsciiWhitespace* = {' ', '\n', '\r', '\t', '\f'}
 func onlyWhitespace*(s: string): bool =
   for c in s:
     if c notin AsciiWhitespace:
@@ -191,9 +178,6 @@ func endsWithIgnoreCase*(s1, s2: string): bool =
       return false
   return true
-func isDigitAscii*(r: Rune): bool =
-  return int(r) < 256 and isDigit(char(r))
 func stripAndCollapse*(s: string): string =
   var i = 0
   while i < s.len and s[i] in AsciiWhitespace:
@@ -588,15 +572,16 @@ func join*(ss: openarray[string], sep: char): string =
     result &= sep
     result &= ss[i]
-proc passRealloc(opaque: pointer, p: pointer, size: csize_t): pointer {.cdecl.} =
+proc passRealloc*(opaque: pointer, p: pointer, size: csize_t): pointer
+    {.cdecl.} =
   return realloc(p, size)
 proc mnormalize*(rs: var seq[Rune], form = UNICODE_NFC) = {.cast(noSideEffect).}:
   if rs.len == 0: return
   var outbuf: ptr uint32
-  let out_len = unicode_normalize(addr outbuf,
-                                  cast[ptr uint32](unsafeAddr rs[0]),
-                                  cint(rs.len), form, nil, passRealloc)
+  let p = cast[ptr uint32](unsafeAddr rs[0])
+  let out_len = unicode_normalize(addr outbuf, p, cint(rs.len), form, nil,
+    passRealloc)
   if out_len < 0:
     raise newException(Defect, "Unicode normalization failed")
   if out_len == 0:
@@ -627,105 +612,6 @@ func normalize*(rs: seq[Rune], form = UNICODE_NFC): seq[Rune] = {.cast(noSideEff
   copyMem(addr result[0], outbuf, out_len * sizeof(uint32))
-type u32pair {.packed.} = object
-  a: uint32
-  b: uint32
-func cmpRange(x: u32pair, y: uint32): int =
-  if x.a < y:
-    return -1
-  elif x.b > y:
-    return 1
-  return 0
-func processIdna(str: string, checkhyphens, checkbidi, checkjoiners, transitionalprocessing: bool): Option[string] =
-  var mapped: seq[Rune]
-  for r in str.runes():
-    let status = getIdnaTableStatus(r)
-    case status
-    of IDNA_DISALLOWED: return none(string) #error
-    of IDNA_IGNORED: discard
-    of IDNA_MAPPED: mapped &= getIdnaMapped(r).toRunes()
-      if transitionalprocessing:
-        mapped &= getDeviationMapped(r).toRunes()
-      else:
-        mapped &= r
-    of IDNA_VALID: mapped &= r
-  if mapped.len == 0: return
-  mapped.mnormalize()
-  var cr: CharRange
-  {.cast(noSideEffect).}:
-    cr_init(addr cr, nil, passRealloc)
-    let r = unicode_general_category(addr cr, "Mark")
-    assert r == 0
-  var labels: seq[string]
-  for label in ($mapped).split('.'):
-    if label.startsWith("xn--"):
-      try:
-        let s = punycode.decode(label.substr("xn--".len))
-        let x0 = s.toRunes()
-        let x1 = normalize(x0)
-        if x0 != x1:
-          return none(string) #error
-        if checkhyphens:
-          if s.len >= 4 and s[2] == '-' and s[3] == '-':
-            return none(string) #error
-          if s.len > 0 and s[0] == '-' and s[^1] == '-':
-            return none(string) #error
-        if x0.len > 0:
-          let cps = cast[ptr UncheckedArray[u32pair]](cr.points)
-          let c = cast[uint32](x0[0])
-          if binarySearch(toOpenArray(cps, 0, cr.len div 2 - 1), c, cmpRange) != -1:
-            return none(string) #error
-        for r in x0:
-          if r == Rune('.'):
-            return none(string) #error
-          let status = getIdnaTableStatus(r)
-          case status
-            return none(string) #error
-          of IDNA_DEVIATION:
-            if transitionalprocessing:
-              return none(string) #error
-          of IDNA_VALID: discard
-          #TODO check joiners
-          #TODO check bidi
-        labels.add(s)
-      except PunyError:
-        return none(string) #error
-    else:
-      labels.add(label)
-  cr_free(addr cr)
-  return some(labels.join('.'))
-func unicodeToAscii*(s: string, checkhyphens, checkbidi, checkjoiners, transitionalprocessing, verifydnslength: bool): Option[string] =
-  let processed = s.processIdna(checkhyphens, checkbidi, checkjoiners,
-                                transitionalprocessing)
-  if processed.isnone:
-    return none(string) #error
-  var labels: seq[string]
-  var all = 0
-  for label in processed.get.split('.'):
-    if not label.isAscii():
-      try:
-        let converted = "xn--" & punycode.encode(label)
-        labels.add(converted)
-      except PunyError:
-        return none(string) #error
-    else:
-      labels.add(label)
-    if verifydnslength:
-      let rl = labels[^1].runeLen()
-      if rl notin 1..63:
-        return none(string)
-      all += rl
-  if verifydnslength:
-    if all notin 1..253:
-      return none(string) #error
-  return some(labels.join('.'))
 # https://www.w3.org/TR/xml/#NT-Name
 const NameStartCharRanges = [
   (0xC0, 0xD6),
@@ -812,252 +698,6 @@ proc expandPath*(path: string): string =
         return $p.pw_dir / path.substr(usr.len)
     return path
-# Combining chars from https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
-# The following two functions define the column width of an ISO 10646
-# character as follows:
-#   - The null character (U+0000) has a column width of 0.
-#   - Other C0/C1 control characters and DEL will lead to a return value of 0
-#   - Non-spacing and enclosing combining characters (general category code Mn
-#     or Me in the Unicode database) have a column width of 0.
-#   - SOFT HYPHEN (U+00AD) has a column width of 1.
-#   - Other format characters (general category code Cf in the Unicode
-#     database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
-#   - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) have a
-#     column width of 0.
-#   - Spacing characters in the East Asian Wide (W) or East Asian Full-width
-#     (F) category as defined in Unicode Technical Report #11 have a column
-#     width of 2.
-#   - All remaining characters (including all printable ISO 8859-1 and WGL4
-#     characters, Unicode control characters, etc.) have a column width of 1.
-# sorted list of non-overlapping intervals of non-spacing characters generated
-# by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c"
-const Combining = [
-  (0x0300u32, 0x036Fu32), (0x0483u32, 0x0486u32), (0x0488u32, 0x0489u32),
-  (0x0591u32, 0x05BDu32), (0x05BFu32, 0x05BFu32), (0x05C1u32, 0x05C2u32),
-  (0x05C4u32, 0x05C5u32), (0x05C7u32, 0x05C7u32), (0x0600u32, 0x0603u32),
-  (0x0610u32, 0x0615u32), (0x064Bu32, 0x065Eu32), (0x0670u32, 0x0670u32),
-  (0x06D6u32, 0x06E4u32), (0x06E7u32, 0x06E8u32), (0x06EAu32, 0x06EDu32),
-  (0x070Fu32, 0x070Fu32), (0x0711u32, 0x0711u32), (0x0730u32, 0x074Au32),
-  (0x07A6u32, 0x07B0u32), (0x07EBu32, 0x07F3u32), (0x0901u32, 0x0902u32),
-  (0x093Cu32, 0x093Cu32), (0x0941u32, 0x0948u32), (0x094Du32, 0x094Du32),
-  (0x0951u32, 0x0954u32), (0x0962u32, 0x0963u32), (0x0981u32, 0x0981u32),
-  (0x09BCu32, 0x09BCu32), (0x09C1u32, 0x09C4u32), (0x09CDu32, 0x09CDu32),
-  (0x09E2u32, 0x09E3u32), (0x0A01u32, 0x0A02u32), (0x0A3Cu32, 0x0A3Cu32),
-  (0x0A41u32, 0x0A42u32), (0x0A47u32, 0x0A48u32), (0x0A4Bu32, 0x0A4Du32),
-  (0x0A70u32, 0x0A71u32), (0x0A81u32, 0x0A82u32), (0x0ABCu32, 0x0ABCu32),
-  (0x0AC1u32, 0x0AC5u32), (0x0AC7u32, 0x0AC8u32), (0x0ACDu32, 0x0ACDu32),
-  (0x0AE2u32, 0x0AE3u32), (0x0B01u32, 0x0B01u32), (0x0B3Cu32, 0x0B3Cu32),
-  (0x0B3Fu32, 0x0B3Fu32), (0x0B41u32, 0x0B43u32), (0x0B4Du32, 0x0B4Du32),
-  (0x0B56u32, 0x0B56u32), (0x0B82u32, 0x0B82u32), (0x0BC0u32, 0x0BC0u32),
-  (0x0BCDu32, 0x0BCDu32), (0x0C3Eu32, 0x0C40u32), (0x0C46u32, 0x0C48u32),
-  (0x0C4Au32, 0x0C4Du32), (0x0C55u32, 0x0C56u32), (0x0CBCu32, 0x0CBCu32),
-  (0x0CBFu32, 0x0CBFu32), (0x0CC6u32, 0x0CC6u32), (0x0CCCu32, 0x0CCDu32),
-  (0x0CE2u32, 0x0CE3u32), (0x0D41u32, 0x0D43u32), (0x0D4Du32, 0x0D4Du32),
-  (0x0DCAu32, 0x0DCAu32), (0x0DD2u32, 0x0DD4u32), (0x0DD6u32, 0x0DD6u32),
-  (0x0E31u32, 0x0E31u32), (0x0E34u32, 0x0E3Au32), (0x0E47u32, 0x0E4Eu32),
-  (0x0EB1u32, 0x0EB1u32), (0x0EB4u32, 0x0EB9u32), (0x0EBBu32, 0x0EBCu32),
-  (0x0EC8u32, 0x0ECDu32), (0x0F18u32, 0x0F19u32), (0x0F35u32, 0x0F35u32),
-  (0x0F37u32, 0x0F37u32), (0x0F39u32, 0x0F39u32), (0x0F71u32, 0x0F7Eu32),
-  (0x0F80u32, 0x0F84u32), (0x0F86u32, 0x0F87u32), (0x0F90u32, 0x0F97u32),
-  (0x0F99u32, 0x0FBCu32), (0x0FC6u32, 0x0FC6u32), (0x102Du32, 0x1030u32),
-  (0x1032u32, 0x1032u32), (0x1036u32, 0x1037u32), (0x1039u32, 0x1039u32),
-  (0x1058u32, 0x1059u32), (0x1160u32, 0x11FFu32), (0x135Fu32, 0x135Fu32),
-  (0x1712u32, 0x1714u32), (0x1732u32, 0x1734u32), (0x1752u32, 0x1753u32),
-  (0x1772u32, 0x1773u32), (0x17B4u32, 0x17B5u32), (0x17B7u32, 0x17BDu32),
-  (0x17C6u32, 0x17C6u32), (0x17C9u32, 0x17D3u32), (0x17DDu32, 0x17DDu32),
-  (0x180Bu32, 0x180Du32), (0x18A9u32, 0x18A9u32), (0x1920u32, 0x1922u32),
-  (0x1927u32, 0x1928u32), (0x1932u32, 0x1932u32), (0x1939u32, 0x193Bu32),
-  (0x1A17u32, 0x1A18u32), (0x1B00u32, 0x1B03u32), (0x1B34u32, 0x1B34u32),
-  (0x1B36u32, 0x1B3Au32), (0x1B3Cu32, 0x1B3Cu32), (0x1B42u32, 0x1B42u32),
-  (0x1B6Bu32, 0x1B73u32), (0x1DC0u32, 0x1DCAu32), (0x1DFEu32, 0x1DFFu32),
-  (0x200Bu32, 0x200Fu32), (0x202Au32, 0x202Eu32), (0x2060u32, 0x2063u32),
-  (0x206Au32, 0x206Fu32), (0x20D0u32, 0x20EFu32), (0x302Au32, 0x302Fu32),
-  (0x3099u32, 0x309Au32), (0xA806u32, 0xA806u32), (0xA80Bu32, 0xA80Bu32),
-  (0xA825u32, 0xA826u32), (0xFB1Eu32, 0xFB1Eu32), (0xFE00u32, 0xFE0Fu32),
-  (0xFE20u32, 0xFE23u32), (0xFEFFu32, 0xFEFFu32), (0xFFF9u32, 0xFFFBu32),
-  (0x10A01u32, 0x10A03u32), (0x10A05u32, 0x10A06u32), (0x10A0Cu32, 0x10A0Fu32),
-  (0x10A38u32, 0x10A3Au32), (0x10A3Fu32, 0x10A3Fu32), (0x1D167u32, 0x1D169u32),
-  (0x1D173u32, 0x1D182u32), (0x1D185u32, 0x1D18Bu32), (0x1D1AAu32, 0x1D1ADu32),
-  (0x1D242u32, 0x1D244u32), (0xE0001u32, 0xE0001u32), (0xE0020u32, 0xE007Fu32),
-  (0xE0100u32, 0xE01EFu32)
-func cmp(range: (uint32, uint32), r: Rune): int =
-  if range[1] < cast[uint32](r):
-    return -1
-  elif range[0] > cast[uint32](r):
-    return 1
-  return 0
-func isDoubleWidthHigh(r: Rune): bool =
-  return binarySearch(DoubleWidthRanges, r, twtstr.cmp) != -1
-func isDoubleWidthAmbiguousHigh(r: Rune): bool =
-  # binary search in table of non-spacing characters
-  if binarySearch(DoubleWidthAmbiguousRanges, r, twtstr.cmp) != -1:
-    return true
-  return r.isDoubleWidthHigh()
-func isCombining(r: Rune): bool =
-  return binarySearch(Combining, r, twtstr.cmp) != -1
-# Lookup tables for characters on the BMP. This "only" takes up 8k of space
-# per table, as opposed to the 135k that storing all characters would require.
-# The downside is obviously that we need a binary search fallback for non-bmp.
-# We do not store a lookup table of ambiguous ranges, either.
-  # following won't work on 16-bit
-  doAssert sizeof(int) >= sizeof(Rune)
-type PropertyTable = array[0x10000 div (sizeof(int) * 8), int]
-type RangeMap = openarray[(uint32, uint32)]
-func makePropertyTable(ranges: RangeMap, skip: RangeMap = @[]): PropertyTable =
-  var ucs: uint32 = 0
-  var j = 0
-  var k = 0
-  while ucs <= 0xFFFF:
-    if k > ranges.len:
-      break
-    if ranges[k][0] > ucs:
-      ucs = ranges[k][0]
-      continue
-    if ranges[k][1] < ucs:
-      inc k
-      continue
-    if j != skip.len and ucs == skip[j][0]:
-      ucs = skip[j][1] + 1
-      continue
-    let i = ucs div (sizeof(int) * 8)
-    let m = ucs mod (sizeof(int) * 8)
-    result[i] = result[i] or (1 shl m)
-    inc ucs
-const DoubleWidthTable = (func(): PropertyTable =
-  var ptab = makePropertyTable(DoubleWidthRanges, Combining)
-  # Control chars return a width of 2, and are displayed as ^{letter}.
-  for c in Controls:
-    let u = int(c)
-    let i = u div (sizeof(int) * 8)
-    let m = u mod (sizeof(int) * 8)
-    ptab[i] = ptab[i] or (1 shl m)
-  return ptab
-const CombiningTable = makePropertyTable(Combining)
-# One of the few global variables in the code. Honestly, it should not exist.
-var is_cjk_ambiguous = false
-proc set_cjk_ambiguous*(b: bool) =
-  is_cjk_ambiguous = b
-{.push boundChecks:off.}
-func contains(props: PropertyTable, r: Rune): bool =
-  let u = int(r)
-  let i = u div (sizeof(int) * 8)
-  let m = u mod (sizeof(int) * 8)
-  return (props[i] and (1 shl m)) != 0
-# Warning: this shouldn't be called without normalization.
-# We could make this function more efficient in edge cases, but it's already
-# too complex for my taste.
-func width*(r: Rune): int =
-  {.cast(noSideEffect).}:
-    if cast[uint32](r) <= 0xFFFF:
-      if r in CombiningTable:
-        return 0
-      if not is_cjk_ambiguous:
-        if r in DoubleWidthTable:
-          return 2
-      else:
-        if r in DoubleWidthTable or
-            binarySearch(DoubleWidthAmbiguousRanges, r, twtstr.cmp) != -1:
-          return 2
-    else:
-      if r.isCombining():
-        return 0
-      if not is_cjk_ambiguous:
-        if r.isDoubleWidthHigh():
-          return 2
-      else:
-        if r.isDoubleWidthAmbiguousHigh():
-          return 2
-    return 1
-# Width, but also works with tabs.
-# Needs the column width of the text so far.
-func twidth*(r: Rune, w: int): int =
-  if r != Rune('\t'):
-    return r.width()
-  return ((w div 8) + 1) * 8 - w
-func width*(s: string): int =
-  for r in s.runes():
-    result += r.twidth(result)
-func width*(s: string, start, len: int): int =
-  var i = start
-  var m = len
-  if m > s.len: m = s.len
-  while i < m:
-    var r: Rune
-    fastRuneAt(s, i, r)
-    result += r.twidth(result)
-func notwidth*(s: string): int =
-  for r in s.runes:
-    result += r.width()
-func twidth*(s: string, w: int): int =
-  var i = w
-  for r in s.runes():
-    i += r.twidth(w)
-  return i - w
-func breaksWord*(r: Rune): bool =
-  return not (r.isDigitAscii() or r.width() == 0 or r.isAlpha())
-type BoundaryFunction* = proc(x: Rune): JSResult[bool]
-proc breaksWord*(r: Rune, check: Opt[BoundaryFunction]): bool =
-  if check.isSome:
-    let f = check.get()
-    let v = f(r)
-    if v.isSome: #TODO report error?
-      return v.get()
-  return r.breaksWord()
-func padToWidth*(str: string, size: int, schar = '$'): string =
-  if str.width() < size:
-    return str & ' '.repeat(size - str.width())
-  else:
-    let size = size - 1
-    result = newStringOfCap(str.len)
-    var w = 0
-    var i = 0
-    while i < str.len:
-      var r: Rune
-      fastRuneAt(str, i, r)
-      if w + r.width <= size:
-        result &= r
-        w += r.width
-    result &= schar
 func deleteChars*(s: string, todel: set[char]): string =
   var i = 0
   block earlyret: