about summary refs log tree commit diff stats
path: root/src/utils
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2024-09-08 15:18:45 +0200
committerbptato <nincsnevem662@gmail.com>2024-09-08 16:06:02 +0200
commit4124c041ed2e3b497ede72fdae229aa2c6aca249 (patch)
treee8488449de6f0be54b9c79547352829b998833d3 /src/utils
parent5a64e3193924c7e503dddb10a99989148b26e922 (diff)
downloadchawan-4124c041ed2e3b497ede72fdae229aa2c6aca249.tar.gz
utils: add twtuni
std/unicode has the following issues:

* Rune is an int32, which implies overflow checking. Also, it is
  distinct, so you have to convert it manually to do arithmetic.
* QJS libunicode and Chagashi work with uint32, interfacing with these
  required pointless type conversions.
* fastRuneAt is a template, meaning it's pasted into every call
  site. Also, it decodes to UCS-4, so it generates two branches that
  aren't even used. Overall this lead to quite some code bloat.
* fastRuneAt and lastRune have frustratingly different
  interfaces. Writing code to handle both cases is error prone.
* On older Nim versions which we still support, std/unicode takes
  strings, not openArray[char]'s.

Replace it with "twtuni", which includes some improved versions of
the few procedures from std/unicode that we actually use.
Diffstat (limited to 'src/utils')
-rw-r--r--src/utils/luwrap.nim57
-rw-r--r--src/utils/strwidth.nim82
-rw-r--r--src/utils/twtstr.nim20
-rw-r--r--src/utils/twtuni.nim95
-rw-r--r--src/utils/widthconv.nim102
-rw-r--r--src/utils/wordbreak.nim36
6 files changed, 226 insertions, 166 deletions
diff --git a/src/utils/luwrap.nim b/src/utils/luwrap.nim
index 6081cdf8..76a5b2e1 100644
--- a/src/utils/luwrap.nim
+++ b/src/utils/luwrap.nim
@@ -1,14 +1,14 @@
 import std/algorithm
 import std/strutils
-import std/unicode
 
 import monoucha/libunicode
 import utils/charcategory
+import utils/twtuni
 
 proc passRealloc(opaque, p: pointer; size: csize_t): pointer {.cdecl.} =
   return realloc(p, size)
 
-proc normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] =
+proc normalize*(rs: seq[uint32]; form = UNICODE_NFC): seq[uint32] =
   {.cast(noSideEffect).}:
     if rs.len == 0:
       return @[]
@@ -20,7 +20,7 @@ proc normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] =
       raise newException(Defect, "Unicode normalization failed")
     if out_len == 0:
       return
-    var rs = cast[seq[Rune]](newSeqUninitialized[uint32](out_len))
+    var rs = newSeqUninitialized[uint32](out_len)
     copyMem(addr rs[0], outbuf, out_len * sizeof(uint32))
     dealloc(outbuf)
     return rs
@@ -28,17 +28,15 @@ proc normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] =
 proc mnormalize*(s: var string) =
   if NonAscii notin s:
     return # no need to normalize ascii
-  s = $s.toRunes().normalize()
+  s = s.toPoints().normalize().toUTF8()
 
 # n == 0: upper, 1: lower, 2: case fold
 proc toUpperLU(s: string; n: cint): string =
   result = newStringOfCap(s.len)
-  for r in s.runes:
+  for u in s.points:
     var outa: array[LRE_CC_RES_LEN_MAX, uint32]
-    let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]),
-      uint32(r), n)
-    for i in 0 ..< n:
-      result &= $Rune(outa[i])
+    let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]), u, n)
+    result.addUTF8(outa.toOpenArray(0, n - 1))
 
 proc toUpperLU*(s: string): string =
   return s.toUpperLU(0)
@@ -49,19 +47,18 @@ proc toLowerLU*(s: string): string =
 proc capitalizeLU*(s: string): string =
   result = newStringOfCap(s.len)
   var wordStart = true
-  for r in s.runes:
-    if lre_is_space(uint32(r)) == 1:
+  for u in s.points:
+    if lre_is_space(u) == 1:
       wordStart = true
-      result &= $r
+      result.addUTF8(u)
     elif wordStart:
       var outa: array[LRE_CC_RES_LEN_MAX, uint32]
       let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]),
-        uint32(r), 0)
-      for i in 0 ..< n:
-        result &= $Rune(outa[i])
+        u, 0)
+      result.addUTF8(outa.toOpenArray(0, n - 1))
       wordStart = false
     else:
-      result &= $r
+      result.addUTF8(u)
 
 type u32pair* {.packed.} = object
   a: uint32
@@ -74,10 +71,10 @@ func cmpRange*(x: u32pair; y: uint32): int =
     return -1
   return 0
 
-func contains(cr: CharRange; r: Rune): bool =
+func contains(cr: CharRange; u: uint32): bool =
   let cps = cast[ptr UncheckedArray[u32pair]](cr.points)
   let L = cr.len div 2 - 1
-  return cps.toOpenArray(0, L).binarySearch(uint32(r), cmpRange) != -1
+  return cps.toOpenArray(0, L).binarySearch(u, cmpRange) != -1
 
 type
   LURangeType = enum
@@ -114,26 +111,26 @@ proc initScript(ctx: LUContext; lur: LURangeType) =
     doAssert unicode_script(p, cstring($lur), 0) == 0
     ctx.inited.incl(lur)
 
-proc isAlphaLU*(ctx: LUContext; r: Rune): bool =
+proc isAlphaLU*(ctx: LUContext; u: uint32): bool =
   ctx.initGeneralCategory(lurLetter)
-  return r in ctx.crs[lurLetter]
+  return u in ctx.crs[lurLetter]
 
-proc isWhiteSpaceLU*(ctx: LUContext; r: Rune): bool =
+proc isWhiteSpaceLU*(ctx: LUContext; u: uint32): bool =
   ctx.initGeneralCategory(lurSeparator)
-  return r in ctx.crs[lurSeparator]
+  return u in ctx.crs[lurSeparator]
 
-proc isHan*(ctx: LUContext; r: Rune): bool =
+proc isHan*(ctx: LUContext; u: uint32): bool =
   ctx.initScript(lurHan)
-  return r in ctx.crs[lurHan]
+  return u in ctx.crs[lurHan]
 
-proc isHiragana*(ctx: LUContext; r: Rune): bool =
+proc isHiragana*(ctx: LUContext; u: uint32): bool =
   ctx.initScript(lurHiragana)
-  return r in ctx.crs[lurHiragana]
+  return u in ctx.crs[lurHiragana]
 
-proc isKatakana*(ctx: LUContext; r: Rune): bool =
+proc isKatakana*(ctx: LUContext; u: uint32): bool =
   ctx.initScript(lurKatakana)
-  return r in ctx.crs[lurKatakana]
+  return u in ctx.crs[lurKatakana]
 
-proc isHangul*(ctx: LUContext; r: Rune): bool =
+proc isHangul*(ctx: LUContext; u: uint32): bool =
   ctx.initScript(lurHangul)
-  return r in ctx.crs[lurHangul]
+  return u in ctx.crs[lurHangul]
diff --git a/src/utils/strwidth.nim b/src/utils/strwidth.nim
index 8c367991..4ce9aa12 100644
--- a/src/utils/strwidth.nim
+++ b/src/utils/strwidth.nim
@@ -1,7 +1,6 @@
-import std/unicode
-
-import utils/proptable
 import utils/map
+import utils/proptable
+import utils/twtuni
 
 include res/map/charwidth_gen
 
@@ -9,8 +8,7 @@ include res/map/charwidth_gen
 var isCJKAmbiguous* = false
 
 # Warning: this shouldn't be called without normalization.
-func width*(r: Rune): int =
-  let u = uint32(r)
+func width*(u: uint32): int =
   if u <= 0xFFFF: # fast path for BMP
     if u in CombiningTable:
       return 0
@@ -31,80 +29,56 @@ func width*(r: Rune): int =
 
 # Width, but also works with tabs.
 # Needs the column width of the text so far.
-func twidth*(r: Rune; w: int): int =
-  if r != Rune('\t'):
-    return r.width()
+func twidth*(u: uint32; w: int): int =
+  if u != uint32('\t'):
+    return u.width()
   return ((w div 8) + 1) * 8 - w
 
-func width*(s: string): int =
-  result = 0
-  for r in s.runes:
-    result += r.twidth(result)
+func width*(s: openArray[char]): int =
+  var w = 0
+  for u in s.points:
+    w += u.twidth(w)
+  return w
 
 func width*(s: string; start, len: int): int =
-  result = 0
+  var w = 0
   var i = start
   var m = len
   if m > s.len:
     m = s.len
   while i < m:
-    var r: Rune
-    fastRuneAt(s, i, r)
-    result += r.twidth(result)
-
-when NimMajor < 2:
-  template ones(n: untyped): untyped = ((1 shl n)-1)
-  template fastRuneAt(s: openArray[char]; i: int; result: untyped) =
-    result = Rune(0xFFFD)
-    if uint32(s[i]) <= 127:
-      result = Rune(uint32(s[i]))
-    elif uint32(s[i]) shr 5 == 0b110:
-      if i <= s.len - 2:
-        result = Rune((uint32(s[i]) and (ones(5))) shl 6 or
-          (uint32(s[i+1]) and ones(6)))
-        i += 1
-    elif uint32(s[i]) shr 4 == 0b1110:
-      if i <= s.len - 3:
-        result = Rune((uint32(s[i]) and ones(4)) shl 12 or
-          (uint32(s[i+1]) and ones(6)) shl 6 or (uint32(s[i+2]) and ones(6)))
-        i += 2
-    elif uint32(s[i]) shr 3 == 0b11110:
-      if i <= s.len - 4:
-        result = Rune((uint32(s[i]) and ones(3)) shl 18 or
-          (uint32(s[i+1]) and ones(6)) shl 12 or
-          (uint32(s[i+2]) and ones(6)) shl 6 or
-          (uint32(s[i+3]) and ones(6)))
-        i += 3
-    inc i
+    let u = s.nextUTF8(i)
+    w += u.twidth(w)
+  return w
 
 func notwidth*(s: openArray[char]): int =
-  result = 0
-  var i = 0
-  while i < s.len:
-    var r: Rune
-    fastRuneAt(s, i, r)
-    result += r.width()
+  var w = 0
+  for u in s.points:
+    w += u.width()
+  return w
 
 func twidth*(s: string; w: int): int =
   var i = w
-  for r in s.runes:
-    i += r.twidth(w)
+  for u in s.points:
+    i += u.twidth(w)
   return i - w
 
 func padToWidth*(s: string; size: int; schar = '$'): string =
   result = newStringOfCap(s.len)
   var w = 0
-  var r: Rune
   var i = 0
+  var pi = 0
   while i < s.len:
-    fastRuneAt(s, i, r)
-    w += r.width()
+    pi = i
+    w += s.nextUTF8(i).width()
     if w > size - 1:
       break
-    result &= r
+    for j in pi ..< i:
+      result &= s[j]
   if w > size - 1:
     if w == size and i == s.len:
-      result &= r
+      for j in pi ..< i:
+        result &= s[j]
     else:
       result &= schar
   while w < size:
diff --git a/src/utils/twtstr.nim b/src/utils/twtstr.nim
index 0d65be50..f08b1131 100644
--- a/src/utils/twtstr.nim
+++ b/src/utils/twtstr.nim
@@ -3,7 +3,6 @@ import std/math
 import std/options
 import std/os
 import std/strutils
-import std/unicode
 
 when defined(posix):
   import std/posix
@@ -11,14 +10,15 @@ when defined(posix):
 import types/opt
 import utils/charcategory
 import utils/map
+import utils/twtuni
 
 export charcategory
 
 func onlyWhitespace*(s: string): bool =
   return AllChars - AsciiWhitespace notin s
 
-func isControlChar*(r: Rune): bool =
-  return int(r) <= 0x1F or int(r) == 0x7F
+func isControlChar*(u: uint32): bool =
+  return u <= 0x1F or u == 0x7F
 
 func getControlChar*(c: char): char =
   if c == '?':
@@ -444,14 +444,13 @@ func matchNameProduction*(s: string): bool =
     return false
   # NameStartChar
   var i = 0
-  var r: Rune
   if s[i] in Ascii:
     if s[i] notin NameStartCharAscii:
       return false
     inc i
   else:
-    fastRuneAt(s, i, r)
-    if not NameStartCharRanges.isInRange(uint32(r)):
+    let u = s.nextUTF8(i)
+    if not NameStartCharRanges.isInRange(u):
       return false
   # NameChar
   while i < s.len:
@@ -460,9 +459,8 @@ func matchNameProduction*(s: string): bool =
         return false
       inc i
     else:
-      fastRuneAt(s, i, r)
-      if not NameStartCharRanges.isInRange(uint32(r)) and
-          not NameCharRanges.isInMap(uint32(r)):
+      let u = s.nextUTF8(i)
+      if not NameStartCharRanges.isInRange(u) and not NameCharRanges.isInMap(u):
         return false
   return true
 
@@ -483,8 +481,8 @@ func matchQNameProduction*(s: string): bool =
 
 func utf16Len*(s: string): int =
   result = 0
-  for r in s.runes:
-    if uint32(r) < 0x10000: # ucs-2
+  for u in s.points:
+    if u < 0x10000: # ucs-2
       result += 1
     else: # surrogate
       result += 2
diff --git a/src/utils/twtuni.nim b/src/utils/twtuni.nim
new file mode 100644
index 00000000..c617ac4e
--- /dev/null
+++ b/src/utils/twtuni.nim
@@ -0,0 +1,95 @@
+func nextUTF8*(s: openArray[char]; i: var int): uint32 =
+  let j = i
+  var u = uint32(s[j])
+  if u <= 0x7F:
+    inc i
+  elif u shr 5 == 0b110:
+    let e = j + 2
+    if likely(e <= s.len):
+      u = (u and 0x1F) shl 6 or (uint32(s[j + 1]) and 0x3F)
+    i = e
+  elif u shr 4 == 0b1110:
+    let e = j + 3
+    if likely(e <= s.len):
+      u = (u and 0xF) shl 12 or
+        (uint32(s[j + 1]) and 0x3F) shl 6 or
+        (uint32(s[j + 2]) and 0x3F)
+    i = e
+  elif u shr 3 == 0b11110:
+    let e = j + 4
+    if likely(e <= s.len):
+      u = (u and 7) shl 18 or
+        (uint32(s[j + 1]) and 0x3F) shl 12 or
+        (uint32(s[j + 2]) and 0x3F) shl 6 or
+        (uint32(s[j + 3]) and 0x3F)
+    i = e
+  else:
+    u = 0xFFFD
+    inc i
+  return u
+
+func prevUTF8*(s: openArray[char]; i: var int): uint32 =
+  var j = i - 1
+  while uint32(s[j]) shr 6 == 2:
+    dec j
+  i = j
+  return s.nextUTF8(j)
+
+func pointLenAt*(s: openArray[char]; i: int): int =
+  let u = uint8(s[i])
+  if u <= 0x7F:
+    return 1
+  elif u shr 5 == 0b110:
+    return 2
+  elif u shr 4 == 0b1110:
+    return 3
+  elif u shr 3 == 0b11110:
+    return 4
+  return 1
+
+iterator points*(s: openArray[char]): uint32 {.inline.} =
+  var i = 0
+  while i < s.len:
+    let u = s.nextUTF8(i)
+    yield u
+
+func toPoints*(s: openArray[char]): seq[uint32] =
+  result = @[]
+  for u in s.points:
+    result.add(u)
+
+proc addUTF8*(res: var string; u: uint32) =
+  if u < 0x80:
+    res &= char(u)
+  elif u < 0x800:
+    res &= char(u shr 6 or 0xC0)
+    res &= char(u and 0x3F or 0x80)
+  elif u < 0x10000:
+    res &= char(u shr 12 or 0xE0)
+    res &= char(u shr 6 and 0x3F or 0x80)
+    res &= char(u and 0x3F or 0x80)
+  else:
+    res &= char(u shr 18 or 0xF0)
+    res &= char(u shr 12 and 0x3F or 0x80)
+    res &= char(u shr 6 and 0x3F or 0x80)
+    res &= char(u and 0x3F or 0x80)
+
+func addUTF8*(res: var string; us: openArray[uint32]) =
+  for u in us:
+    res.addUTF8(u)
+
+func toUTF8*(u: uint32): string =
+  var s = ""
+  s.addUTF8(u)
+  return s
+
+func toUTF8*(us: openArray[uint32]): string =
+  var s = newStringOfCap(us.len shr 2)
+  s.addUTF8(us)
+  return s
+
+func pointLen*(s: openArray[char]): int =
+  var n = 0
+  for u in s.points:
+    inc n
+  return n
diff --git a/src/utils/widthconv.nim b/src/utils/widthconv.nim
index b6495379..32a904f6 100644
--- a/src/utils/widthconv.nim
+++ b/src/utils/widthconv.nim
@@ -1,66 +1,64 @@
 import std/strutils
-import std/unicode
+
 import utils/map
+import utils/twtuni
 
 const CanHaveDakuten = ("かきくけこさしすせそたちつてとはひふへほカキクケコ" &
-  "サシスセソタチツテトハヒフヘホ").toRunes()
+  "サシスセソタチツテトハヒフヘホ").toPoints()
 
-const CanHaveHanDakuten = "はひふへほハヒフヘホ".toRunes()
+const CanHaveHanDakuten = "はひふへほハヒフヘホ".toPoints()
 
 const HasDakuten = ("がぎぐげござじずぜぞだぢづでどばびぶべぼガギグゲゴ" &
-  "ザジズゼゾダヂヅデドバビブベボ").toRunes()
+  "ザジズゼゾダヂヅデドバビブベボ").toPoints()
 
-const HasHanDakuten = "ぱぴぷぺぽパピプペポ".toRunes()
+const HasHanDakuten = "ぱぴぷぺぽパピプペポ".toPoints()
 
 # Halfwidth to fullwidth & vice versa
-const halfFullMap = (func(): seq[tuple[half, full1, full2: Rune]] =
+const halfFullMap = (func(): seq[tuple[half, full1, full2: uint32]] =
   result = @[]
   const map = staticRead"res/widthconvmap.tab"
   for line in map.split('\n'):
     if line == "":
       break
     var i = 0
-    var half: Rune
-    fastRuneAt(line, i, half)
+    let half = line.nextUTF8(i)
     assert line[i] == '\t'
     inc i
-    var full1: Rune
-    fastRuneAt(line, i, full1)
-    var full2 = Rune(0)
+    let full1 = line.nextUTF8(i)
+    var full2 = 0u32
     if i < line.len:
       assert line[i] == '\t'
-      inc i
-      fastRuneAt(line, i, full2)
+      full2 = line.nextUTF8(i)
     result.add((half, full1, full2))
 )()
 
-func halfwidth(r: Rune): Rune =
-  if r != Rune(0): # special case to avoid comparison with f2
+func halfwidth(u: uint32): uint32 =
+  if u != 0: # special case to avoid comparison with f2
     for (h, f1, f2) in halfFullMap:
-      if f1 == r or f2 == r:
+      if f1 == u or f2 == u:
         return h
-  return r
+  return u
 
-const HalfDakuten = Rune(0xFF9E) # half-width dakuten
-const HalfHanDakuten = Rune(0xFF9F) # half-width handakuten
+const HalfDakuten = 0xFF9Eu32 # half-width dakuten
+const HalfHanDakuten = 0xFF9Fu32 # half-width handakuten
 
 # Note: in unicode, char + 1 is dakuten and char + 2 handakuten
 
 func halfwidth*(s: string): string =
   result = ""
-  for r in s.runes:
-    case r
+  for u in s.points:
+    case u
     of HasDakuten:
-      result &= halfwidth(Rune(uint32(r) - 1))
-      result &= HalfDakuten
+      result.addUTF8(halfwidth(u - 1))
+      result.addUTF8(HalfDakuten)
     of HasHanDakuten:
-      result &= halfwidth(Rune(uint32(r) - 2))
-      result &= HalfHanDakuten
+      result.addUTF8(halfwidth(u - 2))
+      result.addUTF8(HalfHanDakuten)
     else:
-      result &= halfwidth(r)
+      result.addUTF8(halfwidth(u))
 
-func fullwidth(r: Rune): Rune =
-  if r != Rune(0): # special case to avoid comparison with f2
+func fullwidth(r: uint32): uint32 =
+  if r != 0: # special case to avoid comparison with f2
     for (h, f1, f2) in halfFullMap:
       if h == r:
         return f1
@@ -68,45 +66,45 @@ func fullwidth(r: Rune): Rune =
 
 func fullwidth*(s: string): string =
   result = ""
-  var lastr = Rune(0)
-  for r in s.runes:
-    if lastr != Rune(0):
-      if r == HalfDakuten:
+  var lastu = 0u32
+  for u in s.points:
+    if lastu != 0:
+      if u == HalfDakuten:
         # flush with dakuten
-        result &= Rune(uint32(lastr) + 1)
-        lastr = Rune(0)
+        result.addUTF8(lastu + 1)
+        lastu = 0
         continue
-      elif r == HalfHanDakuten and lastr in CanHaveHanDakuten:
+      elif u == HalfHanDakuten and lastu in CanHaveHanDakuten:
         # flush with handakuten
-        result &= Rune(uint32(lastr) + 2)
-        lastr = Rune(0)
+        result.addUTF8(lastu + 2)
+        lastu = 0
         continue
-      result &= lastr
-      lastr = Rune(0)
-    let r = fullwidth(r)
-    if r in CanHaveDakuten:
-      lastr = r
+      result.addUTF8(lastu)
+      lastu = 0
+    let u = fullwidth(u)
+    if u in CanHaveDakuten:
+      lastu = u
     else:
-      result &= r
-  if lastr != Rune(0):
+      result.addUTF8(u)
+  if lastu != 0:
     # flush
-    result &= lastr
+    result.addUTF8(lastu)
 
 const kanamap = staticRead"res/kanamap.tab"
 func genFullSizeMap(): seq[(uint32, uint32)] =
   result = @[]
   for line in kanamap.split('\n'):
     if line.len == 0: break
-    let rs = line.toRunes()
-    assert rs[1] == Rune('\t')
-    result.add((uint32(rs[0]), uint32(rs[2])))
+    let rs = line.toPoints()
+    assert rs[1] == uint32('\t')
+    result.add((rs[0], rs[2]))
 const fullSizeMap = genFullSizeMap()
 
 proc fullsize*(s: string): string =
   result = ""
-  for r in s.runes:
-    let i = searchInMap(fullSizeMap, uint32(r))
+  for u in s.points:
+    let i = searchInMap(fullSizeMap, u)
     if i == -1:
-      result &= r
+      result.addUTF8(u)
     else:
-      result &= $Rune(fullSizeMap[i][1])
+      result.addUTF8(fullSizeMap[i][1])
diff --git a/src/utils/wordbreak.nim b/src/utils/wordbreak.nim
index c93d63ec..ff607fbe 100644
--- a/src/utils/wordbreak.nim
+++ b/src/utils/wordbreak.nim
@@ -1,5 +1,3 @@
-import std/unicode
-
 import utils/charcategory
 import utils/luwrap
 import utils/strwidth
@@ -7,39 +5,39 @@ import utils/strwidth
 type BreakCategory* = enum
   bcAlpha, bcSpace, bcSymbol, bcHan, bcHiragana, bcKatakana, bcHangul
 
-func isDigitAscii(r: Rune): bool =
-  return uint32(r) < 128 and char(r) in AsciiDigit
+func isDigitAscii(u: uint32): bool =
+  return u < 128 and char(u) in AsciiDigit
 
-proc breaksWord*(ctx: LUContext; r: Rune): bool =
-  return not r.isDigitAscii() and r.width() != 0 and not ctx.isAlphaLU(r)
+proc breaksWord*(ctx: LUContext; u: uint32): bool =
+  return not u.isDigitAscii() and u.width() != 0 and not ctx.isAlphaLU(u)
 
-proc breaksViWordCat*(ctx: LUContext; r: Rune): BreakCategory =
-  if int32(r) < 0x80: # ASCII
-    let c = char(r)
+proc breaksViWordCat*(ctx: LUContext; u: uint32): BreakCategory =
+  if u < 0x80: # ASCII
+    let c = char(u)
     if c in AsciiAlphaNumeric + {'_'}:
       return bcAlpha
     elif c in AsciiWhitespace:
       return bcSpace
-  elif ctx.isWhiteSpaceLU(r):
+  elif ctx.isWhiteSpaceLU(u):
     return bcSpace
-  elif ctx.isAlphaLU(r):
-    if ctx.isHiragana(r):
+  elif ctx.isAlphaLU(u):
+    if ctx.isHiragana(u):
       return bcHiragana
-    elif ctx.isKatakana(r):
+    elif ctx.isKatakana(u):
       return bcKatakana
-    elif ctx.isHangul(r):
+    elif ctx.isHangul(u):
       return bcHangul
-    elif ctx.isHan(r):
+    elif ctx.isHan(u):
       return bcHan
     return bcAlpha
   return bcSymbol
 
-proc breaksWordCat*(ctx: LUContext; r: Rune): BreakCategory =
-  if not ctx.breaksWord(r):
+proc breaksWordCat*(ctx: LUContext; u: uint32): BreakCategory =
+  if not ctx.breaksWord(u):
     return bcAlpha
   return bcSpace
 
-proc breaksBigWordCat*(ctx: LUContext; r: Rune): BreakCategory =
-  if not ctx.isWhiteSpaceLU(r):
+proc breaksBigWordCat*(ctx: LUContext; u: uint32): BreakCategory =
+  if not ctx.isWhiteSpaceLU(u):
     return bcAlpha
   return bcSpace