luwrap: replace Nim unicode maps with libunicode

Instead of using the built-in (and outdated, and buggy) tables, we now use libunicode from QJS. This shaves some bytes off the executable, though far less than I had imagined it would. Also, a surprising effect of this change: because libunicode's tables aren't glitched out, kanji properly gets classified as alpha. I found this greatly annoying because `w' in Japanese text would now jump through whole sentences. As a band-aid solution I added an extra Han category, but I wish we had a more robust solution that could differentiate between *all* scripts. TODO: I suspect that separately loading the tables for every rune in breaksViWordCat is rather inefficient. Using some context object (at least per operation) would probably be beneficial.
author: bptato <nincsnevem662@gmail.com> 2024-05-09 21:57:00 +0200
committer: bptato <nincsnevem662@gmail.com> 2024-05-09 22:11:30 +0200
commit: 2453c63b0b12baa9bd78c0a114b58f1c3833e967 (patch)
tree: 34b37fa375f8500669877ec726afea0ba2ed2d99 /src/utils
parent: 200a3784de44b90351f7d9a1da47e85e06ff8e15 (diff)
download: chawan-2453c63b0b12baa9bd78c0a114b58f1c3833e967.tar.gz
3 files changed, 108 insertions, 48 deletions
diff --git a/src/utils/luwrap.nim b/src/utils/luwrap.nim
index 330a5d1e..612982e0 100644
--- a/src/utils/luwrap.nim
+++ b/src/utils/luwrap.nim
@@ -1,3 +1,4 @@
+import std/algorithm
 import std/strutils
 import std/unicode
 
@@ -7,9 +8,10 @@ import utils/charcategory
 proc passRealloc(opaque, p: pointer; size: csize_t): pointer {.cdecl.} =
   return realloc(p, size)
 
-proc mnormalize*(rs: var seq[Rune]; form = UNICODE_NFC) =
+proc normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] =
   {.cast(noSideEffect).}:
-    if rs.len == 0: return
+    if rs.len == 0:
+      return @[]
     var outbuf: ptr uint32
     let p = cast[ptr uint32](unsafeAddr rs[0])
     let out_len = unicode_normalize(addr outbuf, p, cint(rs.len), form, nil,
@@ -18,29 +20,81 @@ proc mnormalize*(rs: var seq[Rune]; form = UNICODE_NFC) =
       raise newException(Defect, "Unicode normalization failed")
     if out_len == 0:
       return
-    rs = cast[seq[Rune]](newSeqUninitialized[uint32](out_len))
+    var rs = cast[seq[Rune]](newSeqUninitialized[uint32](out_len))
     copyMem(addr rs[0], outbuf, out_len * sizeof(uint32))
     dealloc(outbuf)
+    return rs
 
-#TODO maybe a utf8 normalization procedure?
 proc mnormalize*(s: var string) =
   if NonAscii notin s:
     return # no need to normalize ascii
-  var rs = s.toRunes()
-  rs.mnormalize()
-  s = $rs
+  s = $s.toRunes().normalize()
 
-func normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] =
-  {.cast(noSideEffect).}:
-    if rs.len == 0: return
-    var outbuf: ptr uint32
-    let p = cast[ptr uint32](unsafeAddr rs[0])
-    let out_len = unicode_normalize(addr outbuf, p, cint(rs.len), form,
-      nil, passRealloc)
-    if out_len < 0:
-      raise newException(Defect, "Unicode normalization failed")
-    if out_len == 0:
-      return
-    result = cast[seq[Rune]](newSeqUninitialized[uint32](out_len))
-    copyMem(addr result[0], outbuf, out_len * sizeof(uint32))
-    dealloc(outbuf)
+# n == 0: upper, 1: lower, 2: case fold
+proc toUpperLU(s: string; n: cint): string =
+  result = newStringOfCap(s.len)
+  for r in s.runes:
+    var outa: array[LRE_CC_RES_LEN_MAX, uint32]
+    let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]),
+      uint32(r), n)
+    for i in 0 ..< n:
+      result &= $Rune(outa[i])
+
+proc toUpperLU*(s: string): string =
+  return s.toUpperLU(0)
+
+proc toLowerLU*(s: string): string =
+  return s.toUpperLU(1)
+
+proc capitalizeLU*(s: string): string =
+  result = newStringOfCap(s.len)
+  var wordStart = true
+  for r in s.runes:
+    if lre_is_space(uint32(r)) == 1:
+      wordStart = true
+      result &= $r
+    elif wordStart:
+      var outa: array[LRE_CC_RES_LEN_MAX, uint32]
+      let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]),
+        uint32(r), 0)
+      for i in 0 ..< n:
+        result &= $Rune(outa[i])
+      wordStart = false
+    else:
+      result &= $r
+
+type u32pair* {.packed.} = object
+  a: uint32
+  b: uint32
+
+func cmpRange*(x: u32pair; y: uint32): int =
+  if x.a > y:
+    return 1
+  elif x.b < y:
+    return -1
+  return 0
+
+func contains(cr: CharRange; r: Rune): bool =
+  let cps = cast[ptr UncheckedArray[u32pair]](cr.points)
+  let L = cr.len div 2 - 1
+  return cps.toOpenArray(0, L).binarySearch(uint32(r), cmpRange) != -1
+
+proc isGeneralCategoryLU*(r: Rune; s: string): bool =
+  var cr: CharRange
+  cr_init(addr cr, nil, passRealloc)
+  doAssert unicode_general_category(addr cr, s) == 0
+  result = r in cr
+  cr_free(addr cr)
+
+proc isAlphaLU*(r: Rune): bool =
+  return r.isGeneralCategoryLU("Letter")
+
+proc isScriptLU*(r: Rune; s: string): bool =
+  var cr: CharRange
+  cr_init(addr cr, nil, passRealloc)
+  doAssert unicode_script(addr cr, s, 0) == 0
+  result = r in cr
+  cr_free(addr cr)
+
+proc isWhiteSpaceLU*(r: Rune): bool =
+  return r.isGeneralCategoryLU("Separator")
diff --git a/src/utils/strwidth.nim b/src/utils/strwidth.nim
index ba7f60a2..fe089328 100644
--- a/src/utils/strwidth.nim
+++ b/src/utils/strwidth.nim
@@ -2,7 +2,6 @@ import std/strutils
 import std/unicode
 
 import utils/proptable
-import utils/charcategory
 import utils/map
 
 include res/map/charwidth_gen
@@ -78,29 +77,3 @@ func padToWidth*(str: string; size: int; schar = '$'): string =
         result &= r
         w += r.width
     result &= schar
-
-func isDigitAscii(r: Rune): bool =
-  return uint32(r) < 128 and char(r) in AsciiDigit
-
-type BreakCategory* = enum
-  bcAlpha, bcSpace, bcSymbol
-
-func breaksWord*(r: Rune): bool =
-  return not (r.isDigitAscii() or r.width() == 0 or r.isAlpha())
-
-func breaksViWordCat*(r: Rune): BreakCategory =
-  if r.isWhiteSpace():
-    return bcSpace
-  elif r.breaksWord() and r != Rune'_':
-    return bcSymbol
-  return bcAlpha
-
-func breaksWordCat*(r: Rune): BreakCategory =
-  if not r.breaksWord():
-    return bcAlpha
-  return bcSpace
-
-func breaksBigWordCat*(r: Rune): BreakCategory =
-  if not r.isWhiteSpace():
-    return bcAlpha
-  return bcSpace
diff --git a/src/utils/wordbreak.nim b/src/utils/wordbreak.nim
new file mode 100644
index 00000000..80959be7
--- /dev/null
+++ b/src/utils/wordbreak.nim
@@ -0,0 +1,33 @@
+import std/unicode
+
+import utils/charcategory
+import utils/luwrap
+import utils/strwidth
+
+func isDigitAscii(r: Rune): bool =
+  return uint32(r) < 128 and char(r) in AsciiDigit
+
+type BreakCategory* = enum
+  bcAlpha, bcSpace, bcSymbol, bcHan
+
+func breaksWord*(r: Rune): bool =
+  return not r.isDigitAscii() and r.width() != 0 and not r.isAlphaLU()
+
+func breaksViWordCat*(r: Rune): BreakCategory =
+  if r.isWhiteSpaceLU():
+    return bcSpace
+  elif r.breaksWord() and r != Rune'_':
+    return bcSymbol
+  elif r.isScriptLU("Han"):
+    return bcHan
+  return bcAlpha
+
+func breaksWordCat*(r: Rune): BreakCategory =
+  if not r.breaksWord():
+    return bcAlpha
+  return bcSpace
+
+func breaksBigWordCat*(r: Rune): BreakCategory =
+  if not r.isWhiteSpaceLU():
+    return bcAlpha
+  return bcSpace
author	bptato <nincsnevem662@gmail.com>	2024-05-09 21:57:00 +0200
committer	bptato <nincsnevem662@gmail.com>	2024-05-09 22:11:30 +0200
commit	2453c63b0b12baa9bd78c0a114b58f1c3833e967 (patch)
tree	34b37fa375f8500669877ec726afea0ba2ed2d99 /src/utils
parent	200a3784de44b90351f7d9a1da47e85e06ff8e15 (diff)
download	chawan-2453c63b0b12baa9bd78c0a114b58f1c3833e967.tar.gz