luwrap: replace Nim unicode maps with libunicode

Instead of using the built-in (and outdated, and buggy) tables, we now use libunicode from QJS. This shaves some bytes off the executable, though far less than I had imagined it would. Also, a surprising effect of this change: because libunicode's tables aren't glitched out, kanji properly gets classified as alpha. I found this greatly annoying because `w' in Japanese text would now jump through whole sentences. As a band-aid solution I added an extra Han category, but I wish we had a more robust solution that could differentiate between *all* scripts. TODO: I suspect that separately loading the tables for every rune in breaksViWordCat is rather inefficient. Using some context object (at least per operation) would probably be beneficial.
author: bptato <nincsnevem662@gmail.com> 2024-05-09 21:57:00 +0200
committer: bptato <nincsnevem662@gmail.com> 2024-05-09 22:11:30 +0200
commit: 2453c63b0b12baa9bd78c0a114b58f1c3833e967 (patch)
tree: 34b37fa375f8500669877ec726afea0ba2ed2d99
parent: 200a3784de44b90351f7d9a1da47e85e06ff8e15 (diff)
download: chawan-2453c63b0b12baa9bd78c0a114b58f1c3833e967.tar.gz
8 files changed, 136 insertions, 69 deletions
diff --git a/src/bindings/libunicode.nim b/src/bindings/libunicode.nim
index f501ae3f..13a36da4 100644
--- a/src/bindings/libunicode.nim
+++ b/src/bindings/libunicode.nim
@@ -16,14 +16,29 @@ type
 {.push header: "quickjs/libunicode.h", importc.}
 
 proc cr_init*(cr: ptr CharRange; mem_opaque: pointer;
-  realloc_func: DynBufReallocFunc) {.importc.}
+  realloc_func: DynBufReallocFunc)
 
-proc cr_free*(cr: ptr CharRange) {.importc.}
+proc cr_free*(cr: ptr CharRange)
 
 proc unicode_normalize*(pdst: ptr ptr uint32; src: ptr uint32; src_len: cint;
   n_type: UnicodeNormalizationEnum; opaque: pointer;
-  realloc_func: DynBufReallocFunc): cint {.importc.}
+  realloc_func: DynBufReallocFunc): cint
 
+proc unicode_script*(cr: ptr CharRange; script_name: cstring; is_ext: cint):
+  cint {.importc, header: "quickjs/libunicode.h".}
+proc unicode_prop*(cr: ptr CharRange; prop_name: cstring): cint
 proc unicode_general_category*(cr: ptr CharRange; gc_name: cstring): cint
-  {.importc.}
+
+const LRE_CC_RES_LEN_MAX* = 3
+
+# conv_type:
+# 0 = to upper
+# 1 = to lower
+# 2 = case folding
+# res must be an array of LRE_CC_RES_LEN_MAX
+proc lre_case_conv*(res: ptr UncheckedArray[uint32]; c: uint32;
+  conv_type: cint): cint
+
+proc lre_is_space*(c: uint32): cint
+
 {.pop.}
diff --git a/src/layout/engine.nim b/src/layout/engine.nim
index d51c555b..ba073c27 100644
--- a/src/layout/engine.nim
+++ b/src/layout/engine.nim
@@ -881,9 +881,9 @@ iterator transform(text: seq[string]; v: CSSTextTransform): string {.inline.} =
   else:
     for str in text:
       let str = case v
-      of TextTransformCapitalize: str.capitalize()
-      of TextTransformUppercase: str.toUpper()
-      of TextTransformLowercase: str.toLower()
+      of TextTransformCapitalize: str.capitalizeLU()
+      of TextTransformUppercase: str.toUpperLU()
+      of TextTransformLowercase: str.toLowerLU()
       of TextTransformFullWidth: str.fullwidth()
       of TextTransformFullSizeKana: str.fullsize()
       of TextTransformChaHalfWidth: str.halfwidth()
diff --git a/src/local/container.nim b/src/local/container.nim
index e4901902..985e45ef 100644
--- a/src/local/container.nim
+++ b/src/local/container.nim
@@ -12,8 +12,8 @@ import io/promise
 import io/serversocket
 import io/socketstream
 import js/javascript
-import js/jstypes
 import js/jsregex
+import js/jstypes
 import layout/renderdocument
 import loader/headers
 import loader/loader
@@ -26,9 +26,11 @@ import types/cookie
 import types/referrer
 import types/url
 import types/winattrs
+import utils/luwrap
 import utils/mimeguess
 import utils/strwidth
 import utils/twtstr
+import utils/wordbreak
 
 import chagashi/charset
 
@@ -595,7 +597,7 @@ proc cursorLineTextStart(container: Container) {.jsfunc.} =
   if container.numLines == 0: return
   var x = 0
   for r in container.currentLine.runes:
-    if not r.isWhitespace():
+    if not r.isWhiteSpaceLU():
       break
     x += r.twidth(x)
   if x == 0:
diff --git a/src/local/lineedit.nim b/src/local/lineedit.nim
index 2db59e7b..e2b89f89 100644
--- a/src/local/lineedit.nim
+++ b/src/local/lineedit.nim
@@ -8,6 +8,7 @@ import types/opt
 import types/winattrs
 import utils/strwidth
 import utils/twtstr
+import utils/wordbreak
 
 import chagashi/charset
 import chagashi/validator
diff --git a/src/types/url.nim b/src/types/url.nim
index 54d6f8ed..abb76f56 100644
--- a/src/types/url.nim
+++ b/src/types/url.nim
@@ -245,17 +245,6 @@ func endsInNumber(input: string): bool =
       inc i
   return true
 
-type u32pair {.packed.} = object
-  a: uint32
-  b: uint32
-
-func cmpRange(x: u32pair; y: uint32): int =
-  if x.a > y:
-    return 1
-  elif x.b < y:
-    return -1
-  return 0
-
 type
   IDNATableStatus = enum
     itsValid, itsIgnored, itsMapped, itsDeviation, itsDisallowed
@@ -306,7 +295,7 @@ func processIdna(str: string; beStrict: bool): Option[string] =
     of itsDeviation: mapped &= r
     of itsValid: mapped &= r
   if mapped.len == 0: return
-  mapped.mnormalize()
+  mapped = mapped.normalize()
   var cr: CharRange
   {.cast(noSideEffect).}:
     cr_init(addr cr, nil, passRealloc)
diff --git a/src/utils/luwrap.nim b/src/utils/luwrap.nim
index 330a5d1e..612982e0 100644
--- a/src/utils/luwrap.nim
+++ b/src/utils/luwrap.nim
@@ -1,3 +1,4 @@
+import std/algorithm
 import std/strutils
 import std/unicode
 
@@ -7,9 +8,10 @@ import utils/charcategory
 proc passRealloc(opaque, p: pointer; size: csize_t): pointer {.cdecl.} =
   return realloc(p, size)
 
-proc mnormalize*(rs: var seq[Rune]; form = UNICODE_NFC) =
+proc normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] =
   {.cast(noSideEffect).}:
-    if rs.len == 0: return
+    if rs.len == 0:
+      return @[]
     var outbuf: ptr uint32
     let p = cast[ptr uint32](unsafeAddr rs[0])
     let out_len = unicode_normalize(addr outbuf, p, cint(rs.len), form, nil,
@@ -18,29 +20,81 @@ proc mnormalize*(rs: var seq[Rune]; form = UNICODE_NFC) =
       raise newException(Defect, "Unicode normalization failed")
     if out_len == 0:
       return
-    rs = cast[seq[Rune]](newSeqUninitialized[uint32](out_len))
+    var rs = cast[seq[Rune]](newSeqUninitialized[uint32](out_len))
     copyMem(addr rs[0], outbuf, out_len * sizeof(uint32))
     dealloc(outbuf)
+    return rs
 
-#TODO maybe a utf8 normalization procedure?
 proc mnormalize*(s: var string) =
   if NonAscii notin s:
     return # no need to normalize ascii
-  var rs = s.toRunes()
-  rs.mnormalize()
-  s = $rs
+  s = $s.toRunes().normalize()
 
-func normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] =
-  {.cast(noSideEffect).}:
-    if rs.len == 0: return
-    var outbuf: ptr uint32
-    let p = cast[ptr uint32](unsafeAddr rs[0])
-    let out_len = unicode_normalize(addr outbuf, p, cint(rs.len), form,
-      nil, passRealloc)
-    if out_len < 0:
-      raise newException(Defect, "Unicode normalization failed")
-    if out_len == 0:
-      return
-    result = cast[seq[Rune]](newSeqUninitialized[uint32](out_len))
-    copyMem(addr result[0], outbuf, out_len * sizeof(uint32))
-    dealloc(outbuf)
+# n == 0: upper, 1: lower, 2: case fold
+proc toUpperLU(s: string; n: cint): string =
+  result = newStringOfCap(s.len)
+  for r in s.runes:
+    var outa: array[LRE_CC_RES_LEN_MAX, uint32]
+    let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]),
+      uint32(r), n)
+    for i in 0 ..< n:
+      result &= $Rune(outa[i])
+
+proc toUpperLU*(s: string): string =
+  return s.toUpperLU(0)
+
+proc toLowerLU*(s: string): string =
+  return s.toUpperLU(1)
+
+proc capitalizeLU*(s: string): string =
+  result = newStringOfCap(s.len)
+  var wordStart = true
+  for r in s.runes:
+    if lre_is_space(uint32(r)) == 1:
+      wordStart = true
+      result &= $r
+    elif wordStart:
+      var outa: array[LRE_CC_RES_LEN_MAX, uint32]
+      let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]),
+        uint32(r), 0)
+      for i in 0 ..< n:
+        result &= $Rune(outa[i])
+      wordStart = false
+    else:
+      result &= $r
+
+type u32pair* {.packed.} = object
+  a: uint32
+  b: uint32
+
+func cmpRange*(x: u32pair; y: uint32): int =
+  if x.a > y:
+    return 1
+  elif x.b < y:
+    return -1
+  return 0
+
+func contains(cr: CharRange; r: Rune): bool =
+  let cps = cast[ptr UncheckedArray[u32pair]](cr.points)
+  let L = cr.len div 2 - 1
+  return cps.toOpenArray(0, L).binarySearch(uint32(r), cmpRange) != -1
+
+proc isGeneralCategoryLU*(r: Rune; s: string): bool =
+  var cr: CharRange
+  cr_init(addr cr, nil, passRealloc)
+  doAssert unicode_general_category(addr cr, s) == 0
+  result = r in cr
+  cr_free(addr cr)
+
+proc isAlphaLU*(r: Rune): bool =
+  return r.isGeneralCategoryLU("Letter")
+
+proc isScriptLU*(r: Rune; s: string): bool =
+  var cr: CharRange
+  cr_init(addr cr, nil, passRealloc)
+  doAssert unicode_script(addr cr, s, 0) == 0
+  result = r in cr
+  cr_free(addr cr)
+
+proc isWhiteSpaceLU*(r: Rune): bool =
+  return r.isGeneralCategoryLU("Separator")
diff --git a/src/utils/strwidth.nim b/src/utils/strwidth.nim
index ba7f60a2..fe089328 100644
--- a/src/utils/strwidth.nim
+++ b/src/utils/strwidth.nim
@@ -2,7 +2,6 @@ import std/strutils
 import std/unicode
 
 import utils/proptable
-import utils/charcategory
 import utils/map
 
 include res/map/charwidth_gen
@@ -78,29 +77,3 @@ func padToWidth*(str: string; size: int; schar = '$'): string =
         result &= r
         w += r.width
     result &= schar
-
-func isDigitAscii(r: Rune): bool =
-  return uint32(r) < 128 and char(r) in AsciiDigit
-
-type BreakCategory* = enum
-  bcAlpha, bcSpace, bcSymbol
-
-func breaksWord*(r: Rune): bool =
-  return not (r.isDigitAscii() or r.width() == 0 or r.isAlpha())
-
-func breaksViWordCat*(r: Rune): BreakCategory =
-  if r.isWhiteSpace():
-    return bcSpace
-  elif r.breaksWord() and r != Rune'_':
-    return bcSymbol
-  return bcAlpha
-
-func breaksWordCat*(r: Rune): BreakCategory =
-  if not r.breaksWord():
-    return bcAlpha
-  return bcSpace
-
-func breaksBigWordCat*(r: Rune): BreakCategory =
-  if not r.isWhiteSpace():
-    return bcAlpha
-  return bcSpace
diff --git a/src/utils/wordbreak.nim b/src/utils/wordbreak.nim
new file mode 100644
index 00000000..80959be7
--- /dev/null
+++ b/src/utils/wordbreak.nim
@@ -0,0 +1,33 @@
+import std/unicode
+
+import utils/charcategory
+import utils/luwrap
+import utils/strwidth
+
+func isDigitAscii(r: Rune): bool =
+  return uint32(r) < 128 and char(r) in AsciiDigit
+
+type BreakCategory* = enum
+  bcAlpha, bcSpace, bcSymbol, bcHan
+
+func breaksWord*(r: Rune): bool =
+  return not r.isDigitAscii() and r.width() != 0 and not r.isAlphaLU()
+
+func breaksViWordCat*(r: Rune): BreakCategory =
+  if r.isWhiteSpaceLU():
+    return bcSpace
+  elif r.breaksWord() and r != Rune'_':
+    return bcSymbol
+  elif r.isScriptLU("Han"):
+    return bcHan
+  return bcAlpha
+
+func breaksWordCat*(r: Rune): BreakCategory =
+  if not r.breaksWord():
+    return bcAlpha
+  return bcSpace
+
+func breaksBigWordCat*(r: Rune): BreakCategory =
+  if not r.isWhiteSpaceLU():
+    return bcAlpha
+  return bcSpace
author	bptato <nincsnevem662@gmail.com>	2024-05-09 21:57:00 +0200
committer	bptato <nincsnevem662@gmail.com>	2024-05-09 22:11:30 +0200
commit	2453c63b0b12baa9bd78c0a114b58f1c3833e967 (patch)
tree	34b37fa375f8500669877ec726afea0ba2ed2d99
parent	200a3784de44b90351f7d9a1da47e85e06ff8e15 (diff)
download	chawan-2453c63b0b12baa9bd78c0a114b58f1c3833e967.tar.gz