utils: add twtuni

std/unicode has the following issues: * Rune is an int32, which implies overflow checking. Also, it is distinct, so you have to convert it manually to do arithmetic. * QJS libunicode and Chagashi work with uint32, interfacing with these required pointless type conversions. * fastRuneAt is a template, meaning it's pasted into every call site. Also, it decodes to UCS-4, so it generates two branches that aren't even used. Overall this lead to quite some code bloat. * fastRuneAt and lastRune have frustratingly different interfaces. Writing code to handle both cases is error prone. * On older Nim versions which we still support, std/unicode takes strings, not openArray[char]'s. Replace it with "twtuni", which includes some improved versions of the few procedures from std/unicode that we actually use.
author: bptato <nincsnevem662@gmail.com> 2024-09-08 15:18:45 +0200
committer: bptato <nincsnevem662@gmail.com> 2024-09-08 16:06:02 +0200
commit: 4124c041ed2e3b497ede72fdae229aa2c6aca249 (patch)
tree: e8488449de6f0be54b9c79547352829b998833d3 /src/utils/twtstr.nim
parent: 5a64e3193924c7e503dddb10a99989148b26e922 (diff)
download: chawan-4124c041ed2e3b497ede72fdae229aa2c6aca249.tar.gz
1 files changed, 9 insertions, 11 deletions
diff --git a/src/utils/twtstr.nim b/src/utils/twtstr.nim
index 0d65be50..f08b1131 100644
--- a/src/utils/twtstr.nim
+++ b/src/utils/twtstr.nim
@@ -3,7 +3,6 @@ import std/math
 import std/options
 import std/os
 import std/strutils
-import std/unicode
 
 when defined(posix):
   import std/posix
@@ -11,14 +10,15 @@ when defined(posix):
 import types/opt
 import utils/charcategory
 import utils/map
+import utils/twtuni
 
 export charcategory
 
 func onlyWhitespace*(s: string): bool =
   return AllChars - AsciiWhitespace notin s
 
-func isControlChar*(r: Rune): bool =
-  return int(r) <= 0x1F or int(r) == 0x7F
+func isControlChar*(u: uint32): bool =
+  return u <= 0x1F or u == 0x7F
 
 func getControlChar*(c: char): char =
   if c == '?':
@@ -444,14 +444,13 @@ func matchNameProduction*(s: string): bool =
     return false
   # NameStartChar
   var i = 0
-  var r: Rune
   if s[i] in Ascii:
     if s[i] notin NameStartCharAscii:
       return false
     inc i
   else:
-    fastRuneAt(s, i, r)
-    if not NameStartCharRanges.isInRange(uint32(r)):
+    let u = s.nextUTF8(i)
+    if not NameStartCharRanges.isInRange(u):
       return false
   # NameChar
   while i < s.len:
@@ -460,9 +459,8 @@ func matchNameProduction*(s: string): bool =
         return false
       inc i
     else:
-      fastRuneAt(s, i, r)
-      if not NameStartCharRanges.isInRange(uint32(r)) and
-          not NameCharRanges.isInMap(uint32(r)):
+      let u = s.nextUTF8(i)
+      if not NameStartCharRanges.isInRange(u) and not NameCharRanges.isInMap(u):
         return false
   return true
 
@@ -483,8 +481,8 @@ func matchQNameProduction*(s: string): bool =
 
 func utf16Len*(s: string): int =
   result = 0
-  for r in s.runes:
-    if uint32(r) < 0x10000: # ucs-2
+  for u in s.points:
+    if u < 0x10000: # ucs-2
       result += 1
     else: # surrogate
       result += 2
author	bptato <nincsnevem662@gmail.com>	2024-09-08 15:18:45 +0200
committer	bptato <nincsnevem662@gmail.com>	2024-09-08 16:06:02 +0200
commit	4124c041ed2e3b497ede72fdae229aa2c6aca249 (patch)
tree	e8488449de6f0be54b9c79547352829b998833d3 /src/utils/twtstr.nim
parent	5a64e3193924c7e503dddb10a99989148b26e922 (diff)
download	chawan-4124c041ed2e3b497ede72fdae229aa2c6aca249.tar.gz