about summary refs log tree commit diff stats
path: root/src/utils/twtstr.nim
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2024-09-08 15:18:45 +0200
committerbptato <nincsnevem662@gmail.com>2024-09-08 16:06:02 +0200
commit4124c041ed2e3b497ede72fdae229aa2c6aca249 (patch)
treee8488449de6f0be54b9c79547352829b998833d3 /src/utils/twtstr.nim
parent5a64e3193924c7e503dddb10a99989148b26e922 (diff)
downloadchawan-4124c041ed2e3b497ede72fdae229aa2c6aca249.tar.gz
utils: add twtuni
std/unicode has the following issues:

* Rune is an int32, which implies overflow checking. Also, it is
  distinct, so you have to convert it manually to do arithmetic.
* QJS libunicode and Chagashi work with uint32, interfacing with these
  required pointless type conversions.
* fastRuneAt is a template, meaning it's pasted into every call
  site. Also, it decodes to UCS-4, so it generates two branches that
  aren't even used. Overall this lead to quite some code bloat.
* fastRuneAt and lastRune have frustratingly different
  interfaces. Writing code to handle both cases is error prone.
* On older Nim versions which we still support, std/unicode takes
  strings, not openArray[char]'s.

Replace it with "twtuni", which includes some improved versions of
the few procedures from std/unicode that we actually use.
Diffstat (limited to 'src/utils/twtstr.nim')
-rw-r--r--src/utils/twtstr.nim20
1 files changed, 9 insertions, 11 deletions
diff --git a/src/utils/twtstr.nim b/src/utils/twtstr.nim
index 0d65be50..f08b1131 100644
--- a/src/utils/twtstr.nim
+++ b/src/utils/twtstr.nim
@@ -3,7 +3,6 @@ import std/math
 import std/options
 import std/os
 import std/strutils
-import std/unicode
 
 when defined(posix):
   import std/posix
@@ -11,14 +10,15 @@ when defined(posix):
 import types/opt
 import utils/charcategory
 import utils/map
+import utils/twtuni
 
 export charcategory
 
 func onlyWhitespace*(s: string): bool =
   return AllChars - AsciiWhitespace notin s
 
-func isControlChar*(r: Rune): bool =
-  return int(r) <= 0x1F or int(r) == 0x7F
+func isControlChar*(u: uint32): bool =
+  return u <= 0x1F or u == 0x7F
 
 func getControlChar*(c: char): char =
   if c == '?':
@@ -444,14 +444,13 @@ func matchNameProduction*(s: string): bool =
     return false
   # NameStartChar
   var i = 0
-  var r: Rune
   if s[i] in Ascii:
     if s[i] notin NameStartCharAscii:
       return false
     inc i
   else:
-    fastRuneAt(s, i, r)
-    if not NameStartCharRanges.isInRange(uint32(r)):
+    let u = s.nextUTF8(i)
+    if not NameStartCharRanges.isInRange(u):
       return false
   # NameChar
   while i < s.len:
@@ -460,9 +459,8 @@ func matchNameProduction*(s: string): bool =
         return false
       inc i
     else:
-      fastRuneAt(s, i, r)
-      if not NameStartCharRanges.isInRange(uint32(r)) and
-          not NameCharRanges.isInMap(uint32(r)):
+      let u = s.nextUTF8(i)
+      if not NameStartCharRanges.isInRange(u) and not NameCharRanges.isInMap(u):
         return false
   return true
 
@@ -483,8 +481,8 @@ func matchQNameProduction*(s: string): bool =
 
 func utf16Len*(s: string): int =
   result = 0
-  for r in s.runes:
-    if uint32(r) < 0x10000: # ucs-2
+  for u in s.points:
+    if u < 0x10000: # ucs-2
       result += 1
     else: # surrogate
       result += 2