about summary refs log tree commit diff stats
path: root/src/utils
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2024-05-10 14:56:28 +0200
committerbptato <nincsnevem662@gmail.com>2024-05-10 15:07:24 +0200
commit99c6d7cd15a29ffba54836f26151847176a8569c (patch)
treeb9cc9308ba1fd7d845c186f441b72524c0ae453d /src/utils
parent2453c63b0b12baa9bd78c0a114b58f1c3833e967 (diff)
downloadchawan-99c6d7cd15a29ffba54836f26151847176a8569c.tar.gz
luwrap: use separate context (+ various cleanups)
Use a LUContext to only load required CharRanges once per pager.

Also, add kana & hangul vi word break categories for convenience.
Diffstat (limited to 'src/utils')
-rw-r--r--src/utils/luwrap.nim78
-rw-r--r--src/utils/strwidth.nim45
-rw-r--r--src/utils/twtstr.nim171
-rw-r--r--src/utils/wordbreak.nim44
4 files changed, 147 insertions, 191 deletions
diff --git a/src/utils/luwrap.nim b/src/utils/luwrap.nim
index 612982e0..853d3015 100644
--- a/src/utils/luwrap.nim
+++ b/src/utils/luwrap.nim
@@ -79,22 +79,62 @@ func contains(cr: CharRange; r: Rune): bool =
   let L = cr.len div 2 - 1
   return cps.toOpenArray(0, L).binarySearch(uint32(r), cmpRange) != -1
 
-proc isGeneralCategoryLU*(r: Rune; s: string): bool =
-  var cr: CharRange
-  cr_init(addr cr, nil, passRealloc)
-  doAssert unicode_general_category(addr cr, s) == 0
-  result = r in cr
-  cr_free(addr cr)
-
-proc isAlphaLU*(r: Rune): bool =
-  return r.isGeneralCategoryLU("Letter")
-
-proc isScriptLU*(r: Rune; s: string): bool =
-  var cr: CharRange
-  cr_init(addr cr, nil, passRealloc)
-  doAssert unicode_script(addr cr, s, 0) == 0
-  result = r in cr
-  cr_free(addr cr)
-
-proc isWhiteSpaceLU*(r: Rune): bool =
-  return r.isGeneralCategoryLU("Separator")
+type
+  LURangeType = enum
+    lurLetter = "Letter"
+    lurSeparator = "Separator"
+    lurHan = "Han"
+    lurHiragana = "Hiragana"
+    lurKatakana = "Katakana"
+    lurHangul = "Hangul"
+
+  LUContextObj = object
+    crs: array[LURangeType, CharRange]
+    inited: set[LURangeType]
+
+  LUContext* = ref LUContextObj
+
+{.warning[Deprecated]: off.}:
+  proc `=destroy`*(ctx: var LUContextObj) =
+    for lur, cr in ctx.crs.mpairs:
+      if lur in ctx.inited:
+        cr_free(addr cr)
+    ctx.inited = {}
+
+proc initGeneralCategory(ctx: LUContext; lur: LURangeType) =
+  if lur notin ctx.inited:
+    let p = addr ctx.crs[lur]
+    cr_init(p, nil, passRealloc)
+    doAssert unicode_general_category(p, cstring($lur)) == 0
+    ctx.inited.incl(lur)
+
+proc initScript(ctx: LUContext; lur: LURangeType) =
+  if lur notin ctx.inited:
+    let p = addr ctx.crs[lur]
+    cr_init(p, nil, passRealloc)
+    doAssert unicode_script(p, cstring($lur), 0) == 0
+    ctx.inited.incl(lur)
+
+proc isAlphaLU*(ctx: LUContext; r: Rune): bool =
+  ctx.initGeneralCategory(lurLetter)
+  return r in ctx.crs[lurLetter]
+
+proc isWhiteSpaceLU*(ctx: LUContext; r: Rune): bool =
+  ctx.initGeneralCategory(lurSeparator)
+  return r in ctx.crs[lurSeparator]
+
+proc isHan*(ctx: LUContext; r: Rune): bool =
+  ctx.initScript(lurHan)
+  return r in ctx.crs[lurHan]
+
+proc isHiragana*(ctx: LUContext; r: Rune): bool =
+  ctx.initScript(lurHiragana)
+  return r in ctx.crs[lurHiragana]
+
+proc isKatakana*(ctx: LUContext; r: Rune): bool =
+  ctx.initScript(lurKatakana)
+  return r in ctx.crs[lurKatakana]
+
+proc isHangul*(ctx: LUContext; r: Rune): bool =
+  ctx.initScript(lurHangul)
+  return r in ctx.crs[lurHangul]
diff --git a/src/utils/strwidth.nim b/src/utils/strwidth.nim
index fe089328..a3acbef7 100644
--- a/src/utils/strwidth.nim
+++ b/src/utils/strwidth.nim
@@ -1,4 +1,3 @@
-import std/strutils
 import std/unicode
 
 import utils/proptable
@@ -40,40 +39,48 @@ func twidth*(r: Rune; w: int): int =
   return ((w div 8) + 1) * 8 - w
 
 func width*(s: string): int =
-  for r in s.runes():
+  result = 0
+  for r in s.runes:
     result += r.twidth(result)
 
 func width*(s: string; start, len: int): int =
+  result = 0
   var i = start
   var m = len
-  if m > s.len: m = s.len
+  if m > s.len:
+    m = s.len
   while i < m:
     var r: Rune
     fastRuneAt(s, i, r)
     result += r.twidth(result)
 
 func notwidth*(s: string): int =
+  result = 0
   for r in s.runes:
     result += r.width()
 
 func twidth*(s: string; w: int): int =
   var i = w
-  for r in s.runes():
+  for r in s.runes:
     i += r.twidth(w)
   return i - w
 
-func padToWidth*(str: string; size: int; schar = '$'): string =
-  if str.width() < size:
-    return str & ' '.repeat(size - str.width())
-  else:
-    let size = size - 1
-    result = newStringOfCap(str.len)
-    var w = 0
-    var i = 0
-    while i < str.len:
-      var r: Rune
-      fastRuneAt(str, i, r)
-      if w + r.width <= size:
-        result &= r
-        w += r.width
-    result &= schar
+func padToWidth*(s: string; size: int; schar = '$'): string =
+  result = newStringOfCap(s.len)
+  var w = 0
+  var r: Rune
+  var i = 0
+  while i < s.len:
+    fastRuneAt(s, i, r)
+    w += r.width()
+    if w > size - 1:
+      break
+    result &= r
+  if w > size - 1:
+    if w == size and i == s.len:
+      result &= r
+    else:
+      result &= schar
+  while w < size:
+    result &= ' '
+    inc w
diff --git a/src/utils/twtstr.nim b/src/utils/twtstr.nim
index da234982..c657d15b 100644
--- a/src/utils/twtstr.nim
+++ b/src/utils/twtstr.nim
@@ -30,16 +30,16 @@ func getControlLetter*(c: char): char =
     return '?'
   return char(int(c) or 0x40)
 
-func toHeaderCase*(str: string): string =
-  result = str
+func toHeaderCase*(s: string): string =
+  result = s
   var flip = true
   for c in result.mitems:
     if flip:
       c = c.toUpperAscii()
     flip = c == '-'
 
-func snakeToKebabCase*(str: string): string =
-  result = str
+func snakeToKebabCase*(s: string): string =
+  result = s
   for c in result.mitems:
     if c == '_':
       c = '-'
@@ -61,13 +61,15 @@ func camelToKebabCase*(s: string): string =
     else:
       result &= c
 
-func startsWithNoCase*(str, prefix: string): bool =
-  if str.len < prefix.len: return false
+func startsWithNoCase*(s, prefix: string): bool =
+  if s.len < prefix.len:
+    return false
   # prefix.len is always lower
   var i = 0
   while true:
     if i == prefix.len: return true
-    if str[i].toLowerAscii() != prefix[i].toLowerAscii(): return false
+    if s[i].toLowerAscii() != prefix[i].toLowerAscii():
+      return false
     inc i
 
 func hexValue*(c: char): int =
@@ -126,12 +128,15 @@ func endsWithIgnoreCase*(s1, s2: string): bool =
       return false
   return true
 
+func skipBlanks*(buf: string; at: int): int =
+  result = at
+  while result < buf.len and buf[result] in AsciiWhitespace:
+    inc result
+
 func stripAndCollapse*(s: string): string =
-  var i = 0
-  while i < s.len and s[i] in AsciiWhitespace:
-    inc i
   var space = false
-  while i < s.len:
+  result = ""
+  for i in s.skipBlanks(0) ..< s.len:
     if s[i] notin AsciiWhitespace:
       if space:
         result &= ' '
@@ -141,19 +146,13 @@ func stripAndCollapse*(s: string): string =
       space = true
     else:
       result &= ' '
-    inc i
-
-func skipBlanks*(buf: string; at: int): int =
-  result = at
-  while result < buf.len and buf[result] in AsciiWhitespace:
-    inc result
 
 func until*(s: string; c: set[char]; starti = 0): string =
   result = ""
   for i in starti ..< s.len:
     if s[i] in c:
       break
-    result.add(s[i])
+    result &= s[i]
 
 func untilLower*(s: string; c: set[char]; starti = 0): string =
   result = ""
@@ -163,14 +162,13 @@ func untilLower*(s: string; c: set[char]; starti = 0): string =
     result.add(s[i].toLowerAscii())
 
 func until*(s: string; c: char; starti = 0): string =
-  s.until({c}, starti)
+  return s.until({c}, starti)
 
 func after*(s: string; c: set[char]): string =
-  var i = 0
-  while i < s.len:
-    if s[i] in c:
-      return s.substr(i + 1)
-    inc i
+  let i = s.find(c)
+  if i != -1:
+    return s.substr(i + 1)
+  return ""
 
 func after*(s: string; c: char): string = s.after({c})
 
@@ -215,100 +213,6 @@ func convertSize*(size: int): string =
   discard c_sprintf(cstring(result), cstring("%.3g%s"), f, SizeUnit[sizepos])
   result.setLen(cstring(result).len)
 
-func numberAdditive*(i: int; range: HSlice[int, int];
-    symbols: openArray[(int, string)]): string =
-  if i notin range:
-    return $i
-  var n = i
-  var at = 0
-  while n > 0:
-    if n >= symbols[at][0]:
-      n -= symbols[at][0]
-      result &= symbols[at][1]
-      continue
-    inc at
-  return result
-
-const romanNumbers = [
-  (1000, "M"), (900, "CM"), (500, "D"), (400, "CD"), (100, "C"), (90, "XC"),
-  (50, "L"), (40, "XL"), (10, "X"), (9, "IX"), (5, "V"), (4, "IV"), (1, "I")
-]
-
-const romanNumbersLower = block:
-  var res: seq[(int, string)]
-  for (n, s) in romanNumbers:
-    res.add((n, s.toLowerAscii()))
-  res
-
-func romanNumber*(i: int): string =
-  return numberAdditive(i, 1..3999, romanNumbers)
-
-func romanNumberLower*(i: int): string =
-  return numberAdditive(i, 1..3999, romanNumbersLower)
-
-func japaneseNumber*(i: int): string =
-  if i == 0:
-    return "〇"
-  var n = i
-  if i < 0:
-    result &= "マイナス"
-    n *= -1
-
-  let o = n
-
-  var ss: seq[string]
-  var d = 0
-  while n > 0:
-    let m = n mod 10
-
-    if m != 0:
-      case d
-      of 1: ss.add("十")
-      of 2: ss.add("百")
-      of 3: ss.add("千")
-      of 4:
-        ss.add("万")
-        ss.add("一")
-      of 5:
-        ss.add("万")
-        ss.add("十")
-      of 6:
-        ss.add("万")
-        ss.add("百")
-      of 7:
-        ss.add("万")
-        ss.add("千")
-        ss.add("一")
-      of 8:
-        ss.add("億")
-        ss.add("一")
-      of 9:
-        ss.add("億")
-        ss.add("十")
-      else: discard
-    case m
-    of 0:
-      inc d
-      n = n div 10
-    of 1:
-      if o == n:
-        ss.add("一")
-    of 2: ss.add("二")
-    of 3: ss.add("三")
-    of 4: ss.add("四")
-    of 5: ss.add("五")
-    of 6: ss.add("六")
-    of 7: ss.add("七")
-    of 8: ss.add("八")
-    of 9: ss.add("九")
-    else: discard
-    n -= m
-
-  n = ss.len - 1
-  while n >= 0:
-    result &= ss[n]
-    dec n
-
 # Implements https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#signed-integers
 func parseIntImpl[T: SomeSignedInt](s: string; allowed: set[char]; radix: T):
     Option[T] =
@@ -540,28 +444,28 @@ const NameCharRanges = [ # + NameStartCharRanges
 ]
 const NameStartCharAscii = {':', '_'} + AsciiAlpha
 const NameCharAscii = NameStartCharAscii + {'-', '.'} + AsciiDigit
-func matchNameProduction*(str: string): bool =
-  if str.len == 0:
+func matchNameProduction*(s: string): bool =
+  if s.len == 0:
     return false
   # NameStartChar
   var i = 0
   var r: Rune
-  if str[i] in Ascii:
-    if str[i] notin NameStartCharAscii:
+  if s[i] in Ascii:
+    if s[i] notin NameStartCharAscii:
       return false
     inc i
   else:
-    fastRuneAt(str, i, r)
+    fastRuneAt(s, i, r)
     if not isInRange(NameStartCharRanges, int32(r)):
       return false
   # NameChar
-  while i < str.len:
-    if str[i] in Ascii:
-      if str[i] notin NameCharAscii:
+  while i < s.len:
+    if s[i] in Ascii:
+      if s[i] notin NameCharAscii:
         return false
       inc i
     else:
-      fastRuneAt(str, i, r)
+      fastRuneAt(s, i, r)
       if not isInRange(NameStartCharRanges, int32(r)) and
           not isInMap(NameCharRanges, int32(r)):
         return false
@@ -606,21 +510,14 @@ proc expandPath*(path: string): string =
     return path
 
 func deleteChars*(s: string; todel: set[char]): string =
-  var i = 0
-  block earlyret:
-    for j, c in s:
-      if c in todel:
-        i = j
-        break earlyret
+  let i = s.find(todel)
+  if i == -1:
     return s
-  var rs = newStringOfCap(s.len - 1)
-  for j in 0 ..< i:
-    rs &= s[j]
+  var rs = s.substr(0, i - 1)
   for j in i + 1 ..< s.len:
     if s[j] in todel:
       continue
     rs &= s[j]
-    inc i
   return rs
 
 func replaceControls*(s: string): string =
diff --git a/src/utils/wordbreak.nim b/src/utils/wordbreak.nim
index 80959be7..c93d63ec 100644
--- a/src/utils/wordbreak.nim
+++ b/src/utils/wordbreak.nim
@@ -4,30 +4,42 @@ import utils/charcategory
 import utils/luwrap
 import utils/strwidth
 
+type BreakCategory* = enum
+  bcAlpha, bcSpace, bcSymbol, bcHan, bcHiragana, bcKatakana, bcHangul
+
 func isDigitAscii(r: Rune): bool =
   return uint32(r) < 128 and char(r) in AsciiDigit
 
-type BreakCategory* = enum
-  bcAlpha, bcSpace, bcSymbol, bcHan
+proc breaksWord*(ctx: LUContext; r: Rune): bool =
+  return not r.isDigitAscii() and r.width() != 0 and not ctx.isAlphaLU(r)
 
-func breaksWord*(r: Rune): bool =
-  return not r.isDigitAscii() and r.width() != 0 and not r.isAlphaLU()
-
-func breaksViWordCat*(r: Rune): BreakCategory =
-  if r.isWhiteSpaceLU():
+proc breaksViWordCat*(ctx: LUContext; r: Rune): BreakCategory =
+  if int32(r) < 0x80: # ASCII
+    let c = char(r)
+    if c in AsciiAlphaNumeric + {'_'}:
+      return bcAlpha
+    elif c in AsciiWhitespace:
+      return bcSpace
+  elif ctx.isWhiteSpaceLU(r):
     return bcSpace
-  elif r.breaksWord() and r != Rune'_':
-    return bcSymbol
-  elif r.isScriptLU("Han"):
-    return bcHan
-  return bcAlpha
+  elif ctx.isAlphaLU(r):
+    if ctx.isHiragana(r):
+      return bcHiragana
+    elif ctx.isKatakana(r):
+      return bcKatakana
+    elif ctx.isHangul(r):
+      return bcHangul
+    elif ctx.isHan(r):
+      return bcHan
+    return bcAlpha
+  return bcSymbol
 
-func breaksWordCat*(r: Rune): BreakCategory =
-  if not r.breaksWord():
+proc breaksWordCat*(ctx: LUContext; r: Rune): BreakCategory =
+  if not ctx.breaksWord(r):
     return bcAlpha
   return bcSpace
 
-func breaksBigWordCat*(r: Rune): BreakCategory =
-  if not r.isWhiteSpaceLU():
+proc breaksBigWordCat*(ctx: LUContext; r: Rune): BreakCategory =
+  if not ctx.isWhiteSpaceLU(r):
     return bcAlpha
   return bcSpace