diff options
author | bptato <nincsnevem662@gmail.com> | 2024-05-10 14:56:28 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2024-05-10 15:07:24 +0200 |
commit | 99c6d7cd15a29ffba54836f26151847176a8569c (patch) | |
tree | b9cc9308ba1fd7d845c186f441b72524c0ae453d /src/utils | |
parent | 2453c63b0b12baa9bd78c0a114b58f1c3833e967 (diff) | |
download | chawan-99c6d7cd15a29ffba54836f26151847176a8569c.tar.gz |
luwrap: use separate context (+ various cleanups)
Use a LUContext to only load required CharRanges once per pager. Also, add kana & hangul vi word break categories for convenience.
Diffstat (limited to 'src/utils')
-rw-r--r-- | src/utils/luwrap.nim | 78 | ||||
-rw-r--r-- | src/utils/strwidth.nim | 45 | ||||
-rw-r--r-- | src/utils/twtstr.nim | 171 | ||||
-rw-r--r-- | src/utils/wordbreak.nim | 44 |
4 files changed, 147 insertions, 191 deletions
diff --git a/src/utils/luwrap.nim b/src/utils/luwrap.nim index 612982e0..853d3015 100644 --- a/src/utils/luwrap.nim +++ b/src/utils/luwrap.nim @@ -79,22 +79,62 @@ func contains(cr: CharRange; r: Rune): bool = let L = cr.len div 2 - 1 return cps.toOpenArray(0, L).binarySearch(uint32(r), cmpRange) != -1 -proc isGeneralCategoryLU*(r: Rune; s: string): bool = - var cr: CharRange - cr_init(addr cr, nil, passRealloc) - doAssert unicode_general_category(addr cr, s) == 0 - result = r in cr - cr_free(addr cr) - -proc isAlphaLU*(r: Rune): bool = - return r.isGeneralCategoryLU("Letter") - -proc isScriptLU*(r: Rune; s: string): bool = - var cr: CharRange - cr_init(addr cr, nil, passRealloc) - doAssert unicode_script(addr cr, s, 0) == 0 - result = r in cr - cr_free(addr cr) - -proc isWhiteSpaceLU*(r: Rune): bool = - return r.isGeneralCategoryLU("Separator") +type + LURangeType = enum + lurLetter = "Letter" + lurSeparator = "Separator" + lurHan = "Han" + lurHiragana = "Hiragana" + lurKatakana = "Katakana" + lurHangul = "Hangul" + + LUContextObj = object + crs: array[LURangeType, CharRange] + inited: set[LURangeType] + + LUContext* = ref LUContextObj + +{.warning[Deprecated]: off.}: + proc `=destroy`*(ctx: var LUContextObj) = + for lur, cr in ctx.crs.mpairs: + if lur in ctx.inited: + cr_free(addr cr) + ctx.inited = {} + +proc initGeneralCategory(ctx: LUContext; lur: LURangeType) = + if lur notin ctx.inited: + let p = addr ctx.crs[lur] + cr_init(p, nil, passRealloc) + doAssert unicode_general_category(p, cstring($lur)) == 0 + ctx.inited.incl(lur) + +proc initScript(ctx: LUContext; lur: LURangeType) = + if lur notin ctx.inited: + let p = addr ctx.crs[lur] + cr_init(p, nil, passRealloc) + doAssert unicode_script(p, cstring($lur), 0) == 0 + ctx.inited.incl(lur) + +proc isAlphaLU*(ctx: LUContext; r: Rune): bool = + ctx.initGeneralCategory(lurLetter) + return r in ctx.crs[lurLetter] + +proc isWhiteSpaceLU*(ctx: LUContext; r: Rune): bool = + ctx.initGeneralCategory(lurSeparator) + return r in ctx.crs[lurSeparator] + +proc isHan*(ctx: LUContext; r: Rune): bool = + ctx.initScript(lurHan) + return r in ctx.crs[lurHan] + +proc isHiragana*(ctx: LUContext; r: Rune): bool = + ctx.initScript(lurHiragana) + return r in ctx.crs[lurHiragana] + +proc isKatakana*(ctx: LUContext; r: Rune): bool = + ctx.initScript(lurKatakana) + return r in ctx.crs[lurKatakana] + +proc isHangul*(ctx: LUContext; r: Rune): bool = + ctx.initScript(lurHangul) + return r in ctx.crs[lurHangul] diff --git a/src/utils/strwidth.nim b/src/utils/strwidth.nim index fe089328..a3acbef7 100644 --- a/src/utils/strwidth.nim +++ b/src/utils/strwidth.nim @@ -1,4 +1,3 @@ -import std/strutils import std/unicode import utils/proptable @@ -40,40 +39,48 @@ func twidth*(r: Rune; w: int): int = return ((w div 8) + 1) * 8 - w func width*(s: string): int = - for r in s.runes(): + result = 0 + for r in s.runes: result += r.twidth(result) func width*(s: string; start, len: int): int = + result = 0 var i = start var m = len - if m > s.len: m = s.len + if m > s.len: + m = s.len while i < m: var r: Rune fastRuneAt(s, i, r) result += r.twidth(result) func notwidth*(s: string): int = + result = 0 for r in s.runes: result += r.width() func twidth*(s: string; w: int): int = var i = w - for r in s.runes(): + for r in s.runes: i += r.twidth(w) return i - w -func padToWidth*(str: string; size: int; schar = '$'): string = - if str.width() < size: - return str & ' '.repeat(size - str.width()) - else: - let size = size - 1 - result = newStringOfCap(str.len) - var w = 0 - var i = 0 - while i < str.len: - var r: Rune - fastRuneAt(str, i, r) - if w + r.width <= size: - result &= r - w += r.width - result &= schar +func padToWidth*(s: string; size: int; schar = '$'): string = + result = newStringOfCap(s.len) + var w = 0 + var r: Rune + var i = 0 + while i < s.len: + fastRuneAt(s, i, r) + w += r.width() + if w > size - 1: + break + result &= r + if w > size - 1: + if w == size and i == s.len: + result &= r + else: + result &= schar + while w < size: + result &= ' ' + inc w diff --git a/src/utils/twtstr.nim b/src/utils/twtstr.nim index da234982..c657d15b 100644 --- a/src/utils/twtstr.nim +++ b/src/utils/twtstr.nim @@ -30,16 +30,16 @@ func getControlLetter*(c: char): char = return '?' return char(int(c) or 0x40) -func toHeaderCase*(str: string): string = - result = str +func toHeaderCase*(s: string): string = + result = s var flip = true for c in result.mitems: if flip: c = c.toUpperAscii() flip = c == '-' -func snakeToKebabCase*(str: string): string = - result = str +func snakeToKebabCase*(s: string): string = + result = s for c in result.mitems: if c == '_': c = '-' @@ -61,13 +61,15 @@ func camelToKebabCase*(s: string): string = else: result &= c -func startsWithNoCase*(str, prefix: string): bool = - if str.len < prefix.len: return false +func startsWithNoCase*(s, prefix: string): bool = + if s.len < prefix.len: + return false # prefix.len is always lower var i = 0 while true: if i == prefix.len: return true - if str[i].toLowerAscii() != prefix[i].toLowerAscii(): return false + if s[i].toLowerAscii() != prefix[i].toLowerAscii(): + return false inc i func hexValue*(c: char): int = @@ -126,12 +128,15 @@ func endsWithIgnoreCase*(s1, s2: string): bool = return false return true +func skipBlanks*(buf: string; at: int): int = + result = at + while result < buf.len and buf[result] in AsciiWhitespace: + inc result + func stripAndCollapse*(s: string): string = - var i = 0 - while i < s.len and s[i] in AsciiWhitespace: - inc i var space = false - while i < s.len: + result = "" + for i in s.skipBlanks(0) ..< s.len: if s[i] notin AsciiWhitespace: if space: result &= ' ' @@ -141,19 +146,13 @@ func stripAndCollapse*(s: string): string = space = true else: result &= ' ' - inc i - -func skipBlanks*(buf: string; at: int): int = - result = at - while result < buf.len and buf[result] in AsciiWhitespace: - inc result func until*(s: string; c: set[char]; starti = 0): string = result = "" for i in starti ..< s.len: if s[i] in c: break - result.add(s[i]) + result &= s[i] func untilLower*(s: string; c: set[char]; starti = 0): string = result = "" @@ -163,14 +162,13 @@ func untilLower*(s: string; c: set[char]; starti = 0): string = result.add(s[i].toLowerAscii()) func until*(s: string; c: char; starti = 0): string = - s.until({c}, starti) + return s.until({c}, starti) func after*(s: string; c: set[char]): string = - var i = 0 - while i < s.len: - if s[i] in c: - return s.substr(i + 1) - inc i + let i = s.find(c) + if i != -1: + return s.substr(i + 1) + return "" func after*(s: string; c: char): string = s.after({c}) @@ -215,100 +213,6 @@ func convertSize*(size: int): string = discard c_sprintf(cstring(result), cstring("%.3g%s"), f, SizeUnit[sizepos]) result.setLen(cstring(result).len) -func numberAdditive*(i: int; range: HSlice[int, int]; - symbols: openArray[(int, string)]): string = - if i notin range: - return $i - var n = i - var at = 0 - while n > 0: - if n >= symbols[at][0]: - n -= symbols[at][0] - result &= symbols[at][1] - continue - inc at - return result - -const romanNumbers = [ - (1000, "M"), (900, "CM"), (500, "D"), (400, "CD"), (100, "C"), (90, "XC"), - (50, "L"), (40, "XL"), (10, "X"), (9, "IX"), (5, "V"), (4, "IV"), (1, "I") -] - -const romanNumbersLower = block: - var res: seq[(int, string)] - for (n, s) in romanNumbers: - res.add((n, s.toLowerAscii())) - res - -func romanNumber*(i: int): string = - return numberAdditive(i, 1..3999, romanNumbers) - -func romanNumberLower*(i: int): string = - return numberAdditive(i, 1..3999, romanNumbersLower) - -func japaneseNumber*(i: int): string = - if i == 0: - return "〇" - var n = i - if i < 0: - result &= "マイナス" - n *= -1 - - let o = n - - var ss: seq[string] - var d = 0 - while n > 0: - let m = n mod 10 - - if m != 0: - case d - of 1: ss.add("十") - of 2: ss.add("百") - of 3: ss.add("千") - of 4: - ss.add("万") - ss.add("一") - of 5: - ss.add("万") - ss.add("十") - of 6: - ss.add("万") - ss.add("百") - of 7: - ss.add("万") - ss.add("千") - ss.add("一") - of 8: - ss.add("億") - ss.add("一") - of 9: - ss.add("億") - ss.add("十") - else: discard - case m - of 0: - inc d - n = n div 10 - of 1: - if o == n: - ss.add("一") - of 2: ss.add("二") - of 3: ss.add("三") - of 4: ss.add("四") - of 5: ss.add("五") - of 6: ss.add("六") - of 7: ss.add("七") - of 8: ss.add("八") - of 9: ss.add("九") - else: discard - n -= m - - n = ss.len - 1 - while n >= 0: - result &= ss[n] - dec n - # Implements https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#signed-integers func parseIntImpl[T: SomeSignedInt](s: string; allowed: set[char]; radix: T): Option[T] = @@ -540,28 +444,28 @@ const NameCharRanges = [ # + NameStartCharRanges ] const NameStartCharAscii = {':', '_'} + AsciiAlpha const NameCharAscii = NameStartCharAscii + {'-', '.'} + AsciiDigit -func matchNameProduction*(str: string): bool = - if str.len == 0: +func matchNameProduction*(s: string): bool = + if s.len == 0: return false # NameStartChar var i = 0 var r: Rune - if str[i] in Ascii: - if str[i] notin NameStartCharAscii: + if s[i] in Ascii: + if s[i] notin NameStartCharAscii: return false inc i else: - fastRuneAt(str, i, r) + fastRuneAt(s, i, r) if not isInRange(NameStartCharRanges, int32(r)): return false # NameChar - while i < str.len: - if str[i] in Ascii: - if str[i] notin NameCharAscii: + while i < s.len: + if s[i] in Ascii: + if s[i] notin NameCharAscii: return false inc i else: - fastRuneAt(str, i, r) + fastRuneAt(s, i, r) if not isInRange(NameStartCharRanges, int32(r)) and not isInMap(NameCharRanges, int32(r)): return false @@ -606,21 +510,14 @@ proc expandPath*(path: string): string = return path func deleteChars*(s: string; todel: set[char]): string = - var i = 0 - block earlyret: - for j, c in s: - if c in todel: - i = j - break earlyret + let i = s.find(todel) + if i == -1: return s - var rs = newStringOfCap(s.len - 1) - for j in 0 ..< i: - rs &= s[j] + var rs = s.substr(0, i - 1) for j in i + 1 ..< s.len: if s[j] in todel: continue rs &= s[j] - inc i return rs func replaceControls*(s: string): string = diff --git a/src/utils/wordbreak.nim b/src/utils/wordbreak.nim index 80959be7..c93d63ec 100644 --- a/src/utils/wordbreak.nim +++ b/src/utils/wordbreak.nim @@ -4,30 +4,42 @@ import utils/charcategory import utils/luwrap import utils/strwidth +type BreakCategory* = enum + bcAlpha, bcSpace, bcSymbol, bcHan, bcHiragana, bcKatakana, bcHangul + func isDigitAscii(r: Rune): bool = return uint32(r) < 128 and char(r) in AsciiDigit -type BreakCategory* = enum - bcAlpha, bcSpace, bcSymbol, bcHan +proc breaksWord*(ctx: LUContext; r: Rune): bool = + return not r.isDigitAscii() and r.width() != 0 and not ctx.isAlphaLU(r) -func breaksWord*(r: Rune): bool = - return not r.isDigitAscii() and r.width() != 0 and not r.isAlphaLU() - -func breaksViWordCat*(r: Rune): BreakCategory = - if r.isWhiteSpaceLU(): +proc breaksViWordCat*(ctx: LUContext; r: Rune): BreakCategory = + if int32(r) < 0x80: # ASCII + let c = char(r) + if c in AsciiAlphaNumeric + {'_'}: + return bcAlpha + elif c in AsciiWhitespace: + return bcSpace + elif ctx.isWhiteSpaceLU(r): return bcSpace - elif r.breaksWord() and r != Rune'_': - return bcSymbol - elif r.isScriptLU("Han"): - return bcHan - return bcAlpha + elif ctx.isAlphaLU(r): + if ctx.isHiragana(r): + return bcHiragana + elif ctx.isKatakana(r): + return bcKatakana + elif ctx.isHangul(r): + return bcHangul + elif ctx.isHan(r): + return bcHan + return bcAlpha + return bcSymbol -func breaksWordCat*(r: Rune): BreakCategory = - if not r.breaksWord(): +proc breaksWordCat*(ctx: LUContext; r: Rune): BreakCategory = + if not ctx.breaksWord(r): return bcAlpha return bcSpace -func breaksBigWordCat*(r: Rune): BreakCategory = - if not r.isWhiteSpaceLU(): +proc breaksBigWordCat*(ctx: LUContext; r: Rune): BreakCategory = + if not ctx.isWhiteSpaceLU(r): return bcAlpha return bcSpace |