diff options
author | bptato <nincsnevem662@gmail.com> | 2024-09-08 15:18:45 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2024-09-08 16:06:02 +0200 |
commit | 4124c041ed2e3b497ede72fdae229aa2c6aca249 (patch) | |
tree | e8488449de6f0be54b9c79547352829b998833d3 /src/utils | |
parent | 5a64e3193924c7e503dddb10a99989148b26e922 (diff) | |
download | chawan-4124c041ed2e3b497ede72fdae229aa2c6aca249.tar.gz |
utils: add twtuni
std/unicode has the following issues: * Rune is an int32, which implies overflow checking. Also, it is distinct, so you have to convert it manually to do arithmetic. * QJS libunicode and Chagashi work with uint32, interfacing with these required pointless type conversions. * fastRuneAt is a template, meaning it's pasted into every call site. Also, it decodes to UCS-4, so it generates two branches that aren't even used. Overall this lead to quite some code bloat. * fastRuneAt and lastRune have frustratingly different interfaces. Writing code to handle both cases is error prone. * On older Nim versions which we still support, std/unicode takes strings, not openArray[char]'s. Replace it with "twtuni", which includes some improved versions of the few procedures from std/unicode that we actually use.
Diffstat (limited to 'src/utils')
-rw-r--r-- | src/utils/luwrap.nim | 57 | ||||
-rw-r--r-- | src/utils/strwidth.nim | 82 | ||||
-rw-r--r-- | src/utils/twtstr.nim | 20 | ||||
-rw-r--r-- | src/utils/twtuni.nim | 95 | ||||
-rw-r--r-- | src/utils/widthconv.nim | 102 | ||||
-rw-r--r-- | src/utils/wordbreak.nim | 36 |
6 files changed, 226 insertions, 166 deletions
diff --git a/src/utils/luwrap.nim b/src/utils/luwrap.nim index 6081cdf8..76a5b2e1 100644 --- a/src/utils/luwrap.nim +++ b/src/utils/luwrap.nim @@ -1,14 +1,14 @@ import std/algorithm import std/strutils -import std/unicode import monoucha/libunicode import utils/charcategory +import utils/twtuni proc passRealloc(opaque, p: pointer; size: csize_t): pointer {.cdecl.} = return realloc(p, size) -proc normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] = +proc normalize*(rs: seq[uint32]; form = UNICODE_NFC): seq[uint32] = {.cast(noSideEffect).}: if rs.len == 0: return @[] @@ -20,7 +20,7 @@ proc normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] = raise newException(Defect, "Unicode normalization failed") if out_len == 0: return - var rs = cast[seq[Rune]](newSeqUninitialized[uint32](out_len)) + var rs = newSeqUninitialized[uint32](out_len) copyMem(addr rs[0], outbuf, out_len * sizeof(uint32)) dealloc(outbuf) return rs @@ -28,17 +28,15 @@ proc normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] = proc mnormalize*(s: var string) = if NonAscii notin s: return # no need to normalize ascii - s = $s.toRunes().normalize() + s = s.toPoints().normalize().toUTF8() # n == 0: upper, 1: lower, 2: case fold proc toUpperLU(s: string; n: cint): string = result = newStringOfCap(s.len) - for r in s.runes: + for u in s.points: var outa: array[LRE_CC_RES_LEN_MAX, uint32] - let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]), - uint32(r), n) - for i in 0 ..< n: - result &= $Rune(outa[i]) + let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]), u, n) + result.addUTF8(outa.toOpenArray(0, n - 1)) proc toUpperLU*(s: string): string = return s.toUpperLU(0) @@ -49,19 +47,18 @@ proc toLowerLU*(s: string): string = proc capitalizeLU*(s: string): string = result = newStringOfCap(s.len) var wordStart = true - for r in s.runes: - if lre_is_space(uint32(r)) == 1: + for u in s.points: + if lre_is_space(u) == 1: wordStart = true - result &= $r + result.addUTF8(u) elif wordStart: var outa: array[LRE_CC_RES_LEN_MAX, uint32] let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]), - uint32(r), 0) - for i in 0 ..< n: - result &= $Rune(outa[i]) + u, 0) + result.addUTF8(outa.toOpenArray(0, n - 1)) wordStart = false else: - result &= $r + result.addUTF8(u) type u32pair* {.packed.} = object a: uint32 @@ -74,10 +71,10 @@ func cmpRange*(x: u32pair; y: uint32): int = return -1 return 0 -func contains(cr: CharRange; r: Rune): bool = +func contains(cr: CharRange; u: uint32): bool = let cps = cast[ptr UncheckedArray[u32pair]](cr.points) let L = cr.len div 2 - 1 - return cps.toOpenArray(0, L).binarySearch(uint32(r), cmpRange) != -1 + return cps.toOpenArray(0, L).binarySearch(u, cmpRange) != -1 type LURangeType = enum @@ -114,26 +111,26 @@ proc initScript(ctx: LUContext; lur: LURangeType) = doAssert unicode_script(p, cstring($lur), 0) == 0 ctx.inited.incl(lur) -proc isAlphaLU*(ctx: LUContext; r: Rune): bool = +proc isAlphaLU*(ctx: LUContext; u: uint32): bool = ctx.initGeneralCategory(lurLetter) - return r in ctx.crs[lurLetter] + return u in ctx.crs[lurLetter] -proc isWhiteSpaceLU*(ctx: LUContext; r: Rune): bool = +proc isWhiteSpaceLU*(ctx: LUContext; u: uint32): bool = ctx.initGeneralCategory(lurSeparator) - return r in ctx.crs[lurSeparator] + return u in ctx.crs[lurSeparator] -proc isHan*(ctx: LUContext; r: Rune): bool = +proc isHan*(ctx: LUContext; u: uint32): bool = ctx.initScript(lurHan) - return r in ctx.crs[lurHan] + return u in ctx.crs[lurHan] -proc isHiragana*(ctx: LUContext; r: Rune): bool = +proc isHiragana*(ctx: LUContext; u: uint32): bool = ctx.initScript(lurHiragana) - return r in ctx.crs[lurHiragana] + return u in ctx.crs[lurHiragana] -proc isKatakana*(ctx: LUContext; r: Rune): bool = +proc isKatakana*(ctx: LUContext; u: uint32): bool = ctx.initScript(lurKatakana) - return r in ctx.crs[lurKatakana] + return u in ctx.crs[lurKatakana] -proc isHangul*(ctx: LUContext; r: Rune): bool = +proc isHangul*(ctx: LUContext; u: uint32): bool = ctx.initScript(lurHangul) - return r in ctx.crs[lurHangul] + return u in ctx.crs[lurHangul] diff --git a/src/utils/strwidth.nim b/src/utils/strwidth.nim index 8c367991..4ce9aa12 100644 --- a/src/utils/strwidth.nim +++ b/src/utils/strwidth.nim @@ -1,7 +1,6 @@ -import std/unicode - -import utils/proptable import utils/map +import utils/proptable +import utils/twtuni include res/map/charwidth_gen @@ -9,8 +8,7 @@ include res/map/charwidth_gen var isCJKAmbiguous* = false # Warning: this shouldn't be called without normalization. -func width*(r: Rune): int = - let u = uint32(r) +func width*(u: uint32): int = if u <= 0xFFFF: # fast path for BMP if u in CombiningTable: return 0 @@ -31,80 +29,56 @@ func width*(r: Rune): int = # Width, but also works with tabs. # Needs the column width of the text so far. -func twidth*(r: Rune; w: int): int = - if r != Rune('\t'): - return r.width() +func twidth*(u: uint32; w: int): int = + if u != uint32('\t'): + return u.width() return ((w div 8) + 1) * 8 - w -func width*(s: string): int = - result = 0 - for r in s.runes: - result += r.twidth(result) +func width*(s: openArray[char]): int = + var w = 0 + for u in s.points: + w += u.twidth(w) + return w func width*(s: string; start, len: int): int = - result = 0 + var w = 0 var i = start var m = len if m > s.len: m = s.len while i < m: - var r: Rune - fastRuneAt(s, i, r) - result += r.twidth(result) - -when NimMajor < 2: - template ones(n: untyped): untyped = ((1 shl n)-1) - template fastRuneAt(s: openArray[char]; i: int; result: untyped) = - result = Rune(0xFFFD) - if uint32(s[i]) <= 127: - result = Rune(uint32(s[i])) - elif uint32(s[i]) shr 5 == 0b110: - if i <= s.len - 2: - result = Rune((uint32(s[i]) and (ones(5))) shl 6 or - (uint32(s[i+1]) and ones(6))) - i += 1 - elif uint32(s[i]) shr 4 == 0b1110: - if i <= s.len - 3: - result = Rune((uint32(s[i]) and ones(4)) shl 12 or - (uint32(s[i+1]) and ones(6)) shl 6 or (uint32(s[i+2]) and ones(6))) - i += 2 - elif uint32(s[i]) shr 3 == 0b11110: - if i <= s.len - 4: - result = Rune((uint32(s[i]) and ones(3)) shl 18 or - (uint32(s[i+1]) and ones(6)) shl 12 or - (uint32(s[i+2]) and ones(6)) shl 6 or - (uint32(s[i+3]) and ones(6))) - i += 3 - inc i + let u = s.nextUTF8(i) + w += u.twidth(w) + return w func notwidth*(s: openArray[char]): int = - result = 0 - var i = 0 - while i < s.len: - var r: Rune - fastRuneAt(s, i, r) - result += r.width() + var w = 0 + for u in s.points: + w += u.width() + return w func twidth*(s: string; w: int): int = var i = w - for r in s.runes: - i += r.twidth(w) + for u in s.points: + i += u.twidth(w) return i - w func padToWidth*(s: string; size: int; schar = '$'): string = result = newStringOfCap(s.len) var w = 0 - var r: Rune var i = 0 + var pi = 0 while i < s.len: - fastRuneAt(s, i, r) - w += r.width() + pi = i + w += s.nextUTF8(i).width() if w > size - 1: break - result &= r + for j in pi ..< i: + result &= s[j] if w > size - 1: if w == size and i == s.len: - result &= r + for j in pi ..< i: + result &= s[j] else: result &= schar while w < size: diff --git a/src/utils/twtstr.nim b/src/utils/twtstr.nim index 0d65be50..f08b1131 100644 --- a/src/utils/twtstr.nim +++ b/src/utils/twtstr.nim @@ -3,7 +3,6 @@ import std/math import std/options import std/os import std/strutils -import std/unicode when defined(posix): import std/posix @@ -11,14 +10,15 @@ when defined(posix): import types/opt import utils/charcategory import utils/map +import utils/twtuni export charcategory func onlyWhitespace*(s: string): bool = return AllChars - AsciiWhitespace notin s -func isControlChar*(r: Rune): bool = - return int(r) <= 0x1F or int(r) == 0x7F +func isControlChar*(u: uint32): bool = + return u <= 0x1F or u == 0x7F func getControlChar*(c: char): char = if c == '?': @@ -444,14 +444,13 @@ func matchNameProduction*(s: string): bool = return false # NameStartChar var i = 0 - var r: Rune if s[i] in Ascii: if s[i] notin NameStartCharAscii: return false inc i else: - fastRuneAt(s, i, r) - if not NameStartCharRanges.isInRange(uint32(r)): + let u = s.nextUTF8(i) + if not NameStartCharRanges.isInRange(u): return false # NameChar while i < s.len: @@ -460,9 +459,8 @@ func matchNameProduction*(s: string): bool = return false inc i else: - fastRuneAt(s, i, r) - if not NameStartCharRanges.isInRange(uint32(r)) and - not NameCharRanges.isInMap(uint32(r)): + let u = s.nextUTF8(i) + if not NameStartCharRanges.isInRange(u) and not NameCharRanges.isInMap(u): return false return true @@ -483,8 +481,8 @@ func matchQNameProduction*(s: string): bool = func utf16Len*(s: string): int = result = 0 - for r in s.runes: - if uint32(r) < 0x10000: # ucs-2 + for u in s.points: + if u < 0x10000: # ucs-2 result += 1 else: # surrogate result += 2 diff --git a/src/utils/twtuni.nim b/src/utils/twtuni.nim new file mode 100644 index 00000000..c617ac4e --- /dev/null +++ b/src/utils/twtuni.nim @@ -0,0 +1,95 @@ +func nextUTF8*(s: openArray[char]; i: var int): uint32 = + let j = i + var u = uint32(s[j]) + if u <= 0x7F: + inc i + elif u shr 5 == 0b110: + let e = j + 2 + if likely(e <= s.len): + u = (u and 0x1F) shl 6 or (uint32(s[j + 1]) and 0x3F) + i = e + elif u shr 4 == 0b1110: + let e = j + 3 + if likely(e <= s.len): + u = (u and 0xF) shl 12 or + (uint32(s[j + 1]) and 0x3F) shl 6 or + (uint32(s[j + 2]) and 0x3F) + i = e + elif u shr 3 == 0b11110: + let e = j + 4 + if likely(e <= s.len): + u = (u and 7) shl 18 or + (uint32(s[j + 1]) and 0x3F) shl 12 or + (uint32(s[j + 2]) and 0x3F) shl 6 or + (uint32(s[j + 3]) and 0x3F) + i = e + else: + u = 0xFFFD + inc i + return u + +func prevUTF8*(s: openArray[char]; i: var int): uint32 = + var j = i - 1 + while uint32(s[j]) shr 6 == 2: + dec j + i = j + return s.nextUTF8(j) + +func pointLenAt*(s: openArray[char]; i: int): int = + let u = uint8(s[i]) + if u <= 0x7F: + return 1 + elif u shr 5 == 0b110: + return 2 + elif u shr 4 == 0b1110: + return 3 + elif u shr 3 == 0b11110: + return 4 + return 1 + +iterator points*(s: openArray[char]): uint32 {.inline.} = + var i = 0 + while i < s.len: + let u = s.nextUTF8(i) + yield u + +func toPoints*(s: openArray[char]): seq[uint32] = + result = @[] + for u in s.points: + result.add(u) + +proc addUTF8*(res: var string; u: uint32) = + if u < 0x80: + res &= char(u) + elif u < 0x800: + res &= char(u shr 6 or 0xC0) + res &= char(u and 0x3F or 0x80) + elif u < 0x10000: + res &= char(u shr 12 or 0xE0) + res &= char(u shr 6 and 0x3F or 0x80) + res &= char(u and 0x3F or 0x80) + else: + res &= char(u shr 18 or 0xF0) + res &= char(u shr 12 and 0x3F or 0x80) + res &= char(u shr 6 and 0x3F or 0x80) + res &= char(u and 0x3F or 0x80) + +func addUTF8*(res: var string; us: openArray[uint32]) = + for u in us: + res.addUTF8(u) + +func toUTF8*(u: uint32): string = + var s = "" + s.addUTF8(u) + return s + +func toUTF8*(us: openArray[uint32]): string = + var s = newStringOfCap(us.len shr 2) + s.addUTF8(us) + return s + +func pointLen*(s: openArray[char]): int = + var n = 0 + for u in s.points: + inc n + return n diff --git a/src/utils/widthconv.nim b/src/utils/widthconv.nim index b6495379..32a904f6 100644 --- a/src/utils/widthconv.nim +++ b/src/utils/widthconv.nim @@ -1,66 +1,64 @@ import std/strutils -import std/unicode + import utils/map +import utils/twtuni const CanHaveDakuten = ("かきくけこさしすせそたちつてとはひふへほカキクケコ" & - "サシスセソタチツテトハヒフヘホ").toRunes() + "サシスセソタチツテトハヒフヘホ").toPoints() -const CanHaveHanDakuten = "はひふへほハヒフヘホ".toRunes() +const CanHaveHanDakuten = "はひふへほハヒフヘホ".toPoints() const HasDakuten = ("がぎぐげござじずぜぞだぢづでどばびぶべぼガギグゲゴ" & - "ザジズゼゾダヂヅデドバビブベボ").toRunes() + "ザジズゼゾダヂヅデドバビブベボ").toPoints() -const HasHanDakuten = "ぱぴぷぺぽパピプペポ".toRunes() +const HasHanDakuten = "ぱぴぷぺぽパピプペポ".toPoints() # Halfwidth to fullwidth & vice versa -const halfFullMap = (func(): seq[tuple[half, full1, full2: Rune]] = +const halfFullMap = (func(): seq[tuple[half, full1, full2: uint32]] = result = @[] const map = staticRead"res/widthconvmap.tab" for line in map.split('\n'): if line == "": break var i = 0 - var half: Rune - fastRuneAt(line, i, half) + let half = line.nextUTF8(i) assert line[i] == '\t' inc i - var full1: Rune - fastRuneAt(line, i, full1) - var full2 = Rune(0) + let full1 = line.nextUTF8(i) + var full2 = 0u32 if i < line.len: assert line[i] == '\t' - inc i - fastRuneAt(line, i, full2) + full2 = line.nextUTF8(i) result.add((half, full1, full2)) )() -func halfwidth(r: Rune): Rune = - if r != Rune(0): # special case to avoid comparison with f2 +func halfwidth(u: uint32): uint32 = + if u != 0: # special case to avoid comparison with f2 for (h, f1, f2) in halfFullMap: - if f1 == r or f2 == r: + if f1 == u or f2 == u: return h - return r + return u -const HalfDakuten = Rune(0xFF9E) # half-width dakuten -const HalfHanDakuten = Rune(0xFF9F) # half-width handakuten +const HalfDakuten = 0xFF9Eu32 # half-width dakuten +const HalfHanDakuten = 0xFF9Fu32 # half-width handakuten # Note: in unicode, char + 1 is dakuten and char + 2 handakuten func halfwidth*(s: string): string = result = "" - for r in s.runes: - case r + for u in s.points: + case u of HasDakuten: - result &= halfwidth(Rune(uint32(r) - 1)) - result &= HalfDakuten + result.addUTF8(halfwidth(u - 1)) + result.addUTF8(HalfDakuten) of HasHanDakuten: - result &= halfwidth(Rune(uint32(r) - 2)) - result &= HalfHanDakuten + result.addUTF8(halfwidth(u - 2)) + result.addUTF8(HalfHanDakuten) else: - result &= halfwidth(r) + result.addUTF8(halfwidth(u)) -func fullwidth(r: Rune): Rune = - if r != Rune(0): # special case to avoid comparison with f2 +func fullwidth(r: uint32): uint32 = + if r != 0: # special case to avoid comparison with f2 for (h, f1, f2) in halfFullMap: if h == r: return f1 @@ -68,45 +66,45 @@ func fullwidth(r: Rune): Rune = func fullwidth*(s: string): string = result = "" - var lastr = Rune(0) - for r in s.runes: - if lastr != Rune(0): - if r == HalfDakuten: + var lastu = 0u32 + for u in s.points: + if lastu != 0: + if u == HalfDakuten: # flush with dakuten - result &= Rune(uint32(lastr) + 1) - lastr = Rune(0) + result.addUTF8(lastu + 1) + lastu = 0 continue - elif r == HalfHanDakuten and lastr in CanHaveHanDakuten: + elif u == HalfHanDakuten and lastu in CanHaveHanDakuten: # flush with handakuten - result &= Rune(uint32(lastr) + 2) - lastr = Rune(0) + result.addUTF8(lastu + 2) + lastu = 0 continue - result &= lastr - lastr = Rune(0) - let r = fullwidth(r) - if r in CanHaveDakuten: - lastr = r + result.addUTF8(lastu) + lastu = 0 + let u = fullwidth(u) + if u in CanHaveDakuten: + lastu = u else: - result &= r - if lastr != Rune(0): + result.addUTF8(u) + if lastu != 0: # flush - result &= lastr + result.addUTF8(lastu) const kanamap = staticRead"res/kanamap.tab" func genFullSizeMap(): seq[(uint32, uint32)] = result = @[] for line in kanamap.split('\n'): if line.len == 0: break - let rs = line.toRunes() - assert rs[1] == Rune('\t') - result.add((uint32(rs[0]), uint32(rs[2]))) + let rs = line.toPoints() + assert rs[1] == uint32('\t') + result.add((rs[0], rs[2])) const fullSizeMap = genFullSizeMap() proc fullsize*(s: string): string = result = "" - for r in s.runes: - let i = searchInMap(fullSizeMap, uint32(r)) + for u in s.points: + let i = searchInMap(fullSizeMap, u) if i == -1: - result &= r + result.addUTF8(u) else: - result &= $Rune(fullSizeMap[i][1]) + result.addUTF8(fullSizeMap[i][1]) diff --git a/src/utils/wordbreak.nim b/src/utils/wordbreak.nim index c93d63ec..ff607fbe 100644 --- a/src/utils/wordbreak.nim +++ b/src/utils/wordbreak.nim @@ -1,5 +1,3 @@ -import std/unicode - import utils/charcategory import utils/luwrap import utils/strwidth @@ -7,39 +5,39 @@ import utils/strwidth type BreakCategory* = enum bcAlpha, bcSpace, bcSymbol, bcHan, bcHiragana, bcKatakana, bcHangul -func isDigitAscii(r: Rune): bool = - return uint32(r) < 128 and char(r) in AsciiDigit +func isDigitAscii(u: uint32): bool = + return u < 128 and char(u) in AsciiDigit -proc breaksWord*(ctx: LUContext; r: Rune): bool = - return not r.isDigitAscii() and r.width() != 0 and not ctx.isAlphaLU(r) +proc breaksWord*(ctx: LUContext; u: uint32): bool = + return not u.isDigitAscii() and u.width() != 0 and not ctx.isAlphaLU(u) -proc breaksViWordCat*(ctx: LUContext; r: Rune): BreakCategory = - if int32(r) < 0x80: # ASCII - let c = char(r) +proc breaksViWordCat*(ctx: LUContext; u: uint32): BreakCategory = + if u < 0x80: # ASCII + let c = char(u) if c in AsciiAlphaNumeric + {'_'}: return bcAlpha elif c in AsciiWhitespace: return bcSpace - elif ctx.isWhiteSpaceLU(r): + elif ctx.isWhiteSpaceLU(u): return bcSpace - elif ctx.isAlphaLU(r): - if ctx.isHiragana(r): + elif ctx.isAlphaLU(u): + if ctx.isHiragana(u): return bcHiragana - elif ctx.isKatakana(r): + elif ctx.isKatakana(u): return bcKatakana - elif ctx.isHangul(r): + elif ctx.isHangul(u): return bcHangul - elif ctx.isHan(r): + elif ctx.isHan(u): return bcHan return bcAlpha return bcSymbol -proc breaksWordCat*(ctx: LUContext; r: Rune): BreakCategory = - if not ctx.breaksWord(r): +proc breaksWordCat*(ctx: LUContext; u: uint32): BreakCategory = + if not ctx.breaksWord(u): return bcAlpha return bcSpace -proc breaksBigWordCat*(ctx: LUContext; r: Rune): BreakCategory = - if not ctx.isWhiteSpaceLU(r): +proc breaksBigWordCat*(ctx: LUContext; u: uint32): BreakCategory = + if not ctx.isWhiteSpaceLU(u): return bcAlpha return bcSpace |