diff options
author | bptato <nincsnevem662@gmail.com> | 2024-05-09 21:57:00 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2024-05-09 22:11:30 +0200 |
commit | 2453c63b0b12baa9bd78c0a114b58f1c3833e967 (patch) | |
tree | 34b37fa375f8500669877ec726afea0ba2ed2d99 /src/utils | |
parent | 200a3784de44b90351f7d9a1da47e85e06ff8e15 (diff) | |
download | chawan-2453c63b0b12baa9bd78c0a114b58f1c3833e967.tar.gz |
luwrap: replace Nim unicode maps with libunicode
Instead of using the built-in (and outdated, and buggy) tables, we now use libunicode from QJS. This shaves some bytes off the executable, though far less than I had imagined it would. Also, a surprising effect of this change: because libunicode's tables aren't glitched out, kanji properly gets classified as alpha. I found this greatly annoying because `w' in Japanese text would now jump through whole sentences. As a band-aid solution I added an extra Han category, but I wish we had a more robust solution that could differentiate between *all* scripts. TODO: I suspect that separately loading the tables for every rune in breaksViWordCat is rather inefficient. Using some context object (at least per operation) would probably be beneficial.
Diffstat (limited to 'src/utils')
-rw-r--r-- | src/utils/luwrap.nim | 96 | ||||
-rw-r--r-- | src/utils/strwidth.nim | 27 | ||||
-rw-r--r-- | src/utils/wordbreak.nim | 33 |
3 files changed, 108 insertions, 48 deletions
diff --git a/src/utils/luwrap.nim b/src/utils/luwrap.nim index 330a5d1e..612982e0 100644 --- a/src/utils/luwrap.nim +++ b/src/utils/luwrap.nim @@ -1,3 +1,4 @@ +import std/algorithm import std/strutils import std/unicode @@ -7,9 +8,10 @@ import utils/charcategory proc passRealloc(opaque, p: pointer; size: csize_t): pointer {.cdecl.} = return realloc(p, size) -proc mnormalize*(rs: var seq[Rune]; form = UNICODE_NFC) = +proc normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] = {.cast(noSideEffect).}: - if rs.len == 0: return + if rs.len == 0: + return @[] var outbuf: ptr uint32 let p = cast[ptr uint32](unsafeAddr rs[0]) let out_len = unicode_normalize(addr outbuf, p, cint(rs.len), form, nil, @@ -18,29 +20,81 @@ proc mnormalize*(rs: var seq[Rune]; form = UNICODE_NFC) = raise newException(Defect, "Unicode normalization failed") if out_len == 0: return - rs = cast[seq[Rune]](newSeqUninitialized[uint32](out_len)) + var rs = cast[seq[Rune]](newSeqUninitialized[uint32](out_len)) copyMem(addr rs[0], outbuf, out_len * sizeof(uint32)) dealloc(outbuf) + return rs -#TODO maybe a utf8 normalization procedure? proc mnormalize*(s: var string) = if NonAscii notin s: return # no need to normalize ascii - var rs = s.toRunes() - rs.mnormalize() - s = $rs + s = $s.toRunes().normalize() -func normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] = - {.cast(noSideEffect).}: - if rs.len == 0: return - var outbuf: ptr uint32 - let p = cast[ptr uint32](unsafeAddr rs[0]) - let out_len = unicode_normalize(addr outbuf, p, cint(rs.len), form, - nil, passRealloc) - if out_len < 0: - raise newException(Defect, "Unicode normalization failed") - if out_len == 0: - return - result = cast[seq[Rune]](newSeqUninitialized[uint32](out_len)) - copyMem(addr result[0], outbuf, out_len * sizeof(uint32)) - dealloc(outbuf) +# n == 0: upper, 1: lower, 2: case fold +proc toUpperLU(s: string; n: cint): string = + result = newStringOfCap(s.len) + for r in s.runes: + var outa: array[LRE_CC_RES_LEN_MAX, uint32] + let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]), + uint32(r), n) + for i in 0 ..< n: + result &= $Rune(outa[i]) + +proc toUpperLU*(s: string): string = + return s.toUpperLU(0) + +proc toLowerLU*(s: string): string = + return s.toUpperLU(1) + +proc capitalizeLU*(s: string): string = + result = newStringOfCap(s.len) + var wordStart = true + for r in s.runes: + if lre_is_space(uint32(r)) == 1: + wordStart = true + result &= $r + elif wordStart: + var outa: array[LRE_CC_RES_LEN_MAX, uint32] + let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]), + uint32(r), 0) + for i in 0 ..< n: + result &= $Rune(outa[i]) + wordStart = false + else: + result &= $r + +type u32pair* {.packed.} = object + a: uint32 + b: uint32 + +func cmpRange*(x: u32pair; y: uint32): int = + if x.a > y: + return 1 + elif x.b < y: + return -1 + return 0 + +func contains(cr: CharRange; r: Rune): bool = + let cps = cast[ptr UncheckedArray[u32pair]](cr.points) + let L = cr.len div 2 - 1 + return cps.toOpenArray(0, L).binarySearch(uint32(r), cmpRange) != -1 + +proc isGeneralCategoryLU*(r: Rune; s: string): bool = + var cr: CharRange + cr_init(addr cr, nil, passRealloc) + doAssert unicode_general_category(addr cr, s) == 0 + result = r in cr + cr_free(addr cr) + +proc isAlphaLU*(r: Rune): bool = + return r.isGeneralCategoryLU("Letter") + +proc isScriptLU*(r: Rune; s: string): bool = + var cr: CharRange + cr_init(addr cr, nil, passRealloc) + doAssert unicode_script(addr cr, s, 0) == 0 + result = r in cr + cr_free(addr cr) + +proc isWhiteSpaceLU*(r: Rune): bool = + return r.isGeneralCategoryLU("Separator") diff --git a/src/utils/strwidth.nim b/src/utils/strwidth.nim index ba7f60a2..fe089328 100644 --- a/src/utils/strwidth.nim +++ b/src/utils/strwidth.nim @@ -2,7 +2,6 @@ import std/strutils import std/unicode import utils/proptable -import utils/charcategory import utils/map include res/map/charwidth_gen @@ -78,29 +77,3 @@ func padToWidth*(str: string; size: int; schar = '$'): string = result &= r w += r.width result &= schar - -func isDigitAscii(r: Rune): bool = - return uint32(r) < 128 and char(r) in AsciiDigit - -type BreakCategory* = enum - bcAlpha, bcSpace, bcSymbol - -func breaksWord*(r: Rune): bool = - return not (r.isDigitAscii() or r.width() == 0 or r.isAlpha()) - -func breaksViWordCat*(r: Rune): BreakCategory = - if r.isWhiteSpace(): - return bcSpace - elif r.breaksWord() and r != Rune'_': - return bcSymbol - return bcAlpha - -func breaksWordCat*(r: Rune): BreakCategory = - if not r.breaksWord(): - return bcAlpha - return bcSpace - -func breaksBigWordCat*(r: Rune): BreakCategory = - if not r.isWhiteSpace(): - return bcAlpha - return bcSpace diff --git a/src/utils/wordbreak.nim b/src/utils/wordbreak.nim new file mode 100644 index 00000000..80959be7 --- /dev/null +++ b/src/utils/wordbreak.nim @@ -0,0 +1,33 @@ +import std/unicode + +import utils/charcategory +import utils/luwrap +import utils/strwidth + +func isDigitAscii(r: Rune): bool = + return uint32(r) < 128 and char(r) in AsciiDigit + +type BreakCategory* = enum + bcAlpha, bcSpace, bcSymbol, bcHan + +func breaksWord*(r: Rune): bool = + return not r.isDigitAscii() and r.width() != 0 and not r.isAlphaLU() + +func breaksViWordCat*(r: Rune): BreakCategory = + if r.isWhiteSpaceLU(): + return bcSpace + elif r.breaksWord() and r != Rune'_': + return bcSymbol + elif r.isScriptLU("Han"): + return bcHan + return bcAlpha + +func breaksWordCat*(r: Rune): BreakCategory = + if not r.breaksWord(): + return bcAlpha + return bcSpace + +func breaksBigWordCat*(r: Rune): BreakCategory = + if not r.isWhiteSpaceLU(): + return bcAlpha + return bcSpace |