diff options
author | bptato <nincsnevem662@gmail.com> | 2024-05-09 21:57:00 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2024-05-09 22:11:30 +0200 |
commit | 2453c63b0b12baa9bd78c0a114b58f1c3833e967 (patch) | |
tree | 34b37fa375f8500669877ec726afea0ba2ed2d99 | |
parent | 200a3784de44b90351f7d9a1da47e85e06ff8e15 (diff) | |
download | chawan-2453c63b0b12baa9bd78c0a114b58f1c3833e967.tar.gz |
luwrap: replace Nim unicode maps with libunicode
Instead of using the built-in (and outdated, and buggy) tables, we now use libunicode from QJS. This shaves some bytes off the executable, though far less than I had imagined it would. Also, a surprising effect of this change: because libunicode's tables aren't glitched out, kanji properly gets classified as alpha. I found this greatly annoying because `w' in Japanese text would now jump through whole sentences. As a band-aid solution I added an extra Han category, but I wish we had a more robust solution that could differentiate between *all* scripts. TODO: I suspect that separately loading the tables for every rune in breaksViWordCat is rather inefficient. Using some context object (at least per operation) would probably be beneficial.
-rw-r--r-- | src/bindings/libunicode.nim | 23 | ||||
-rw-r--r-- | src/layout/engine.nim | 6 | ||||
-rw-r--r-- | src/local/container.nim | 6 | ||||
-rw-r--r-- | src/local/lineedit.nim | 1 | ||||
-rw-r--r-- | src/types/url.nim | 13 | ||||
-rw-r--r-- | src/utils/luwrap.nim | 96 | ||||
-rw-r--r-- | src/utils/strwidth.nim | 27 | ||||
-rw-r--r-- | src/utils/wordbreak.nim | 33 |
8 files changed, 136 insertions, 69 deletions
diff --git a/src/bindings/libunicode.nim b/src/bindings/libunicode.nim index f501ae3f..13a36da4 100644 --- a/src/bindings/libunicode.nim +++ b/src/bindings/libunicode.nim @@ -16,14 +16,29 @@ type {.push header: "quickjs/libunicode.h", importc.} proc cr_init*(cr: ptr CharRange; mem_opaque: pointer; - realloc_func: DynBufReallocFunc) {.importc.} + realloc_func: DynBufReallocFunc) -proc cr_free*(cr: ptr CharRange) {.importc.} +proc cr_free*(cr: ptr CharRange) proc unicode_normalize*(pdst: ptr ptr uint32; src: ptr uint32; src_len: cint; n_type: UnicodeNormalizationEnum; opaque: pointer; - realloc_func: DynBufReallocFunc): cint {.importc.} + realloc_func: DynBufReallocFunc): cint +proc unicode_script*(cr: ptr CharRange; script_name: cstring; is_ext: cint): + cint {.importc, header: "quickjs/libunicode.h".} +proc unicode_prop*(cr: ptr CharRange; prop_name: cstring): cint proc unicode_general_category*(cr: ptr CharRange; gc_name: cstring): cint - {.importc.} + +const LRE_CC_RES_LEN_MAX* = 3 + +# conv_type: +# 0 = to upper +# 1 = to lower +# 2 = case folding +# res must be an array of LRE_CC_RES_LEN_MAX +proc lre_case_conv*(res: ptr UncheckedArray[uint32]; c: uint32; + conv_type: cint): cint + +proc lre_is_space*(c: uint32): cint + {.pop.} diff --git a/src/layout/engine.nim b/src/layout/engine.nim index d51c555b..ba073c27 100644 --- a/src/layout/engine.nim +++ b/src/layout/engine.nim @@ -881,9 +881,9 @@ iterator transform(text: seq[string]; v: CSSTextTransform): string {.inline.} = else: for str in text: let str = case v - of TextTransformCapitalize: str.capitalize() - of TextTransformUppercase: str.toUpper() - of TextTransformLowercase: str.toLower() + of TextTransformCapitalize: str.capitalizeLU() + of TextTransformUppercase: str.toUpperLU() + of TextTransformLowercase: str.toLowerLU() of TextTransformFullWidth: str.fullwidth() of TextTransformFullSizeKana: str.fullsize() of TextTransformChaHalfWidth: str.halfwidth() diff --git a/src/local/container.nim b/src/local/container.nim index e4901902..985e45ef 100644 --- a/src/local/container.nim +++ b/src/local/container.nim @@ -12,8 +12,8 @@ import io/promise import io/serversocket import io/socketstream import js/javascript -import js/jstypes import js/jsregex +import js/jstypes import layout/renderdocument import loader/headers import loader/loader @@ -26,9 +26,11 @@ import types/cookie import types/referrer import types/url import types/winattrs +import utils/luwrap import utils/mimeguess import utils/strwidth import utils/twtstr +import utils/wordbreak import chagashi/charset @@ -595,7 +597,7 @@ proc cursorLineTextStart(container: Container) {.jsfunc.} = if container.numLines == 0: return var x = 0 for r in container.currentLine.runes: - if not r.isWhitespace(): + if not r.isWhiteSpaceLU(): break x += r.twidth(x) if x == 0: diff --git a/src/local/lineedit.nim b/src/local/lineedit.nim index 2db59e7b..e2b89f89 100644 --- a/src/local/lineedit.nim +++ b/src/local/lineedit.nim @@ -8,6 +8,7 @@ import types/opt import types/winattrs import utils/strwidth import utils/twtstr +import utils/wordbreak import chagashi/charset import chagashi/validator diff --git a/src/types/url.nim b/src/types/url.nim index 54d6f8ed..abb76f56 100644 --- a/src/types/url.nim +++ b/src/types/url.nim @@ -245,17 +245,6 @@ func endsInNumber(input: string): bool = inc i return true -type u32pair {.packed.} = object - a: uint32 - b: uint32 - -func cmpRange(x: u32pair; y: uint32): int = - if x.a > y: - return 1 - elif x.b < y: - return -1 - return 0 - type IDNATableStatus = enum itsValid, itsIgnored, itsMapped, itsDeviation, itsDisallowed @@ -306,7 +295,7 @@ func processIdna(str: string; beStrict: bool): Option[string] = of itsDeviation: mapped &= r of itsValid: mapped &= r if mapped.len == 0: return - mapped.mnormalize() + mapped = mapped.normalize() var cr: CharRange {.cast(noSideEffect).}: cr_init(addr cr, nil, passRealloc) diff --git a/src/utils/luwrap.nim b/src/utils/luwrap.nim index 330a5d1e..612982e0 100644 --- a/src/utils/luwrap.nim +++ b/src/utils/luwrap.nim @@ -1,3 +1,4 @@ +import std/algorithm import std/strutils import std/unicode @@ -7,9 +8,10 @@ import utils/charcategory proc passRealloc(opaque, p: pointer; size: csize_t): pointer {.cdecl.} = return realloc(p, size) -proc mnormalize*(rs: var seq[Rune]; form = UNICODE_NFC) = +proc normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] = {.cast(noSideEffect).}: - if rs.len == 0: return + if rs.len == 0: + return @[] var outbuf: ptr uint32 let p = cast[ptr uint32](unsafeAddr rs[0]) let out_len = unicode_normalize(addr outbuf, p, cint(rs.len), form, nil, @@ -18,29 +20,81 @@ proc mnormalize*(rs: var seq[Rune]; form = UNICODE_NFC) = raise newException(Defect, "Unicode normalization failed") if out_len == 0: return - rs = cast[seq[Rune]](newSeqUninitialized[uint32](out_len)) + var rs = cast[seq[Rune]](newSeqUninitialized[uint32](out_len)) copyMem(addr rs[0], outbuf, out_len * sizeof(uint32)) dealloc(outbuf) + return rs -#TODO maybe a utf8 normalization procedure? proc mnormalize*(s: var string) = if NonAscii notin s: return # no need to normalize ascii - var rs = s.toRunes() - rs.mnormalize() - s = $rs + s = $s.toRunes().normalize() -func normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] = - {.cast(noSideEffect).}: - if rs.len == 0: return - var outbuf: ptr uint32 - let p = cast[ptr uint32](unsafeAddr rs[0]) - let out_len = unicode_normalize(addr outbuf, p, cint(rs.len), form, - nil, passRealloc) - if out_len < 0: - raise newException(Defect, "Unicode normalization failed") - if out_len == 0: - return - result = cast[seq[Rune]](newSeqUninitialized[uint32](out_len)) - copyMem(addr result[0], outbuf, out_len * sizeof(uint32)) - dealloc(outbuf) +# n == 0: upper, 1: lower, 2: case fold +proc toUpperLU(s: string; n: cint): string = + result = newStringOfCap(s.len) + for r in s.runes: + var outa: array[LRE_CC_RES_LEN_MAX, uint32] + let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]), + uint32(r), n) + for i in 0 ..< n: + result &= $Rune(outa[i]) + +proc toUpperLU*(s: string): string = + return s.toUpperLU(0) + +proc toLowerLU*(s: string): string = + return s.toUpperLU(1) + +proc capitalizeLU*(s: string): string = + result = newStringOfCap(s.len) + var wordStart = true + for r in s.runes: + if lre_is_space(uint32(r)) == 1: + wordStart = true + result &= $r + elif wordStart: + var outa: array[LRE_CC_RES_LEN_MAX, uint32] + let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]), + uint32(r), 0) + for i in 0 ..< n: + result &= $Rune(outa[i]) + wordStart = false + else: + result &= $r + +type u32pair* {.packed.} = object + a: uint32 + b: uint32 + +func cmpRange*(x: u32pair; y: uint32): int = + if x.a > y: + return 1 + elif x.b < y: + return -1 + return 0 + +func contains(cr: CharRange; r: Rune): bool = + let cps = cast[ptr UncheckedArray[u32pair]](cr.points) + let L = cr.len div 2 - 1 + return cps.toOpenArray(0, L).binarySearch(uint32(r), cmpRange) != -1 + +proc isGeneralCategoryLU*(r: Rune; s: string): bool = + var cr: CharRange + cr_init(addr cr, nil, passRealloc) + doAssert unicode_general_category(addr cr, s) == 0 + result = r in cr + cr_free(addr cr) + +proc isAlphaLU*(r: Rune): bool = + return r.isGeneralCategoryLU("Letter") + +proc isScriptLU*(r: Rune; s: string): bool = + var cr: CharRange + cr_init(addr cr, nil, passRealloc) + doAssert unicode_script(addr cr, s, 0) == 0 + result = r in cr + cr_free(addr cr) + +proc isWhiteSpaceLU*(r: Rune): bool = + return r.isGeneralCategoryLU("Separator") diff --git a/src/utils/strwidth.nim b/src/utils/strwidth.nim index ba7f60a2..fe089328 100644 --- a/src/utils/strwidth.nim +++ b/src/utils/strwidth.nim @@ -2,7 +2,6 @@ import std/strutils import std/unicode import utils/proptable -import utils/charcategory import utils/map include res/map/charwidth_gen @@ -78,29 +77,3 @@ func padToWidth*(str: string; size: int; schar = '$'): string = result &= r w += r.width result &= schar - -func isDigitAscii(r: Rune): bool = - return uint32(r) < 128 and char(r) in AsciiDigit - -type BreakCategory* = enum - bcAlpha, bcSpace, bcSymbol - -func breaksWord*(r: Rune): bool = - return not (r.isDigitAscii() or r.width() == 0 or r.isAlpha()) - -func breaksViWordCat*(r: Rune): BreakCategory = - if r.isWhiteSpace(): - return bcSpace - elif r.breaksWord() and r != Rune'_': - return bcSymbol - return bcAlpha - -func breaksWordCat*(r: Rune): BreakCategory = - if not r.breaksWord(): - return bcAlpha - return bcSpace - -func breaksBigWordCat*(r: Rune): BreakCategory = - if not r.isWhiteSpace(): - return bcAlpha - return bcSpace diff --git a/src/utils/wordbreak.nim b/src/utils/wordbreak.nim new file mode 100644 index 00000000..80959be7 --- /dev/null +++ b/src/utils/wordbreak.nim @@ -0,0 +1,33 @@ +import std/unicode + +import utils/charcategory +import utils/luwrap +import utils/strwidth + +func isDigitAscii(r: Rune): bool = + return uint32(r) < 128 and char(r) in AsciiDigit + +type BreakCategory* = enum + bcAlpha, bcSpace, bcSymbol, bcHan + +func breaksWord*(r: Rune): bool = + return not r.isDigitAscii() and r.width() != 0 and not r.isAlphaLU() + +func breaksViWordCat*(r: Rune): BreakCategory = + if r.isWhiteSpaceLU(): + return bcSpace + elif r.breaksWord() and r != Rune'_': + return bcSymbol + elif r.isScriptLU("Han"): + return bcHan + return bcAlpha + +func breaksWordCat*(r: Rune): BreakCategory = + if not r.breaksWord(): + return bcAlpha + return bcSpace + +func breaksBigWordCat*(r: Rune): BreakCategory = + if not r.isWhiteSpaceLU(): + return bcAlpha + return bcSpace |