From 4124c041ed2e3b497ede72fdae229aa2c6aca249 Mon Sep 17 00:00:00 2001 From: bptato Date: Sun, 8 Sep 2024 15:18:45 +0200 Subject: utils: add twtuni std/unicode has the following issues: * Rune is an int32, which implies overflow checking. Also, it is distinct, so you have to convert it manually to do arithmetic. * QJS libunicode and Chagashi work with uint32, interfacing with these required pointless type conversions. * fastRuneAt is a template, meaning it's pasted into every call site. Also, it decodes to UCS-4, so it generates two branches that aren't even used. Overall this lead to quite some code bloat. * fastRuneAt and lastRune have frustratingly different interfaces. Writing code to handle both cases is error prone. * On older Nim versions which we still support, std/unicode takes strings, not openArray[char]'s. Replace it with "twtuni", which includes some improved versions of the few procedures from std/unicode that we actually use. --- Makefile | 2 +- src/config/toml.nim | 8 +- src/css/cssparser.nim | 21 +++-- src/css/cssvalues.nim | 31 +++---- src/img/painter.nim | 6 +- src/layout/engine.nim | 40 ++++----- src/layout/renderdocument.nim | 15 ++-- src/local/container.nim | 205 +++++++++++++++++++++--------------------- src/local/lineedit.nim | 125 +++++++++++++------------- src/local/pager.nim | 21 ++--- src/local/term.nim | 1 - src/server/buffer.nim | 7 +- src/types/url.nim | 42 +++++---- src/utils/luwrap.nim | 57 ++++++------ src/utils/strwidth.nim | 82 ++++++----------- src/utils/twtstr.nim | 20 ++--- src/utils/twtuni.nim | 95 ++++++++++++++++++++ src/utils/widthconv.nim | 102 +++++++++++---------- src/utils/wordbreak.nim | 36 ++++---- 19 files changed, 483 insertions(+), 433 deletions(-) create mode 100644 src/utils/twtuni.nim diff --git a/Makefile b/Makefile index 42c85932..3b3c51c7 100644 --- a/Makefile +++ b/Makefile @@ -91,7 +91,7 @@ $(OUTDIR_CGI_BIN)/gmifetch: adapter/protocol/gmifetch.c @mkdir -p "$(OUTDIR_CGI_BIN)" $(CC) $(GMIFETCH_CFLAGS) adapter/protocol/gmifetch.c -o "$(OUTDIR_CGI_BIN)/gmifetch" $(GMIFETCH_LDFLAGS) -twtstr = src/utils/twtstr.nim src/utils/charcategory.nim src/utils/map.nim +twtstr = src/utils/twtstr.nim src/utils/charcategory.nim src/utils/map.nim src/utils/twtuni.nim dynstream = src/io/dynstream.nim src/io/serversocket.nim $(OUTDIR_CGI_BIN)/man: lib/monoucha/monoucha/jsregex.nim \ lib/monoucha/monoucha/libregexp.nim src/types/opt.nim $(twtstr) diff --git a/src/config/toml.nim b/src/config/toml.nim index 992a0cbc..2978585c 100644 --- a/src/config/toml.nim +++ b/src/config/toml.nim @@ -2,10 +2,10 @@ import std/options import std/streams import std/tables import std/times -import std/unicode import types/opt import utils/twtstr +import utils/twtuni type TomlValueType* = enum @@ -144,7 +144,7 @@ proc reconsume(state: var TomlParser) = proc has(state: var TomlParser; i: int = 0): bool = return state.at + i < state.buf.len -proc consumeEscape(state: var TomlParser; c: char): Result[Rune, TomlError] = +proc consumeEscape(state: var TomlParser; c: char): Result[uint32, TomlError] = var len = 4 if c == 'U': len = 8 @@ -166,7 +166,7 @@ proc consumeEscape(state: var TomlParser; c: char): Result[Rune, TomlError] = if num > 0x10FFFF or num in 0xD800..0xDFFF: return state.err("invalid escaped codepoint: " & $num) else: - return ok(Rune(num)) + return ok(uint32(num)) else: return state.err("invalid escaped codepoint: " & $c) @@ -213,7 +213,7 @@ proc consumeString(state: var TomlParser; first: char): Result[string, string] = of 'r': res &= '\r' of '"': res &= '"' of '\\': res &= '\\' - of 'u', 'U': res &= ?state.consumeEscape(c) + of 'u', 'U': res.addUTF8(?state.consumeEscape(c)) of '\n': ml_trim = true of '$': res &= "\\$" # special case for substitution in paths else: return state.err("invalid escape sequence \\" & c) diff --git a/src/css/cssparser.nim b/src/css/cssparser.nim index 1e56d11f..8ed54bf2 100644 --- a/src/css/cssparser.nim +++ b/src/css/cssparser.nim @@ -1,9 +1,9 @@ import std/options -import std/unicode import js/domexception import types/opt import utils/twtstr +import utils/twtuni type CSSTokenType* = enum @@ -156,10 +156,9 @@ proc consume(state: var CSSTokenizerState): char = return state.curr proc consumeRChar(state: var CSSTokenizerState): char = - var r: Rune - fastRuneAt(state.buf, state.at, r) - if int32(r) < 0x80: - return char(r) + let u = state.buf.nextUTF8(state.at) + if u < 0x80: + return char(u) return char(128) proc reconsume(state: var CSSTokenizerState) = @@ -230,10 +229,10 @@ proc skipWhitespace(state: var CSSTokenizerState) = proc consumeEscape(state: var CSSTokenizerState): string = if not state.has(): - return $Rune(0xFFFD) + return "\uFFFD" let c = state.consume() if c in AsciiHexDigit: - var num = hexValue(c) + var num = uint32(hexValue(c)) var i = 0 while i <= 5 and state.has(): let c = state.consume() @@ -241,14 +240,14 @@ proc consumeEscape(state: var CSSTokenizerState): string = state.reconsume() break num *= 0x10 - num += hexValue(c) + num += uint32(hexValue(c)) inc i if state.has() and state.peek() in AsciiWhitespace: discard state.consume() - if num == 0 or num > 0x10FFFF or num in 0xD800..0xDFFF: - return $Rune(0xFFFD) + if num == 0 or num > 0x10FFFF or num in 0xD800u32..0xDFFFu32: + return "\uFFFD" else: - return $Rune(num) + return num.toUTF8() else: return $c #NOTE this assumes the caller doesn't care about non-ascii diff --git a/src/css/cssvalues.nim b/src/css/cssvalues.nim index 4b1cc957..a0bb1f1f 100644 --- a/src/css/cssvalues.nim +++ b/src/css/cssvalues.nim @@ -3,7 +3,6 @@ import std/macros import std/options import std/strutils import std/tables -import std/unicode import css/cssparser import css/selectorparser @@ -13,6 +12,7 @@ import types/color import types/opt import types/winattrs import utils/twtstr +import utils/twtuni export selectorparser.PseudoElem @@ -605,34 +605,35 @@ func blockify*(display: CSSDisplay): CSSDisplay = of DisplayInlineFlex: return DisplayFlex -const UpperAlphaMap = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".toRunes() -const LowerAlphaMap = "abcdefghijklmnopqrstuvwxyz".toRunes() -const LowerGreekMap = "αβγδεζηθικλμνξοπρστυφχψω".toRunes() +const UpperAlphaMap = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".toPoints() +const LowerAlphaMap = "abcdefghijklmnopqrstuvwxyz".toPoints() +const LowerGreekMap = "αβγδεζηθικλμνξοπρστυφχψω".toPoints() const HiraganaMap = ("あいうえおかきくけこさしすせそたちつてとなにぬねの" & - "はひふへほまみむめもやゆよらりるれろわゐゑをん").toRunes() + "はひふへほまみむめもやゆよらりるれろわゐゑをん").toPoints() const HiraganaIrohaMap = ("いろはにほへとちりぬるをわかよたれそつねならむ" & - "うゐのおくやまけふこえてあさきゆめみしゑひもせす").toRunes() + "うゐのおくやまけふこえてあさきゆめみしゑひもせす").toPoints() const KatakanaMap = ("アイウエオカキクケコサシスセソタチツテトナニヌネノ" & - "ハヒフヘホマミムメモヤユヨラリルレロワヰヱヲン").toRunes() + "ハヒフヘホマミムメモヤユヨラリルレロワヰヱヲン").toPoints() const KatakanaIrohaMap = ("イロハニホヘトチリヌルヲワカヨタレソツネナラム" & - "ウヰノオクヤマケフコエテアサキユメミシヱヒモセス").toRunes() -const EarthlyBranchMap = "子丑寅卯辰巳午未申酉戌亥".toRunes() -const HeavenlyStemMap = "甲乙丙丁戊己庚辛壬癸".toRunes() + "ウヰノオクヤマケフコエテアサキユメミシヱヒモセス").toPoints() +const EarthlyBranchMap = "子丑寅卯辰巳午未申酉戌亥".toPoints() +const HeavenlyStemMap = "甲乙丙丁戊己庚辛壬癸".toPoints() -func numToBase(n: int; map: openArray[Rune]): string = +func numToBase(n: int; map: openArray[uint32]): string = if n <= 0: return $n - var tmp: seq[Rune] + var tmp: seq[uint32] = @[] var n = n while n != 0: n -= 1 tmp &= map[n mod map.len] n = n div map.len - result = "" + var res = "" for i in countdown(tmp.high, 0): - result &= $tmp[i] + res.addUTF8(tmp[i]) + return res -func numToFixed(n: int; map: openArray[Rune]): string = +func numToFixed(n: int; map: openArray[uint32]): string = let n = n - 1 if n notin 0 .. map.high: return $n diff --git a/src/img/painter.nim b/src/img/painter.nim index c86ad0a2..40f2c79a 100644 --- a/src/img/painter.nim +++ b/src/img/painter.nim @@ -1,11 +1,11 @@ import std/algorithm -import std/unicode import img/bitmap import img/path import types/color import types/line import types/vector +import utils/twtuni type CanvasFillRule* = enum @@ -191,8 +191,8 @@ proc fillText*(bmp: Bitmap; text: string; x, y: float64; color: ARGBColor; textAlign: CanvasTextAlign) = var w = 0f64 var glyphs: seq[Bitmap] = @[] - for r in text.runes: - let glyph = getCharBmp(uint32(r)) + for u in text.points: + let glyph = getCharBmp(u) glyphs.add(glyph) w += float64(glyph.width) var x = x diff --git a/src/layout/engine.nim b/src/layout/engine.nim index 6943958d..831df62e 100644 --- a/src/layout/engine.nim +++ b/src/layout/engine.nim @@ -1,6 +1,5 @@ import std/algorithm import std/math -import std/unicode import css/cssvalues import css/stylednode @@ -11,6 +10,7 @@ import types/winattrs import utils/luwrap import utils/strwidth import utils/twtstr +import utils/twtuni import utils/widthconv type @@ -714,7 +714,7 @@ proc addWordEOL(ictx: var InlineContext; state: var InlineState): bool = let leftstr = ictx.word.str.substr(ictx.wrappos) ictx.word.str.setLen(ictx.wrappos) if ictx.hasshy: - const shy = $Rune(0xAD) # soft hyphen + const shy = "\u00AD" # soft hyphen ictx.word.str &= shy ictx.hasshy = false result = ictx.addWord(state) @@ -723,34 +723,34 @@ proc addWordEOL(ictx: var InlineContext; state: var InlineState): bool = else: result = ictx.addWord(state) -proc checkWrap(ictx: var InlineContext; state: var InlineState; r: Rune) = +proc checkWrap(ictx: var InlineContext; state: var InlineState; u: uint32; + uw: int) = if state.fragment.computed.nowrap: return let shift = ictx.computeShift(state) - let rw = r.width() - state.prevrw = rw + state.prevrw = uw if ictx.word.str.len == 0: - state.firstrw = rw - if rw >= 2: + state.firstrw = uw + if uw >= 2: # remove wrap opportunity, so we wrap properly on the last CJK char (instead # of any dash inside CJK sentences) ictx.wrappos = -1 case state.fragment.computed{"word-break"} of WordBreakNormal: - if rw == 2 or ictx.wrappos != -1: # break on cjk and wrap opportunities - let plusWidth = ictx.word.size.w + shift + rw * ictx.cellWidth + if uw == 2 or ictx.wrappos != -1: # break on cjk and wrap opportunities + let plusWidth = ictx.word.size.w + shift + uw * ictx.cellWidth if ictx.shouldWrap(plusWidth, nil): if not ictx.addWordEOL(state): # no line wrapping occured in addAtom ictx.finishLine(state, wrap = true) ictx.whitespacenum = 0 of WordBreakBreakAll: - let plusWidth = ictx.word.size.w + shift + rw * ictx.cellWidth + let plusWidth = ictx.word.size.w + shift + uw * ictx.cellWidth if ictx.shouldWrap(plusWidth, nil): if not ictx.addWordEOL(state): # no line wrapping occured in addAtom ictx.finishLine(state, wrap = true) ictx.whitespacenum = 0 of WordBreakKeepAll: - let plusWidth = ictx.word.size.w + shift + rw * ictx.cellWidth + let plusWidth = ictx.word.size.w + shift + uw * ictx.cellWidth if ictx.shouldWrap(plusWidth, nil): ictx.finishLine(state, wrap = true) ictx.whitespacenum = 0 @@ -814,10 +814,9 @@ proc layoutTextLoop(ictx: var InlineContext; state: var InlineState; if c in AsciiWhitespace: ictx.processWhitespace(state, c) else: - let r = Rune(c) - ictx.checkWrap(state, r) + let w = uint32(c).width() + ictx.checkWrap(state, uint32(c), w) ictx.word.str &= c - let w = r.width() ictx.word.size.w += w * ictx.cellWidth ictx.lbstate.charwidth += w if c == '-': # ascii dash @@ -825,15 +824,16 @@ proc layoutTextLoop(ictx: var InlineContext; state: var InlineState; ictx.hasshy = false inc i else: - var r: Rune - fastRuneAt(str, i, r) - ictx.checkWrap(state, r) - if r == Rune(0xAD): # soft hyphen + let pi = i + let u = str.nextUTF8(i) + let w = u.width() + ictx.checkWrap(state, u, w) + if u == 0xAD: # soft hyphen ictx.wrappos = ictx.word.str.len ictx.hasshy = true else: - ictx.word.str &= r - let w = r.width() + for j in pi ..< i: + ictx.word.str &= str[j] ictx.word.size.w += w * ictx.cellWidth ictx.lbstate.charwidth += w discard ictx.addWord(state) diff --git a/src/layout/renderdocument.nim b/src/layout/renderdocument.nim index 2ff8d484..67ecd311 100644 --- a/src/layout/renderdocument.nim +++ b/src/layout/renderdocument.nim @@ -1,5 +1,4 @@ import std/strutils -import std/unicode import css/cssvalues import css/stylednode @@ -11,6 +10,7 @@ import types/cell import types/color import types/winattrs import utils/strwidth +import utils/twtuni type # A FormatCell *starts* a new terminal formatting context. @@ -77,10 +77,9 @@ proc findFirstX(line: var FlexibleLine; x: int; outi: var int): int = var cx = 0 var i = 0 while cx < x and i < line.str.len: - var r: Rune let pi = i - fastRuneAt(line.str, i, r) - let w = r.twidth(cx) + let u = line.str.nextUTF8(i) + let w = u.twidth(cx) # we must ensure x is max(cx, x), otherwise our assumption of cx <= x # breaks down if cx + w > x: @@ -203,9 +202,7 @@ proc setText(line: var FlexibleLine; linestr: string; x: int; format: Format; var j = i var nx = x # last x of new string while nx < targetX and j < line.str.len: - var r: Rune - fastRuneAt(line.str, j, r) - nx += r.twidth(nx) + nx += line.str.nextUTF8(j).twidth(nx) let ostr = line.str.substr(j) line.setTextStr(linestr, ostr, i, x, cx, nx, targetX) line.setTextFormat(x, cx, nx, ostr, format, node) @@ -214,10 +211,8 @@ proc setText(grid: var FlexibleGrid; linestr: string; x, y: int; format: Format; node: StyledNode) = var x = x var i = 0 - var r: Rune while x < 0 and i < linestr.len: - fastRuneAt(linestr, i, r) - x += r.twidth(x) + x += linestr.nextUTF8(i).twidth(x) if x < 0: # highest x is outside the canvas, no need to draw return diff --git a/src/local/container.nim b/src/local/container.nim index 2c12c4ae..cb7738b8 100644 --- a/src/local/container.nim +++ b/src/local/container.nim @@ -3,7 +3,6 @@ import std/options import std/os import std/posix import std/tables -import std/unicode import chagashi/charset import config/config @@ -32,6 +31,7 @@ import utils/luwrap import utils/mimeguess import utils/strwidth import utils/twtstr +import utils/twtuni import utils/wordbreak type @@ -369,12 +369,12 @@ proc popCursorPos(select: Select; nojump = false) = if not nojump: select.queueDraw() -const HorizontalBar = $Rune(0x2500) -const VerticalBar = $Rune(0x2502) -const CornerTopLeft = $Rune(0x250C) -const CornerTopRight = $Rune(0x2510) -const CornerBottomLeft = $Rune(0x2514) -const CornerBottomRight = $Rune(0x2518) +const HorizontalBar = "\u2500" +const VerticalBar = "\u2502" +const CornerTopLeft = "\u250C" +const CornerTopRight = "\u2510" +const CornerBottomLeft = "\u2514" +const CornerBottomRight = "\u2518" proc drawBorders(display: var FixedGrid; sx, ex, sy, ey: int; upmore, downmore: bool) = @@ -446,7 +446,6 @@ proc drawSelect*(select: Select; display: var FixedGrid) = # move inside border inc sy inc sx - var r: Rune var k = 0 var format = Format() while k < select.selected.len and select.selected[k] < si: @@ -462,13 +461,13 @@ proc drawSelect*(select: Select; display: var FixedGrid) = else: format.flags.excl(ffReverse) while j < select.options[i].len: - fastRuneAt(select.options[i], j, r) - let rw = r.twidth(x) + let pj = j + let u = select.options[i].nextUTF8(j) let ox = x - x += rw + x += u.twidth(x) if x > ex: break - display[dls + ox].str = $r + display[dls + ox].str = select.options[i].substr(pj, j - 1) display[dls + ox].format = format while x < ex: display[dls + x].str = " " @@ -578,9 +577,8 @@ func findColBytes(s: string; endx: int; startx = 0; starti = 0): int = var w = startx var i = starti while i < s.len and w < endx: - var r: Rune - fastRuneAt(s, i, r) - w += r.twidth(w) + let u = s.nextUTF8(i) + w += u.twidth(w) return i func cursorBytes(container: Container; y: int; cc = container.cursorx): int = @@ -596,11 +594,10 @@ func cursorFirstX(container: Container): int = let line = container.currentLine var w = 0 var i = 0 - var r: Rune let cc = container.cursorx while i < line.len: - fastRuneAt(line, i, r) - let tw = r.twidth(w) + let u = line.nextUTF8(i) + let tw = u.twidth(w) if w + tw > cc: return w w += tw @@ -613,11 +610,10 @@ func cursorLastX(container: Container): int = let line = container.currentLine var w = 0 var i = 0 - var r: Rune let cc = container.cursorx while i < line.len and w <= cc: - fastRuneAt(line, i, r) - w += r.twidth(w) + let u = line.nextUTF8(i) + w += u.twidth(w) return max(w - 1, 0) # Last cell for tab, first cell for everything else (e.g. double width.) @@ -630,16 +626,15 @@ func cursorDispX(container: Container): int = var w = 0 var pw = 0 var i = 0 - var r: Rune + var u = 0u32 let cc = container.cursorx while i < line.len and w <= cc: - fastRuneAt(line, i, r) + u = line.nextUTF8(i) pw = w - w += r.twidth(w) - if r == Rune('\t'): + w += u.twidth(w) + if u == uint32('\t'): return max(w - 1, 0) - else: - return pw + return pw func acursorx*(container: Container): int = max(0, container.cursorDispX() - container.fromx) @@ -911,10 +906,10 @@ proc setCursorXY*(container: Container; x, y: int; refresh = true) {.jsfunc.} = proc cursorLineTextStart(container: Container) {.jsfunc.} = if container.numLines == 0: return var x = 0 - for r in container.currentLine.runes: - if not container.luctx.isWhiteSpaceLU(r): + for u in container.currentLine.points: + if not container.luctx.isWhiteSpaceLU(u): break - x += r.twidth(x) + x += u.twidth(x) if x == 0: dec x container.setCursorX(x) @@ -1020,45 +1015,62 @@ proc cursorLineBegin(container: Container) {.jsfunc.} = proc cursorLineEnd(container: Container) {.jsfunc.} = container.setCursorX(container.currentLineWidth() - 1) -type BreakFunc = proc(ctx: LUContext; r: Rune): BreakCategory {.nimcall.} +type BreakFunc = proc(ctx: LUContext; r: uint32): BreakCategory {.nimcall.} -proc skipSpace(container: Container; b, x: var int; breakFunc: BreakFunc) = +# move to first char that is not in this category +proc skipCat(container: Container; b, x: var int; breakFunc: BreakFunc; + cat: BreakCategory) = while b < container.currentLine.len: - var r: Rune let pb = b - fastRuneAt(container.currentLine, b, r) - if container.luctx.breakFunc(r) != bcSpace: + let u = container.currentLine.nextUTF8(b) + if container.luctx.breakFunc(u) != cat: b = pb break - x += r.twidth(x) + x += u.twidth(x) -proc skipSpaceRev(container: Container; b, x: var int; breakFunc: BreakFunc) = - while b >= 0: - let (r, o) = lastRune(container.currentLine, b) - if container.luctx.breakFunc(r) != bcSpace: +proc skipSpace(container: Container; b, x: var int; breakFunc: BreakFunc) = + container.skipCat(b, x, breakFunc, bcSpace) + +# move to last char in category, backwards +proc lastCatRev(container: Container; b, x: var int; breakFunc: BreakFunc; + cat: BreakCategory) = + while b > 0: + let pb = b + let u = container.currentLine.prevUTF8(b) + if container.luctx.breakFunc(u) != cat: + b = pb break - b -= o - x -= r.twidth(x) + x -= u.width() + +# move to first char that is not in this category, backwards +proc skipCatRev(container: Container; b, x: var int; breakFunc: BreakFunc; + cat: BreakCategory): BreakCategory = + while b > 0: + let u = container.currentLine.prevUTF8(b) + x -= u.width() + let it = container.luctx.breakFunc(u) + if it != cat: + return it + b = -1 + return cat + +proc skipSpaceRev(container: Container; b, x: var int; breakFunc: BreakFunc): + BreakCategory = + return container.skipCatRev(b, x, breakFunc, bcSpace) proc cursorNextWord(container: Container; breakFunc: BreakFunc) = if container.numLines == 0: return - var r: Rune var b = container.currentCursorBytes() var x = container.cursorx # meow let currentCat = if b < container.currentLine.len: - container.luctx.breakFunc(container.currentLine.runeAt(b)) + var tmp = b + container.luctx.breakFunc(container.currentLine.nextUTF8(tmp)) else: bcSpace if currentCat != bcSpace: # not in space, skip chars that have the same category - while b < container.currentLine.len: - let pb = b - fastRuneAt(container.currentLine, b, r) - if container.luctx.breakFunc(r) != currentCat: - b = pb - break - x += r.twidth(x) + container.skipCat(b, x, breakFunc, currentCat) container.skipSpace(b, x, breakFunc) if b < container.currentLine.len: container.setCursorX(x) @@ -1084,19 +1096,16 @@ proc cursorPrevWord(container: Container; breakFunc: BreakFunc) = var x = container.cursorx if container.currentLine.len > 0: b = min(b, container.currentLine.len - 1) - let currentCat = if b >= 0: - container.luctx.breakFunc(container.currentLine.runeAt(b)) + var currentCat = if b >= 0: + var tmp = b + container.luctx.breakFunc(container.currentLine.nextUTF8(tmp)) else: bcSpace if currentCat != bcSpace: # not in space, skip chars that have the same category - while b >= 0: - let (r, o) = lastRune(container.currentLine, b) - if container.luctx.breakFunc(r) != currentCat: - break - b -= o - x -= r.twidth(x) - container.skipSpaceRev(b, x, breakFunc) + currentCat = container.skipCatRev(b, x, breakFunc, currentCat) + if currentCat == bcSpace: + discard container.skipSpaceRev(b, x, breakFunc) else: b = -1 if b >= 0: @@ -1119,32 +1128,33 @@ proc cursorPrevBigWord(container: Container) {.jsfunc.} = proc cursorWordEnd(container: Container; breakFunc: BreakFunc) = if container.numLines == 0: return - var r: Rune var b = container.currentCursorBytes() var x = container.cursorx var px = x # if not in space, move to the right by one if b < container.currentLine.len: let pb = b - fastRuneAt(container.currentLine, b, r) - if container.luctx.breakFunc(r) == bcSpace: + let u = container.currentLine.nextUTF8(b) + if container.luctx.breakFunc(u) == bcSpace: b = pb else: px = x - x += r.twidth(x) + x += u.twidth(x) container.skipSpace(b, x, breakFunc) # move to the last char in the current category let ob = b if b < container.currentLine.len: - let currentCat = container.luctx.breakFunc(container.currentLine.runeAt(b)) + var tmp = b + let u = container.currentLine.nextUTF8(tmp) + let currentCat = container.luctx.breakFunc(u) while b < container.currentLine.len: let pb = b - fastRuneAt(container.currentLine, b, r) - if container.luctx.breakFunc(r) != currentCat: + let u = container.currentLine.nextUTF8(b) + if container.luctx.breakFunc(u) != currentCat: b = pb break px = x - x += r.twidth(x) + x += u.twidth(x) x = px if b < container.currentLine.len or ob != b: container.setCursorX(x) @@ -1168,35 +1178,27 @@ proc cursorWordBegin(container: Container; breakFunc: BreakFunc) = if container.numLines == 0: return var b = container.currentCursorBytes() var x = container.cursorx - var px = x - var ob = b if container.currentLine.len > 0: b = min(b, container.currentLine.len - 1) if b >= 0: - let (r, o) = lastRune(container.currentLine, b) + var tmp = b + var u = container.currentLine.nextUTF8(tmp) + var currentCat = container.luctx.breakFunc(u) # if not in space, move to the left by one - if container.luctx.breakFunc(r) != bcSpace: - b -= o - px = x - x -= r.twidth(x) - container.skipSpaceRev(b, x, breakFunc) - # move to the first char in the current category - ob = b - if b >= 0: - let (r, _) = lastRune(container.currentLine, b) - let currentCat = container.luctx.breakFunc(r) - while b >= 0: - let (r, o) = lastRune(container.currentLine, b) - if container.luctx.breakFunc(r) != currentCat: - break - b -= o - px = x - x -= r.twidth(x) - x = px + if currentCat != bcSpace: + if b > 0: + u = container.currentLine.prevUTF8(b) + x -= u.width() + currentCat = container.luctx.breakFunc(u) + else: + b = -1 + if container.luctx.breakFunc(u) == bcSpace: + currentCat = container.skipSpaceRev(b, x, breakFunc) + # move to the first char in the current category + container.lastCatRev(b, x, breakFunc, currentCat) else: b = -1 - ob = -1 - if b >= 0 or ob != b: + if b >= 0: container.setCursorX(x) else: if container.cursory > 0: @@ -1994,7 +1996,6 @@ proc drawLines*(container: Container; display: var FixedGrid; hlcolor: CellColor cell.format = cf.format if bgcolor != defaultColor and cell.format.bgcolor == defaultColor: cell.format.bgcolor = bgcolor - var r: Rune var by = 0 let endy = min(container.fromy + display.height, container.numLines) for line in container.ilines(container.fromy ..< endy): @@ -2002,8 +2003,8 @@ proc drawLines*(container: Container; display: var FixedGrid; hlcolor: CellColor var i = 0 # byte in line.str # Skip cells till fromx. while w < container.fromx and i < line.str.len: - fastRuneAt(line.str, i, r) - w += r.twidth(w) + let u = line.str.nextUTF8(i) + w += u.twidth(w) let dls = by * display.width # starting position of row in display # Fill in the gap in case we skipped more cells than fromx mandates (i.e. # we encountered a double-width character.) @@ -2018,25 +2019,27 @@ proc drawLines*(container: Container; display: var FixedGrid; hlcolor: CellColor # Now fill in the visible part of the row. while i < line.str.len: let pw = w - fastRuneAt(line.str, i, r) - let rw = r.twidth(w) - w += rw + let pi = i + let u = line.str.nextUTF8(i) + let uw = u.twidth(w) + w += uw if w > container.fromx + display.width: break # die on exceeding the width limit if nf.pos != -1 and nf.pos <= pw: cf = nf nf = line.findNextFormat(pw) - if r == Rune('\t'): + if u == uint32('\t'): # Needs to be replaced with spaces, otherwise bgcolor isn't displayed. - let tk = k + rw + let tk = k + uw while k < tk: display[dls + k].str &= ' ' set_fmt display[dls + k], cf inc k else: - display[dls + k].str &= r + for j in pi ..< i: + display[dls + k].str &= line.str[j] set_fmt display[dls + k], cf - k += rw + k += uw if bgcolor != defaultColor: # Fill the screen if bgcolor is not default. while k < display.width: diff --git a/src/local/lineedit.nim b/src/local/lineedit.nim index ecfc3db8..edb6ee09 100644 --- a/src/local/lineedit.nim +++ b/src/local/lineedit.nim @@ -1,5 +1,4 @@ import std/strutils -import std/unicode import chagashi/charset import chagashi/decoder @@ -11,6 +10,7 @@ import types/winattrs import utils/luwrap import utils/strwidth import utils/twtstr +import utils/twtuni import utils/wordbreak type @@ -37,6 +37,7 @@ type hist: LineHistory histindex: int histtmp: string + luctx: LUContext redraw*: bool jsDestructor(LineEdit) @@ -48,10 +49,8 @@ func newLineHistory*(): LineHistory = func getDisplayWidth(edit: LineEdit): int = var dispw = 0 var i = edit.shifti - var r: Rune while i < edit.news.len and dispw < edit.maxwidth: - fastRuneAt(edit.news, i, r) - dispw += r.width() + dispw += edit.news.nextUTF8(i).width() return dispw proc shiftView(edit: LineEdit) = @@ -69,17 +68,14 @@ proc shiftView(edit: LineEdit) = edit.shifti = 0 else: while edit.shiftx > targetx: - let (r, len) = edit.news.lastRune(edit.shifti - 1) - edit.shiftx -= r.width() - edit.shifti -= len + let u = edit.news.prevUTF8(edit.shifti) + edit.shiftx -= u.width() edit.padding = 0 # Shift view so it contains the cursor. (act 2) if edit.shiftx < edit.cursorx - edit.maxwidth: while edit.shiftx < edit.cursorx - edit.maxwidth and edit.shifti < edit.news.len: - var r: Rune - fastRuneAt(edit.news, edit.shifti, r) - edit.shiftx += r.width() + edit.shiftx += edit.news.nextUTF8(edit.shifti).width() if edit.shiftx > edit.cursorx - edit.maxwidth: # skipped over a cell because of a double-width char edit.padding = 1 @@ -89,9 +85,9 @@ proc generateOutput*(edit: LineEdit): FixedGrid = # Make the output grid +1 cell wide, so it covers the whole input area. result = newFixedGrid(edit.promptw + edit.maxwidth + 1) var x = 0 - for r in edit.prompt.runes: - result[x].str &= $r - x += r.width() + for u in edit.prompt.points: + result[x].str.addUTF8(u) + x += u.width() if x >= result.width: break for i in 0 ..< edit.padding: if x < result.width: @@ -99,18 +95,19 @@ proc generateOutput*(edit: LineEdit): FixedGrid = inc x var i = edit.shifti while i < edit.news.len: - var r: Rune - fastRuneAt(edit.news, i, r) + let pi = i + let u = edit.news.nextUTF8(i) if not edit.hide: - let w = r.width() + let w = u.width() if x + w > result.width: break - if r.isControlChar(): + if u.isControlChar(): result[x].str &= '^' inc x - result[x].str &= char(r).getControlLetter() + result[x].str &= char(u).getControlLetter() inc x else: - result[x].str &= $r + for j in pi ..< i: + result[x].str &= edit.news[j] x += w else: if x + 1 > result.width: break @@ -143,10 +140,10 @@ proc submit(edit: LineEdit) {.jsfunc.} = proc backspace(edit: LineEdit) {.jsfunc.} = if edit.cursori > 0: - let (r, len) = edit.news.lastRune(edit.cursori - 1) - edit.news.delete(edit.cursori - len .. edit.cursori - 1) - edit.cursori -= len - edit.cursorx -= r.width() + let pi = edit.cursori + let u = edit.news.prevUTF8(edit.cursori) + edit.news.delete(edit.cursori ..< pi) + edit.cursorx -= u.width() edit.redraw = true proc write*(edit: LineEdit; s: string; cs: Charset): bool = @@ -171,7 +168,7 @@ proc write(edit: LineEdit; s: string): bool {.jsfunc.} = proc delete(edit: LineEdit) {.jsfunc.} = if edit.cursori < edit.news.len: - let len = edit.news.runeLenAt(edit.cursori) + let len = edit.news.pointLenAt(edit.cursori) edit.news.delete(edit.cursori ..< edit.cursori + len) edit.redraw = true @@ -192,55 +189,53 @@ proc kill(edit: LineEdit) {.jsfunc.} = proc backward(edit: LineEdit) {.jsfunc.} = if edit.cursori > 0: - let (r, len) = edit.news.lastRune(edit.cursori - 1) - edit.cursori -= len - edit.cursorx -= r.width() + let u = edit.news.prevUTF8(edit.cursori) + edit.cursorx -= u.width() if edit.cursorx < edit.shiftx: edit.redraw = true proc forward(edit: LineEdit) {.jsfunc.} = if edit.cursori < edit.news.len: - var r: Rune - fastRuneAt(edit.news, edit.cursori, r) - edit.cursorx += r.width() + let u = edit.news.nextUTF8(edit.cursori) + edit.cursorx += u.width() if edit.cursorx >= edit.shiftx + edit.maxwidth: edit.redraw = true proc prevWord(edit: LineEdit) {.jsfunc.} = if edit.cursori == 0: return - let ctx = LUContext() - let (r, len) = edit.news.lastRune(edit.cursori - 1) - if ctx.breaksWord(r): - edit.cursori -= len - edit.cursorx -= r.width() + let pi = edit.cursori + let u = edit.news.prevUTF8(edit.cursori) + if edit.luctx.breaksWord(u): + edit.cursorx -= u.width() + else: + edit.cursori = pi while edit.cursori > 0: - let (r, len) = edit.news.lastRune(edit.cursori - 1) - if ctx.breaksWord(r): + let pi = edit.cursori + let u = edit.news.prevUTF8(edit.cursori) + if edit.luctx.breaksWord(u): + edit.cursori = pi break - edit.cursori -= len - edit.cursorx -= r.width() + edit.cursorx -= u.width() if edit.cursorx < edit.shiftx: edit.redraw = true proc nextWord(edit: LineEdit) {.jsfunc.} = if edit.cursori >= edit.news.len: return - let ctx = LUContext() - let oc = edit.cursori - var r: Rune - fastRuneAt(edit.news, edit.cursori, r) - if ctx.breaksWord(r): - edit.cursorx += r.width() + let pi = edit.cursori + let u = edit.news.nextUTF8(edit.cursori) + if edit.luctx.breaksWord(u): + edit.cursorx += u.width() else: - edit.cursori = oc + edit.cursori = pi while edit.cursori < edit.news.len: - let pc = edit.cursori - fastRuneAt(edit.news, edit.cursori, r) - if ctx.breaksWord(r): - edit.cursori = pc + let pi = edit.cursori + let u = edit.news.nextUTF8(edit.cursori) + if edit.luctx.breaksWord(u): + edit.cursori = pi break - edit.cursorx += r.width() + edit.cursorx += u.width() if edit.cursorx >= edit.shiftx + edit.maxwidth: edit.redraw = true @@ -254,18 +249,17 @@ proc clearWord(edit: LineEdit) {.jsfunc.} = proc killWord(edit: LineEdit) {.jsfunc.} = if edit.cursori >= edit.news.len: return - let oc = edit.cursori - let ox = edit.cursorx - edit.nextWord() - if edit.cursori != oc: - if edit.cursori < edit.news.len: - let len = edit.news.runeLenAt(edit.cursori) - edit.news.delete(oc ..< edit.cursori + len) - else: - edit.news.delete(oc ..< edit.cursori) - edit.cursori = oc - edit.cursorx = ox - edit.redraw = true + var i = edit.cursori + var u = edit.news.nextUTF8(i) + if not edit.luctx.breaksWord(u): + while i < edit.news.len: + let pi = i + let u = edit.news.nextUTF8(i) + if edit.luctx.breaksWord(u): + i = pi + break + edit.news.delete(edit.cursori ..< i) + edit.redraw = true proc begin(edit: LineEdit) {.jsfunc.} = edit.cursori = 0 @@ -310,7 +304,7 @@ proc windowChange*(edit: LineEdit; attrs: WindowAttributes) = edit.maxwidth = attrs.width - edit.promptw - 1 proc readLine*(prompt, current: string; termwidth: int; disallowed: set[char]; - hide: bool; hist: LineHistory): LineEdit = + hide: bool; hist: LineHistory; luctx: LUContext): LineEdit = let promptw = prompt.width() return LineEdit( prompt: prompt, @@ -324,7 +318,8 @@ proc readLine*(prompt, current: string; termwidth: int; disallowed: set[char]; # - 1, so that the cursor always has place maxwidth: termwidth - promptw - 1, hist: hist, - histindex: hist.lines.len + histindex: hist.lines.len, + luctx: luctx ) proc addLineEditModule*(ctx: JSContext) = diff --git a/src/local/pager.nim b/src/local/pager.nim index 035ec2d7..3822eb77 100644 --- a/src/local/pager.nim +++ b/src/local/pager.nim @@ -7,7 +7,6 @@ import std/posix import std/selectors import std/sets import std/tables -import std/unicode import chagashi/charset import config/chapath @@ -52,6 +51,7 @@ import utils/mimeguess import utils/regexutils import utils/strwidth import utils/twtstr +import utils/twtuni type LineMode* = enum @@ -284,7 +284,7 @@ proc setLineEdit(pager: Pager; mode: LineMode; current = ""; hide = false; if pager.term.isatty() and pager.config.input.use_mouse: pager.term.disableMouse() pager.lineedit = readLine($mode & extraPrompt, current, pager.attrs.width, - {}, hide, hist) + {}, hide, hist, pager.luctx) pager.linemode = mode proc clearLineEdit(pager: Pager) = @@ -387,19 +387,19 @@ proc writeStatusMessage(pager: Pager; str: string; format = Format(); if i >= e: return i pager.status.redraw = true - for r in str.runes: - let w = r.width() + for u in str.points: + let w = u.width() if i + w >= e: pager.status.grid[i].format = format pager.status.grid[i].str = $clip inc i # Note: we assume `clip' is 1 cell wide break - if r.isControlChar(): + if u.isControlChar(): pager.status.grid[i].str = "^" - pager.status.grid[i + 1].str = $getControlLetter(char(r)) + pager.status.grid[i + 1].str = $getControlLetter(char(u)) pager.status.grid[i + 1].format = format else: - pager.status.grid[i].str = $r + pager.status.grid[i].str = u.toUTF8() pager.status.grid[i].format = format i += w result = i @@ -461,9 +461,8 @@ proc drawBuffer*(pager: Pager; container: Container; ofile: File) = for f in line.formats: let si = i while x < f.pos: - var r: Rune - fastRuneAt(line.str, i, r) - x += r.width() + let u = line.str.nextUTF8(i) + x += u.width() let outstr = line.str.substr(si, i - 1) s &= pager.term.processOutputString(outstr, w) s &= pager.term.processFormat(format, f.format) @@ -576,6 +575,8 @@ proc initImages(pager: Pager; container: Container) = dispw = min(width + xpx, maxwpx) - xpx let ypx = (image.y - container.fromy) * pager.attrs.ppl erry = -min(ypx, 0) mod 6 + if dispw <= offx: + continue let cached = container.findCachedImage(image, offx, erry, dispw) let imageId = image.bmp.imageId if cached == nil: diff --git a/src/local/term.nim b/src/local/term.nim index 263e6363..2f31104a 100644 --- a/src/local/term.nim +++ b/src/local/term.nim @@ -4,7 +4,6 @@ import std/posix import std/strutils import std/tables import std/termios -import std/unicode import bindings/termcap import chagashi/charset diff --git a/src/server/buffer.nim b/src/server/buffer.nim index 7af3cae0..4324944e 100644 --- a/src/server/buffer.nim +++ b/src/server/buffer.nim @@ -8,7 +8,6 @@ import std/os import std/posix import std/selectors import std/tables -import std/unicode import chagashi/charset import chagashi/decoder @@ -51,6 +50,7 @@ import types/url import types/winattrs import utils/strwidth import utils/twtstr +import utils/twtuni type BufferCommand* = enum @@ -412,9 +412,8 @@ func cursorBytes(buffer: Buffer; y, cc: int): int = var w = 0 var i = 0 while i < line.len and w < cc: - var r: Rune - fastRuneAt(line, i, r) - w += r.twidth(w) + let u = line.nextUTF8(i) + w += u.twidth(w) return i proc navigate(buffer: Buffer; url: URL) = diff --git a/src/types/url.nim b/src/types/url.nim index b676a2b5..ab2aa9c9 100644 --- a/src/types/url.nim +++ b/src/types/url.nim @@ -3,7 +3,6 @@ import std/algorithm import std/options import std/strutils import std/tables -import std/unicode import io/bufreader import io/bufwriter @@ -18,6 +17,7 @@ import types/opt import utils/luwrap import utils/map import utils/twtstr +import utils/twtuni include res/map/idna_gen @@ -291,10 +291,9 @@ type IDNATableStatus = enum itsValid, itsIgnored, itsMapped, itsDeviation, itsDisallowed -func getIdnaTableStatus(r: Rune): IDNATableStatus = - let i = uint32(r) - if i <= high(uint16): - let u = uint16(i) +func getIdnaTableStatus(u: uint32): IDNATableStatus = + if u <= high(uint16): + let u = uint16(u) if u in IgnoredLow: return itsIgnored if u in DisallowedLow or DisallowedRangesLow.isInRange(u): @@ -302,16 +301,15 @@ func getIdnaTableStatus(r: Rune): IDNATableStatus = if MappedMapLow.isInMap(u): return itsMapped else: - if i in IgnoredHigh: + if u in IgnoredHigh: return itsIgnored - if i in DisallowedHigh or DisallowedRangesHigh.isInRange(i): + if u in DisallowedHigh or DisallowedRangesHigh.isInRange(u): return itsDisallowed - if MappedMapHigh.isInMap(uint32(i)): + if MappedMapHigh.isInMap(u): return itsMapped return itsValid -func getIdnaMapped(r: Rune): string = - let u = uint32(r) +func getIdnaMapped(u: uint32): string = if u <= high(uint16): let u = uint16(u) let n = MappedMapLow.searchInMap(u) @@ -330,15 +328,15 @@ func processIdna(str: string; beStrict: bool): string = # UseSTD3ASCIIRules = beStrict (but STD3 is not implemented) # Transitional_Processing = false # VerifyDnsLength = beStrict - var mapped: seq[Rune] = @[] - for r in str.runes(): - let status = getIdnaTableStatus(r) + var mapped: seq[uint32] = @[] + for u in str.points: + let status = getIdnaTableStatus(u) case status of itsDisallowed: return "" #error of itsIgnored: discard - of itsMapped: mapped &= getIdnaMapped(r).toRunes() - of itsDeviation: mapped &= r - of itsValid: mapped &= r + of itsMapped: mapped &= getIdnaMapped(u).toPoints() + of itsDeviation: mapped &= u + of itsValid: mapped &= u if mapped.len == 0: return mapped = mapped.normalize() var cr: CharRange @@ -351,8 +349,8 @@ func processIdna(str: string; beStrict: bool): string = if label.startsWith("xn--"): try: let s = punycode.decode(label.substr("xn--".len)) - let x0 = s.toRunes() - let x1 = normalize(x0) + let x0 = s.toPoints() + let x1 = x0.normalize() if x0 != x1: return "" #error # CheckHyphens is false @@ -362,10 +360,10 @@ func processIdna(str: string; beStrict: bool): string = let L = cr.len div 2 - 1 if cps.toOpenArray(0, L).binarySearch(c, cmpRange) != -1: return "" #error - for r in x0: - if r == Rune('.'): + for u in x0: + if u == uint32('.'): return "" #error - let status = getIdnaTableStatus(r) + let status = getIdnaTableStatus(u) if status in {itsDisallowed, itsIgnored, itsMapped}: return "" #error #TODO check joiners @@ -396,7 +394,7 @@ func unicodeToAscii(s: string; beStrict: bool): string = else: s = label if beStrict: # VerifyDnsLength - let rl = s.runeLen() + let rl = s.pointLen() if rl notin 1..63: return "" all += rl diff --git a/src/utils/luwrap.nim b/src/utils/luwrap.nim index 6081cdf8..76a5b2e1 100644 --- a/src/utils/luwrap.nim +++ b/src/utils/luwrap.nim @@ -1,14 +1,14 @@ import std/algorithm import std/strutils -import std/unicode import monoucha/libunicode import utils/charcategory +import utils/twtuni proc passRealloc(opaque, p: pointer; size: csize_t): pointer {.cdecl.} = return realloc(p, size) -proc normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] = +proc normalize*(rs: seq[uint32]; form = UNICODE_NFC): seq[uint32] = {.cast(noSideEffect).}: if rs.len == 0: return @[] @@ -20,7 +20,7 @@ proc normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] = raise newException(Defect, "Unicode normalization failed") if out_len == 0: return - var rs = cast[seq[Rune]](newSeqUninitialized[uint32](out_len)) + var rs = newSeqUninitialized[uint32](out_len) copyMem(addr rs[0], outbuf, out_len * sizeof(uint32)) dealloc(outbuf) return rs @@ -28,17 +28,15 @@ proc normalize*(rs: seq[Rune]; form = UNICODE_NFC): seq[Rune] = proc mnormalize*(s: var string) = if NonAscii notin s: return # no need to normalize ascii - s = $s.toRunes().normalize() + s = s.toPoints().normalize().toUTF8() # n == 0: upper, 1: lower, 2: case fold proc toUpperLU(s: string; n: cint): string = result = newStringOfCap(s.len) - for r in s.runes: + for u in s.points: var outa: array[LRE_CC_RES_LEN_MAX, uint32] - let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]), - uint32(r), n) - for i in 0 ..< n: - result &= $Rune(outa[i]) + let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]), u, n) + result.addUTF8(outa.toOpenArray(0, n - 1)) proc toUpperLU*(s: string): string = return s.toUpperLU(0) @@ -49,19 +47,18 @@ proc toLowerLU*(s: string): string = proc capitalizeLU*(s: string): string = result = newStringOfCap(s.len) var wordStart = true - for r in s.runes: - if lre_is_space(uint32(r)) == 1: + for u in s.points: + if lre_is_space(u) == 1: wordStart = true - result &= $r + result.addUTF8(u) elif wordStart: var outa: array[LRE_CC_RES_LEN_MAX, uint32] let n = lre_case_conv(cast[ptr UncheckedArray[uint32]](addr outa[0]), - uint32(r), 0) - for i in 0 ..< n: - result &= $Rune(outa[i]) + u, 0) + result.addUTF8(outa.toOpenArray(0, n - 1)) wordStart = false else: - result &= $r + result.addUTF8(u) type u32pair* {.packed.} = object a: uint32 @@ -74,10 +71,10 @@ func cmpRange*(x: u32pair; y: uint32): int = return -1 return 0 -func contains(cr: CharRange; r: Rune): bool = +func contains(cr: CharRange; u: uint32): bool = let cps = cast[ptr UncheckedArray[u32pair]](cr.points) let L = cr.len div 2 - 1 - return cps.toOpenArray(0, L).binarySearch(uint32(r), cmpRange) != -1 + return cps.toOpenArray(0, L).binarySearch(u, cmpRange) != -1 type LURangeType = enum @@ -114,26 +111,26 @@ proc initScript(ctx: LUContext; lur: LURangeType) = doAssert unicode_script(p, cstring($lur), 0) == 0 ctx.inited.incl(lur) -proc isAlphaLU*(ctx: LUContext; r: Rune): bool = +proc isAlphaLU*(ctx: LUContext; u: uint32): bool = ctx.initGeneralCategory(lurLetter) - return r in ctx.crs[lurLetter] + return u in ctx.crs[lurLetter] -proc isWhiteSpaceLU*(ctx: LUContext; r: Rune): bool = +proc isWhiteSpaceLU*(ctx: LUContext; u: uint32): bool = ctx.initGeneralCategory(lurSeparator) - return r in ctx.crs[lurSeparator] + return u in ctx.crs[lurSeparator] -proc isHan*(ctx: LUContext; r: Rune): bool = +proc isHan*(ctx: LUContext; u: uint32): bool = ctx.initScript(lurHan) - return r in ctx.crs[lurHan] + return u in ctx.crs[lurHan] -proc isHiragana*(ctx: LUContext; r: Rune): bool = +proc isHiragana*(ctx: LUContext; u: uint32): bool = ctx.initScript(lurHiragana) - return r in ctx.crs[lurHiragana] + return u in ctx.crs[lurHiragana] -proc isKatakana*(ctx: LUContext; r: Rune): bool = +proc isKatakana*(ctx: LUContext; u: uint32): bool = ctx.initScript(lurKatakana) - return r in ctx.crs[lurKatakana] + return u in ctx.crs[lurKatakana] -proc isHangul*(ctx: LUContext; r: Rune): bool = +proc isHangul*(ctx: LUContext; u: uint32): bool = ctx.initScript(lurHangul) - return r in ctx.crs[lurHangul] + return u in ctx.crs[lurHangul] diff --git a/src/utils/strwidth.nim b/src/utils/strwidth.nim index 8c367991..4ce9aa12 100644 --- a/src/utils/strwidth.nim +++ b/src/utils/strwidth.nim @@ -1,7 +1,6 @@ -import std/unicode - -import utils/proptable import utils/map +import utils/proptable +import utils/twtuni include res/map/charwidth_gen @@ -9,8 +8,7 @@ include res/map/charwidth_gen var isCJKAmbiguous* = false # Warning: this shouldn't be called without normalization. -func width*(r: Rune): int = - let u = uint32(r) +func width*(u: uint32): int = if u <= 0xFFFF: # fast path for BMP if u in CombiningTable: return 0 @@ -31,80 +29,56 @@ func width*(r: Rune): int = # Width, but also works with tabs. # Needs the column width of the text so far. -func twidth*(r: Rune; w: int): int = - if r != Rune('\t'): - return r.width() +func twidth*(u: uint32; w: int): int = + if u != uint32('\t'): + return u.width() return ((w div 8) + 1) * 8 - w -func width*(s: string): int = - result = 0 - for r in s.runes: - result += r.twidth(result) +func width*(s: openArray[char]): int = + var w = 0 + for u in s.points: + w += u.twidth(w) + return w func width*(s: string; start, len: int): int = - result = 0 + var w = 0 var i = start var m = len if m > s.len: m = s.len while i < m: - var r: Rune - fastRuneAt(s, i, r) - result += r.twidth(result) - -when NimMajor < 2: - template ones(n: untyped): untyped = ((1 shl n)-1) - template fastRuneAt(s: openArray[char]; i: int; result: untyped) = - result = Rune(0xFFFD) - if uint32(s[i]) <= 127: - result = Rune(uint32(s[i])) - elif uint32(s[i]) shr 5 == 0b110: - if i <= s.len - 2: - result = Rune((uint32(s[i]) and (ones(5))) shl 6 or - (uint32(s[i+1]) and ones(6))) - i += 1 - elif uint32(s[i]) shr 4 == 0b1110: - if i <= s.len - 3: - result = Rune((uint32(s[i]) and ones(4)) shl 12 or - (uint32(s[i+1]) and ones(6)) shl 6 or (uint32(s[i+2]) and ones(6))) - i += 2 - elif uint32(s[i]) shr 3 == 0b11110: - if i <= s.len - 4: - result = Rune((uint32(s[i]) and ones(3)) shl 18 or - (uint32(s[i+1]) and ones(6)) shl 12 or - (uint32(s[i+2]) and ones(6)) shl 6 or - (uint32(s[i+3]) and ones(6))) - i += 3 - inc i + let u = s.nextUTF8(i) + w += u.twidth(w) + return w func notwidth*(s: openArray[char]): int = - result = 0 - var i = 0 - while i < s.len: - var r: Rune - fastRuneAt(s, i, r) - result += r.width() + var w = 0 + for u in s.points: + w += u.width() + return w func twidth*(s: string; w: int): int = var i = w - for r in s.runes: - i += r.twidth(w) + for u in s.points: + i += u.twidth(w) return i - w func padToWidth*(s: string; size: int; schar = '$'): string = result = newStringOfCap(s.len) var w = 0 - var r: Rune var i = 0 + var pi = 0 while i < s.len: - fastRuneAt(s, i, r) - w += r.width() + pi = i + w += s.nextUTF8(i).width() if w > size - 1: break - result &= r + for j in pi ..< i: + result &= s[j] if w > size - 1: if w == size and i == s.len: - result &= r + for j in pi ..< i: + result &= s[j] else: result &= schar while w < size: diff --git a/src/utils/twtstr.nim b/src/utils/twtstr.nim index 0d65be50..f08b1131 100644 --- a/src/utils/twtstr.nim +++ b/src/utils/twtstr.nim @@ -3,7 +3,6 @@ import std/math import std/options import std/os import std/strutils -import std/unicode when defined(posix): import std/posix @@ -11,14 +10,15 @@ when defined(posix): import types/opt import utils/charcategory import utils/map +import utils/twtuni export charcategory func onlyWhitespace*(s: string): bool = return AllChars - AsciiWhitespace notin s -func isControlChar*(r: Rune): bool = - return int(r) <= 0x1F or int(r) == 0x7F +func isControlChar*(u: uint32): bool = + return u <= 0x1F or u == 0x7F func getControlChar*(c: char): char = if c == '?': @@ -444,14 +444,13 @@ func matchNameProduction*(s: string): bool = return false # NameStartChar var i = 0 - var r: Rune if s[i] in Ascii: if s[i] notin NameStartCharAscii: return false inc i else: - fastRuneAt(s, i, r) - if not NameStartCharRanges.isInRange(uint32(r)): + let u = s.nextUTF8(i) + if not NameStartCharRanges.isInRange(u): return false # NameChar while i < s.len: @@ -460,9 +459,8 @@ func matchNameProduction*(s: string): bool = return false inc i else: - fastRuneAt(s, i, r) - if not NameStartCharRanges.isInRange(uint32(r)) and - not NameCharRanges.isInMap(uint32(r)): + let u = s.nextUTF8(i) + if not NameStartCharRanges.isInRange(u) and not NameCharRanges.isInMap(u): return false return true @@ -483,8 +481,8 @@ func matchQNameProduction*(s: string): bool = func utf16Len*(s: string): int = result = 0 - for r in s.runes: - if uint32(r) < 0x10000: # ucs-2 + for u in s.points: + if u < 0x10000: # ucs-2 result += 1 else: # surrogate result += 2 diff --git a/src/utils/twtuni.nim b/src/utils/twtuni.nim new file mode 100644 index 00000000..c617ac4e --- /dev/null +++ b/src/utils/twtuni.nim @@ -0,0 +1,95 @@ +func nextUTF8*(s: openArray[char]; i: var int): uint32 = + let j = i + var u = uint32(s[j]) + if u <= 0x7F: + inc i + elif u shr 5 == 0b110: + let e = j + 2 + if likely(e <= s.len): + u = (u and 0x1F) shl 6 or (uint32(s[j + 1]) and 0x3F) + i = e + elif u shr 4 == 0b1110: + let e = j + 3 + if likely(e <= s.len): + u = (u and 0xF) shl 12 or + (uint32(s[j + 1]) and 0x3F) shl 6 or + (uint32(s[j + 2]) and 0x3F) + i = e + elif u shr 3 == 0b11110: + let e = j + 4 + if likely(e <= s.len): + u = (u and 7) shl 18 or + (uint32(s[j + 1]) and 0x3F) shl 12 or + (uint32(s[j + 2]) and 0x3F) shl 6 or + (uint32(s[j + 3]) and 0x3F) + i = e + else: + u = 0xFFFD + inc i + return u + +func prevUTF8*(s: openArray[char]; i: var int): uint32 = + var j = i - 1 + while uint32(s[j]) shr 6 == 2: + dec j + i = j + return s.nextUTF8(j) + +func pointLenAt*(s: openArray[char]; i: int): int = + let u = uint8(s[i]) + if u <= 0x7F: + return 1 + elif u shr 5 == 0b110: + return 2 + elif u shr 4 == 0b1110: + return 3 + elif u shr 3 == 0b11110: + return 4 + return 1 + +iterator points*(s: openArray[char]): uint32 {.inline.} = + var i = 0 + while i < s.len: + let u = s.nextUTF8(i) + yield u + +func toPoints*(s: openArray[char]): seq[uint32] = + result = @[] + for u in s.points: + result.add(u) + +proc addUTF8*(res: var string; u: uint32) = + if u < 0x80: + res &= char(u) + elif u < 0x800: + res &= char(u shr 6 or 0xC0) + res &= char(u and 0x3F or 0x80) + elif u < 0x10000: + res &= char(u shr 12 or 0xE0) + res &= char(u shr 6 and 0x3F or 0x80) + res &= char(u and 0x3F or 0x80) + else: + res &= char(u shr 18 or 0xF0) + res &= char(u shr 12 and 0x3F or 0x80) + res &= char(u shr 6 and 0x3F or 0x80) + res &= char(u and 0x3F or 0x80) + +func addUTF8*(res: var string; us: openArray[uint32]) = + for u in us: + res.addUTF8(u) + +func toUTF8*(u: uint32): string = + var s = "" + s.addUTF8(u) + return s + +func toUTF8*(us: openArray[uint32]): string = + var s = newStringOfCap(us.len shr 2) + s.addUTF8(us) + return s + +func pointLen*(s: openArray[char]): int = + var n = 0 + for u in s.points: + inc n + return n diff --git a/src/utils/widthconv.nim b/src/utils/widthconv.nim index b6495379..32a904f6 100644 --- a/src/utils/widthconv.nim +++ b/src/utils/widthconv.nim @@ -1,66 +1,64 @@ import std/strutils -import std/unicode + import utils/map +import utils/twtuni const CanHaveDakuten = ("かきくけこさしすせそたちつてとはひふへほカキクケコ" & - "サシスセソタチツテトハヒフヘホ").toRunes() + "サシスセソタチツテトハヒフヘホ").toPoints() -const CanHaveHanDakuten = "はひふへほハヒフヘホ".toRunes() +const CanHaveHanDakuten = "はひふへほハヒフヘホ".toPoints() const HasDakuten = ("がぎぐげござじずぜぞだぢづでどばびぶべぼガギグゲゴ" & - "ザジズゼゾダヂヅデドバビブベボ").toRunes() + "ザジズゼゾダヂヅデドバビブベボ").toPoints() -const HasHanDakuten = "ぱぴぷぺぽパピプペポ".toRunes() +const HasHanDakuten = "ぱぴぷぺぽパピプペポ".toPoints() # Halfwidth to fullwidth & vice versa -const halfFullMap = (func(): seq[tuple[half, full1, full2: Rune]] = +const halfFullMap = (func(): seq[tuple[half, full1, full2: uint32]] = result = @[] const map = staticRead"res/widthconvmap.tab" for line in map.split('\n'): if line == "": break var i = 0 - var half: Rune - fastRuneAt(line, i, half) + let half = line.nextUTF8(i) assert line[i] == '\t' inc i - var full1: Rune - fastRuneAt(line, i, full1) - var full2 = Rune(0) + let full1 = line.nextUTF8(i) + var full2 = 0u32 if i < line.len: assert line[i] == '\t' - inc i - fastRuneAt(line, i, full2) + full2 = line.nextUTF8(i) result.add((half, full1, full2)) )() -func halfwidth(r: Rune): Rune = - if r != Rune(0): # special case to avoid comparison with f2 +func halfwidth(u: uint32): uint32 = + if u != 0: # special case to avoid comparison with f2 for (h, f1, f2) in halfFullMap: - if f1 == r or f2 == r: + if f1 == u or f2 == u: return h - return r + return u -const HalfDakuten = Rune(0xFF9E) # half-width dakuten -const HalfHanDakuten = Rune(0xFF9F) # half-width handakuten +const HalfDakuten = 0xFF9Eu32 # half-width dakuten +const HalfHanDakuten = 0xFF9Fu32 # half-width handakuten # Note: in unicode, char + 1 is dakuten and char + 2 handakuten func halfwidth*(s: string): string = result = "" - for r in s.runes: - case r + for u in s.points: + case u of HasDakuten: - result &= halfwidth(Rune(uint32(r) - 1)) - result &= HalfDakuten + result.addUTF8(halfwidth(u - 1)) + result.addUTF8(HalfDakuten) of HasHanDakuten: - result &= halfwidth(Rune(uint32(r) - 2)) - result &= HalfHanDakuten + result.addUTF8(halfwidth(u - 2)) + result.addUTF8(HalfHanDakuten) else: - result &= halfwidth(r) + result.addUTF8(halfwidth(u)) -func fullwidth(r: Rune): Rune = - if r != Rune(0): # special case to avoid comparison with f2 +func fullwidth(r: uint32): uint32 = + if r != 0: # special case to avoid comparison with f2 for (h, f1, f2) in halfFullMap: if h == r: return f1 @@ -68,45 +66,45 @@ func fullwidth(r: Rune): Rune = func fullwidth*(s: string): string = result = "" - var lastr = Rune(0) - for r in s.runes: - if lastr != Rune(0): - if r == HalfDakuten: + var lastu = 0u32 + for u in s.points: + if lastu != 0: + if u == HalfDakuten: # flush with dakuten - result &= Rune(uint32(lastr) + 1) - lastr = Rune(0) + result.addUTF8(lastu + 1) + lastu = 0 continue - elif r == HalfHanDakuten and lastr in CanHaveHanDakuten: + elif u == HalfHanDakuten and lastu in CanHaveHanDakuten: # flush with handakuten - result &= Rune(uint32(lastr) + 2) - lastr = Rune(0) + result.addUTF8(lastu + 2) + lastu = 0 continue - result &= lastr - lastr = Rune(0) - let r = fullwidth(r) - if r in CanHaveDakuten: - lastr = r + result.addUTF8(lastu) + lastu = 0 + let u = fullwidth(u) + if u in CanHaveDakuten: + lastu = u else: - result &= r - if lastr != Rune(0): + result.addUTF8(u) + if lastu != 0: # flush - result &= lastr + result.addUTF8(lastu) const kanamap = staticRead"res/kanamap.tab" func genFullSizeMap(): seq[(uint32, uint32)] = result = @[] for line in kanamap.split('\n'): if line.len == 0: break - let rs = line.toRunes() - assert rs[1] == Rune('\t') - result.add((uint32(rs[0]), uint32(rs[2]))) + let rs = line.toPoints() + assert rs[1] == uint32('\t') + result.add((rs[0], rs[2])) const fullSizeMap = genFullSizeMap() proc fullsize*(s: string): string = result = "" - for r in s.runes: - let i = searchInMap(fullSizeMap, uint32(r)) + for u in s.points: + let i = searchInMap(fullSizeMap, u) if i == -1: - result &= r + result.addUTF8(u) else: - result &= $Rune(fullSizeMap[i][1]) + result.addUTF8(fullSizeMap[i][1]) diff --git a/src/utils/wordbreak.nim b/src/utils/wordbreak.nim index c93d63ec..ff607fbe 100644 --- a/src/utils/wordbreak.nim +++ b/src/utils/wordbreak.nim @@ -1,5 +1,3 @@ -import std/unicode - import utils/charcategory import utils/luwrap import utils/strwidth @@ -7,39 +5,39 @@ import utils/strwidth type BreakCategory* = enum bcAlpha, bcSpace, bcSymbol, bcHan, bcHiragana, bcKatakana, bcHangul -func isDigitAscii(r: Rune): bool = - return uint32(r) < 128 and char(r) in AsciiDigit +func isDigitAscii(u: uint32): bool = + return u < 128 and char(u) in AsciiDigit -proc breaksWord*(ctx: LUContext; r: Rune): bool = - return not r.isDigitAscii() and r.width() != 0 and not ctx.isAlphaLU(r) +proc breaksWord*(ctx: LUContext; u: uint32): bool = + return not u.isDigitAscii() and u.width() != 0 and not ctx.isAlphaLU(u) -proc breaksViWordCat*(ctx: LUContext; r: Rune): BreakCategory = - if int32(r) < 0x80: # ASCII - let c = char(r) +proc breaksViWordCat*(ctx: LUContext; u: uint32): BreakCategory = + if u < 0x80: # ASCII + let c = char(u) if c in AsciiAlphaNumeric + {'_'}: return bcAlpha elif c in AsciiWhitespace: return bcSpace - elif ctx.isWhiteSpaceLU(r): + elif ctx.isWhiteSpaceLU(u): return bcSpace - elif ctx.isAlphaLU(r): - if ctx.isHiragana(r): + elif ctx.isAlphaLU(u): + if ctx.isHiragana(u): return bcHiragana - elif ctx.isKatakana(r): + elif ctx.isKatakana(u): return bcKatakana - elif ctx.isHangul(r): + elif ctx.isHangul(u): return bcHangul - elif ctx.isHan(r): + elif ctx.isHan(u): return bcHan return bcAlpha return bcSymbol -proc breaksWordCat*(ctx: LUContext; r: Rune): BreakCategory = - if not ctx.breaksWord(r): +proc breaksWordCat*(ctx: LUContext; u: uint32): BreakCategory = + if not ctx.breaksWord(u): return bcAlpha return bcSpace -proc breaksBigWordCat*(ctx: LUContext; r: Rune): BreakCategory = - if not ctx.isWhiteSpaceLU(r): +proc breaksBigWordCat*(ctx: LUContext; u: uint32): BreakCategory = + if not ctx.isWhiteSpaceLU(u): return bcAlpha return bcSpace -- cgit 1.4.1-2-gfad0