diff options
-rw-r--r-- | .gitmodules | 6 | ||||
m--------- | lib/chagashi | 0 | ||||
m--------- | lib/chakasu | 0 | ||||
-rw-r--r-- | nim.cfg | 2 | ||||
-rw-r--r-- | res/config.toml | 1 | ||||
-rw-r--r-- | src/config/config.nim | 2 | ||||
-rw-r--r-- | src/config/mailcap.nim | 2 | ||||
-rw-r--r-- | src/display/lineedit.nim | 18 | ||||
-rw-r--r-- | src/display/term.nim | 16 | ||||
-rw-r--r-- | src/html/chadombuilder.nim | 129 | ||||
-rw-r--r-- | src/html/dom.nim | 25 | ||||
-rw-r--r-- | src/io/posixstream.nim | 8 | ||||
-rw-r--r-- | src/js/encoding.nim | 197 | ||||
-rw-r--r-- | src/js/jstypes.nim | 3 | ||||
-rw-r--r-- | src/loader/loader.nim | 2 | ||||
-rw-r--r-- | src/loader/response.nim | 18 | ||||
-rw-r--r-- | src/local/client.nim | 2 | ||||
-rw-r--r-- | src/local/container.nim | 2 | ||||
-rw-r--r-- | src/local/pager.nim | 2 | ||||
-rw-r--r-- | src/main.nim | 2 | ||||
-rw-r--r-- | src/render/rendertext.nim | 79 | ||||
-rw-r--r-- | src/server/buffer.nim | 208 | ||||
-rw-r--r-- | src/version.nim | 4 | ||||
-rw-r--r-- | todo | 3 |
24 files changed, 382 insertions, 349 deletions
diff --git a/.gitmodules b/.gitmodules index 37ea29a2..29bb3d1a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ -[submodule "lib/chakasu"] - path = lib/chakasu - url = https://git.sr.ht/~bptato/chakasu [submodule "lib/chame"] path = lib/chame url = https://git.sr.ht/~bptato/chame +[submodule "lib/chagashi"] + path = lib/chagashi + url = https://git.sr.ht/~bptato/chagashi diff --git a/lib/chagashi b/lib/chagashi new file mode 160000 +Subproject 235cf0b2beafa177e7fa74ad6cc099ffbe5ec65 diff --git a/lib/chakasu b/lib/chakasu deleted file mode 160000 -Subproject 451226cb52d242bc65c1696a9c3abb8e89bc183 diff --git a/nim.cfg b/nim.cfg index 4338e32d..8d688ac0 100644 --- a/nim.cfg +++ b/nim.cfg @@ -1,8 +1,8 @@ -p:"." -p:"src/" -p:"lib/" --p:"lib/chakasu" -p:"lib/chame" +-p:"lib/chagashi" --experimental:"overloadableEnums" --warning:Effect:off --mm:refc diff --git a/res/config.toml b/res/config.toml index 54f74729..39650567 100644 --- a/res/config.toml +++ b/res/config.toml @@ -9,7 +9,6 @@ wrap = true [encoding] document-charset = ["utf-8", "sjis", "euc-jp", "latin2"] display-charset = "auto" -#system-charset = "auto" #TODO [external] mailcap = [ diff --git a/src/config/config.nim b/src/config/config.nim index 7be0c4a2..79ef263b 100644 --- a/src/config/config.nim +++ b/src/config/config.nim @@ -24,7 +24,7 @@ import types/url import utils/mimeguess import utils/twtstr -import chakasu/charset +import chagashi/charset type ColorMode* = enum diff --git a/src/config/mailcap.nim b/src/config/mailcap.nim index 0caf7f17..d5d17eae 100644 --- a/src/config/mailcap.nim +++ b/src/config/mailcap.nim @@ -8,7 +8,7 @@ import types/url import types/opt import utils/twtstr -import chakasu/charset +import chagashi/charset type MailcapParser = object diff --git a/src/display/lineedit.nim b/src/display/lineedit.nim index 7a5067eb..6969171e 100644 --- a/src/display/lineedit.nim +++ b/src/display/lineedit.nim @@ -1,4 +1,3 @@ -import std/streams import std/strutils import std/unicode @@ -10,9 +9,9 @@ import types/opt import utils/strwidth import utils/twtstr -import chakasu/charset -import chakasu/decoderstream -import chakasu/encoderstream +import chagashi/charset +import chagashi/validator +import chagashi/decoder type LineEditState* = enum @@ -155,15 +154,14 @@ proc backspace(edit: LineEdit) {.jsfunc.} = proc write*(edit: LineEdit, s: string, cs: Charset): bool = if cs == CHARSET_UTF_8: - if s.validateUtf8() != -1: + if s.validateUTF8Surr() != -1: return false edit.insertCharseq(s) else: - let ss = newStringStream(s) - let ds = newDecoderStream(ss, cs, errormode = DECODER_ERROR_MODE_FATAL) - let es = newEncoderStream(ds, CHARSET_UTF_8) - let s = es.readAll() - if ds.failed or es.failed: + let td = newTextDecoder(cs) + var success = false + let s = td.decodeAll(s, success) + if not success: return false edit.insertCharseq(s) return true diff --git a/src/display/term.nim b/src/display/term.nim index fd6271f3..afe84db6 100644 --- a/src/display/term.nim +++ b/src/display/term.nim @@ -16,9 +16,9 @@ import types/opt import utils/strwidth import utils/twtstr -import chakasu/charset -import chakasu/decoderstream -import chakasu/encoderstream +import chagashi/charset +import chagashi/encoder +import chagashi/validator export isatty @@ -384,7 +384,7 @@ proc setTitle*(term: Terminal, title: string) = term.outfile.write(XTERM_TITLE(title)) proc processOutputString*(term: Terminal, str: string, w: var int): string = - if str.validateUtf8() != -1: + if str.validateUTF8Surr() != -1: return "?" # twidth wouldn't work here, the view may start at the nth character. # pager must ensure tabs are converted beforehand. @@ -397,11 +397,9 @@ proc processOutputString*(term: Terminal, str: string, w: var int): string = # The output encoding matches the internal representation. return str else: - # Output is not utf-8, so we must convert back to utf-32 and then encode. - let ss = newStringStream(str) - let ds = newDecoderStream(ss) - let es = newEncoderStream(ds, term.cs, errormode = ENCODER_ERROR_MODE_FATAL) - return es.readAll() + # Output is not utf-8, so we must encode it first. + var success = false + return newTextEncoder(term.cs).encodeAll(str, success) proc generateFullOutput(term: Terminal, grid: FixedGrid): string = var format = Format() diff --git a/src/html/chadombuilder.nim b/src/html/chadombuilder.nim index 221cf7f8..66e3a518 100644 --- a/src/html/chadombuilder.nim +++ b/src/html/chadombuilder.nim @@ -1,6 +1,5 @@ import std/deques import std/options -import std/streams import html/catom import html/dom @@ -10,37 +9,30 @@ import js/fromjs import js/javascript import types/url -import chakasu/charset -import chakasu/decoderstream -import chakasu/encoderstream +import chagashi/charset import chame/htmlparser import chame/tags +export htmlparser.ParseResult + # DOMBuilder implementation for Chawan. -type CharsetConfidence = enum +type CharsetConfidence* = enum ccTentative, ccCertain, ccIrrelevant type HTML5ParserWrapper* = ref object parser: HTML5Parser[Node, CAtom] - charsetStack: seq[Charset] - seekable: bool builder*: ChaDOMBuilder opts: HTML5ParserOpts[Node, CAtom] - stream: StringStream - encoder: EncoderStream - decoder: DecoderStream # hack so we don't have to worry about leaks or the GC deallocating parser refs: seq[Document] stoppedFromScript: bool - needsBOMSniff: bool - wasICE: bool # inhibitCheckEnd ChaDOMBuilder = ref object of DOMBuilder[Node, CAtom] - charset: Charset - confidence: CharsetConfidence + charset*: Charset + confidence*: CharsetConfidence document*: Document factory: CAtomFactory poppedScript: HTMLScriptElement @@ -80,7 +72,8 @@ proc finish(builder: ChaDOMBuilder) = script.execute() #TODO events -proc restart(builder: ChaDOMBuilder, wrapper: HTML5ParserWrapper) = +proc restart*(wrapper: HTML5ParserWrapper, charset: Charset) = + let builder = wrapper.builder let document = newDocument(builder.factory) document.setActiveParser(wrapper) wrapper.refs.add(document) @@ -92,7 +85,9 @@ proc restart(builder: ChaDOMBuilder, wrapper: HTML5ParserWrapper) = document.window = window window.document = document builder.document = document + builder.charset = charset assert document.factory != nil + wrapper.parser = initHTML5Parser(builder, wrapper.opts) proc setQuirksModeImpl(builder: ChaDOMBuilder, quirksMode: QuirksMode) = if not builder.document.parser_cannot_change_the_mode_flag: @@ -214,7 +209,7 @@ proc elementPoppedImpl(builder: ChaDOMBuilder, element: Node) = builder.poppedScript = HTMLScriptElement(element) proc newChaDOMBuilder(url: URL, window: Window, factory: CAtomFactory, - confidence: CharsetConfidence): ChaDOMBuilder = + confidence: CharsetConfidence, charset = DefaultCharset): ChaDOMBuilder = let document = newDocument(factory) document.contentType = "text/html" document.url = url @@ -224,7 +219,8 @@ proc newChaDOMBuilder(url: URL, window: Window, factory: CAtomFactory, return ChaDOMBuilder( document: document, factory: factory, - confidence: confidence + confidence: confidence, + charset: charset ) # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments @@ -264,68 +260,22 @@ proc parseHTMLFragment*(element: Element, s: string): seq[Node] = builder.finish() return root.childList -#TODO this should be handled by decoderstream or buffer -proc bomSniff(wrapper: HTML5ParserWrapper): Charset = - let stream = wrapper.stream - let op = stream.getPosition() - if op + 2 >= stream.data.len: - return CHARSET_UNKNOWN - let bom = stream.readStr(2) - if bom == "\xFE\xFF": - return CHARSET_UTF_16_BE - if bom == "\xFF\xFE": - return CHARSET_UTF_16_LE - if bom == "\xEF\xBB": - if op + 3 < stream.data.len and stream.readChar() == '\xBF': - return CHARSET_UTF_8 - wrapper.stream.setPosition(op) - return CHARSET_UNKNOWN - -proc switchCharset(wrapper: HTML5ParserWrapper) = - let builder = wrapper.builder - builder.charset = wrapper.charsetStack.pop() - if wrapper.seekable: - builder.confidence = ccTentative # used in the next iteration - else: - builder.confidence = ccCertain - let em = if wrapper.charsetStack.len == 0 or not wrapper.seekable: - DECODER_ERROR_MODE_REPLACEMENT - else: - DECODER_ERROR_MODE_FATAL - let ice = wrapper.decoder == nil or wrapper.wasICE - wrapper.parser = initHTML5Parser(builder, wrapper.opts) - wrapper.decoder = newDecoderStream(wrapper.stream, builder.charset, - errormode = em) - wrapper.decoder.setInhibitCheckEnd(ice) - wrapper.wasICE = ice - wrapper.encoder = newEncoderStream(wrapper.decoder, CHARSET_UTF_8, - errormode = ENCODER_ERROR_MODE_FATAL) - -proc newHTML5ParserWrapper*(stream: StringStream, window: Window, url: URL, - factory: CAtomFactory, charsets: seq[Charset], seekable: bool): - HTML5ParserWrapper = +proc newHTML5ParserWrapper*(window: Window, url: URL, factory: CAtomFactory, + charset: Charset): HTML5ParserWrapper = let opts = HTML5ParserOpts[Node, CAtom]( isIframeSrcdoc: false, #TODO? scripting: window != nil and window.settings.scripting ) - let builder = newChaDOMBuilder(url, window, factory, ccTentative) + let builder = newChaDOMBuilder(url, window, factory, ccTentative, charset) let wrapper = HTML5ParserWrapper( - seekable: seekable, builder: builder, opts: opts, - stream: stream, - needsBOMSniff: seekable + parser: initHTML5Parser(builder, opts) ) builder.document.setActiveParser(wrapper) - if charsets.len == 0: - wrapper.charsetStack = @[DefaultCharset] # UTF-8 - else: - for i in countdown(charsets.high, 0): - wrapper.charsetStack.add(charsets[i]) - wrapper.switchCharset() return wrapper -proc parseBuffer(wrapper: HTML5ParserWrapper, buffer: openArray[char]): +proc parseBuffer*(wrapper: HTML5ParserWrapper, buffer: openArray[char]): ParseResult = let builder = wrapper.builder let document = builder.document @@ -390,50 +340,7 @@ proc CDB_parseDocumentWriteChunk(wrapper: pointer) {.exportc.} = if res == PRES_STOP: wrapper.stoppedFromScript = true -proc parseAll*(wrapper: HTML5ParserWrapper): bool = - let builder = wrapper.builder - if wrapper.needsBOMSniff: - if wrapper.stream.getPosition() + 3 >= wrapper.stream.data.len: - return true - let scs = wrapper.bomSniff() - if scs != CHARSET_UNKNOWN: - builder.confidence = ccCertain - wrapper.charsetStack = @[scs] - wrapper.seekable = false - wrapper.switchCharset() - wrapper.needsBOMSniff = false - let buffer = wrapper.encoder.readAll() - if wrapper.decoder.failed: - assert wrapper.seekable - # Retry with another charset. - builder.restart(wrapper) - wrapper.switchCharset() - return false - if buffer.len == 0: - return true - let res = wrapper.parseBuffer(buffer) - if res == PRES_STOP: - # A meta tag describing the charset has been found; force use of this - # charset. - builder.restart(wrapper) - wrapper.charsetStack.add(builder.charset) - wrapper.seekable = false - wrapper.switchCharset() - return false - return true - proc finish*(wrapper: HTML5ParserWrapper) = - if wrapper.needsBOMSniff: - let scs = wrapper.bomSniff() - if scs != CHARSET_UNKNOWN: - wrapper.builder.confidence = ccCertain - wrapper.charsetStack = @[scs] - wrapper.seekable = false - wrapper.switchCharset() - wrapper.needsBOMSniff = false - wrapper.decoder.setInhibitCheckEnd(false) - wrapper.wasICE = false - doAssert wrapper.parseAll() wrapper.parser.finish() wrapper.builder.finish() for r in wrapper.refs: diff --git a/src/html/dom.nim b/src/html/dom.nim index d5b21622..e5d871ba 100644 --- a/src/html/dom.nim +++ b/src/html/dom.nim @@ -41,9 +41,9 @@ import utils/mimeguess import utils/strwidth import utils/twtstr -import chakasu/charset -import chakasu/decoderstream -import chakasu/encoderstream +import chagashi/charset +import chagashi/decoder +import chagashi/validator import chame/tags @@ -2744,13 +2744,8 @@ proc loadResource(window: Window, link: HTMLLinkElement) = res.unregisterFun() ).then(proc(s: JSResult[string]) = if s.isOk: - #TODO this is extremely inefficient, and text() should return - # utf8 anyways - let ss = newStringStream(s.get) - #TODO non-utf-8 css - let ds = newDecoderStream(ss, cs = CHARSET_UTF_8) - let source = newEncoderStream(ds, cs = CHARSET_UTF_8) - link.sheet = parseStylesheet(source, window.factory) + #TODO non-utf-8 css? + link.sheet = parseStylesheet(newStringStream(s.get), window.factory) window.document.cachedSheetsInvalid = true ) window.loadingResourcePromises.add(p) @@ -3441,12 +3436,12 @@ proc fetchClassicScript(element: HTMLScriptElement, url: URL, if response.res != 0: element.onComplete(ScriptResult(t: RESULT_NULL)) return - let cs = if cs == CHARSET_UNKNOWN: - CHARSET_UTF_8 + #TODO make this non-blocking somehow + let s = response.body.readAll() + let source = if cs in {CHARSET_UNKNOWN, CHARSET_UTF_8}: + s.toValidUTF8() else: - cs - let decoder = newDecoderStream(response.body, cs = cs) - let source = newEncoderStream(decoder).readAll() + newTextDecoder(cs).decodeAll(s) let script = window.jsctx.createClassicScript(source, url, options, false) element.onComplete(ScriptResult(t: RESULT_SCRIPT, script: script)) diff --git a/src/io/posixstream.nim b/src/io/posixstream.nim index 683d72b8..c9c1c234 100644 --- a/src/io/posixstream.nim +++ b/src/io/posixstream.nim @@ -74,7 +74,13 @@ proc psClose(s: Stream) = proc psReadData(s: Stream, buffer: pointer, len: int): int = let s = PosixStream(s) assert len != 0 and s.blocking - return s.recvData(buffer, len) + result = 0 + while result < len: + let p = addr cast[ptr UncheckedArray[uint8]](buffer)[result] + let n = s.recvData(p, len - result) + if n == 0: + break + result += n proc psWriteData(s: Stream, buffer: pointer, len: int) = let s = PosixStream(s) diff --git a/src/js/encoding.nim b/src/js/encoding.nim index 0e6643fb..b2ef1d1b 100644 --- a/src/js/encoding.nim +++ b/src/js/encoding.nim @@ -1,98 +1,189 @@ import std/streams +import bindings/quickjs import js/error import js/javascript import js/jstypes -import chakasu/charset -import chakasu/decoderstream -import chakasu/encoderstream +import chagashi/charset +import chagashi/decoder +import chagashi/decodercore +import chagashi/validator +import chagashi/validatorcore type - TextEncoder = ref object + JSTextEncoder = ref object - TextDecoder = ref object + JSTextDecoder = ref object encoding: Charset - errorMode: DecoderErrorMode ignoreBOM {.jsget.}: bool + fatal {.jsget.}: bool doNotFlush: bool bomSeen: bool - decoder: DecoderStream - encoder: EncoderStream # to return the string to JS - istream: StringStream + td: TextDecoder + tv: ref TextValidatorUTF8 + validateBuf: seq[uint8] -jsDestructor(TextDecoder) -jsDestructor(TextEncoder) +jsDestructor(JSTextDecoder) +jsDestructor(JSTextEncoder) type TextDecoderOptions = object of JSDict fatal: bool ignoreBOM: bool -func newTextDecoder(label = "utf-8", options = TextDecoderOptions()): - JSResult[TextDecoder] {.jsctor.} = - let errorMode = if options.fatal: - DECODER_ERROR_MODE_FATAL - else: - DECODER_ERROR_MODE_REPLACEMENT +func newJSTextDecoder(label = "utf-8", options = TextDecoderOptions()): + JSResult[JSTextDecoder] {.jsctor.} = let encoding = getCharset(label) if encoding in {CHARSET_UNKNOWN, CHARSET_REPLACEMENT}: return err(newRangeError("Invalid encoding label")) - return ok(TextDecoder( - errorMode: errorMode, + return ok(JSTextDecoder( ignoreBOM: options.ignoreBOM, + fatal: options.fatal, + td: if encoding != CHARSET_UTF_8: newTextDecoder(encoding) else: nil, + tv: if encoding == CHARSET_UTF_8: (ref TextValidatorUTF8)() else: nil, encoding: encoding )) +type Growbuf = object + p: ptr UncheckedArray[uint8] + cap: int + len: int + +{.warning[Deprecated]: off.}: + proc `=destroy`(growbuf: var Growbuf) = + if growbuf.p != nil: + dealloc(growbuf.p) + growbuf.p = nil + +const BufferSize = 128 +proc grow(buf: var Growbuf) = + if buf.cap == 0: + buf.cap = BufferSize + else: + buf.cap *= 2 + buf.p = cast[ptr UncheckedArray[uint8]](buf.p.realloc(buf.cap)) + +proc write(buf: var Growbuf, s: openArray[uint8]) = + if buf.len + s.len > buf.cap: + buf.grow() + if s.len > 0: + copyMem(addr buf.p[buf.len], unsafeAddr s[0], s.len) + buf.len += s.len + +proc write(buf: var Growbuf, s: string) = + if buf.len + s.len > buf.cap: + buf.grow() + if s.len > 0: + copyMem(addr buf.p[buf.len], unsafeAddr s[0], s.len) + buf.len += s.len + +proc decode0(this: JSTextDecoder, ctx: JSContext, input: JSArrayBufferView, + stream: bool): JSResult[JSValue] = + var oq = Growbuf( + p: cast[ptr UncheckedArray[uint8]](alloc(BufferSize)), + len: 0, + cap: BufferSize + ) + var i = 0 + let H = int(input.abuf.len) - 1 + template handle_error = + if this.fatal: + return errTypeError("Failed to decode string") + oq.write("\uFFFD") + i = this.td.i + while true: + case this.td.decode(input.abuf.p.toOpenArray(i, H), + oq.p.toOpenArray(0, oq.cap - 1), oq.len) + of tdrDone: + if not stream: + case this.td.finish() + of tdfrDone: discard + of tdfrError: handle_error + break + of tdrError: + handle_error + of tdrReqOutput: + oq.grow() + return ok(JS_NewStringLen(ctx, cast[cstring](oq.p), csize_t(oq.len))) + +proc validate0(this: JSTextDecoder, ctx: JSContext, input: JSArrayBufferView, + stream: bool): JSResult[JSValue] = + # assume input is valid; do not allocate yet + var oq = Growbuf(p: nil, len: 0, cap: 0) + var i = 0 + let H = int(input.abuf.len) - 1 + var n = 0 + template handle_error = + if this.fatal: + return errTypeError("Failed to decode string") + # write from previous error (or beginning) to the last valid char + oq.write(input.abuf.p.toOpenArray(i, n)) + oq.write("\uFFFD") + this.validateBuf.setLen(0) + i = this.tv.i + while true: + case this.tv[].validate(input.abuf.p.toOpenArray(i, H), n) + of tvrDone: + break + of tvrError: + handle_error + if not stream: + case this.tv[].finish() + of tvrDone: discard + of tvrError: handle_error + if this.validateBuf.len > 0 and n > -1: + oq.write(this.validateBuf) + oq.write(input.abuf.p.toOpenArray(i, n)) + this.validateBuf.setLen(0) + this.validateBuf.add(input.abuf.p.toOpenArray(n + 1, input.abuf.high)) + if oq.len > 0: + assert oq.p != nil + return ok(JS_NewStringLen(ctx, cast[cstring](oq.p), csize_t(oq.len))) + assert oq.p == nil + return ok(JS_NewStringLen(ctx, cast[cstring](input.abuf.p), csize_t(n + 1))) + type TextDecodeOptions = object of JSDict stream: bool #TODO AllowSharedBufferSource -proc decode(this: TextDecoder, input = none(JSArrayBufferView), - options = TextDecodeOptions()): string {.jsfunc.} = +proc decode(ctx: JSContext, this: JSTextDecoder, input = opt(JSArrayBufferView), + options = TextDecodeOptions()): JSResult[JSValue] {.jsfunc.} = if not this.doNotFlush: - if this.istream != nil: - this.istream.close() - if this.decoder != nil: - this.decoder.close() - if this.encoder != nil: - this.encoder.close() - this.istream = newStringStream() - this.decoder = newDecoderStream(this.istream, cs = this.encoding, - errormode = this.errorMode) - this.encoder = newEncoderStream(this.decoder, cs = CHARSET_UTF_8) + if this.td != nil: + this.td = newTextDecoder(this.encoding) + else: + assert this.tv != nil + this.tv = (ref TextValidatorUTF8)() this.bomSeen = false if this.doNotFlush != options.stream: this.doNotFlush = options.stream - this.decoder.setInhibitCheckEnd(options.stream) if input.isSome: - let input = input.get - let pos = this.istream.getPosition() - #TODO input offset? - this.istream.writeData(input.abuf.p, int(input.abuf.len)) - this.istream.setPosition(pos) - #TODO this should return a JSString, so we do not needlessly re-encode - # the output. (Right now we do, implicitly through toJS.) - return this.encoder.readAll() - -func jencoding(this: TextDecoder): string {.jsfget: "encoding".} = + if this.td != nil: + return this.decode0(ctx, input.get, options.stream) + else: + assert this.encoding == CHARSET_UTF_8 + # just validate + return this.validate0(ctx, input.get, options.stream) + return ok(JS_NewString(ctx, "")) + +func jencoding(this: JSTextDecoder): string {.jsfget: "encoding".} = return $this.encoding -func fatal(this: TextDecoder): bool {.jsfget.} = - return this.errorMode == DECODER_ERROR_MODE_FATAL - -func newTextEncoder(): TextEncoder {.jsctor.} = - return TextEncoder() +func newTextEncoder(): JSTextEncoder {.jsctor.} = + return JSTextEncoder() -func jencoding(this: TextEncoder): string {.jsfget: "encoding".} = +func jencoding(this: JSTextEncoder): string {.jsfget: "encoding".} = return "utf-8" proc dealloc_wrap(rt: JSRuntime, opaque, p: pointer) {.cdecl.} = dealloc(p) -proc encode(this: TextEncoder, input = ""): JSUint8Array {.jsfunc.} = - # input is already UTF-8 here :P +proc encode(this: JSTextEncoder, input = ""): JSUint8Array {.jsfunc.} = + # we have to validate input first :/ + #TODO it is possible to do less copies here... + var input = input.toValidUTF8() let buf = cast[ptr UncheckedArray[uint8]](alloc(input.len)) - copyMem(buf, unsafeAddr input[0], input.len) + copyMem(buf, addr input[0], input.len) let abuf = JSArrayBuffer( p: buf, len: csize_t(input.len), @@ -107,5 +198,5 @@ proc encode(this: TextEncoder, input = ""): JSUint8Array {.jsfunc.} = #TODO encodeInto proc addEncodingModule*(ctx: JSContext) = - ctx.registerType(TextDecoder) - ctx.registerType(TextEncoder) + ctx.registerType(JSTextDecoder, name = "TextDecoder") + ctx.registerType(JSTextEncoder, name = "TextEncoder") diff --git a/src/js/jstypes.nim b/src/js/jstypes.nim index b8ef6c55..5336f067 100644 --- a/src/js/jstypes.nim +++ b/src/js/jstypes.nim @@ -30,3 +30,6 @@ type abuf*: JSArrayBuffer offset*: csize_t # offset into the buffer nmemb*: csize_t # number of members + +func high*(abuf: JSArrayBuffer): int = + return int(abuf.len) - 1 diff --git a/src/loader/loader.nim b/src/loader/loader.nim index d859ae34..3c974c95 100644 --- a/src/loader/loader.nim +++ b/src/loader/loader.nim @@ -51,7 +51,7 @@ import types/url import utils/mimeguess import utils/twtstr -import chakasu/charset +import chagashi/charset export request export response diff --git a/src/loader/response.nim b/src/loader/response.nim index 0869a73e..6168aff6 100644 --- a/src/loader/response.nim +++ b/src/loader/response.nim @@ -1,5 +1,4 @@ import std/streams -import std/unicode import bindings/quickjs import io/promise @@ -11,9 +10,9 @@ import loader/request import types/blob import types/url -import chakasu/charset -import chakasu/decoderstream -import chakasu/encoderstream +import chagashi/charset +import chagashi/decoder +import chagashi/validator type ResponseType* = enum @@ -106,13 +105,12 @@ proc text*(response: Response): Promise[JSResult[string]] {.jsfunc.} = CHARSET_UTF_8 else: response.charset - if cs == CHARSET_UTF_8 and s.validateUtf8() == -1: - ok(s) + #TODO this is inefficient + # maybe add a JS type that turns a seq[char] into JS strings + if cs in {CHARSET_UTF_8, CHARSET_UNKNOWN}: + ok(s.toValidUTF8()) else: - let ss = newStringStream(s) - let ds = newDecoderStream(ss, cs) - let es = newEncoderStream(ds, CHARSET_UTF_8) - return ok(es.readAll()) + ok(newTextDecoder(cs).decodeAll(s)) ) proc blob*(response: Response): Promise[JSResult[Blob]] {.jsfunc.} = diff --git a/src/local/client.nim b/src/local/client.nim index 291e1b0d..e0a453db 100644 --- a/src/local/client.nim +++ b/src/local/client.nim @@ -51,7 +51,7 @@ import utils/twtstr import xhr/formdata import xhr/xmlhttprequest -import chakasu/charset +import chagashi/charset type Client* = ref object diff --git a/src/local/container.nim b/src/local/container.nim index 3814f43d..e61d4fbf 100644 --- a/src/local/container.nim +++ b/src/local/container.nim @@ -29,7 +29,7 @@ import utils/mimeguess import utils/strwidth import utils/twtstr -import chakasu/charset +import chagashi/charset type CursorPosition* = object diff --git a/src/local/pager.nim b/src/local/pager.nim index a5b63523..64a11964 100644 --- a/src/local/pager.nim +++ b/src/local/pager.nim @@ -42,7 +42,7 @@ import types/url import utils/strwidth import utils/twtstr -import chakasu/charset +import chagashi/charset type LineMode* = enum diff --git a/src/main.nim b/src/main.nim index d6a9abce..9d782a90 100644 --- a/src/main.nim +++ b/src/main.nim @@ -15,7 +15,7 @@ import types/opt import utils/strwidth import utils/twtstr -import chakasu/charset +import chagashi/charset proc main() = let params = commandLineParams() diff --git a/src/render/rendertext.nim b/src/render/rendertext.nim index 56a0b2ba..27992215 100644 --- a/src/render/rendertext.nim +++ b/src/render/rendertext.nim @@ -5,69 +5,29 @@ import std/unicode import types/cell import utils/strwidth -import chakasu/charset -import chakasu/decoderstream -import chakasu/encoderstream - -type StreamRenderer* = object +type StreamRenderer* = ref object ansiparser: AnsiCodeParser format: Format af: bool stream: Stream - decoder: DecoderStream - encoder: EncoderStream - charsets: seq[Charset] newline: bool w: int j: int # byte in line -#TODO pass bool for whether we can rewind -proc newStreamRenderer*(stream: Stream, charsets0: openArray[Charset]): - ref StreamRenderer = - var charsets = newSeq[Charset](charsets0.len) - for i in 0 ..< charsets.len: - charsets[i] = charsets0[charsets.high - i] - if charsets.len == 0: - charsets.add(DefaultCharset) - let cs = charsets.pop() - let em = if charsets.len > 0: - DECODER_ERROR_MODE_FATAL - else: - DECODER_ERROR_MODE_REPLACEMENT - let decoder = newDecoderStream(stream, cs, errormode = em) - decoder.setInhibitCheckEnd(true) - let encoder = newEncoderStream(decoder) - return (ref StreamRenderer)( - stream: stream, - decoder: decoder, - encoder: encoder, - format: Format(), - charsets: charsets, - ansiparser: AnsiCodeParser( - state: PARSE_DONE - ) - ) +proc newStreamRenderer*(): StreamRenderer = + return StreamRenderer(ansiparser: AnsiCodeParser(state: PARSE_DONE)) -proc rewind(renderer: var StreamRenderer) = - let cs = renderer.charsets.pop() - let em = if renderer.charsets.len > 0: - DECODER_ERROR_MODE_FATAL - else: - DECODER_ERROR_MODE_REPLACEMENT - let decoder = newDecoderStream(renderer.stream, cs, errormode = em) - decoder.setInhibitCheckEnd(true) - renderer.decoder = decoder - renderer.encoder = newEncoderStream(decoder) +proc rewind*(renderer: StreamRenderer) = renderer.format = Format() renderer.ansiparser.state = PARSE_DONE -proc addFormat(grid: var FlexibleGrid, renderer: var StreamRenderer) = +proc addFormat(grid: var FlexibleGrid, renderer: StreamRenderer) = if renderer.af: renderer.af = false if renderer.j == grid[^1].str.len: grid[^1].addFormat(renderer.w, renderer.format) -proc processBackspace(grid: var FlexibleGrid, renderer: var StreamRenderer, +proc processBackspace(grid: var FlexibleGrid, renderer: StreamRenderer, r: Rune): bool = let pj = renderer.j var cr: Rune @@ -105,8 +65,7 @@ proc processBackspace(grid: var FlexibleGrid, renderer: var StreamRenderer, grid[^1].str.setLen(renderer.j) return false -proc processAscii(grid: var FlexibleGrid, renderer: var StreamRenderer, - c: char) = +proc processAscii(grid: var FlexibleGrid, renderer: StreamRenderer, c: char) = case c of '\b': if renderer.j == 0: @@ -129,8 +88,10 @@ proc processAscii(grid: var FlexibleGrid, renderer: var StreamRenderer, renderer.w += Rune(c).twidth(renderer.w) inc renderer.j -proc renderChunk(grid: var FlexibleGrid, renderer: var StreamRenderer, - buf: string) = +proc renderChunk*(grid: var FlexibleGrid, renderer: StreamRenderer, + buf: openArray[char]) = + if grid.len == 0: + grid.addLine() var i = 0 while i < buf.len: if renderer.newline: @@ -158,21 +119,3 @@ proc renderChunk(grid: var FlexibleGrid, renderer: var StreamRenderer, grid[^1].str &= r renderer.w += r.twidth(renderer.w) renderer.j += i - pi - -proc renderStream*(grid: var FlexibleGrid, renderer: var StreamRenderer): bool = - let buf = renderer.encoder.readAll() - if renderer.decoder.failed: - renderer.rewind() - grid.setLen(0) - return false - if grid.len == 0: - grid.addLine() - grid.renderChunk(renderer, buf) - return true - -proc finishRender*(grid: var FlexibleGrid, renderer: var StreamRenderer) = - renderer.decoder.setInhibitCheckEnd(false) - let buf = renderer.decoder.readAll() - if grid.len == 0: - grid.addLine() - grid.renderChunk(renderer, buf) diff --git a/src/server/buffer.nim b/src/server/buffer.nim index 27308ff3..788c0aea 100644 --- a/src/server/buffer.nim +++ b/src/server/buffer.nim @@ -51,7 +51,10 @@ import utils/strwidth import utils/twtstr import xhr/formdata as formdata_impl -import chakasu/charset +from chagashi/decoder import newTextDecoder +import chagashi/charset +import chagashi/decodercore +import chagashi/validatorcore import chame/tags @@ -103,7 +106,6 @@ type prevstyled: StyledNode selector: Selector[int] istream: SocketStream - sstream: StringStream available: int state: BufferState prevnode: StyledNode @@ -118,8 +120,15 @@ type quirkstyle: CSSStylesheet userstyle: CSSStylesheet htmlParser: HTML5ParserWrapper - srenderer: ref StreamRenderer + srenderer: StreamRenderer bgcolor: CellColor + needsBOMSniff: bool + seekable: bool + decoder: TextDecoder + validator: ref TextValidatorUTF8 + validateBuf: seq[char] + charsetStack: seq[Charset] + charset: Charset InterfaceOpaque = ref object stream: Stream @@ -274,11 +283,6 @@ macro task(fun: typed) = pfun.istask = true fun -func charsets(buffer: Buffer): seq[Charset] = - if buffer.source.charset != CHARSET_UNKNOWN: - return @[buffer.source.charset] - return buffer.config.charsets - func getTitleAttr(node: StyledNode): string = if node == nil: return "" @@ -632,13 +636,115 @@ proc do_reshape(buffer: Buffer) = buffer.lines.renderDocument(buffer.bgcolor, styledRoot, buffer.attrs) buffer.prevstyled = styledRoot -proc processData(buffer: Buffer): bool = +proc processData0(buffer: Buffer, data: openArray[char]): bool = if buffer.ishtml: - let res = buffer.htmlParser.parseAll() + if buffer.htmlParser.parseBuffer(data) == PRES_STOP: + buffer.charsetStack = @[buffer.htmlParser.builder.charset] + return false buffer.document = buffer.htmlParser.builder.document - return res else: - return buffer.lines.renderStream(buffer.srenderer[]) + buffer.lines.renderChunk(buffer.srenderer, data) + true + +func canSwitch(buffer: Buffer): bool {.inline.} = + if buffer.ishtml and buffer.htmlParser.builder.confidence != ccTentative: + return false + return buffer.charsetStack.len > 0 + +proc initDecoder(buffer: Buffer) = + if buffer.charset != CHARSET_UTF_8: + buffer.decoder = newTextDecoder(buffer.charset) + else: + buffer.validator = (ref TextValidatorUTF8)() + +proc switchCharset(buffer: Buffer) = + buffer.charset = buffer.charsetStack.pop() + buffer.initDecoder() + if buffer.ishtml: + buffer.htmlParser.restart(buffer.charset) + else: + buffer.srenderer.rewind() + buffer.lines.setLen(0) + +const BufferSize = 16384 + +proc decodeData(buffer: Buffer, iq: openArray[uint8]): bool = + var oq {.noinit.}: array[BufferSize, char] + var n = 0 + while true: + case buffer.decoder.decode(iq, oq.toOpenArrayByte(0, oq.high), n) + of tdrDone: + if not buffer.processData0(oq.toOpenArray(0, n - 1)): + assert buffer.canSwitch + buffer.switchCharset() + return false + break + of tdrReqOutput: + # flush output buffer + if not buffer.processData0(oq.toOpenArray(0, n - 1)): + assert buffer.canSwitch + buffer.switchCharset() + return false + n = 0 + of tdrError: + if buffer.canSwitch: + buffer.switchCharset() + return false + doAssert buffer.processData0("\uFFFD") + true + +proc validateData(buffer: Buffer, iq: openArray[char]): bool = + var pi = 0 + var n = 0 + while true: + case buffer.validator[].validate(iq.toOpenArrayByte(0, iq.high), n) + of tvrDone: + if n == -1: + return true + if buffer.validateBuf.len > 0: + doAssert buffer.processData0(buffer.validateBuf) + buffer.validateBuf.setLen(0) + if not buffer.processData0(iq.toOpenArray(pi, n)): + assert buffer.canSwitch + buffer.switchCharset() + return false + buffer.validateBuf.add(iq.toOpenArray(n + 1, iq.high)) + break + of tvrError: + buffer.validateBuf.setLen(0) + if buffer.canSwitch: + buffer.switchCharset() + return false + if n > pi: + doAssert buffer.processData0(iq.toOpenArray(pi, n - 1)) + doAssert buffer.processData0("\uFFFD") + pi = buffer.validator.i + true + +proc bomSniff(buffer: Buffer, iq: openArray[char]): int = + if iq[0] == '\xFE' and iq[1] == '\xFF': + buffer.charsetStack = @[CHARSET_UTF_16_BE] + buffer.switchCharset() + return 2 + if iq[0] == '\xFF' and iq[1] == '\xFE': + buffer.charsetStack = @[CHARSET_UTF_16_LE] + buffer.switchCharset() + return 2 + if iq[0] == '\xEF' and iq[1] == '\xBB' and iq[2] == '\xBF': + buffer.charsetStack = @[CHARSET_UTF_8] + buffer.switchCharset() + return 3 + return 0 + +proc processData(buffer: Buffer, iq: openArray[char]): bool = + var start = 0 + if buffer.needsBOMSniff: + if iq.len >= 3: # ehm... TODO + start += buffer.bomSniff(iq) + buffer.needsBOMSniff = false + if buffer.decoder != nil: + return buffer.decodeData(iq.toOpenArrayByte(start, iq.high)) + return buffer.validateData(iq.toOpenArray(start, iq.high)) proc windowChange*(buffer: Buffer, attrs: WindowAttributes) {.proxy.} = buffer.attrs = attrs @@ -717,44 +823,38 @@ proc rewind(buffer: Buffer): bool = proc setHTML(buffer: Buffer, ishtml: bool) = buffer.ishtml = ishtml + buffer.charset = buffer.charsetStack.pop() + buffer.initDecoder() if ishtml: let factory = newCAtomFactory() buffer.factory = factory - if buffer.config.scripting: - buffer.window = newWindow( - buffer.config.scripting, - buffer.config.images, - buffer.selector, - buffer.attrs, - factory, - proc(url: URL) = buffer.navigate(url), - some(buffer.loader) - ) + let navigate = if buffer.config.scripting: + proc(url: URL) = buffer.navigate(url) else: - buffer.window = newWindow( - buffer.config.scripting, - buffer.config.images, - buffer.selector, - buffer.attrs, - factory, - nil, - some(buffer.loader) - ) + nil + buffer.window = newWindow( + buffer.config.scripting, + buffer.config.images, + buffer.selector, + buffer.attrs, + factory, + navigate, + some(buffer.loader) + ) buffer.htmlParser = newHTML5ParserWrapper( - buffer.sstream, buffer.window, buffer.url, buffer.factory, - buffer.charsets, - seekable = true + buffer.charset ) + assert buffer.htmlParser.builder.document != nil const css = staticRead"res/ua.css" const quirk = css & staticRead"res/quirk.css" buffer.uastyle = css.parseStylesheet(factory) buffer.quirkstyle = quirk.parseStylesheet(factory) buffer.userstyle = parseStylesheet(buffer.config.userstyle, factory) else: - buffer.srenderer = newStreamRenderer(buffer.sstream, buffer.charsets) + buffer.srenderer = newStreamRenderer() proc connect*(buffer: Buffer): ConnectResult {.proxy.} = if buffer.connected: @@ -1025,13 +1125,14 @@ proc dispatchEvent(buffer: Buffer, ctype: string, elem: Element): tuple[ break return (called, canceled) -const BufferSize = 16384 - proc finishLoad(buffer: Buffer): EmptyPromise = if buffer.state != LOADING_PAGE: let p = EmptyPromise() p.resolve() return p + if buffer.decoder != nil and buffer.decoder.finish() == tdfrError or + buffer.validator != nil and buffer.validator[].finish() == tvrError: + doAssert buffer.processData0("\uFFFD") var p: EmptyPromise if buffer.ishtml: buffer.htmlParser.finish() @@ -1083,20 +1184,16 @@ proc onload(buffer: Buffer) = of LOADING_PAGE: discard var reprocess = false + var iq {.noinit.}: array[BufferSize, char] + var n = 0 while true: - buffer.sstream.setPosition(0) - if not reprocess: - buffer.sstream.data.setLen(BufferSize) try: - var n = 0 if not reprocess: - buffer.sstream.data.prepareMutation() - n = buffer.istream.recvData(addr buffer.sstream.data[0], BufferSize) - if n != buffer.sstream.data.len: - buffer.sstream.data.setLen(n) - if n != 0 or reprocess: + n = buffer.istream.recvData(addr iq[0], iq.len) buffer.available += n - if not buffer.processData(): + res.lines = buffer.lines.len + if n != 0: + if not buffer.processData(iq.toOpenArray(0, n - 1)): if not buffer.firstBufferRead: reprocess = true continue @@ -1105,10 +1202,8 @@ proc onload(buffer: Buffer) = buffer.firstBufferRead = true reprocess = false res.bytes = buffer.available - res.lines = buffer.lines.len - if buffer.istream.atEnd(): - buffer.sstream = nil - # EOF + res.lines = buffer.lines.len + else: # EOF res.atend = true buffer.finishLoad().then(proc() = buffer.do_reshape() @@ -1116,8 +1211,6 @@ proc onload(buffer: Buffer) = buffer.state = LOADED if buffer.document != nil: # may be nil if not buffer.ishtml buffer.document.readyState = READY_STATE_COMPLETE - if not buffer.ishtml: - buffer.lines.finishRender(buffer.srenderer[]) buffer.dispatchLoadEvent() buffer.resolveTask(LOAD, res) ) @@ -1754,13 +1847,18 @@ proc launchBuffer*(config: BufferConfig, source: BufferSource, config: config, loader: loader, source: source, - sstream: newStringStream(), selector: newSelector[int](), estream: newFileStream(stderr), pstream: socks, rfd: socks.fd, - ssock: ssock + ssock: ssock, + needsBOMSniff: true, + seekable: true ) + for i in countdown(buffer.config.charsets.high, 0): + buffer.charsetStack.add(buffer.config.charsets[i]) + if buffer.charsetStack.len == 0: + buffer.charsetStack.add(DefaultCharset) gbuffer = buffer onSignal SIGTERM: discard sig diff --git a/src/version.nim b/src/version.nim index b2862180..1ac9b191 100644 --- a/src/version.nim +++ b/src/version.nim @@ -23,9 +23,9 @@ macro checkVersion(xs: static string, major, minor, patch: int) = gs & ").\n" & "Please run `make submodule` to update.") -tryImport chakasu/version, "chakasu" +tryImport chagashi/version, "chagashi" tryImport chame/version, "chame" static: - checkVersion("chakasu", 0, 3, 2) + checkVersion("chagashi", 0, 4, 0) checkVersion("chame", 0, 14, 3) diff --git a/todo b/todo index 1a668975..fcf640bd 100644 --- a/todo +++ b/todo @@ -4,9 +4,6 @@ compilation: * fbf for unifont * maybe use system wcwidth? charsets: -- rewrite it: - * push API (not based on std/streams) - * directly convert to/from UTF-8, not UTF-32 - set up some fuzzer - use appropriate charsets in forms, urls, etc. display: |