diff options
author | bptato <nincsnevem662@gmail.com> | 2022-12-10 19:05:38 +0100 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2022-12-10 19:05:38 +0100 |
commit | 1e858c874804444bc4b95b6e89eb96a0deb8473c (patch) | |
tree | 3151b498e19c6d6eed3d90827483eb270314f3da /src | |
parent | d963385cd9fd77f0a950c5b92be7774bbf76d661 (diff) | |
download | chawan-1e858c874804444bc4b95b6e89eb96a0deb8473c.tar.gz |
Add support for the encoding standard, fix parseLegacyColor
Also, fix a bug in the
Diffstat (limited to 'src')
-rw-r--r-- | src/buffer/buffer.nim | 70 | ||||
-rw-r--r-- | src/data/charset.nim | 433 | ||||
-rw-r--r-- | src/data/idna.nim | 2 | ||||
-rw-r--r-- | src/html/htmlparser.nim | 112 | ||||
-rw-r--r-- | src/html/htmltokenizer.nim | 241 | ||||
-rw-r--r-- | src/io/posixstream.nim | 40 | ||||
-rw-r--r-- | src/io/teestream.nim | 44 | ||||
-rw-r--r-- | src/ips/socketstream.nim | 10 | ||||
-rw-r--r-- | src/js/regex.nim | 69 | ||||
-rw-r--r-- | src/render/rendertext.nim | 75 | ||||
-rw-r--r-- | src/strings/charset.nim | 69 | ||||
-rw-r--r-- | src/strings/decoderstream.nim | 844 | ||||
-rw-r--r-- | src/types/color.nim | 11 | ||||
-rw-r--r-- | src/utils/twtstr.nim | 2 |
14 files changed, 1758 insertions, 264 deletions
diff --git a/src/buffer/buffer.nim b/src/buffer/buffer.nim index 1048d7d2..0f425ecc 100644 --- a/src/buffer/buffer.nim +++ b/src/buffer/buffer.nim @@ -17,12 +17,15 @@ import css/cssparser import css/mediaquery import css/sheet import css/stylednode +import data/charset import config/config import html/dom import html/tags import html/htmlparser import io/loader import io/request +import io/posixstream +import io/teestream import ips/serialize import ips/serversocket import ips/socketstream @@ -53,6 +56,7 @@ type Buffer* = ref object alive: bool + cs: Charset lasttimeout: int timeout: int readbufsize: int @@ -525,9 +529,6 @@ proc loadResources(buffer: Buffer, document: Document) = for child in elem.children_rev: stack.add(child) -proc c_setvbuf(f: File, buf: pointer, mode: cint, size: csize_t): cint {. - importc: "setvbuf", header: "<stdio.h>", tags: [].} - type ConnectResult* = tuple[code: int, needsAuth: bool, redirect: Option[URL], contentType: string] proc setupSource(buffer: Buffer): ConnectResult = @@ -549,12 +550,7 @@ proc setupSource(buffer: Buffer): ConnectResult = buffer.contenttype = "text/plain" of LOAD_PIPE: discard fcntl(source.fd, F_SETFL, fcntl(source.fd, F_GETFL, 0) or O_NONBLOCK) - var f: File - if not open(f, source.fd, fmRead): - result.code = 1 - return - discard c_setvbuf(f, nil, IONBF, 0) - buffer.istream = newFileStream(f) + buffer.istream = newPosixStream(source.fd) if setct: buffer.contenttype = "text/plain" of LOAD_REQUEST: @@ -569,6 +565,7 @@ proc setupSource(buffer: Buffer): ConnectResult = SocketStream(buffer.istream).source.getFd().setBlocking(false) result.needsAuth = response.status == 401 # Unauthorized result.redirect = response.redirect + buffer.istream = newTeeStream(buffer.istream, buffer.sstream, closedest = false) if setct: result.contentType = buffer.contenttype buffer.loaded = true @@ -585,27 +582,39 @@ proc finishLoad(buffer: Buffer) = of "text/html": buffer.sstream.setPosition(0) buffer.available = 0 - buffer.document = parseHTML5(buffer.sstream) + let (doc, cs) = parseHTML5(buffer.sstream, fallbackcs = buffer.cs) + buffer.document = doc + if buffer.document == nil: # needsreinterpret + buffer.sstream.setPosition(0) + let (doc, _) = parseHTML5(buffer.sstream, cs = some(cs)) + buffer.document = doc buffer.document.location = buffer.location buffer.loadResources(buffer.document) buffer.istream.close() buffer.streamclosed = true -var sequential = 0 proc load*(buffer: Buffer): tuple[atend: bool, lines, bytes: int] {.proxy.} = var bytes = -1 if buffer.streamclosed: return (true, buffer.lines.len, bytes) let op = buffer.sstream.getPosition() - inc sequential var s = newString(buffer.readbufsize) try: - buffer.istream.readStr(buffer.readbufsize, s) - result = (s.len == 0, buffer.lines.len, bytes) + buffer.sstream.setPosition(op + buffer.available) + let n = buffer.istream.readData(addr s[0], buffer.readbufsize) + s.setLen(n) + result = (n == 0, buffer.lines.len, bytes) + buffer.sstream.setPosition(op) if buffer.readbufsize < BufferSize: buffer.readbufsize = min(BufferSize, buffer.readbufsize * 2) - except IOError: - # Presumably EAGAIN, unless the loader process crashed in which case we're screwed. - s = s.until('\0') #TODO this shouldn't be needed here... + buffer.available += s.len + case buffer.contenttype + of "text/html": + bytes = buffer.available + else: + buffer.do_reshape() + if result.atend: + buffer.finishLoad() + except ErrorAgain, ErrorWouldBlock: buffer.timeout = buffer.lasttimeout if buffer.readbufsize == 1: if buffer.lasttimeout == 0: @@ -615,18 +624,6 @@ proc load*(buffer: Buffer): tuple[atend: bool, lines, bytes: int] {.proxy.} = else: buffer.readbufsize = buffer.readbufsize div 2 result = (false, buffer.lines.len, bytes) - if s != "": - buffer.sstream.setPosition(op + buffer.available) - buffer.sstream.write(s) - buffer.sstream.setPosition(op) - buffer.available += s.len - case buffer.contenttype - of "text/html": - bytes = buffer.available - else: - buffer.do_reshape() - if result.atend: - buffer.finishLoad() proc render*(buffer: Buffer): int {.proxy.} = buffer.do_reshape() @@ -640,7 +637,8 @@ proc cancel*(buffer: Buffer): int {.proxy.} = of "text/html": buffer.sstream.setPosition(0) buffer.available = 0 - buffer.document = parseHTML5(buffer.sstream) + let (doc, _) = parseHTML5(buffer.sstream, cs = some(buffer.cs)) # confidence: certain + buffer.document = doc buffer.document.location = buffer.location buffer.do_reshape() return buffer.lines.len @@ -1107,6 +1105,7 @@ proc launchBuffer*(config: BufferConfig, source: BufferSource, mainproc: Pid) = let buffer = Buffer( alive: true, + cs: CHARSET_UTF_8, userstyle: parseStylesheet(config.userstyle), attrs: attrs, config: config, @@ -1119,12 +1118,11 @@ proc launchBuffer*(config: BufferConfig, source: BufferSource, ) buffer.readbufsize = BufferSize buffer.selector = newSelector[int]() - buffer.sstream = newStringStream() buffer.srenderer = newStreamRenderer(buffer.sstream) - let sstream = connectSocketStream(mainproc, false) - sstream.swrite(getpid()) - buffer.pistream = sstream - buffer.postream = sstream - let rfd = int(sstream.source.getFd()) + let socks = connectSocketStream(mainproc, false) + socks.swrite(getpid()) + buffer.pistream = socks + buffer.postream = socks + let rfd = int(socks.source.getFd()) buffer.selector.registerHandle(rfd, {Read}, 0) buffer.runBuffer(rfd) diff --git a/src/data/charset.nim b/src/data/charset.nim new file mode 100644 index 00000000..f93a82b3 --- /dev/null +++ b/src/data/charset.nim @@ -0,0 +1,433 @@ +import algorithm +import os +import strutils +import tables + +type Charset* = enum + CHARSET_UNKNOWN + CHARSET_UTF_8 = "UTF-8" + CHARSET_IBM866 = "IBM866" + CHARSET_ISO_8859_2 = "ISO-8859-2" + CHARSET_ISO_8859_3 = "ISO-8859-3" + CHARSET_ISO_8859_4 = "ISO-8859-4" + CHARSET_ISO_8859_5 = "ISO-8859-5" + CHARSET_ISO_8859_6 = "ISO-8859-6" + CHARSET_ISO_8859_7 = "ISO-8859-7" + CHARSET_ISO_8859_8 = "ISO-8859-8" + CHARSET_ISO_8859_8_I = "ISO-8859-8-I" + CHARSET_ISO_8859_10 = "ISO-8859-10" + CHARSET_ISO_8859_13 = "ISO-8859-13" + CHARSET_ISO_8859_14 = "ISO-8859-14" + CHARSET_ISO_8859_15 = "ISO-8859-15" + CHARSET_ISO_8859_16 = "ISO-8859-16" + CHARSET_KOI8_R = "KOI8-R" + CHARSET_KOI8_U = "KOI8-U" + CHARSET_MACINTOSH = "macintosh" + CHARSET_WINDOWS_874 = "windows-874" + CHARSET_WINDOWS_1250 = "windows-1250" + CHARSET_WINDOWS_1251 = "windows-1251" + CHARSET_WINDOWS_1252 = "windows-1252" + CHARSET_WINDOWS_1253 = "windows-1253" + CHARSET_WINDOWS_1254 = "windows-1254" + CHARSET_WINDOWS_1255 = "windows-1255" + CHARSET_WINDOWS_1256 = "windows-1256" + CHARSET_WINDOWS_1257 = "windows-1257" + CHARSET_WINDOWS_1258 = "windows-1258" + CHARSET_X_MAC_CYRILLIC = "x-mac-cyrillic" + CHARSET_GBK = "GBK" + CHARSET_GB18030 = "gb18030" + CHARSET_BIG5 = "Big5" + CHARSET_EUC_JP = "EUC-JP" + CHARSET_ISO_2022_JP = "ISO-2022-JP" + CHARSET_SHIFT_JIS = "Shift_JIS" + CHARSET_EUC_KR = "EUC-KR" + CHARSET_REPLACEMENT = "replacement" + CHARSET_UTF_16_BE = "UTF-16BE" + CHARSET_UTF_16_LE = "UTF-16LE" + CHARSET_X_USER_DEFINED = "x-user-defined" + +const CharsetMap = { + # UTF-8 (The Encoding) + "unicode-1-1-utf-8": CHARSET_UTF_8, + "unicode11utf-8": CHARSET_UTF_8, + "unicode20utf-8": CHARSET_UTF_8, + "utf-8": CHARSET_UTF_8, + "utf8": CHARSET_UTF_8, + "x-unicode20utf8": CHARSET_UTF_8, + # IBM866 + "866": CHARSET_IBM_866, + "cp866": CHARSET_IBM_866, + "csibm866": CHARSET_IBM_866, + "ibm866": CHARSET_IBM_866, + # ISO-8859-2 + "csisolatin2": CHARSET_ISO_8859_2, + "iso-8859-2": CHARSET_ISO_8859_2, + "iso-ir-101": CHARSET_ISO_8859_2, + "iso8859-2": CHARSET_ISO_8859_2, + "iso88592": CHARSET_ISO_8859_2, + "iso_8859-2": CHARSET_ISO_8859_2, + "iso_8859-2:1987": CHARSET_ISO_8859_2, + "l2": CHARSET_ISO_8859_2, + "latin2": CHARSET_ISO_8859_2, + # ISO-8859-3 + "csisolatin3": CHARSET_ISO_8859_3, + "iso-8859-3": CHARSET_ISO_8859_3, + "iso-ir-109": CHARSET_ISO_8859_3, + "iso8859-3": CHARSET_ISO_8859_3, + "iso88593": CHARSET_ISO_8859_3, + "iso_8859-3": CHARSET_ISO_8859_3, + "iso_8859-3:1988": CHARSET_ISO_8859_3, + "l3": CHARSET_ISO_8859_3, + "latin3": CHARSET_ISO_8859_3, + # ISO-8859-4 + "csisolatin4": CHARSET_ISO_8859_4, + "iso-8859-4": CHARSET_ISO_8859_4, + "iso-ir-110": CHARSET_ISO_8859_4, + "iso8859-4": CHARSET_ISO_8859_4, + "iso88594": CHARSET_ISO_8859_4, + "iso_8859-4": CHARSET_ISO_8859_4, + "iso_8859-4:1988": CHARSET_ISO_8859_4, + "l4": CHARSET_ISO_8859_4, + "latin4": CHARSET_ISO_8859_4, + # ISO-8859-5 + "csisolatincyrillic": CHARSET_ISO_8859_5, + "cyrillic": CHARSET_ISO_8859_5, + "iso-8859-5": CHARSET_ISO_8859_5, + "iso-ir-144": CHARSET_ISO_8859_5, + "iso8859-5": CHARSET_ISO_8859_5, + "iso88595": CHARSET_ISO_8859_5, + "iso_8859-5": CHARSET_ISO_8859_5, + "iso_8859-5:1988": CHARSET_ISO_8859_5, + # ISO-8859-6 + "arabic": CHARSET_ISO_8859_6, + "asmo-708": CHARSET_ISO_8859_6, + "csiso88596e": CHARSET_ISO_8859_6, + "csiso88596i": CHARSET_ISO_8859_6, + "csisolatinarabic": CHARSET_ISO_8859_6, + "ecma-114": CHARSET_ISO_8859_6, + "iso-8859-6": CHARSET_ISO_8859_6, + "iso-8859-6-e": CHARSET_ISO_8859_6, + "iso-8859-6-i": CHARSET_ISO_8859_6, + "iso-ir-127": CHARSET_ISO_8859_6, + "iso8859-6": CHARSET_ISO_8859_6, + "iso88596": CHARSET_ISO_8859_6, + "iso_8859-6": CHARSET_ISO_8859_6, + "iso_8859-6:1987": CHARSET_ISO_8859_6, + # ISO-8859-7 + "csisolatingreek": CHARSET_ISO_8859_7, + "ecma-118": CHARSET_ISO_8859_7, + "elot_928": CHARSET_ISO_8859_7, + "greek": CHARSET_ISO_8859_7, + "greek8": CHARSET_ISO_8859_7, + "iso-8859-7": CHARSET_ISO_8859_7, + "iso-ir-126": CHARSET_ISO_8859_7, + "iso8859-7": CHARSET_ISO_8859_7, + "iso88597": CHARSET_ISO_8859_7, + "iso_8859-7": CHARSET_ISO_8859_7, + "iso_8859-7:1987": CHARSET_ISO_8859_7, + "sun_eu_greek": CHARSET_ISO_8859_7, + # ISO-8859-8 + "csiso88598e": CHARSET_ISO_8859_8, + "csisolatinhebrew": CHARSET_ISO_8859_8, + "hebrew": CHARSET_ISO_8859_8, + "iso-8859-8": CHARSET_ISO_8859_8, + "iso-8859-8-e": CHARSET_ISO_8859_8, + "iso-ir-138": CHARSET_ISO_8859_8, + "iso8859-8": CHARSET_ISO_8859_8, + "iso88598": CHARSET_ISO_8859_8, + "iso_8859-8": CHARSET_ISO_8859_8, + "iso_8859-8:1988": CHARSET_ISO_8859_8, + "visual": CHARSET_ISO_8859_8, + # ISO-8859-8-I + "csiso88598i": CHARSET_ISO_8859_8_I, + "iso-8859-8-i": CHARSET_ISO_8859_8_I, + "logical": CHARSET_ISO_8859_8_I, + # ISO-8859-10 + "csisolatin6": CHARSET_ISO_8859_10, + "iso-8859-10": CHARSET_ISO_8859_10, + "iso-ir-157": CHARSET_ISO_8859_10, + "iso8859-10": CHARSET_ISO_8859_10, + "iso885910": CHARSET_ISO_8859_10, + "l6": CHARSET_ISO_8859_10, + "latin6": CHARSET_ISO_8859_10, + # ISO-8859-13 + "iso-8859-13": CHARSET_ISO_8859_13, + "iso8859-13": CHARSET_ISO_8859_13, + "iso885913": CHARSET_ISO_8859_13, + # ISO-8859-14 + "iso-8859-14": CHARSET_ISO_8859_14, + "iso8859-14": CHARSET_ISO_8859_14, + "iso885914": CHARSET_ISO_8859_14, + # ISO-8859-15 + "csisolatin9": CHARSET_ISO_8859_15, + "iso-8859-15": CHARSET_ISO_8859_15, + "iso8859-15": CHARSET_ISO_8859_15, + "iso885915": CHARSET_ISO_8859_15, + "iso_8859-15": CHARSET_ISO_8859_15, + "l9": CHARSET_ISO_8859_15, + # ISO-8859-16 + "iso-8859-16": CHARSET_ISO_8859_16, + # KOI8-R + "cskoi8r": CHARSET_KOI8_R, + "koi": CHARSET_KOI8_R, + "koi8": CHARSET_KOI8_R, + "koi8-r": CHARSET_KOI8_R, + "koi8_r": CHARSET_KOI8_R, + # KOI8-U + "koi8-ru": CHARSET_KOI8_U, + "koi8-u": CHARSET_KOI8_U, + # macintosh + "csmacintosh": CHARSET_MACINTOSH, + "mac": CHARSET_MACINTOSH, + "macintosh": CHARSET_MACINTOSH, + "x-mac-roman": CHARSET_MACINTOSH, + # windows-874 + "dos-874": CHARSET_WINDOWS_874, + "iso-8859-11": CHARSET_WINDOWS_874, + "iso8859-11": CHARSET_WINDOWS_874, + "iso885911": CHARSET_WINDOWS_874, + "tis-620": CHARSET_WINDOWS_874, + "windows-874": CHARSET_WINDOWS_874, + # windows-1250 + "cp1250": CHARSET_WINDOWS_1250, + "windows-1250": CHARSET_WINDOWS_1250, + "x-cp1250" : CHARSET_WINDOWS_1250, + # windows-1251 + "cp1251": CHARSET_WINDOWS_1251, + "windows-1251": CHARSET_WINDOWS_1251, + "x-cp1251": CHARSET_WINDOWS_1251, + # windows-1252 + "ansi_x3.4-1968": CHARSET_WINDOWS_1252, + "ascii": CHARSET_WINDOWS_1252, # lol + "cp1252": CHARSET_WINDOWS_1252, + "cp819": CHARSET_WINDOWS_1252, + "csisolatin1": CHARSET_WINDOWS_1252, + "ibm819": CHARSET_WINDOWS_1252, + "iso-8859-1": CHARSET_WINDOWS_1252, + "iso88591": CHARSET_WINDOWS_1252, + "iso_8859-1:1987": CHARSET_WINDOWS_1252, + "l1": CHARSET_WINDOWS_1252, + "latin1": CHARSET_WINDOWS_1252, + "us-ascii": CHARSET_WINDOWS_1252, + "windows-1252": CHARSET_WINDOWS_1252, + "x-cp1252": CHARSET_WINDOWS_1252, + # windows-1253 + "cp1253": CHARSET_WINDOWS_1253, + "windows-1253": CHARSET_WINDOWS_1253, + "x-cp1253": CHARSET_WINDOWS_1253, + # windows-1254 + "cp1254": CHARSET_WINDOWS_1254, + "csisolatin5": CHARSET_WINDOWS_1254, + "iso-8859-9": CHARSET_WINDOWS_1254, + "iso-ir-148": CHARSET_WINDOWS_1254, + "iso8859-9": CHARSET_WINDOWS_1254, + "iso88599": CHARSET_WINDOWS_1254, + "iso_8859-9": CHARSET_WINDOWS_1254, + "iso_8859-9:1989": CHARSET_WINDOWS_1254, + "l5": CHARSET_WINDOWS_1254, + "latin5": CHARSET_WINDOWS_1254, + "windows-1254": CHARSET_WINDOWS_1254, + "x-cp1254": CHARSET_WINDOWS_1254, + # windows-1255 + "cp1255": CHARSET_WINDOWS_1255, + "windows-1255": CHARSET_WINDOWS_1255, + "x-cp1255": CHARSET_WINDOWS_1255, + # windows-1256 + "cp1256": CHARSET_WINDOWS_1256, + "windows-1256": CHARSET_WINDOWS_1256, + "x-cp1256": CHARSET_WINDOWS_1256, + # windows-1257 + "cp1257": CHARSET_WINDOWS_1257, + "windows-1257": CHARSET_WINDOWS_1257, + "x-cp1257": CHARSET_WINDOWS_1257, + # windows-1258 + "cp1258": CHARSET_WINDOWS_1258, + "windows-1258": CHARSET_WINDOWS_1258, + "x-cp1258": CHARSET_WINDOWS_1258, + # x-mac-cyrillic + "x-mac-cyrillic": CHARSET_X_MAC_CYRILLIC, + "x-mac-ukrainian": CHARSET_X_MAC_CYRILLIC, + # GBK + "chinese": CHARSET_GBK, + "csgb2312": CHARSET_GBK, + "csiso58gb231280": CHARSET_GBK, + "gb2312": CHARSET_GBK, + "gb_2312": CHARSET_GBK, + "gb_2312-80": CHARSET_GBK, + "gbk": CHARSET_GBK, + "iso-ir-58": CHARSET_GBK, + "x-gbk": CHARSET_GBK, + # gb18030 + "gb18030": CHARSET_GB18030, + # Big5 + "big5": CHARSET_BIG5, + "big5-hkscs": CHARSET_BIG5, + "cn-big5": CHARSET_BIG5, + "csbig5": CHARSET_BIG5, + "x-x-big5": CHARSET_BIG5, + # EUC-JP + "cseucpkdfmtjapanese": CHARSET_EUC_JP, + "euc-jp": CHARSET_EUC_JP, + "x-euc-jp": CHARSET_EUC_JP, + # ISO-2022-JP (ugh) + "csiso2022jp": CHARSET_ISO_2022_JP, + "iso-2022-jp": CHARSET_ISO_2022_JP, + # Shift_JIS + "csshiftjis": CHARSET_SHIFT_JIS, + "ms932": CHARSET_SHIFT_JIS, + "ms_kanji": CHARSET_SHIFT_JIS, + "shift-jis": CHARSET_SHIFT_JIS, + "shift_jis": CHARSET_SHIFT_JIS, + "sjis": CHARSET_SHIFT_JIS, + "windows-31j": CHARSET_SHIFT_JIS, + "x-sjis": CHARSET_SHIFT_JIS, + # EUC-KR + "cseuckr": CHARSET_EUC_KR, + "csksc56011987": CHARSET_EUC_KR, + "euc-kr": CHARSET_EUC_KR, + "iso-ir-149": CHARSET_EUC_KR, + "korean": CHARSET_EUC_KR, + "ks_c_5601-1987": CHARSET_EUC_KR, + "ks_c_5601-1989": CHARSET_EUC_KR, + "ksc5601": CHARSET_EUC_KR, + "ksc_5601": CHARSET_EUC_KR, + "windows-949": CHARSET_EUC_KR, + # replacement + "csiso2022kr": CHARSET_REPLACEMENT, + "hz-gb-2312": CHARSET_REPLACEMENT, + "iso-2022-cn": CHARSET_REPLACEMENT, + "iso-2022-cn-ext": CHARSET_REPLACEMENT, + "iso-2022-kr": CHARSET_REPLACEMENT, + "replacement": CHARSET_REPLACEMENT, + # UTF-16BE + "unicodefffe": CHARSET_UTF_16_BE, + "utf-16be": CHARSET_UTF_16_BE, + # UTF-16LE + "csunicode": CHARSET_UTF_16_LE, + "iso-10646-ucs-2": CHARSET_UTF_16_LE, + "ucs-2": CHARSET_UTF_16_LE, + "unicode": CHARSET_UTF_16_LE, + "unicodefeff": CHARSET_UTF_16_LE, + "utf-16": CHARSET_UTF_16_LE, + "utf-16le": CHARSET_UTF_16_LE, + # x-user-defined + "x-user-defined": CHARSET_X_USER_DEFINED +}.toTable() + +proc getCharset*(s: string): Charset = + return CharsetMap.getOrDefault(s.strip().toLower(), CHARSET_UNKNOWN) + +iterator mappairs(path: string): tuple[a, b: int] = + let s = staticRead(path) + for line in s.split('\n'): + if line.len == 0 or line[0] == '#': continue + var i = 0 + while line[i] == ' ': inc i + var j = i + while i < line.len and line[i] in '0'..'9': inc i + let index = parseInt(line.substr(j, i - 1)) + inc i # tab + j = i + while i < line.len and line[i] in {'0'..'9', 'A'..'F', 'x'}: inc i + let n = parseHexInt(line.substr(j, i - 1)) + yield (index, n) + +# I'm pretty sure single-byte encodings map to ucs-2. +func loadCharsetMap8(path: string): tuple[ + decode: array[char, uint16], + encode: seq[ + tuple[ + ucs: uint16, + val: char + ] + ], + ] = + var m: int + for index, n in mappairs("res/map" / path): + result.decode[char(index)] = uint16(n) + if index > m: m = index + for index in low(char) .. char(m): + let val = result.decode[index] + if val != 0u16: + result.encode.add((val, index)) + result.encode.sort() + +func loadCharsetMap8Encode(path: string): seq[tuple[ucs: uint16, val: char]] = + for index, n in mappairs("res/map" / path): + result.add((uint16(n), char(index))) + result.sort() + +func loadGb18030Ranges(path: string): tuple[ + decode: seq[ + tuple[ + p: uint16, + ucs: uint16 ]], + encode: seq[ + tuple[ + ucs: uint16, + p: uint16 ]]] = + for index, n in mappairs("res/map" / path): + if uint32(index) > uint32(high(uint16)): break + result.decode.add((uint16(index), uint16(n))) + result.encode.add((uint16(n), uint16(index))) + result.encode.sort() + +func loadCharsetMap16(path: string, len: static uint16): tuple[ + decode: array[len, uint16], + encode: seq[ + tuple[ + ucs: uint16, + p: uint16 ]]] = + for index, n in mappairs("res/map" / path): + result.decode[uint16(index)] = uint16(n) + result.encode.add((uint16(n), uint16(index))) + result.encode.sort() + +func loadBig5Map(path: string, offset: static uint16): tuple[ + decode: array[19782u16 - offset, uint32], # ouch (+75KB...) + encode: seq[ + tuple[ + ucs: uint32, + p: uint16 ]]] = + for index, n in mappairs("res/map" / path): + result.decode[uint16(index) - offset] = uint32(n) + result.encode.add((uint32(n), uint16(index))) + #for i in result.decode: assert x != 0 # fail + result.encode.sort() + +const (IBM866Decode*, IBM866Encode*) = loadCharsetMap8("index-ibm866.txt") +const (ISO88592Decode*, ISO88592Encode*) = loadCharsetMap8("index-iso-8859-2.txt") +const (ISO88593Decode*, ISO88593Encode*) = loadCharsetMap8("index-iso-8859-3.txt") +const (ISO88594Decode*, ISO88594Encode*) = loadCharsetMap8("index-iso-8859-4.txt") +const (ISO88595Decode*, ISO88595Encode*) = loadCharsetMap8("index-iso-8859-5.txt") +const (ISO88596Decode*, ISO88596Encode*) = loadCharsetMap8("index-iso-8859-6.txt") +const (ISO88597Decode*, ISO88597Encode*) = loadCharsetMap8("index-iso-8859-7.txt") +const (ISO88598Decode*, ISO88598Encode*) = loadCharsetMap8("index-iso-8859-8.txt") +const (ISO885910Decode*, ISO885910Encode*) = loadCharsetMap8("index-iso-8859-10.txt") +const (ISO885913Decode*, ISO885913Encode*) = loadCharsetMap8("index-iso-8859-13.txt") +const (ISO885914Decode*, ISO885914Encode*) = loadCharsetMap8("index-iso-8859-14.txt") +const (ISO885915Decode*, ISO885915Encode*) = loadCharsetMap8("index-iso-8859-15.txt") +const (ISO885916Decode*, ISO885916Encode*) = loadCharsetMap8("index-iso-8859-16.txt") +const (KOI8RDecode*, KOI8REncode*) = loadCharsetMap8("index-koi8-r.txt") +const (KOI8UDecode*, KOI8UEncode*) = loadCharsetMap8("index-koi8-u.txt") +const (MacintoshDecode*, MacintoshEncode*) = loadCharsetMap8("index-macintosh.txt") +const (Windows874Decode*, Windows874Encode*) = loadCharsetMap8("index-windows-874.txt") +const (Windows1250Decode*, Windows1250Encode*) = loadCharsetMap8("index-windows-1250.txt") +const (Windows1251Decode*, Windows1251Encode*) = loadCharsetMap8("index-windows-1251.txt") +const (Windows1252Decode*, Windows1252Encode*) = loadCharsetMap8("index-windows-1252.txt") +const (Windows1253Decode*, Windows1253Encode*) = loadCharsetMap8("index-windows-1253.txt") +const (Windows1254Decode*, Windows1254Encode*) = loadCharsetMap8("index-windows-1254.txt") +const (Windows1255Decode*, Windows1255Encode*) = loadCharsetMap8("index-windows-1255.txt") +const (Windows1256Decode*, Windows1256Encode*) = loadCharsetMap8("index-windows-1256.txt") +const (Windows1257Decode*, Windows1257Encode*) = loadCharsetMap8("index-windows-1257.txt") +const (Windows1258Decode*, Windows1258Encode*) = loadCharsetMap8("index-windows-1258.txt") +const (XMacCyrillicDecode*, XMacCyrillicEncode*) = loadCharsetMap8("index-x-mac-cyrillic.txt") +const (Gb18030RangesDecode*, Gb18030RangesEncode*) = loadGb18030Ranges("index-gb18030-ranges.txt") +const (Gb18030Decode*, Gb18030Encode*) = loadCharsetMap16("index-gb18030.txt", len = 23940) +#for x in Gb18030Decode: assert x != 0 # success +const Big5DecodeOffset* = 942 +const (Big5Decode*, Big5Encode*) = loadBig5Map("index-big5.txt", offset = Big5DecodeOffset) +const (Jis0208Decode*, Jis0208Encode*) = loadCharsetMap16("index-jis0208.txt", len = 11104) +const (Jis0212Decode*, Jis0212Encode*) = loadCharsetMap16("index-jis0212.txt", len = 7211) +const ISO2022JPKatakanaEncode* = loadCharsetMap8Encode("index-iso-2022-jp-katakana.txt") +const (EUCKRDecode*, EUCKREncode*) = loadCharsetMap16("index-euc-kr.txt", len = 23750) diff --git a/src/data/idna.nim b/src/data/idna.nim index f204e934..b636aa8c 100644 --- a/src/data/idna.nim +++ b/src/data/idna.nim @@ -15,7 +15,7 @@ type FullRangeList = (seq[(uint16, uint16)], seq[(uint32, uint32)]) FullSet = (set[uint16], HashSet[uint32]) -const IdnaMappingTable = staticRead"res/IdnaMappingTable.txt" +const IdnaMappingTable = staticRead"res/map/IdnaMappingTable.txt" func loadStuff(s: string): (FullMap[cstring], # Map FullRangeList, # Disallowed Ranges diff --git a/src/html/htmlparser.nim b/src/html/htmlparser.nim index 65da5c4e..ce8ce2bb 100644 --- a/src/html/htmlparser.nim +++ b/src/html/htmlparser.nim @@ -3,17 +3,23 @@ import options import sequtils import streams import strformat +import strutils import tables import unicode import css/sheet +import data/charset import html/dom import html/tags import html/htmltokenizer import js/javascript +import strings/decoderstream import utils/twtstr type + CharsetConfidence = enum + CONFIDENCE_TENTATIVE, CONFIDENCE_CERTAIN, CONFIDENCE_IRRELEVANT + DOMParser = ref object # JS interface OpenElements = seq[Element] @@ -22,6 +28,9 @@ type case fragment: bool of true: ctx: Element else: discard + needsreinterpret: bool + charset: Charset + confidence: CharsetConfidence openElements: OpenElements insertionMode: InsertionMode oldInsertionMode: InsertionMode @@ -548,6 +557,54 @@ template pop_current_node = discard parser.popElement() func isHTMLIntegrationPoint(node: Element): bool = return false #TODO SVG (NOTE MathML not implemented) +func extractEncFromMeta(s: string): Charset = + var i = 0 + while true: # Loop: + var j = 0 + while i < s.len: + template check(c: static char) = + if s[i] in {c, c.toUpperAscii()}: inc j + else: j = 0 + case j + of 0: check 'c' + of 1: check 'h' + of 2: check 'a' + of 3: check 'r' + of 4: check 's' + of 5: check 'e' + of 6: check 't' + of 7: + inc i + break + else: discard + inc i + if j < 7: return CHARSET_UNKNOWN + j = 0 + while i < s.len and s[i] in AsciiWhitespace: inc i + if i >= s.len or s[i] != '=': continue + while i < s.len and s[i] in AsciiWhitespace: inc i + break + if i >= s.len: return CHARSET_UNKNOWN + if s[i] in {'"', '\''}: + let s2 = s.substr(i).until(s[i]) + if s2.len == 0 or s2[^1] != s[i]: + return CHARSET_UNKNOWN + return getCharset(s2) + return getCharset(s.substr(i - 1).until({';', ' '})) + +proc changeEncoding(parser: var HTML5Parser, cs: Charset) = + if parser.charset in {CHARSET_UTF_16_LE, CHARSET_UTF_16_BE}: + parser.confidence = CONFIDENCE_CERTAIN + return + parser.confidence = CONFIDENCE_CERTAIN + if cs == parser.charset: + return + if cs == CHARSET_X_USER_DEFINED: + parser.charset = CHARSET_WINDOWS_1252 + else: + parser.charset = cs + parser.needsreinterpret = true + # Following is an implementation of the state (?) machine defined in # https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhtml # It uses the ad-hoc pattern matching macro `match' to apply the following @@ -562,7 +619,7 @@ func isHTMLIntegrationPoint(node: Element): bool = # pseudo-goto by breaking out only when the else statement needn't be # executed. # -# e.g. the following code: +# For example, the following code: # # match token: # TokenType.COMMENT => (block: echo "comment") @@ -644,7 +701,7 @@ macro match(token: Token, body: typed): untyped = ofBranches[CHARACTER_ASCII].painted = true else: error fmt"Unsupported curly of kind {pattern[0].kind}" of nnkStrLit: - var tempTokenizer = newTokenizer(newStringStream(pattern.strVal)) + var tempTokenizer = newTokenizer(pattern.strVal) for token in tempTokenizer.tokenize: let tt = int(token.tagtype) case token.t @@ -811,9 +868,16 @@ proc processInHTMLContent(parser: var HTML5Parser, token: Token, insertionMode = pop_current_node ) "<meta>" => (block: - discard parser.insertHTMLElement(token) + let element = parser.insertHTMLElement(token) pop_current_node - #TODO encodings + if parser.confidence == CONFIDENCE_TENTATIVE: + let cs = getCharset(element.attr("charset")) + if cs != CHARSET_UNKNOWN: + parser.changeEncoding(cs) + elif element.attr("http-equiv").equalsIgnoreCase("Content-Type"): + let cs = extractEncFromMeta(element.attr("content")) + if cs != CHARSET_UNKNOWN: + parser.changeEncoding(cs) ) "<title>" => (block: parser.genericRCDATAElementParsingAlgorithm(token)) "<noscript>" => (block: @@ -2092,17 +2156,48 @@ proc constructTree(parser: var HTML5Parser): Document = parser.processInHTMLContent(token) else: parser.processInForeignContent(token) + if parser.needsreinterpret: + return nil #TODO document.write (?) #TODO etc etc... return parser.document -proc parseHTML5*(inputStream: Stream): Document = +proc parseHTML5*(inputStream: Stream, cs = none(Charset), fallbackcs = CHARSET_UTF_8): (Document, Charset) = var parser: HTML5Parser + var bom: string + if cs.isSome: + parser.charset = cs.get + parser.confidence = CONFIDENCE_CERTAIN + else: + # bom sniff + const u8bom = char(0xEF) & char(0xBB) & char(0xBF) + const bebom = char(0xFE) & char(0xFF) + const lebom = char(0xFF) & char(0xFE) + bom = inputStream.readStr(2) + if bom == bebom: + parser.charset = CHARSET_UTF_16_BE + parser.confidence = CONFIDENCE_CERTAIN + bom = "" + elif bom == lebom: + parser.charset = CHARSET_UTF_16_LE + parser.confidence = CONFIDENCE_CERTAIN + bom = "" + else: + bom &= inputStream.readChar() + if bom == u8bom: + parser.charset = CHARSET_UTF_8 + parser.confidence = CONFIDENCE_CERTAIN + bom = "" + else: + parser.charset = fallbackcs + let decoder = newDecoderStream(inputStream, parser.charset) + for c in bom: + decoder.prepend(cast[uint32](c)) parser.document = newDocument() - parser.tokenizer = inputStream.newTokenizer() - return parser.constructTree() + parser.tokenizer = newTokenizer(decoder) + return (parser.constructTree(), parser.charset) proc newDOMParser*(): DOMParser {.jsctor.} = new(result) @@ -2110,7 +2205,8 @@ proc newDOMParser*(): DOMParser {.jsctor.} = proc parseFromString*(parser: DOMParser, str: string, t: string): Document {.jserr, jsfunc.} = case t of "text/html": - return parseHTML5(newStringStream(str)) + let (res, _) = parseHTML5(newStringStream(str)) + return res of "text/xml", "application/xml", "application/xhtml+xml", "image/svg+xml": JS_THROW JS_InternalError, "XML parsing is not supported yet" else: diff --git a/src/html/htmltokenizer.nim b/src/html/htmltokenizer.nim index c8f96144..08779c24 100644 --- a/src/html/htmltokenizer.nim +++ b/src/html/htmltokenizer.nim @@ -1,5 +1,4 @@ import options -import streams import strformat import strutils import macros @@ -8,6 +7,7 @@ import unicode import html/entity import html/tags +import strings/decoderstream import utils/radixtree import utils/twtstr @@ -16,7 +16,6 @@ type Tokenizer* = object state*: TokenizerState rstate: TokenizerState - curr: Rune tmp: string code: int tok: Token @@ -25,10 +24,9 @@ type attrv: string attr: bool - istream: Stream - sbuf: string + decoder: DecoderStream + sbuf: seq[Rune] sbuf_i: int - sbuf_ip: int eof_i: int TokenType* = enum @@ -97,65 +95,67 @@ func `$`*(tok: Token): string = of COMMENT: fmt"{tok.t} {tok.data}" of EOF: fmt"{tok.t}" -const bufSize = 4096 -const copyBufSize = 16 -proc newTokenizer*(s: Stream): Tokenizer = - result.sbuf = newString(bufSize) - result.istream = s - result.eof_i = -1 - if result.istream.atEnd: - result.eof_i = 0 - else: - let n = s.readDataStr(result.sbuf, 0..bufSize-1) - if n != bufSize: - result.eof_i = n +const bufLen = 1024 # * 4096 bytes +const copyBufLen = 16 # * 64 bytes + +proc readn(t: var Tokenizer) = + let l = t.sbuf.len + t.sbuf.setLen(bufLen) + let n = t.decoder.readData(addr t.sbuf[l], (bufLen - l) * sizeof(Rune)) + t.sbuf.setLen(l + n div sizeof(Rune)) + if t.decoder.atEnd: + t.eof_i = t.sbuf.len + +proc newTokenizer*(s: DecoderStream): Tokenizer = + var t = Tokenizer( + decoder: s, + sbuf: newSeqOfCap[Rune](bufLen), + eof_i: -1, + sbuf_i: 0 + ) + t.readn() + return t + +proc newTokenizer*(s: string): Tokenizer = + let rs = s.toRunes() + var t = Tokenizer( + sbuf: rs, + eof_i: rs.len, + sbuf_i: 0 + ) + return t func atEof(t: Tokenizer): bool = t.eof_i != -1 and t.sbuf_i >= t.eof_i -proc consume(t: var Tokenizer): char {.inline.} = - if t.eof_i == -1 and t.sbuf_i >= bufSize-copyBufSize: - # Workaround to swap buffer without breaking fastRuneAt. - var sbuf2 = newString(copyBufSize) - var i = 0 - while t.sbuf_i + i < bufSize: - sbuf2[i] = t.sbuf[t.sbuf_i + i] - inc i - let n = t.istream.readDataStr(t.sbuf, i..bufSize-1) - if n != bufSize - i: - t.eof_i = i + n +proc consume(t: var Tokenizer): Rune = + if t.sbuf_i >= min(bufLen - copyBufLen, t.sbuf.len): + for i in t.sbuf_i ..< t.sbuf.len: + t.sbuf[i - t.sbuf_i] = t.sbuf[i] + t.sbuf.setLen(t.sbuf.len - t.sbuf_i) t.sbuf_i = 0 - - var j = 0 - while j < i: - t.sbuf[j] = sbuf2[j] - inc j - - assert t.eof_i == -1 or t.sbuf_i < t.eof_i # not consuming eof... - t.sbuf_ip = t.sbuf_i # save previous pointer for potential reconsume - - # Normalize newlines (\r\n -> \n, single \r -> \n) - if t.sbuf[t.sbuf_i] == '\r': + if t.sbuf.len < bufLen: + t.readn() + ## Normalize newlines (\r\n -> \n, single \r -> \n) + if t.sbuf[t.sbuf_i] == Rune('\r'): inc t.sbuf_i - if t.sbuf[t.sbuf_i] != '\n': + if t.sbuf[t.sbuf_i] != Rune('\n'): # \r - result = '\n' - t.curr = Rune('\n') + result = Rune('\n') return # else, \r\n so just return the \n - result = t.sbuf[t.sbuf_i] - fastRuneAt(t.sbuf, t.sbuf_i, t.curr) + inc t.sbuf_i proc reconsume(t: var Tokenizer) = - t.sbuf_i = t.sbuf_ip + dec t.sbuf_i iterator tokenize*(tokenizer: var Tokenizer): Token = template emit(tok: Token) = if tok.t == START_TAG: tokenizer.laststart = tok if tok.t in {START_TAG, END_TAG}: - tok.tagtype = tagType(tok.tagName) + tok.tagtype = tagType(tok.tagname) yield tok template emit(tok: TokenType) = emit Token(t: tok) template emit(rn: Rune) = emit Token(t: CHARACTER, r: rn) @@ -173,7 +173,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = elif c in Ascii: emit c else: - emit tokenizer.curr + emit r template emit_replacement = emit Rune(0xFFFD) template switch_state(s: TokenizerState) = tokenizer.state = s @@ -199,23 +199,40 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = if tokenizer.attr: tokenizer.attrv &= c template peek_str(s: string): bool = - # WARNING: will break on strings with copyBufSize + 4 bytes - assert s.len < copyBufSize - 4 and s.len > 0 - if tokenizer.sbuf_i + s.len > tokenizer.eof_i: + # WARNING: will break on strings with copyBufLen + 4 bytes + # WARNING: only works with ascii + assert s.len < copyBufLen - 4 and s.len > 0 + if tokenizer.eof_i != -1 and tokenizer.sbuf_i + s.len >= tokenizer.eof_i: false else: - let slice = tokenizer.sbuf[tokenizer.sbuf_i..tokenizer.sbuf_i+s.high] - s == slice + var b = true + for i in 0 ..< s.len: + let c = tokenizer.sbuf[tokenizer.sbuf_i + i] + if not c.isAscii() or cast[char](c) != s[i]: + b = false + break + b + template peek_str_nocase(s: string): bool = - # WARNING: will break on strings with copyBufSize + 4 bytes + # WARNING: will break on strings with copyBufLen + 4 bytes # WARNING: only works with UPPER CASE ascii - assert s.len < copyBufSize - 4 and s.len > 0 - if tokenizer.sbuf_i + s.len > tokenizer.eof_i: + assert s.len < copyBufLen - 4 and s.len > 0 + if tokenizer.eof_i != -1 and tokenizer.sbuf_i + s.len >= tokenizer.eof_i: false else: - let slice = tokenizer.sbuf[tokenizer.sbuf_i..tokenizer.sbuf_i+s.high] - s == slice.toUpperAscii() - template peek_char(): char = tokenizer.sbuf[tokenizer.sbuf_i] + var b = true + for i in 0 ..< s.len: + let c = tokenizer.sbuf[tokenizer.sbuf_i + i] + if not c.isAscii() or cast[char](c).toUpperAscii() != s[i]: + b = false + break + b + template peek_char(): char = + let r = tokenizer.sbuf[tokenizer.sbuf_i] + if r.isAscii(): + cast[char](r) + else: + char(128) template has_adjusted_current_node(): bool = false #TODO implement this template consume_and_discard(n: int) = #TODO optimize var i = 0 @@ -298,17 +315,17 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = template has_anything_else = discard # does nothing const null = char(0) - const whitespace = {'\t', '\n', '\f', ' '} while true: {.computedGoto.} #eprint tokenizer.state #debug let is_eof = tokenizer.atEof # set eof here, otherwise we would exit at the last character - let c = if not is_eof: + let r = if not is_eof: tokenizer.consume() else: # avoid consuming eof... - null + Rune(null) + let c = if r.isAscii(): cast[char](r) else: char(128) stateMachine: # => case tokenizer.state of DATA: case c @@ -394,19 +411,19 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of TAG_NAME: case c - of whitespace: switch_state BEFORE_ATTRIBUTE_NAME + of AsciiWhitespace: switch_state BEFORE_ATTRIBUTE_NAME of '/': switch_state SELF_CLOSING_START_TAG of '>': switch_state DATA emit_tok - of AsciiUpperAlpha: tokenizer.tok.tagname &= char(tokenizer.curr).tolower() + of AsciiUpperAlpha: tokenizer.tok.tagname &= c.tolower() of null: parse_error unexpected_null_character tokenizer.tok.tagname &= Rune(0xFFFD) of eof: parse_error eof_in_tag emit_eof - else: tokenizer.tok.tagname &= tokenizer.curr + else: tokenizer.tok.tagname &= r of RCDATA_LESS_THAN_SIGN: case c @@ -430,7 +447,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of RCDATA_END_TAG_NAME: has_anything_else case c - of whitespace: + of AsciiWhitespace: if is_appropriate_end_tag_token: switch_state BEFORE_ATTRIBUTE_NAME else: @@ -447,8 +464,8 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr + tokenizer.tok.tagname &= c.tolower() + tokenizer.tmp &= c else: new_token nil #TODO emit '<' @@ -478,7 +495,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of RAWTEXT_END_TAG_NAME: has_anything_else case c - of whitespace: + of AsciiWhitespace: if is_appropriate_end_tag_token: switch_state BEFORE_ATTRIBUTE_NAME else: @@ -495,8 +512,8 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr + tokenizer.tok.tagname &= c.tolower() + tokenizer.tmp &= c else: new_token nil #TODO emit '<' @@ -531,7 +548,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of SCRIPT_DATA_END_TAG_NAME: has_anything_else case c - of whitespace: + of AsciiWhitespace: if is_appropriate_end_tag_token: switch_state BEFORE_ATTRIBUTE_NAME else: @@ -548,8 +565,8 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else of AsciiAlpha: # note: merged upper & lower - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr + tokenizer.tok.tagname &= c.tolower() + tokenizer.tmp &= c else: emit '<' emit '/' @@ -650,7 +667,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of SCRIPT_DATA_ESCAPED_END_TAG_NAME: has_anything_else case c - of whitespace: + of AsciiWhitespace: if is_appropriate_end_tag_token: switch_state BEFORE_ATTRIBUTE_NAME else: @@ -666,8 +683,8 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = else: anything_else of AsciiAlpha: - tokenizer.tok.tagname &= char(tokenizer.curr).tolower() - tokenizer.tmp &= tokenizer.curr + tokenizer.tok.tagname &= c.tolower() + tokenizer.tmp &= c else: emit '<' emit '/' @@ -676,7 +693,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of SCRIPT_DATA_DOUBLE_ESCAPE_START: case c - of whitespace, '/', '>': + of AsciiWhitespace, '/', '>': if tokenizer.tmp == "script": switch_state SCRIPT_DATA_DOUBLE_ESCAPED else: @@ -750,7 +767,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of SCRIPT_DATA_DOUBLE_ESCAPE_END: case c - of whitespace, '/', '>': + of AsciiWhitespace, '/', '>': if tokenizer.tmp == "script": switch_state SCRIPT_DATA_ESCAPED else: @@ -764,7 +781,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of BEFORE_ATTRIBUTE_NAME: case c - of whitespace: discard + of AsciiWhitespace: discard of '/', '>', eof: reconsume_in AFTER_ATTRIBUTE_NAME of '=': parse_error unexpected_equals_sign_before_attribute_name @@ -777,7 +794,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of ATTRIBUTE_NAME: has_anything_else case c - of whitespace, '/', '>', eof: + of AsciiWhitespace, '/', '>', eof: leave_attribute_name_state reconsume_in AFTER_ATTRIBUTE_NAME of '=': @@ -792,11 +809,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = parse_error unexpected_character_in_attribute_name anything_else else: - tokenizer.attrn &= tokenizer.curr + tokenizer.attrn &= r of AFTER_ATTRIBUTE_NAME: case c - of whitespace: discard + of AsciiWhitespace: discard of '/': switch_state SELF_CLOSING_START_TAG of '=': switch_state BEFORE_ATTRIBUTE_VALUE of '>': @@ -811,7 +828,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of BEFORE_ATTRIBUTE_VALUE: case c - of whitespace: discard + of AsciiWhitespace: discard of '"': switch_state ATTRIBUTE_VALUE_DOUBLE_QUOTED of '\'': switch_state ATTRIBUTE_VALUE_SINGLE_QUOTED of '>': @@ -830,7 +847,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of eof: parse_error eof_in_tag emit_eof - else: append_to_current_attr_value tokenizer.curr + else: append_to_current_attr_value r of ATTRIBUTE_VALUE_SINGLE_QUOTED: case c @@ -842,11 +859,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of eof: parse_error eof_in_tag emit_eof - else: append_to_current_attr_value tokenizer.curr + else: append_to_current_attr_value r of ATTRIBUTE_VALUE_UNQUOTED: case c - of whitespace: switch_state BEFORE_ATTRIBUTE_NAME + of AsciiWhitespace: switch_state BEFORE_ATTRIBUTE_NAME of '&': switch_state_return CHARACTER_REFERENCE of '>': switch_state DATA @@ -860,11 +877,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of eof: parse_error eof_in_tag emit_eof - else: append_to_current_attr_value tokenizer.curr + else: append_to_current_attr_value r of AFTER_ATTRIBUTE_VALUE_QUOTED: case c - of whitespace: + of AsciiWhitespace: switch_state BEFORE_ATTRIBUTE_NAME of '/': switch_state SELF_CLOSING_START_TAG @@ -874,7 +891,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of eof: parse_error eof_in_tag emit_eof - else: append_to_current_attr_value tokenizer.curr + else: append_to_current_attr_value r of SELF_CLOSING_START_TAG: case c @@ -899,7 +916,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok emit_eof of null: parse_error unexpected_null_character - else: tokenizer.tok.data &= tokenizer.curr + else: tokenizer.tok.data &= r of MARKUP_DECLARATION_OPEN: # note: rewritten to fit case model as we consume a char anyway has_anything_else @@ -967,7 +984,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = parse_error eof_in_comment emit_tok emit_eof - else: tokenizer.tok.data &= tokenizer.curr + else: tokenizer.tok.data &= r of COMMENT_LESS_THAN_SIGN: case c @@ -1037,7 +1054,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of DOCTYPE: case c - of whitespace: switch_state BEFORE_DOCTYPE_NAME + of AsciiWhitespace: switch_state BEFORE_DOCTYPE_NAME of '>': reconsume_in BEFORE_DOCTYPE_NAME of eof: parse_error eof_in_doctype @@ -1050,7 +1067,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of BEFORE_DOCTYPE_NAME: case c - of whitespace: discard + of AsciiWhitespace: discard of AsciiUpperAlpha: new_token Token(t: DOCTYPE, name: some($c.tolower())) switch_state DOCTYPE_NAME @@ -1068,12 +1085,12 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok emit_eof else: - new_token Token(t: DOCTYPE, name: some($tokenizer.curr)) + new_token Token(t: DOCTYPE, name: some($r)) switch_state DOCTYPE_NAME of DOCTYPE_NAME: case c - of whitespace: switch_state AFTER_DOCTYPE_NAME + of AsciiWhitespace: switch_state AFTER_DOCTYPE_NAME of '>': switch_state DATA emit_tok @@ -1088,12 +1105,12 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok emit_eof else: - tokenizer.tok.name.get &= tokenizer.curr + tokenizer.tok.name.get &= r of AFTER_DOCTYPE_NAME: # note: rewritten to fit case model as we consume a char anyway has_anything_else case c - of whitespace: discard + of AsciiWhitespace: discard of '>': switch_state DATA emit_tok @@ -1121,7 +1138,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of AFTER_DOCTYPE_PUBLIC_KEYWORD: case c - of whitespace: switch_state BEFORE_DOCTYPE_PUBLIC_IDENTIFIER + of AsciiWhitespace: switch_state BEFORE_DOCTYPE_PUBLIC_IDENTIFIER of '"': parse_error missing_whitespace_after_doctype_public_keyword tokenizer.tok.pubid = some("") @@ -1143,7 +1160,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: case c - of whitespace: discard + of AsciiWhitespace: discard of '"': tokenizer.tok.pubid = some("") switch_state DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED @@ -1182,7 +1199,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok emit_eof else: - tokenizer.tok.pubid.get &= tokenizer.curr + tokenizer.tok.pubid.get &= r of DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: case c @@ -1201,11 +1218,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok emit_eof else: - tokenizer.tok.pubid.get &= tokenizer.curr + tokenizer.tok.pubid.get &= r of AFTER_DOCTYPE_PUBLIC_IDENTIFIER: case c - of whitespace: switch_state BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS + of AsciiWhitespace: switch_state BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS of '>': switch_state DATA emit_tok @@ -1229,7 +1246,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: case c - of whitespace: discard + of AsciiWhitespace: discard of '>': switch_state DATA emit_tok @@ -1251,7 +1268,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of AFTER_DOCTYPE_SYSTEM_KEYWORD: case c - of whitespace: switch_state BEFORE_DOCTYPE_SYSTEM_IDENTIFIER + of AsciiWhitespace: switch_state BEFORE_DOCTYPE_SYSTEM_IDENTIFIER of '"': parse_error missing_whitespace_after_doctype_system_keyword tokenizer.tok.sysid = some("") @@ -1277,7 +1294,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: case c - of whitespace: discard + of AsciiWhitespace: discard of '"': tokenizer.tok.pubid = some("") switch_state DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED @@ -1316,7 +1333,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok emit_eof else: - tokenizer.tok.sysid.get &= tokenizer.curr + tokenizer.tok.sysid.get &= r of DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: case c @@ -1335,11 +1352,11 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = emit_tok emit_eof else: - tokenizer.tok.sysid.get &= tokenizer.curr + tokenizer.tok.sysid.get &= r of AFTER_DOCTYPE_SYSTEM_IDENTIFIER: case c - of whitespace: discard + of AsciiWhitespace: discard of '>': switch_state DATA emit_tok @@ -1403,7 +1420,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = of NAMED_CHARACTER_REFERENCE: ignore_eof # we check for eof ourselves tokenizer.reconsume() - when nimVm: + when nimvm: eprint "Cannot evaluate character references at compile time" else: var buf = "" @@ -1412,8 +1429,8 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = #TODO interfacing with RadixNode is suffering # plus this doesn't look very efficient either while not tokenizer.atEof: - let c = tokenizer.consume() - buf &= c + let r = tokenizer.consume() + buf &= r if not node.hasPrefix(buf): tokenizer.reconsume() break @@ -1423,7 +1440,7 @@ iterator tokenize*(tokenizer: var Tokenizer): Token = buf = "" if node.value.issome: value = node.value - tokenizer.tmp &= tokenizer.curr + tokenizer.tmp &= r if value.issome: if consumed_as_an_attribute and tokenizer.tmp[^1] != ';' and peek_char in {'='} + AsciiAlpha: flush_code_points_consumed_as_a_character_reference diff --git a/src/io/posixstream.nim b/src/io/posixstream.nim new file mode 100644 index 00000000..9a40ce9b --- /dev/null +++ b/src/io/posixstream.nim @@ -0,0 +1,40 @@ +# stdlib file handling is broken, so we use this instead of FileStream. +import posix +import streams + +type + PosixStream* = ref object of Stream + fd*: FileHandle + + ErrorAgain* = object of IOError + ErrorWouldBlock* = object of IOError + ErrorBadFD* = object of IOError + ErrorFault* = object of IOError + ErrorInterrupted* = object of IOError + ErrorInvalid* = object of IOError + +proc psReadData(s: Stream, buffer: pointer, len: int): int = + let s = cast[PosixStream](s) + result = read(s.fd, buffer, len) + if result == -1: + if errno == EAGAIN: + raise newException(ErrorAgain, "") + case errno + of EWOULDBLOCK: raise newException(ErrorWouldBlock, "") + of EBADF: raise newException(ErrorBadFD, "") + of EFAULT: raise newException(ErrorFault, "") + of EINVAL: raise newException(ErrorInvalid, "") + else: raise newException(IOError, $strerror(errno)) + +proc psWriteData(s: Stream, buffer: pointer, len: int) = + let s = cast[PosixStream](s) + let res = write(s.fd, buffer, len) + if res == -1: + raise newException(IOError, $strerror(errno)) + +proc newPosixStream*(fd: FileHandle): PosixStream = + return PosixStream( + fd: fd, + readDataImpl: psReadData, + writeDataImpl: psWriteData + ) diff --git a/src/io/teestream.nim b/src/io/teestream.nim new file mode 100644 index 00000000..81c9e2f0 --- /dev/null +++ b/src/io/teestream.nim @@ -0,0 +1,44 @@ +import streams + +type TeeStream = ref object of Stream + source: Stream + dest: Stream + closedest: bool + +proc tsClose(s: Stream) = + let s = cast[TeeStream](s) + s.source.close() + if s.closedest: + s.dest.close() + +proc tsReadData(s: Stream, buffer: pointer, bufLen: int): int = + let s = cast[TeeStream](s) + result = s.source.readData(buffer, bufLen) + s.dest.writeData(buffer, result) + +proc tsReadDataStr(s: Stream, buffer: var string, slice: Slice[int]): int = + let s = cast[TeeStream](s) + result = s.source.readDataStr(buffer, slice) + if result <= 0: return + s.dest.writeData(addr buffer[0], result) + +proc tsAtEnd(s: Stream): bool = + let s = cast[TeeStream](s) + return s.source.atEnd + +proc newTeeStream*(source, dest: Stream, closedest = true): TeeStream = + return TeeStream( + source: source, + dest: dest, + closedest: closedest, + closeImpl: tsClose, + readDataImpl: + cast[proc(s: Stream, buffer: pointer, len: int): int + {.nimcall, raises: [Defect, IOError, OSError], tags: [ReadIOEffect], gcsafe.} + ](tsReadData), + readDataStrImpl: + cast[proc(s: Stream, buffer: var string, slice: Slice[int]): int + {.nimcall, raises: [Defect, IOError, OSError], tags: [ReadIOEffect], gcsafe.} + ](tsReadDataStr), + atEndImpl: tsAtEnd + ) diff --git a/src/ips/socketstream.nim b/src/ips/socketstream.nim index 1fe99c3b..c0abf07f 100644 --- a/src/ips/socketstream.nim +++ b/src/ips/socketstream.nim @@ -6,6 +6,7 @@ import streams when defined(posix): import posix +import io/posixstream import ips/serversocket type SocketStream* = ref object of Stream @@ -17,7 +18,14 @@ proc sockReadData(s: Stream, buffer: pointer, len: int): int = let s = SocketStream(s) result = s.source.recv(buffer, len) if result < 0: - raise newException(IOError, "Failed to read data (code " & $osLastError() & ")") + if errno == EAGAIN: + raise newException(ErrorAgain, "") + case errno + of EWOULDBLOCK: raise newException(ErrorWouldBlock, "") + of EBADF: raise newException(ErrorBadFD, "") + of EFAULT: raise newException(ErrorFault, "") + of EINVAL: raise newException(ErrorInvalid, "") + else: raise newException(IOError, $strerror(errno)) elif result == 0: s.isend = true diff --git a/src/js/regex.nim b/src/js/regex.nim index 492ae031..e4b31c23 100644 --- a/src/js/regex.nim +++ b/src/js/regex.nim @@ -6,7 +6,6 @@ import unicode import bindings/libregexp import bindings/quickjs import js/javascript -import strings/charset import utils/twtstr export @@ -33,6 +32,74 @@ type rule: string global: bool +type string16 = distinct string + +# Convert a UTF-8 string to UTF-16. +# Note: this doesn't check for (invalid) UTF-8 containing surrogates. +proc toUTF16*(s: string): string16 = + var res = "" + var i = 0 + template put16(c: uint16) = + res.setLen(res.len + 2) + res[i] = cast[char](c) + inc i + res[i] = cast[char](c shr 8) + inc i + for r in s.runes: + var c = uint32(r) + if c < 0x10000: # ucs-2 + put16 uint16(c) + elif c <= 0x10FFFF: # surrogate + c -= 0x10000 + put16 uint16((c shr 10) + 0xD800) + put16 uint16((c and 0x3FF) + 0xDC00) + else: # invalid + put16 uint16(0xFFFD) + result = string16(res) + +func len(s: string16): int {.borrow.} +func `[]`(s: string16, i: int): char = string(s)[i] +func `[]`(s: string16, i: BackwardsIndex): char = string(s)[i] + +template fastRuneAt(s: string16, i: int, r: untyped, doInc = true, be = false) = + if i + 1 == s.len: # unmatched byte + when doInc: inc i + r = Rune(0xFFFD) + else: + when be: + var c1: uint32 = (uint32(s[i]) shl 8) + uint32(s[i + 1]) + else: + var c1: uint32 = uint32(s[i]) + (uint32(s[i + 1]) shl 8) + if c1 >= 0xD800 or c1 < 0xDC00: + if i + 2 == s.len or i + 3 == s.len: + when doInc: i += 2 + r = Rune(c1) # unmatched surrogate + else: + when be: + var c2: uint32 = (uint32(s[i + 2]) shl 8) + uint32(s[i + 3]) + else: + var c2: uint32 = uint32(s[i + 2]) + (uint32(s[i + 3]) shl 8) + if c2 >= 0xDC00 and c2 < 0xE000: + r = Rune((((c1 and 0x3FF) shl 10) or (c2 and 0x3FF)) + 0x10000) + when doInc: i += 4 + else: + r = Rune(c1) # unmatched surrogate + when doInc: i += 2 + else: + r = Rune(c1) # ucs-2 + when doInc: i += 2 + +iterator runes(s: string16): Rune = + var i = 0 + var r: Rune + while i < s.len: + fastRuneAt(s, i, r) + yield r + +proc fromUTF16(s: string16): string = + for r in s.runes: + result &= r + var dummyRuntime = newJSRuntime() var dummyContext = dummyRuntime.newJSContextRaw() diff --git a/src/render/rendertext.nim b/src/render/rendertext.nim index a43a50c5..68da0557 100644 --- a/src/render/rendertext.nim +++ b/src/render/rendertext.nim @@ -1,6 +1,9 @@ import streams +import unicode import buffer/cell +import data/charset +import strings/decoderstream import utils/twtstr const tabwidth = 8 @@ -48,17 +51,18 @@ proc renderPlainText*(text: string): FlexibleGrid = type StreamRenderer* = object spaces: int - stream*: Stream ansiparser: AnsiCodeParser format: Format af: bool + decoder: DecoderStream proc newStreamRenderer*(stream: Stream): StreamRenderer = result.format = newFormat() result.ansiparser.state = PARSE_DONE - result.stream = stream + result.decoder = newDecoderStream(stream, CHARSET_UTF_8) proc renderStream*(grid: var FlexibleGrid, renderer: var StreamRenderer, len: int) = + if len == 0: return template add_format() = if renderer.af: renderer.af = false @@ -66,36 +70,43 @@ proc renderStream*(grid: var FlexibleGrid, renderer: var StreamRenderer, len: in if grid.len == 0: grid.addLine() var i = 0 - while i < len and not renderer.stream.atEnd: - let c = renderer.stream.readChar() - if renderer.ansiparser.state != PARSE_DONE: - let cancel = renderer.ansiparser.parseAnsiCode(renderer.format, c) - if not cancel: - if renderer.ansiparser.state == PARSE_DONE: - renderer.af = true - continue - case c - of '\n': - add_format - grid.addLine() - of '\r': discard - of '\t': - add_format - for i in 0 ..< tabwidth - renderer.spaces: - grid[^1].str &= ' ' - renderer.spaces = 0 - of ' ': - add_format - grid[^1].str &= c - inc renderer.spaces - if renderer.spaces == 8: - renderer.spaces = 0 - of '\e': - renderer.ansiparser.reset() - elif c in Controls: - add_format - grid[^1].str &= '^' & c.getControlLetter() + var buf = newSeq[Rune](len * 4) + let n = renderer.decoder.readData(addr buf[0], buf.len * sizeof(buf[0])) + while i < n div sizeof(buf[0]): + let r = buf[i] + if r.isAscii(): + let c = cast[char](r) + if renderer.ansiparser.state != PARSE_DONE: + let cancel = renderer.ansiparser.parseAnsiCode(renderer.format, c) + if not cancel: + if renderer.ansiparser.state == PARSE_DONE: + renderer.af = true + continue + case c + of '\n': + add_format + grid.addLine() + of '\r': discard + of '\t': + add_format + for i in 0 ..< tabwidth - renderer.spaces: + grid[^1].str &= ' ' + renderer.spaces = 0 + of ' ': + add_format + grid[^1].str &= c + inc renderer.spaces + if renderer.spaces == 8: + renderer.spaces = 0 + of '\e': + renderer.ansiparser.reset() + elif c in Controls: + add_format + grid[^1].str &= '^' & c.getControlLetter() + else: + add_format + grid[^1].str &= c else: add_format - grid[^1].str &= c + grid[^1].str &= r inc i diff --git a/src/strings/charset.nim b/src/strings/charset.nim deleted file mode 100644 index 4b293b95..00000000 --- a/src/strings/charset.nim +++ /dev/null @@ -1,69 +0,0 @@ -import unicode - -type string16* = distinct string - -# Convert a UTF-8 string to UTF-16. -# Note: this doesn't check for (invalid) UTF-8 containing surrogates. -proc toUTF16*(s: string): string16 = - var res = "" - var i = 0 - template put16(c: uint16) = - res.setLen(res.len + 2) - res[i] = cast[char](c) - inc i - res[i] = cast[char](c shr 8) - inc i - for r in s.runes: - var c = uint32(r) - if c < 0x10000: # ucs-2 - put16 uint16(c) - elif c <= 0x10FFFF: # surrogate - c -= 0x10000 - put16 uint16((c shr 10) + 0xD800) - put16 uint16((c and 0x3FF) + 0xDC00) - else: # invalid - put16 uint16(0xFFFD) - result = string16(res) - -proc len*(s: string16): int {.borrow.} -proc `[]`*(s: string16, i: int): char = string(s)[i] -proc `[]`*(s: string16, i: BackwardsIndex): char = string(s)[i] - -template fastRuneAt*(s: string16, i: int, r: untyped, doInc = true, be = false) = - if i + 1 == s.len: # unmatched byte - when doInc: inc i - r = Rune(0xFFFD) - else: - when be: - var c1: uint32 = (uint32(s[i]) shl 8) + uint32(s[i + 1]) - else: - var c1: uint32 = uint32(s[i]) + (uint32(s[i + 1]) shl 8) - if c1 >= 0xD800 or c1 < 0xDC00: - if i + 2 == s.len or i + 3 == s.len: - when doInc: i += 2 - r = Rune(c1) # unmatched surrogate - else: - when be: - var c2: uint32 = (uint32(s[i + 2]) shl 8) + uint32(s[i + 3]) - else: - var c2: uint32 = uint32(s[i + 2]) + (uint32(s[i + 3]) shl 8) - if c2 >= 0xDC00 and c2 < 0xE000: - r = Rune((((c1 and 0x3FF) shl 10) or (c2 and 0x3FF)) + 0x10000) - when doInc: i += 4 - else: - r = Rune(c1) # unmatched surrogate - when doInc: i += 2 - else: - r = Rune(c1) # ucs-2 - when doInc: i += 2 - -iterator runes*(s: string16): Rune = - var i = 0 - var r: Rune - while i < s.len: - fastRuneAt(s, i, r) - yield r - -proc fromUTF16*(s: string16): string = - for r in s.runes: - result &= r diff --git a/src/strings/decoderstream.nim b/src/strings/decoderstream.nim new file mode 100644 index 00000000..c5f66b3c --- /dev/null +++ b/src/strings/decoderstream.nim @@ -0,0 +1,844 @@ +import algorithm +import streams +import unicode + +import data/charset +import utils/twtstr + +# DecoderStream decodes any encoding to valid utf-32. +type + DecoderErrorMode* = enum + DECODER_ERROR_MODE_FATAL, DECODER_ERROR_MODE_REPLACEMENT, + DECODER_ERROR_MODE_HTML + + ISO2022JPState = enum + STATE_ASCII, STATE_ROMAN, STATE_KATAKANA, STATE_LEAD_BYTE, + STATE_TRAIL_BYTE, STATE_ESCAPE_START, STATE_ESCAPE + + DecoderStream* = ref object + source: Stream + errormode: DecoderErrorMode + isend: bool + failed*: bool + bufs: seq[seq[uint32]] + bs: int + bi: int + buflen: int + c: uint32 + case charset: Charset + of CHARSET_UTF_8: + u8needed: int + u8seen: int + u8bounds: Slice[uint8] + of CHARSET_GBK, CHARSET_GB18030: + gb18first: uint8 + gb18second: uint8 + gb18third: uint8 + gb18buf: uint8 + gb18hasbuf: bool + of CHARSET_BIG5: + big5lead: uint8 + of CHARSET_EUC_JP: + eucjplead: uint8 + eucjpjis0212: bool + of CHARSET_ISO_2022_JP: + iso2022jplead: uint8 + iso2022jpstate: ISO2022JPState + iso2022jpoutputstate: ISO2022JPState + iso2022jpoutput: bool + iso2022jpbuf: uint8 + iso2022jphasbuf: bool + of CHARSET_SHIFT_JIS: + sjislead: uint8 + of CHARSET_EUC_KR: + euckrlead: uint8 + of CHARSET_UTF_16_BE, CHARSET_UTF_16_LE: + u16lead: uint8 + u16surr: uint16 + u16haslead: bool + u16hassurr: bool + of CHARSET_REPLACEMENT: + replreported: bool + else: discard + +template append_codepoint_buf(stream: DecoderStream, c: uint32) = + if stream.bi >= stream.buflen: + stream.bufs.add(newSeqUninitialized[uint32](stream.buflen)) + stream.bi = 0 + stream.bufs[^1][stream.bi] = c + inc stream.bi + +template append_codepoint(stream: DecoderStream, c: uint32, oq: ptr UncheckedArray[uint32], olen: int, n: var int) = + if n < olen: + oq[n div sizeof(uint32)] = c + n += sizeof(uint32) + else: + append_codepoint_buf stream, c + +template append_codepoint(stream: DecoderStream, c: char, oq: ptr UncheckedArray[uint32], olen: int, n: var int) = + stream.append_codepoint cast[uint32](c), oq, olen, n + +proc handleError(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int, n: var int) = + case stream.errormode + of DECODER_ERROR_MODE_FATAL: + stream.isend = true + stream.failed = true + of DECODER_ERROR_MODE_HTML: + if stream.charset == CHARSET_UTF_8: + # "html" mode is handled as "replacement" for utf-8. + stream.append_codepoint 0xFFFD, oq, olen, n + else: + stream.append_codepoint '&', oq, olen, n + stream.append_codepoint '#', oq, olen, n + while stream.c > 0: + stream.append_codepoint cast[char](0x30 + stream.c mod 10), oq, olen, n + stream.c = stream.c div 10 + stream.append_codepoint ';', oq, olen, n + of DECODER_ERROR_MODE_REPLACEMENT: + stream.append_codepoint 0xFFFD, oq, olen, n + +proc decodeUTF8(stream: DecoderStream, iq: var seq[uint8], oq: ptr UncheckedArray[uint32], ilen, olen: int, n: var int) = + var c = stream.c + var needed = stream.u8needed + var seen = stream.u8seen + var bounds = stream.u8bounds + var i = 0 + while i < ilen: + let b = iq[i] + if needed == 0: + case b + of 0x00u8 .. 0x7Fu8: + stream.append_codepoint uint32(b), oq, olen, n + of 0xC2u8 .. 0xDFu8: + needed = 1 + c = cast[uint32](b) and 0x1F + of 0xE0u8: + bounds.a = 0xA0 + needed = 2 + c = cast[uint32](b) and 0xF + of 0xEDu8: + bounds.b = 0x9F + needed = 2 + c = cast[uint32](b) and 0xF + of 0xE1u8 .. 0xECu8, 0xEEu8 .. 0xEFu8: + needed = 2 + c = cast[uint32](b) and 0xF + of 0xF0u8: + bounds.a = 0x90 + needed = 3 + c = cast[uint32](b) and 0x7 + of 0xF4u8: + bounds.b = 0x8F + needed = 3 + c = cast[uint32](b) and 0x7 + of 0xF1u8 .. 0xF3u8, 0xF5u8 .. 0xF7u8: + needed = 3 + c = cast[uint32](b) and 0x7 + else: + stream.handleError(oq, olen, n) + if stream.isend: # fatal error + break + inc i + continue + if b notin bounds: + c = 0 + needed = 0 + seen = 0 + bounds = 0x80u8 .. 0xBFu8 + stream.handleError(oq, olen, n) + continue # prepend (no inc i) + bounds = 0x80u8 .. 0xBFu8 + c = (c shl 6) or (b and 0x3F) + inc seen + if seen == needed: + stream.append_codepoint c, oq, olen, n + c = 0 + needed = 0 + seen = 0 + inc i + stream.c = c + stream.u8bounds = bounds + stream.u8needed = needed + +proc gb18RangesCodepoint(p: uint32): uint32 = + if p > 39419 and p < 189000 or p > 1237575: + return high(uint32) # null + if p == 7457: + return 0xE7C7 + # Let offset be the last pointer in index gb18030 ranges that is less than or + # equal to pointer and code point offset its corresponding code point. + var offset: uint32 + var c: uint32 + if p >= 189000: + # omitted from the map for storage efficiency + offset = 189000 + c = 0x10000 + elif p >= 39394: + # Needed because upperBound returns the first element greater than pointer + # OR last on failure, so we can't just remove one if p is e.g. 39400. + offset = 39394 + c = 0xFFE6 + else: + # Find the first range that is greater than p, or last if no such element + # is found. + # We want the last that is <=, so decrease index by one. + let i = upperBound(Gb18030RangesDecode, p, func(a: tuple[p, ucs: uint16], b: uint32): int = + cmp(cast[uint32](a.p), b)) + let elem = Gb18030RangesDecode[i - 1] + offset = elem.p + c = elem.ucs + return c + p - offset + +proc decodeGb18030(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + var first = stream.gb18first + var second = stream.gb18second + var third = stream.gb18third + var buf = stream.gb18buf + var hasbuf = stream.gb18hasbuf + var i = 0 + while i < ilen: + let b = if hasbuf: + hasbuf = false + dec i + buf + else: + iq[i] + if third != 0: + if b notin 0x30u8 .. 0x39u8: + hasbuf = true + buf = second + first = third + first = 0 + second = 0 + third = 0 + stream.handleError(oq, olen, n) + if stream.isend: break + continue # prepend (no inc i) + else: + let p = ((uint32(first) - 0x81) * 10 * 126 * 10) + + ((uint32(second) - 0x30) * (10 * 126)) + + ((uint32(third) - 0x81) * 10) + uint32(b) - 0x30 + let c = gb18RangesCodepoint(p) + first = 0 + second = 0 + third = 0 + if c == high(uint32): # null + stream.handleError(oq, olen, n) + if stream.isend: break + else: + stream.append_codepoint c, oq, olen, n + elif second != 0: + if b in 0x81u8 .. 0xFEu8: + third = b + else: + hasbuf = true + buf = second + first = 0 + second = 0 + third = 0 + stream.handleError(oq, olen, n) + if stream.isend: break + elif first != 0: + if b in 0x30u8 .. 0x39u8: + second = b + else: + let ff = first + first = 0 + if b in 0x40u8 .. 0x7Eu8: + let offset = if b < 0x7F: 0x40u32 else: 0x41u32 + let p = (uint16(ff) - 0x81) * 190 + (uint16(b) - offset) + if p < Gb18030Decode.len: + let c = Gb18030Decode[cast[uint16](p)] + stream.append_codepoint uint32(c), oq, olen, n + inc i + continue + if cast[char](b) in Ascii: + continue # prepend (no inc i) + else: + stream.handleError(oq, olen, n) + if stream.isend: break + elif cast[char](b) in Ascii: + stream.append_codepoint b, oq, olen, n + elif b == 0x80: + stream.append_codepoint 0x20AC, oq, olen, n + elif b in 0x81u8 .. 0xFEu8: + first = b + else: + stream.handleError(oq, olen, n) + if stream.isend: break + inc i + stream.gb18first = first + stream.gb18second = second + stream.gb18third = third + stream.gb18buf = buf + stream.gb18hasbuf = hasbuf + +proc decodeBig5(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + var i = 0 + while i < ilen: + if stream.big5lead != 0: + let lead = uint32(stream.big5lead) + stream.big5lead = 0 + let offset = if iq[i] < 0x7F: 0x40u16 else: 0x7E + if iq[i] in {0x40u8 .. 0x7Eu8, 0xA1 .. 0xFEu8}: + let p = (lead - 0x81) * 157 + uint16(iq[i]) - offset + template output_two(a, b: uint32) = + stream.append_codepoint a, oq, olen, n + stream.append_codepoint b, oq, olen, n + block no_continue: + case p + of 1133: output_two 0x00CA, 0x0304 + of 1135: output_two 0x00CA, 0x030C + of 1164: output_two 0x00EA, 0x0304 + of 1166: output_two 0x00EA, 0x030C + else: break no_continue + inc i + continue + if p < Big5Decode.len - Big5DecodeOffset: + let c = Big5Decode[p - Big5DecodeOffset] + if c != 0: + stream.append_codepoint c, oq, olen, n + inc i + continue + if cast[char](iq[i]) in Ascii: + stream.append_codepoint iq[i], oq, olen, n + else: + stream.handleError(oq, olen, n) + if stream.isend: break + elif cast[char](iq[i]) in Ascii: + stream.append_codepoint iq[i], oq, olen, n + elif iq[i] in 0x00u8 .. 0xFEu8: + stream.big5lead = iq[i] + else: + stream.handleError(oq, olen, n) + if stream.isend: break + inc i + +proc decodeEUCJP(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + var jis0212 = stream.eucjpjis0212 + var lead = stream.eucjplead + var i = 0 + while i < ilen: + let b = iq[i] + if lead == 0x8E and b in 0xA1u8 .. 0xDFu8: + lead = 0 + stream.append_codepoint iq[i], oq, olen, n + elif lead == 0x8F and b in 0xA1u8 .. 0xFEu8: + jis0212 = true + lead = b + elif lead != 0: + if lead in 0xA1u8 .. 0xFEu8 and b in 0xA1u8 .. 0xFEu8: + let p = (uint16(lead) - 0xA1) * 94 + uint16(b) - 0xA1 + lead = 0 + var c: uint16 + if jis0212: + if p < Jis0212Decode.len: + c = Jis0212Decode[p] + else: + if p < Jis0208Decode.len: + c = Jis0208Decode[p] + jis0212 = false + if c != 0: + stream.append_codepoint c, oq, olen, n + inc i + continue + else: + lead = 0 + stream.handleError(oq, olen, n) + if stream.isend: break + elif cast[char](b) in Ascii: + stream.append_codepoint b, oq, olen, n + elif b in {0x8Eu8, 0x8Fu8, 0xA1u8 .. 0xFEu8}: + lead = b + else: + stream.handleError(oq, olen, n) + if stream.isend: break + inc i + stream.eucjpjis0212 = jis0212 + stream.eucjplead = lead + +proc decodeISO2022JP(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + var i = 0 + var lead = stream.iso2022jplead + var state = stream.iso2022jpstate + var output = stream.iso2022jpoutput + var outputstate = stream.iso2022jpoutputstate + var buf = stream.iso2022jpbuf + var hasbuf = stream.iso2022jphasbuf + while i < ilen: + let b = if hasbuf: + hasbuf = false + dec i + buf + else: + iq[i] + case state + of STATE_ASCII: + case b + of 0x1B: state = STATE_ESCAPE_START + of {0x00u8..0x7Fu8} - {0x0Eu8, 0x0Fu8, 0x1Bu8}: + output = false + stream.append_codepoint b, oq, olen, n + else: + output = false + stream.handleError(oq, olen, n) + if stream.isend: break + of STATE_ROMAN: + case b + of 0x1B: state = STATE_ESCAPE_START + of 0x5C: + output = false + stream.append_codepoint 0x00A5, oq, olen, n + of 0x7E: + output = false + stream.append_codepoint 0x203E, oq, olen, n + of {0x00u8..0x7Fu8} - {0x0Eu8, 0x0Fu8, 0x1Bu8, 0x5Cu8, 0x7Eu8}: + output = false + stream.append_codepoint b, oq, olen, n + else: + output = false + stream.handleError(oq, olen, n) + if stream.isend: break + of STATE_KATAKANA: + case b + of 0x1B: state = STATE_ESCAPE_START + of 0x21u8..0x5Fu8: + output = false + stream.append_codepoint 0xFF61u16 - 0x21 + uint16(b), oq, olen, n + else: + output = false + stream.handleError(oq, olen, n) + if stream.isend: break + of STATE_LEAD_BYTE: + case b + of 0x1B: state = STATE_ESCAPE_START + of 0x21u8..0x7Eu8: + output = false + lead = b + state = STATE_TRAIL_BYTE + else: + output = false + stream.handleError(oq, olen, n) + if stream.isend: break + of STATE_TRAIL_BYTE: + case b + of 0x1B: + state = STATE_ESCAPE_START + stream.handleError(oq, olen, n) + if stream.isend: break + of 0x21u8..0x7Eu8: + state = STATE_LEAD_BYTE + let p = (uint16(lead) - 0x21) * 94 + uint16(b) - 0x21 + if p < Jis0208Decode.len: + let c = Jis0208Decode[p] + if c != 0: + stream.append_codepoint c, oq, olen, n + else: + stream.handleError(oq, olen, n) + if stream.isend: break + else: + state = STATE_LEAD_BYTE + stream.handleError(oq, olen, n) + if stream.isend: break + of STATE_ESCAPE_START: + if b == 0x24 or b == 0x28: + lead = b + state = STATE_ESCAPE + else: + output = false + state = outputstate + stream.handleError(oq, olen, n) + if stream.isend: break + continue # prepend (no inc i) + of STATE_ESCAPE: + let l = lead + lead = 0 + block statenonnull: + var s: ISO2022JPState + if l == 0x28: + case b + of 0x42: s = STATE_ASCII + of 0x4A: s = STATE_ROMAN + of 0x49: s = STATE_KATAKANA + else: break statenonnull + elif l == 0x24 and b in {0x40u8, 0x42u8}: + s = STATE_LEAD_BYTE + else: break statenonnull + state = s + outputstate = s + if not output: + output = true + stream.handleError(oq, olen, n) + if stream.isend: + output = true + break + output = true + inc i + continue + output = false + state = outputstate + stream.handleError(oq, olen, n) + if stream.isend: break + continue # prepend (no inc i) + inc i + stream.iso2022jphasbuf = hasbuf + stream.iso2022jpbuf = buf + stream.iso2022jplead = lead + stream.iso2022jpstate = state + stream.iso2022jpoutput = output + stream.iso2022jpoutputstate = outputstate + +proc decodeShiftJIS(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + var lead = stream.sjislead + var i = 0 + while i < ilen: + let b = iq[i] + if lead != 0: + let l = lead + lead = 0 + let offset = if b < 0x7Fu8: 0x40u16 else: 0x41u16 + let leadoffset = if l < 0xA0: 0x81u16 else: 0xC1u16 + if b in 0x40u8..0x7Eu8 or b in 0x80u8..0xFCu8: + let p = (uint16(l) - leadoffset) * 188 + uint16(b) - offset + if p in 8836u16..10715u16: + stream.append_codepoint 0xE000u16 - 8836 + p, oq, olen, n + inc i + continue + if p < Jis0208Decode.len and Jis0208Decode[p] != 0: + let c = Jis0208Decode[p] + stream.append_codepoint c, oq, olen, n + inc i + continue + if cast[char](b) in Ascii: + continue # prepend (no inc i) + else: + stream.handleError(oq, olen, n) + if stream.isend: break + elif cast[char](b) in Ascii or b == 0x80: + stream.append_codepoint b, oq, olen, n + elif b in 0xA1u8..0xDFu8: + stream.append_codepoint 0xFF61u16 - 0xA1 + b, oq, olen, n + elif b in {0x81..0x9F} + {0xE0..0xFC}: + lead = b + else: + stream.handleError(oq, olen, n) + if stream.isend: break + inc i + stream.sjislead = lead + +proc decodeEUCKR(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + var lead = stream.euckrlead + var i = 0 + while i < ilen: + let b = iq[i] + if lead != 0: + if b in 0x41u8..0xFEu8: + let p = (uint16(lead) - 0x81) * 190 + (uint16(b) - 0x41) + if p < EUCKRDecode.len and EUCKRDecode[p] != 0: + let c = EUCKRDecode[p] + stream.append_codepoint c, oq, olen, n + inc i + continue + stream.handleError(oq, olen, n) + if stream.isend: break + elif cast[char](b) in Ascii: + stream.append_codepoint b, oq, olen, n + elif b in {0x81u8..0xFEu8}: + lead = b + else: + stream.handleError(oq, olen, n) + if stream.isend: break + inc i + stream.euckrlead = lead + +proc decodeUTF16(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int, be: static bool) = + var i = 0 + var lead = stream.u16lead + var haslead = stream.u16haslead + var surr = stream.u16surr + var hassurr = stream.u16hassurr + while i < ilen: + if not haslead: + haslead = true + lead = iq[i] + else: + let cu = if be: + (uint16(lead) shl 8) + uint16(iq[i]) + else: + (uint16(iq[i]) shl 8) + uint16(lead) + haslead = false + if hassurr: + hassurr = false + if cu in 0xDC00u16 .. 0xDFFFu16: + let c = 0x10000 + ((uint32(surr) - 0xD800) shl 10) + (uint32(cu) - 0xDC00) + stream.append_codepoint c, oq, olen, n + inc i + continue + haslead = true # prepend the last two bytes + stream.handleError(oq, olen, n) + continue + if cu in 0xD800u16 .. 0xDBFFu16: + surr = cu + hassurr = true + inc i + continue + elif cu in 0xDC00u16 .. 0xDFFFu16: + stream.handleError(oq, olen, n) + if stream.isend: # fatal error + break + else: + inc i + continue + stream.append_codepoint uint32(cu), oq, olen, n + inc i + stream.u16lead = lead + stream.u16haslead = haslead + stream.u16surr = surr + stream.u16hassurr = hassurr + +proc decodeUTF16LE(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + stream.decodeUTF16(iq, oq, ilen, olen, n, false) + +proc decodeUTF16BE(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + stream.decodeUTF16(iq, oq, ilen, olen, n, true) + +proc decodeXUserDefined(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + for i in 0 ..< ilen: + let c = cast[char](iq[i]) + if c in Ascii: + stream.append_codepoint c, oq, olen, n + else: + let c = 0xF780 + cast[uint32](c) - 0x80 + stream.append_codepoint c, oq, olen, n + +proc decodeSingleByte(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int, map: array[char, uint16]) = + for i in 0 ..< ilen: + let c = cast[char](iq[i]) + if c in Ascii: + stream.append_codepoint c, oq, olen, n + else: + let p = map[c] + if p == 0u16: + stream.handleError(oq, olen, n) + else: + stream.append_codepoint cast[uint32](oq), oq, olen, n + +proc decodeReplacement(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int, n: var int) = + if not stream.replreported: + stream.replreported = true + stream.handleError(oq, olen, n) + # I think that's it? + +# copy any data remaining from previous passes +proc copyBuffers(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int): int = + if stream.bufs.len == 1: + # one page: stream.bs ..< stream.bi + let n = min((stream.bi - stream.bs) * sizeof(stream.bufs[0][0]), olen) + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], n) + stream.bs += n div sizeof(uint32) + if stream.bs >= stream.bi: + # read entire page; recycle it + stream.bs = 0 + stream.bi = 0 + return n + else: + # multiple pages: + # stream.bs ..< stream.buflen + # 0 ..< stream.buflen + # ... + # 0 ..< stream.bi + let a = (stream.buflen - stream.bs) * sizeof(stream.bufs[0][0]) + if a < olen: + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a) + var ns = a + stream.bs = 0 + var i = 1 + while i < stream.bufs.high: + let n = min(stream.buflen * sizeof(stream.bufs[0][0]), olen - ns) + copyMem(addr oq[ns div sizeof(uint32)], addr stream.bufs[i][0], n) + ns += n + if ns >= olen: + # i'th buffer still has contents. + stream.bs = n div sizeof(uint32) + break + stream.bs = 0 + inc i + if ns < olen: + # last page + let n = min(stream.bi * sizeof(stream.bufs[0][0]), olen - ns) + copyMem(addr oq[ns div sizeof(uint32)], addr stream.bufs[i][0], n) + ns += n + stream.bs = n div sizeof(uint32) + if stream.bs >= stream.bi: + # read entire page; recycle it + stream.bs = 0 + stream.bi = 0 + for j in i ..< stream.bufs.len: + stream.bufs[j - i] = stream.bufs[j] + stream.bufs.setLen(stream.bufs.len - i) + return ns + elif a > olen: + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], olen) + stream.bs += olen div sizeof(uint32) + assert stream.bs < stream.buflen + return olen + else: # a == olen + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a) + stream.bs = 0 + stream.bufs.delete(0) + return a + +proc checkEnd(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int, + n: var int) = + if not stream.isend and stream.bufs.len == 1 and + stream.bs >= stream.bi and stream.source.atEnd: + stream.isend = true + case stream.charset + of CHARSET_UTF_16_LE, CHARSET_UTF_16_BE: + if stream.u16haslead or stream.u16hassurr: + stream.handleError(oq, olen, n) + of CHARSET_UTF_8: + if stream.u8needed != 0: + stream.handleError(oq, olen, n) + of CHARSET_GB18030, CHARSET_GBK: + if stream.gb18first != 0 or stream.gb18second != 0 or stream.gb18third != 0: + stream.handleError(oq, olen, n) + of CHARSET_BIG5: + if stream.big5lead != 0: + stream.handleError(oq, olen, n) + of CHARSET_EUC_JP: + if stream.eucjplead != 0: + stream.handleError(oq, olen, n) + of CHARSET_ISO_2022_JP: + case stream.iso2022jpstate + of STATE_ASCII, STATE_ROMAN, STATE_KATAKANA, STATE_LEAD_BYTE: discard + of STATE_TRAIL_BYTE: + stream.handleError(oq, olen, n) + of STATE_ESCAPE_START: + stream.handleError(oq, olen, n) + of STATE_ESCAPE: + stream.isend = false + stream.iso2022jpbuf = stream.iso2022jplead + stream.iso2022jphasbuf = true + stream.iso2022jplead = 0 + stream.iso2022jpoutput = false + stream.iso2022jpstate = stream.iso2022jpoutputstate + stream.handleError(oq, olen, n) + of CHARSET_SHIFT_JIS: + if stream.sjislead != 0: + stream.handleError(oq, olen, n) + of CHARSET_EUC_KR: + if stream.euckrlead != 0: + stream.handleError(oq, olen, n) + else: discard + +proc prepend*(stream: DecoderStream, c: uint32) = + append_codepoint_buf stream, c + +const ReadSize = 4096 +proc readData*(stream: DecoderStream, buffer: pointer, olen: int): int = + const l = sizeof(stream.bufs[0][0]) + assert olen mod l == 0, "Buffer size must be divisible by " & $l + if olen == 0: return + let oq = cast[ptr UncheckedArray[uint32]](buffer) + result = stream.copyBuffers(oq, olen) + let olen = olen - result + if olen == 0: + stream.checkEnd(oq, olen, result) + return result # output filled with buffered data; nothing to decode. + var iq = newSeqUninitialized[uint8](ReadSize) + let ilen = stream.source.readData(cast[pointer](addr iq[0]), ReadSize) + case stream.charset + of CHARSET_UTF_8: stream.decodeUTF8(iq, oq, ilen, olen, result) + of CHARSET_IBM866: stream.decodeSingleByte(iq, oq, ilen, olen, result, IBM866Decode) + of CHARSET_ISO_8859_2: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88592Decode) + of CHARSET_ISO_8859_3: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88593Decode) + of CHARSET_ISO_8859_4: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88594Decode) + of CHARSET_ISO_8859_5: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88595Decode) + of CHARSET_ISO_8859_6: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88596Decode) + of CHARSET_ISO_8859_7: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88597Decode) + of CHARSET_ISO_8859_8, + CHARSET_ISO_8859_8_I: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88598Decode) + of CHARSET_ISO_8859_10: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885910Decode) + of CHARSET_ISO_8859_13: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885913Decode) + of CHARSET_ISO_8859_14: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885914Decode) + of CHARSET_ISO_8859_15: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885915Decode) + of CHARSET_ISO_8859_16: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885916Decode) + of CHARSET_KOI8_R: stream.decodeSingleByte(iq, oq, ilen, olen, result, KOI8RDecode) + of CHARSET_KOI8_U: stream.decodeSingleByte(iq, oq, ilen, olen, result, KOI8UDecode) + of CHARSET_MACINTOSH: stream.decodeSingleByte(iq, oq, ilen, olen, result, MacintoshDecode) + of CHARSET_WINDOWS_874: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows874Decode) + of CHARSET_WINDOWS_1250: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1250Decode) + of CHARSET_WINDOWS_1251: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1251Decode) + of CHARSET_WINDOWS_1252: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1252Decode) + of CHARSET_WINDOWS_1253: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1253Decode) + of CHARSET_WINDOWS_1254: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1254Decode) + of CHARSET_WINDOWS_1255: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1255Decode) + of CHARSET_WINDOWS_1256: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1256Decode) + of CHARSET_WINDOWS_1257: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1257Decode) + of CHARSET_WINDOWS_1258: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1258Decode) + of CHARSET_X_MAC_CYRILLIC: stream.decodeSingleByte(iq, oq, ilen, olen, result, XMacCyrillicDecode) + of CHARSET_GBK, CHARSET_GB18030: stream.decodeGb18030(iq, oq, ilen, olen, result) + of CHARSET_BIG5: stream.decodeBig5(iq, oq, ilen, olen, result) + of CHARSET_EUC_JP: stream.decodeEUCJP(iq, oq, ilen, olen, result) + of CHARSET_ISO_2022_JP: stream.decodeISO2022JP(iq, oq, ilen, olen, result) + of CHARSET_SHIFT_JIS: stream.decodeShiftJIS(iq, oq, ilen, olen, result) + of CHARSET_EUC_KR: stream.decodeEUCKR(iq, oq, ilen, olen, result) + of CHARSET_REPLACEMENT: stream.decodeReplacement(oq, olen, result) + of CHARSET_UTF_16_LE: stream.decodeUTF16LE(iq, oq, ilen, olen, result) + of CHARSET_UTF_16_BE: stream.decodeUTF16BE(iq, oq, ilen, olen, result) + of CHARSET_X_USER_DEFINED: stream.decodeXUserDefined(iq, oq, ilen, olen, result) + of CHARSET_UNKNOWN: assert false, "Somebody forgot to set the character set here" + stream.checkEnd(oq, olen, result) + +proc readRunes*(stream: DecoderStream, olen: int): seq[Rune] = + when nimvm: + let s = stream.source.readStr(olen) + result = s.toRunes() + if stream.source.atEnd: + stream.isend = true + else: + assert false + +proc atEnd*(stream: DecoderStream): bool = + return stream.isend + +proc newDecoderStream*(source: Stream, cs = CHARSET_UTF_8, buflen = 1024, + errormode = DECODER_ERROR_MODE_REPLACEMENT): + DecoderStream = + result = DecoderStream( + source: source, + charset: cs, + buflen: buflen, + errormode: errormode + ) + when nimvm: + result.bufs = @[newSeq[uint32](buflen)] + else: + result.bufs = @[newSeqUninitialized[uint32](buflen)] + case cs + of CHARSET_UTF_8: + result.u8bounds = 0x80u8 .. 0xBFu8 + else: discard diff --git a/src/types/color.nim b/src/types/color.nim index 64568964..3e29c15f 100644 --- a/src/types/color.nim +++ b/src/types/color.nim @@ -272,9 +272,14 @@ func parseLegacyColor*(s: string): Option[RGBColor] = let c1 = s2[0..<min(l,2)] let c2 = s2[l..<min(l*2,l+2)] let c3 = s2[l*2..<min(l*3,l*2+2)] - let c = (hexValue(c1[0]) shl 20) or (hexValue(c1[1]) shl 16) or - (hexValue(c2[0]) shl 12) or (hexValue(c2[1]) shl 8) or - (hexValue(c3[0]) shl 4) or hexValue(c3[1]) + let c = if l == 1: + (hexValue(c1[0]) shl 20) or (hexValue(c1[0]) shl 16) or + (hexValue(c2[0]) shl 12) or (hexValue(c2[0]) shl 8) or + (hexValue(c3[0]) shl 4) or hexValue(c3[0]) + else: + (hexValue(c1[0]) shl 20) or (hexValue(c1[1]) shl 16) or + (hexValue(c2[0]) shl 12) or (hexValue(c2[1]) shl 8) or + (hexValue(c3[0]) shl 4) or hexValue(c3[1]) return some(RGBColor(c)) func r*(c: RGBAColor): int = diff --git a/src/utils/twtstr.nim b/src/utils/twtstr.nim index 53d68798..1a38e6c5 100644 --- a/src/utils/twtstr.nim +++ b/src/utils/twtstr.nim @@ -140,7 +140,7 @@ func decValue*(c: char): int = return decCharMap[c] func isAscii*(r: Rune): bool = - return int(r) < 128 + return int32(r) < 128 const HexChars = "0123456789ABCDEF" func toHex*(c: char): string = |