diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/buffer/buffer.nim | 5 | ||||
-rw-r--r-- | src/config/config.nim | 3 | ||||
-rw-r--r-- | src/config/mailcap.nim | 3 | ||||
-rw-r--r-- | src/data/charset.nim | 458 | ||||
-rw-r--r-- | src/display/client.nim | 3 | ||||
-rw-r--r-- | src/display/pager.nim | 3 | ||||
-rw-r--r-- | src/display/term.nim | 5 | ||||
-rw-r--r-- | src/encoding/decoderstream.nim | 863 | ||||
-rw-r--r-- | src/encoding/encoderstream.nim | 533 | ||||
-rw-r--r-- | src/html/chadombuilder.nim | 3 | ||||
-rw-r--r-- | src/html/dom.nim | 5 | ||||
-rw-r--r-- | src/html/htmlparser.nim | 5 | ||||
-rw-r--r-- | src/html/htmltokenizer.nim | 3 | ||||
-rw-r--r-- | src/io/lineedit.nim | 7 | ||||
-rw-r--r-- | src/io/loader.nim | 3 | ||||
-rw-r--r-- | src/io/response.nim | 3 | ||||
-rw-r--r-- | src/main.nim | 5 | ||||
-rw-r--r-- | src/render/rendertext.nim | 5 | ||||
-rw-r--r-- | src/types/buffersource.nim | 3 | ||||
-rw-r--r-- | src/version.nim | 29 |
20 files changed, 70 insertions, 1877 deletions
diff --git a/src/buffer/buffer.nim b/src/buffer/buffer.nim index b793360a..1dde438f 100644 --- a/src/buffer/buffer.nim +++ b/src/buffer/buffer.nim @@ -17,8 +17,6 @@ import css/mediaquery import css/sheet import css/stylednode import css/values -import data/charset -import encoding/decoderstream import html/chadombuilder import html/dom import html/env @@ -50,6 +48,9 @@ import utils/opt import utils/twtstr import xhr/formdata as formdata_impl +import chakasu/charset +import chakasu/decoderstream + type LoadInfo* = enum CONNECT, DOWNLOAD, RENDER, DONE diff --git a/src/config/config.nim b/src/config/config.nim index 6b9c7032..e5dbad9c 100644 --- a/src/config/config.nim +++ b/src/config/config.nim @@ -7,7 +7,6 @@ import buffer/cell import config/mailcap import config/mimetypes import config/toml -import data/charset import io/headers import io/urlfilter import js/javascript @@ -20,6 +19,8 @@ import utils/mimeguess import utils/opt import utils/twtstr +import chakasu/charset + type ColorMode* = enum MONOCHROME, ANSI, EIGHT_BIT, TRUE_COLOR diff --git a/src/config/mailcap.nim b/src/config/mailcap.nim index 71e3d6c1..8c2a8853 100644 --- a/src/config/mailcap.nim +++ b/src/config/mailcap.nim @@ -4,11 +4,12 @@ import osproc import streams import strutils -import data/charset import types/url import utils/opt import utils/twtstr +import chakasu/charset + type MailcapParser = object stream: Stream diff --git a/src/data/charset.nim b/src/data/charset.nim deleted file mode 100644 index f8f833d5..00000000 --- a/src/data/charset.nim +++ /dev/null @@ -1,458 +0,0 @@ -import algorithm -import os -import strutils -import tables - -import utils/twtstr - -type Charset* = enum - CHARSET_UNKNOWN - CHARSET_UTF_8 = "UTF-8" - CHARSET_IBM866 = "IBM866" - CHARSET_ISO_8859_2 = "ISO-8859-2" - CHARSET_ISO_8859_3 = "ISO-8859-3" - CHARSET_ISO_8859_4 = "ISO-8859-4" - CHARSET_ISO_8859_5 = "ISO-8859-5" - CHARSET_ISO_8859_6 = "ISO-8859-6" - CHARSET_ISO_8859_7 = "ISO-8859-7" - CHARSET_ISO_8859_8 = "ISO-8859-8" - CHARSET_ISO_8859_8_I = "ISO-8859-8-I" - CHARSET_ISO_8859_10 = "ISO-8859-10" - CHARSET_ISO_8859_13 = "ISO-8859-13" - CHARSET_ISO_8859_14 = "ISO-8859-14" - CHARSET_ISO_8859_15 = "ISO-8859-15" - CHARSET_ISO_8859_16 = "ISO-8859-16" - CHARSET_KOI8_R = "KOI8-R" - CHARSET_KOI8_U = "KOI8-U" - CHARSET_MACINTOSH = "macintosh" - CHARSET_WINDOWS_874 = "windows-874" - CHARSET_WINDOWS_1250 = "windows-1250" - CHARSET_WINDOWS_1251 = "windows-1251" - CHARSET_WINDOWS_1252 = "windows-1252" - CHARSET_WINDOWS_1253 = "windows-1253" - CHARSET_WINDOWS_1254 = "windows-1254" - CHARSET_WINDOWS_1255 = "windows-1255" - CHARSET_WINDOWS_1256 = "windows-1256" - CHARSET_WINDOWS_1257 = "windows-1257" - CHARSET_WINDOWS_1258 = "windows-1258" - CHARSET_X_MAC_CYRILLIC = "x-mac-cyrillic" - CHARSET_GBK = "GBK" - CHARSET_GB18030 = "gb18030" - CHARSET_BIG5 = "Big5" - CHARSET_EUC_JP = "EUC-JP" - CHARSET_ISO_2022_JP = "ISO-2022-JP" - CHARSET_SHIFT_JIS = "Shift_JIS" - CHARSET_EUC_KR = "EUC-KR" - CHARSET_REPLACEMENT = "replacement" - CHARSET_UTF_16_BE = "UTF-16BE" - CHARSET_UTF_16_LE = "UTF-16LE" - CHARSET_X_USER_DEFINED = "x-user-defined" - -const CharsetMap = { - # UTF-8 (The Encoding) - "unicode-1-1-utf-8": CHARSET_UTF_8, - "unicode11utf-8": CHARSET_UTF_8, - "unicode20utf-8": CHARSET_UTF_8, - "utf-8": CHARSET_UTF_8, - "utf8": CHARSET_UTF_8, - "x-unicode20utf8": CHARSET_UTF_8, - # IBM866 - "866": CHARSET_IBM_866, - "cp866": CHARSET_IBM_866, - "csibm866": CHARSET_IBM_866, - "ibm866": CHARSET_IBM_866, - # ISO-8859-2 - "csisolatin2": CHARSET_ISO_8859_2, - "iso-8859-2": CHARSET_ISO_8859_2, - "iso-ir-101": CHARSET_ISO_8859_2, - "iso8859-2": CHARSET_ISO_8859_2, - "iso88592": CHARSET_ISO_8859_2, - "iso_8859-2": CHARSET_ISO_8859_2, - "iso_8859-2:1987": CHARSET_ISO_8859_2, - "l2": CHARSET_ISO_8859_2, - "latin2": CHARSET_ISO_8859_2, - # ISO-8859-3 - "csisolatin3": CHARSET_ISO_8859_3, - "iso-8859-3": CHARSET_ISO_8859_3, - "iso-ir-109": CHARSET_ISO_8859_3, - "iso8859-3": CHARSET_ISO_8859_3, - "iso88593": CHARSET_ISO_8859_3, - "iso_8859-3": CHARSET_ISO_8859_3, - "iso_8859-3:1988": CHARSET_ISO_8859_3, - "l3": CHARSET_ISO_8859_3, - "latin3": CHARSET_ISO_8859_3, - # ISO-8859-4 - "csisolatin4": CHARSET_ISO_8859_4, - "iso-8859-4": CHARSET_ISO_8859_4, - "iso-ir-110": CHARSET_ISO_8859_4, - "iso8859-4": CHARSET_ISO_8859_4, - "iso88594": CHARSET_ISO_8859_4, - "iso_8859-4": CHARSET_ISO_8859_4, - "iso_8859-4:1988": CHARSET_ISO_8859_4, - "l4": CHARSET_ISO_8859_4, - "latin4": CHARSET_ISO_8859_4, - # ISO-8859-5 - "csisolatincyrillic": CHARSET_ISO_8859_5, - "cyrillic": CHARSET_ISO_8859_5, - "iso-8859-5": CHARSET_ISO_8859_5, - "iso-ir-144": CHARSET_ISO_8859_5, - "iso8859-5": CHARSET_ISO_8859_5, - "iso88595": CHARSET_ISO_8859_5, - "iso_8859-5": CHARSET_ISO_8859_5, - "iso_8859-5:1988": CHARSET_ISO_8859_5, - # ISO-8859-6 - "arabic": CHARSET_ISO_8859_6, - "asmo-708": CHARSET_ISO_8859_6, - "csiso88596e": CHARSET_ISO_8859_6, - "csiso88596i": CHARSET_ISO_8859_6, - "csisolatinarabic": CHARSET_ISO_8859_6, - "ecma-114": CHARSET_ISO_8859_6, - "iso-8859-6": CHARSET_ISO_8859_6, - "iso-8859-6-e": CHARSET_ISO_8859_6, - "iso-8859-6-i": CHARSET_ISO_8859_6, - "iso-ir-127": CHARSET_ISO_8859_6, - "iso8859-6": CHARSET_ISO_8859_6, - "iso88596": CHARSET_ISO_8859_6, - "iso_8859-6": CHARSET_ISO_8859_6, - "iso_8859-6:1987": CHARSET_ISO_8859_6, - # ISO-8859-7 - "csisolatingreek": CHARSET_ISO_8859_7, - "ecma-118": CHARSET_ISO_8859_7, - "elot_928": CHARSET_ISO_8859_7, - "greek": CHARSET_ISO_8859_7, - "greek8": CHARSET_ISO_8859_7, - "iso-8859-7": CHARSET_ISO_8859_7, - "iso-ir-126": CHARSET_ISO_8859_7, - "iso8859-7": CHARSET_ISO_8859_7, - "iso88597": CHARSET_ISO_8859_7, - "iso_8859-7": CHARSET_ISO_8859_7, - "iso_8859-7:1987": CHARSET_ISO_8859_7, - "sun_eu_greek": CHARSET_ISO_8859_7, - # ISO-8859-8 - "csiso88598e": CHARSET_ISO_8859_8, - "csisolatinhebrew": CHARSET_ISO_8859_8, - "hebrew": CHARSET_ISO_8859_8, - "iso-8859-8": CHARSET_ISO_8859_8, - "iso-8859-8-e": CHARSET_ISO_8859_8, - "iso-ir-138": CHARSET_ISO_8859_8, - "iso8859-8": CHARSET_ISO_8859_8, - "iso88598": CHARSET_ISO_8859_8, - "iso_8859-8": CHARSET_ISO_8859_8, - "iso_8859-8:1988": CHARSET_ISO_8859_8, - "visual": CHARSET_ISO_8859_8, - # ISO-8859-8-I - "csiso88598i": CHARSET_ISO_8859_8_I, - "iso-8859-8-i": CHARSET_ISO_8859_8_I, - "logical": CHARSET_ISO_8859_8_I, - # ISO-8859-10 - "csisolatin6": CHARSET_ISO_8859_10, - "iso-8859-10": CHARSET_ISO_8859_10, - "iso-ir-157": CHARSET_ISO_8859_10, - "iso8859-10": CHARSET_ISO_8859_10, - "iso885910": CHARSET_ISO_8859_10, - "l6": CHARSET_ISO_8859_10, - "latin6": CHARSET_ISO_8859_10, - # ISO-8859-13 - "iso-8859-13": CHARSET_ISO_8859_13, - "iso8859-13": CHARSET_ISO_8859_13, - "iso885913": CHARSET_ISO_8859_13, - # ISO-8859-14 - "iso-8859-14": CHARSET_ISO_8859_14, - "iso8859-14": CHARSET_ISO_8859_14, - "iso885914": CHARSET_ISO_8859_14, - # ISO-8859-15 - "csisolatin9": CHARSET_ISO_8859_15, - "iso-8859-15": CHARSET_ISO_8859_15, - "iso8859-15": CHARSET_ISO_8859_15, - "iso885915": CHARSET_ISO_8859_15, - "iso_8859-15": CHARSET_ISO_8859_15, - "l9": CHARSET_ISO_8859_15, - # ISO-8859-16 - "iso-8859-16": CHARSET_ISO_8859_16, - # KOI8-R - "cskoi8r": CHARSET_KOI8_R, - "koi": CHARSET_KOI8_R, - "koi8": CHARSET_KOI8_R, - "koi8-r": CHARSET_KOI8_R, - "koi8_r": CHARSET_KOI8_R, - # KOI8-U - "koi8-ru": CHARSET_KOI8_U, - "koi8-u": CHARSET_KOI8_U, - # macintosh - "csmacintosh": CHARSET_MACINTOSH, - "mac": CHARSET_MACINTOSH, - "macintosh": CHARSET_MACINTOSH, - "x-mac-roman": CHARSET_MACINTOSH, - # windows-874 - "dos-874": CHARSET_WINDOWS_874, - "iso-8859-11": CHARSET_WINDOWS_874, - "iso8859-11": CHARSET_WINDOWS_874, - "iso885911": CHARSET_WINDOWS_874, - "tis-620": CHARSET_WINDOWS_874, - "windows-874": CHARSET_WINDOWS_874, - # windows-1250 - "cp1250": CHARSET_WINDOWS_1250, - "windows-1250": CHARSET_WINDOWS_1250, - "x-cp1250" : CHARSET_WINDOWS_1250, - # windows-1251 - "cp1251": CHARSET_WINDOWS_1251, - "windows-1251": CHARSET_WINDOWS_1251, - "x-cp1251": CHARSET_WINDOWS_1251, - # windows-1252 - "ansi_x3.4-1968": CHARSET_WINDOWS_1252, - "ascii": CHARSET_WINDOWS_1252, # lol - "cp1252": CHARSET_WINDOWS_1252, - "cp819": CHARSET_WINDOWS_1252, - "csisolatin1": CHARSET_WINDOWS_1252, - "ibm819": CHARSET_WINDOWS_1252, - "iso-8859-1": CHARSET_WINDOWS_1252, - "iso88591": CHARSET_WINDOWS_1252, - "iso_8859-1:1987": CHARSET_WINDOWS_1252, - "l1": CHARSET_WINDOWS_1252, - "latin1": CHARSET_WINDOWS_1252, - "us-ascii": CHARSET_WINDOWS_1252, - "windows-1252": CHARSET_WINDOWS_1252, - "x-cp1252": CHARSET_WINDOWS_1252, - # windows-1253 - "cp1253": CHARSET_WINDOWS_1253, - "windows-1253": CHARSET_WINDOWS_1253, - "x-cp1253": CHARSET_WINDOWS_1253, - # windows-1254 - "cp1254": CHARSET_WINDOWS_1254, - "csisolatin5": CHARSET_WINDOWS_1254, - "iso-8859-9": CHARSET_WINDOWS_1254, - "iso-ir-148": CHARSET_WINDOWS_1254, - "iso8859-9": CHARSET_WINDOWS_1254, - "iso88599": CHARSET_WINDOWS_1254, - "iso_8859-9": CHARSET_WINDOWS_1254, - "iso_8859-9:1989": CHARSET_WINDOWS_1254, - "l5": CHARSET_WINDOWS_1254, - "latin5": CHARSET_WINDOWS_1254, - "windows-1254": CHARSET_WINDOWS_1254, - "x-cp1254": CHARSET_WINDOWS_1254, - # windows-1255 - "cp1255": CHARSET_WINDOWS_1255, - "windows-1255": CHARSET_WINDOWS_1255, - "x-cp1255": CHARSET_WINDOWS_1255, - # windows-1256 - "cp1256": CHARSET_WINDOWS_1256, - "windows-1256": CHARSET_WINDOWS_1256, - "x-cp1256": CHARSET_WINDOWS_1256, - # windows-1257 - "cp1257": CHARSET_WINDOWS_1257, - "windows-1257": CHARSET_WINDOWS_1257, - "x-cp1257": CHARSET_WINDOWS_1257, - # windows-1258 - "cp1258": CHARSET_WINDOWS_1258, - "windows-1258": CHARSET_WINDOWS_1258, - "x-cp1258": CHARSET_WINDOWS_1258, - # x-mac-cyrillic - "x-mac-cyrillic": CHARSET_X_MAC_CYRILLIC, - "x-mac-ukrainian": CHARSET_X_MAC_CYRILLIC, - # GBK - "chinese": CHARSET_GBK, - "csgb2312": CHARSET_GBK, - "csiso58gb231280": CHARSET_GBK, - "gb2312": CHARSET_GBK, - "gb_2312": CHARSET_GBK, - "gb_2312-80": CHARSET_GBK, - "gbk": CHARSET_GBK, - "iso-ir-58": CHARSET_GBK, - "x-gbk": CHARSET_GBK, - # gb18030 - "gb18030": CHARSET_GB18030, - # Big5 - "big5": CHARSET_BIG5, - "big5-hkscs": CHARSET_BIG5, - "cn-big5": CHARSET_BIG5, - "csbig5": CHARSET_BIG5, - "x-x-big5": CHARSET_BIG5, - # EUC-JP - "cseucpkdfmtjapanese": CHARSET_EUC_JP, - "euc-jp": CHARSET_EUC_JP, - "x-euc-jp": CHARSET_EUC_JP, - # ISO-2022-JP (ugh) - "csiso2022jp": CHARSET_ISO_2022_JP, - "iso-2022-jp": CHARSET_ISO_2022_JP, - # Shift_JIS - "csshiftjis": CHARSET_SHIFT_JIS, - "ms932": CHARSET_SHIFT_JIS, - "ms_kanji": CHARSET_SHIFT_JIS, - "shift-jis": CHARSET_SHIFT_JIS, - "shift_jis": CHARSET_SHIFT_JIS, - "sjis": CHARSET_SHIFT_JIS, - "windows-31j": CHARSET_SHIFT_JIS, - "x-sjis": CHARSET_SHIFT_JIS, - # EUC-KR - "cseuckr": CHARSET_EUC_KR, - "csksc56011987": CHARSET_EUC_KR, - "euc-kr": CHARSET_EUC_KR, - "iso-ir-149": CHARSET_EUC_KR, - "korean": CHARSET_EUC_KR, - "ks_c_5601-1987": CHARSET_EUC_KR, - "ks_c_5601-1989": CHARSET_EUC_KR, - "ksc5601": CHARSET_EUC_KR, - "ksc_5601": CHARSET_EUC_KR, - "windows-949": CHARSET_EUC_KR, - # replacement - "csiso2022kr": CHARSET_REPLACEMENT, - "hz-gb-2312": CHARSET_REPLACEMENT, - "iso-2022-cn": CHARSET_REPLACEMENT, - "iso-2022-cn-ext": CHARSET_REPLACEMENT, - "iso-2022-kr": CHARSET_REPLACEMENT, - "replacement": CHARSET_REPLACEMENT, - # UTF-16BE - "unicodefffe": CHARSET_UTF_16_BE, - "utf-16be": CHARSET_UTF_16_BE, - # UTF-16LE - "csunicode": CHARSET_UTF_16_LE, - "iso-10646-ucs-2": CHARSET_UTF_16_LE, - "ucs-2": CHARSET_UTF_16_LE, - "unicode": CHARSET_UTF_16_LE, - "unicodefeff": CHARSET_UTF_16_LE, - "utf-16": CHARSET_UTF_16_LE, - "utf-16le": CHARSET_UTF_16_LE, - # x-user-defined - "x-user-defined": CHARSET_X_USER_DEFINED -}.toTable() - -const NormalizedCharsetMap = (func(): Table[string, Charset] = - for k, v in CharsetMap: - result[k.normalizeLocale()] = v)() - -const DefaultCharset* = CHARSET_UTF_8 - -proc getCharset*(s: string): Charset = - return CharsetMap.getOrDefault(s.strip().toLower(), CHARSET_UNKNOWN) - -proc getLocaleCharset*(s: string): Charset = - let ss = s.after('.') - if ss != "": - return NormalizedCharsetMap.getOrDefault(ss.normalizeLocale(), - CHARSET_UNKNOWN) - # We could try to guess the charset based on the language here, like w3m - # does. - # However, these days it is more likely for any system to be using UTF-8 - # than any other charset, irrespective of the language. So we just assume - # UTF-8. - return DefaultCharset - -iterator mappairs(path: string): tuple[a, b: int] = - let s = staticRead(path) - for line in s.split('\n'): - if line.len == 0 or line[0] == '#': continue - var i = 0 - while line[i] == ' ': inc i - var j = i - while i < line.len and line[i] in '0'..'9': inc i - let index = parseInt(line.substr(j, i - 1)) - inc i # tab - j = i - while i < line.len and line[i] in {'0'..'9', 'A'..'F', 'x'}: inc i - let n = parseHexInt(line.substr(j, i - 1)) - yield (index, n) - -# I'm pretty sure single-byte encodings map to ucs-2. -func loadCharsetMap8(path: string): tuple[ - decode: array[char, uint16], - encode: seq[ - tuple[ - ucs: uint16, - val: char - ] - ], - ] = - var m: int - for index, n in mappairs("res/map" / path): - result.decode[char(index)] = uint16(n) - if index > m: m = index - for index in low(char) .. char(m): - let val = result.decode[index] - if val != 0u16: - result.encode.add((val, index)) - result.encode.sort() - -func loadCharsetMap8Encode(path: string): seq[tuple[ucs: uint16, val: char]] = - for index, n in mappairs("res/map" / path): - result.add((uint16(n), char(index))) - result.sort() - -func loadGb18030Ranges(path: string): tuple[ - decode: seq[ - tuple[ - p: uint16, - ucs: uint16 ]], - encode: seq[ - tuple[ - ucs: uint16, - p: uint16 ]]] = - for index, n in mappairs("res/map" / path): - if uint32(index) > uint32(high(uint16)): break - result.decode.add((uint16(index), uint16(n))) - result.encode.add((uint16(n), uint16(index))) - result.encode.sort() - -type UCS16x16* = tuple[ucs, p: uint16] - -func loadCharsetMap16(path: string, len: static uint16): tuple[ - decode: array[len, uint16], - encode: seq[UCS16x16]] = - for index, n in mappairs("res/map" / path): - result.decode[uint16(index)] = uint16(n) - result.encode.add((uint16(n), uint16(index))) - result.encode.sort() - -func loadCharsetMapSJIS(path: string): seq[UCS16x16] = - for index, n in mappairs("res/map" / path): - if n notin 8272..8835: - result.add((uint16(n), uint16(index))) - result.sort() - -type UCS32x16* = tuple[ucs: uint32, p: uint16] - -func loadBig5Map(path: string, offset: static uint16): tuple[ - decode: array[19782u16 - offset, uint32], # ouch (+75KB...) - encode: seq[UCS32x16]] = - for index, n in mappairs("res/map" / path): - result.decode[uint16(index) - offset] = uint32(n) - result.encode.add((uint32(n), uint16(index))) - #for i in result.decode: assert x != 0 # fail - result.encode.sort() - -const (IBM866Decode*, IBM866Encode*) = loadCharsetMap8("index-ibm866.txt") -const (ISO88592Decode*, ISO88592Encode*) = loadCharsetMap8("index-iso-8859-2.txt") -const (ISO88593Decode*, ISO88593Encode*) = loadCharsetMap8("index-iso-8859-3.txt") -const (ISO88594Decode*, ISO88594Encode*) = loadCharsetMap8("index-iso-8859-4.txt") -const (ISO88595Decode*, ISO88595Encode*) = loadCharsetMap8("index-iso-8859-5.txt") -const (ISO88596Decode*, ISO88596Encode*) = loadCharsetMap8("index-iso-8859-6.txt") -const (ISO88597Decode*, ISO88597Encode*) = loadCharsetMap8("index-iso-8859-7.txt") -const (ISO88598Decode*, ISO88598Encode*) = loadCharsetMap8("index-iso-8859-8.txt") -const (ISO885910Decode*, ISO885910Encode*) = loadCharsetMap8("index-iso-8859-10.txt") -const (ISO885913Decode*, ISO885913Encode*) = loadCharsetMap8("index-iso-8859-13.txt") -const (ISO885914Decode*, ISO885914Encode*) = loadCharsetMap8("index-iso-8859-14.txt") -const (ISO885915Decode*, ISO885915Encode*) = loadCharsetMap8("index-iso-8859-15.txt") -const (ISO885916Decode*, ISO885916Encode*) = loadCharsetMap8("index-iso-8859-16.txt") -const (KOI8RDecode*, KOI8REncode*) = loadCharsetMap8("index-koi8-r.txt") -const (KOI8UDecode*, KOI8UEncode*) = loadCharsetMap8("index-koi8-u.txt") -const (MacintoshDecode*, MacintoshEncode*) = loadCharsetMap8("index-macintosh.txt") -const (Windows874Decode*, Windows874Encode*) = loadCharsetMap8("index-windows-874.txt") -const (Windows1250Decode*, Windows1250Encode*) = loadCharsetMap8("index-windows-1250.txt") -const (Windows1251Decode*, Windows1251Encode*) = loadCharsetMap8("index-windows-1251.txt") -const (Windows1252Decode*, Windows1252Encode*) = loadCharsetMap8("index-windows-1252.txt") -const (Windows1253Decode*, Windows1253Encode*) = loadCharsetMap8("index-windows-1253.txt") -const (Windows1254Decode*, Windows1254Encode*) = loadCharsetMap8("index-windows-1254.txt") -const (Windows1255Decode*, Windows1255Encode*) = loadCharsetMap8("index-windows-1255.txt") -const (Windows1256Decode*, Windows1256Encode*) = loadCharsetMap8("index-windows-1256.txt") -const (Windows1257Decode*, Windows1257Encode*) = loadCharsetMap8("index-windows-1257.txt") -const (Windows1258Decode*, Windows1258Encode*) = loadCharsetMap8("index-windows-1258.txt") -const (XMacCyrillicDecode*, XMacCyrillicEncode*) = loadCharsetMap8("index-x-mac-cyrillic.txt") -const (Gb18030RangesDecode*, Gb18030RangesEncode*) = loadGb18030Ranges("index-gb18030-ranges.txt") -const (Gb18030Decode*, Gb18030Encode*) = loadCharsetMap16("index-gb18030.txt", len = 23940) -#for x in Gb18030Decode: assert x != 0 # success -const Big5DecodeOffset* = 942 -const (Big5Decode*, Big5Encode*) = loadBig5Map("index-big5.txt", offset = Big5DecodeOffset) -const (Jis0208Decode*, Jis0208Encode*) = loadCharsetMap16("index-jis0208.txt", len = 11104) -const ShiftJISEncode* = loadCharsetMapSJIS("index-jis0208.txt") -const (Jis0212Decode*, Jis0212Encode*) = loadCharsetMap16("index-jis0212.txt", len = 7211) -const ISO2022JPKatakanaEncode* = loadCharsetMap8Encode("index-iso-2022-jp-katakana.txt") -const (EUCKRDecode*, EUCKREncode*) = loadCharsetMap16("index-euc-kr.txt", len = 23750) diff --git a/src/display/client.nim b/src/display/client.nim index 3d40fe63..a993fe9f 100644 --- a/src/display/client.nim +++ b/src/display/client.nim @@ -18,7 +18,6 @@ import bindings/quickjs import buffer/container import config/config import css/sheet -import data/charset import display/pager import display/term import html/chadombuilder @@ -47,6 +46,8 @@ import utils/opt import xhr/formdata import xhr/xmlhttprequest +import chakasu/charset + type Client* = ref ClientObj ClientObj* = object diff --git a/src/display/pager.nim b/src/display/pager.nim index 0b8197fc..ba9a8453 100644 --- a/src/display/pager.nim +++ b/src/display/pager.nim @@ -16,7 +16,6 @@ import buffer/select import config/config import config/mailcap import config/mimetypes -import data/charset import display/term import io/connecterror import io/headers @@ -38,6 +37,8 @@ import types/url import utils/opt import utils/twtstr +import chakasu/charset + type LineMode* = enum NO_LINEMODE, LOCATION, USERNAME, PASSWORD, COMMAND, BUFFER, SEARCH_F, diff --git a/src/display/term.nim b/src/display/term.nim index a2112ee5..c9981f42 100644 --- a/src/display/term.nim +++ b/src/display/term.nim @@ -8,14 +8,15 @@ import unicode import bindings/termcap import buffer/cell import config/config -import data/charset -import encoding/encoderstream import io/runestream import io/window import types/color import utils/opt import utils/twtstr +import chakasu/charset +import chakasu/encoderstream + #TODO switch from termcap... type diff --git a/src/encoding/decoderstream.nim b/src/encoding/decoderstream.nim deleted file mode 100644 index c78eebdc..00000000 --- a/src/encoding/decoderstream.nim +++ /dev/null @@ -1,863 +0,0 @@ -import algorithm -import streams -import unicode - -import data/charset -import utils/twtstr - -# DecoderStream decodes any encoding to valid utf-32. -type - DecoderErrorMode* = enum - DECODER_ERROR_MODE_FATAL, DECODER_ERROR_MODE_REPLACEMENT - - ISO2022JPState = enum - STATE_ASCII, STATE_ROMAN, STATE_KATAKANA, STATE_LEAD_BYTE, - STATE_TRAIL_BYTE, STATE_ESCAPE_START, STATE_ESCAPE - - DecoderStream* = ref object - source: Stream - errormode: DecoderErrorMode - isend: bool - failed*: bool - bufs: seq[seq[uint32]] - bs: int - bi: int - buflen: int - c: uint32 - case charset: Charset - of CHARSET_UTF_8: - u8c: uint32 - u8needed: int - u8seen: int - u8bounds: Slice[uint8] - of CHARSET_GBK, CHARSET_GB18030: - gb18first: uint8 - gb18second: uint8 - gb18third: uint8 - gb18buf: uint8 - gb18hasbuf: bool - of CHARSET_BIG5: - big5lead: uint8 - of CHARSET_EUC_JP: - eucjplead: uint8 - eucjpjis0212: bool - of CHARSET_ISO_2022_JP: - iso2022jplead: uint8 - iso2022jpstate: ISO2022JPState - iso2022jpoutputstate: ISO2022JPState - iso2022jpoutput: bool - iso2022jpbuf: uint8 - iso2022jphasbuf: bool - of CHARSET_SHIFT_JIS: - sjislead: uint8 - of CHARSET_EUC_KR: - euckrlead: uint8 - of CHARSET_UTF_16_BE, CHARSET_UTF_16_LE: - u16lead: uint8 - u16surr: uint16 - u16haslead: bool - u16hassurr: bool - of CHARSET_REPLACEMENT: - replreported: bool - else: discard - -template append_codepoint_buf(stream: DecoderStream, c: uint32) = - if stream.bi >= stream.buflen: - stream.bufs.add(newSeqUninitialized[uint32](stream.buflen)) - stream.bi = 0 - stream.bufs[^1][stream.bi] = c - inc stream.bi - -template append_codepoint(stream: DecoderStream, c: uint32, oq: ptr UncheckedArray[uint32], olen: int, n: var int) = - if n < olen: - oq[n div sizeof(uint32)] = c - n += sizeof(uint32) - else: - append_codepoint_buf stream, c - -template append_codepoint(stream: DecoderStream, c: char, oq: ptr UncheckedArray[uint32], olen: int, n: var int) = - stream.append_codepoint uint32(c), oq, olen, n - -proc handleError(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int, n: var int) = - case stream.errormode - of DECODER_ERROR_MODE_FATAL: - stream.isend = true - stream.failed = true - of DECODER_ERROR_MODE_REPLACEMENT: - stream.append_codepoint 0xFFFD, oq, olen, n - -proc decodeUTF8(stream: DecoderStream, iq: openArray[uint8], - oq: ptr UncheckedArray[uint32], olen: int, n: var int) = - var c = stream.u8c - var needed = stream.u8needed - var seen = stream.u8seen - var bounds = stream.u8bounds - var i = 0 - while i < iq.len: - let b = iq[i] - if needed == 0: - case b - of 0x00u8 .. 0x7Fu8: - stream.append_codepoint uint32(b), oq, olen, n - of 0xC2u8 .. 0xDFu8: - needed = 1 - c = uint32(b) and 0x1F - of 0xE0u8: - bounds.a = 0xA0 - needed = 2 - c = uint32(b) and 0xF - of 0xEDu8: - bounds.b = 0x9F - needed = 2 - c = uint32(b) and 0xF - of 0xE1u8 .. 0xECu8, 0xEEu8 .. 0xEFu8: - needed = 2 - c = uint32(b) and 0xF - of 0xF0u8: - bounds.a = 0x90 - needed = 3 - c = uint32(b) and 0x7 - of 0xF4u8: - bounds.b = 0x8F - needed = 3 - c = uint32(b) and 0x7 - of 0xF1u8 .. 0xF3u8: - needed = 3 - c = uint32(b) and 0x7 - else: - stream.handleError(oq, olen, n) - if stream.isend: # fatal error - break - inc i - continue - if b notin bounds: - c = 0 - needed = 0 - seen = 0 - bounds = 0x80u8 .. 0xBFu8 - stream.handleError(oq, olen, n) - continue # prepend (no inc i) - bounds = 0x80u8 .. 0xBFu8 - c = (c shl 6) or (b and 0x3F) - inc seen - if seen == needed: - stream.append_codepoint c, oq, olen, n - c = 0 - needed = 0 - seen = 0 - inc i - stream.u8c = c - stream.u8bounds = bounds - stream.u8seen = seen - stream.u8needed = needed - -proc gb18RangesCodepoint(p: uint32): uint32 = - if p > 39419 and p < 189000 or p > 1237575: - return high(uint32) # null - if p == 7457: - return 0xE7C7 - # Let offset be the last pointer in index gb18030 ranges that is less than or - # equal to pointer and code point offset its corresponding code point. - var offset: uint32 - var c: uint32 - if p >= 189000: - # omitted from the map for storage efficiency - offset = 189000 - c = 0x10000 - elif p >= 39394: - # Needed because upperBound returns the first element greater than pointer - # OR last on failure, so we can't just remove one if p is e.g. 39400. - offset = 39394 - c = 0xFFE6 - else: - # Find the first range that is greater than p, or last if no such element - # is found. - # We want the last that is <=, so decrease index by one. - let i = upperBound(Gb18030RangesDecode, p, func(a: tuple[p, ucs: uint16], b: uint32): int = - cmp(uint32(a.p), b)) - let elem = Gb18030RangesDecode[i - 1] - offset = elem.p - c = elem.ucs - return c + p - offset - -proc decodeGb18030(stream: DecoderStream, iq: openArray[uint8], - oq: ptr UncheckedArray[uint32], olen: int, n: var int) = - var first = stream.gb18first - var second = stream.gb18second - var third = stream.gb18third - var buf = stream.gb18buf - var hasbuf = stream.gb18hasbuf - var i = 0 - while i < iq.len: - let b = if hasbuf: - hasbuf = false - dec i - buf - else: - iq[i] - if third != 0: - if b notin 0x30u8 .. 0x39u8: - hasbuf = true - buf = second - first = third - first = 0 - second = 0 - third = 0 - stream.handleError(oq, olen, n) - if stream.isend: break - continue # prepend (no inc i) - else: - let p = ((uint32(first) - 0x81) * 10 * 126 * 10) + - ((uint32(second) - 0x30) * (10 * 126)) + - ((uint32(third) - 0x81) * 10) + uint32(b) - 0x30 - let c = gb18RangesCodepoint(p) - first = 0 - second = 0 - third = 0 - if c == high(uint32): # null - stream.handleError(oq, olen, n) - if stream.isend: break - else: - stream.append_codepoint c, oq, olen, n - elif second != 0: - if b in 0x81u8 .. 0xFEu8: - third = b - else: - hasbuf = true - buf = second - first = 0 - second = 0 - third = 0 - stream.handleError(oq, olen, n) - if stream.isend: break - elif first != 0: - if b in 0x30u8 .. 0x39u8: - second = b - else: - let ff = first - first = 0 - if b in 0x40u8 .. 0x7Eu8: - let offset = if b < 0x7F: 0x40u32 else: 0x41u32 - let p = (uint16(ff) - 0x81) * 190 + (uint16(b) - offset) - if p < Gb18030Decode.len: - let c = Gb18030Decode[cast[uint16](p)] - stream.append_codepoint uint32(c), oq, olen, n - inc i - continue - if cast[char](b) in Ascii: - continue # prepend (no inc i) - else: - stream.handleError(oq, olen, n) - if stream.isend: break - elif cast[char](b) in Ascii: - stream.append_codepoint b, oq, olen, n - elif b == 0x80: - stream.append_codepoint 0x20AC, oq, olen, n - elif b in 0x81u8 .. 0xFEu8: - first = b - else: - stream.handleError(oq, olen, n) - if stream.isend: break - inc i - stream.gb18first = first - stream.gb18second = second - stream.gb18third = third - stream.gb18buf = buf - stream.gb18hasbuf = hasbuf - -proc decodeBig5(stream: DecoderStream, iq: openArray[uint8], - oq: ptr UncheckedArray[uint32], olen: int, n: var int) = - for b in iq: - if stream.big5lead != 0: - let lead = uint32(stream.big5lead) - stream.big5lead = 0 - let offset = if b < 0x7F: 0x40u16 else: 0x62u16 - if b in {0x40u8 .. 0x7Eu8, 0xA1u8 .. 0xFEu8}: - let p = (lead - 0x81) * 157 + uint16(b) - offset - template output_two(a, b: uint32) = - stream.append_codepoint a, oq, olen, n - stream.append_codepoint b, oq, olen, n - block no_continue: - case p - of 1133: output_two 0x00CA, 0x0304 - of 1135: output_two 0x00CA, 0x030C - of 1164: output_two 0x00EA, 0x0304 - of 1166: output_two 0x00EA, 0x030C - else: break no_continue - continue - if p < Big5Decode.len + Big5DecodeOffset: - let c = Big5Decode[p - Big5DecodeOffset] - if c != 0: - stream.append_codepoint c, oq, olen, n - continue - if cast[char](b) in Ascii: - stream.append_codepoint b, oq, olen, n - else: - stream.handleError(oq, olen, n) - if stream.isend: break - elif cast[char](b) in Ascii: - stream.append_codepoint b, oq, olen, n - elif b in 0x81u8 .. 0xFEu8: - stream.big5lead = b - else: - stream.handleError(oq, olen, n) - if stream.isend: break - -proc decodeEUCJP(stream: DecoderStream, iq: openArray[uint8], - oq: ptr UncheckedArray[uint32], olen: int, n: var int) = - var jis0212 = stream.eucjpjis0212 - var lead = stream.eucjplead - for b in iq: - if lead == 0x8E and b in 0xA1u8 .. 0xDFu8: - lead = 0 - stream.append_codepoint b, oq, olen, n - elif lead == 0x8F and b in 0xA1u8 .. 0xFEu8: - jis0212 = true - lead = b - elif lead != 0: - if lead in 0xA1u8 .. 0xFEu8 and b in 0xA1u8 .. 0xFEu8: - let p = (uint16(lead) - 0xA1) * 94 + uint16(b) - 0xA1 - lead = 0 - var c: uint16 - if jis0212: - if p < Jis0212Decode.len: - c = Jis0212Decode[p] - else: - if p < Jis0208Decode.len: - c = Jis0208Decode[p] - jis0212 = false - if c != 0: - stream.append_codepoint c, oq, olen, n - continue - else: - lead = 0 - stream.handleError(oq, olen, n) - if stream.isend: break - elif cast[char](b) in Ascii: - stream.append_codepoint b, oq, olen, n - elif b in {0x8Eu8, 0x8Fu8, 0xA1u8 .. 0xFEu8}: - lead = b - else: - stream.handleError(oq, olen, n) - if stream.isend: break - stream.eucjpjis0212 = jis0212 - stream.eucjplead = lead - -proc decodeISO2022JP(stream: DecoderStream, iq: openArray[uint8], - oq: ptr UncheckedArray[uint32], olen: int, n: var int) = - var i = 0 - var lead = stream.iso2022jplead - var state = stream.iso2022jpstate - var output = stream.iso2022jpoutput - var outputstate = stream.iso2022jpoutputstate - var buf = stream.iso2022jpbuf - var hasbuf = stream.iso2022jphasbuf - while i < iq.len: - let b = if hasbuf: - hasbuf = false - dec i - buf - else: - iq[i] - case state - of STATE_ASCII: - case b - of 0x1B: state = STATE_ESCAPE_START - of {0x00u8..0x7Fu8} - {0x0Eu8, 0x0Fu8, 0x1Bu8}: - output = false - stream.append_codepoint b, oq, olen, n - else: - output = false - stream.handleError(oq, olen, n) - if stream.isend: break - of STATE_ROMAN: - case b - of 0x1B: state = STATE_ESCAPE_START - of 0x5C: - output = false - stream.append_codepoint 0x00A5, oq, olen, n - of 0x7E: - output = false - stream.append_codepoint 0x203E, oq, olen, n - of {0x00u8..0x7Fu8} - {0x0Eu8, 0x0Fu8, 0x1Bu8, 0x5Cu8, 0x7Eu8}: - output = false - stream.append_codepoint b, oq, olen, n - else: - output = false - stream.handleError(oq, olen, n) - if stream.isend: break - of STATE_KATAKANA: - case b - of 0x1B: state = STATE_ESCAPE_START - of 0x21u8..0x5Fu8: - output = false - stream.append_codepoint 0xFF61u16 - 0x21 + uint16(b), oq, olen, n - else: - output = false - stream.handleError(oq, olen, n) - if stream.isend: break - of STATE_LEAD_BYTE: - case b - of 0x1B: state = STATE_ESCAPE_START - of 0x21u8..0x7Eu8: - output = false - lead = b - state = STATE_TRAIL_BYTE - else: - output = false - stream.handleError(oq, olen, n) - if stream.isend: break - of STATE_TRAIL_BYTE: - case b - of 0x1B: - state = STATE_ESCAPE_START - stream.handleError(oq, olen, n) - if stream.isend: break - of 0x21u8..0x7Eu8: - state = STATE_LEAD_BYTE - let p = (uint16(lead) - 0x21) * 94 + uint16(b) - 0x21 - if p < Jis0208Decode.len: - let c = Jis0208Decode[p] - if c != 0: - stream.append_codepoint c, oq, olen, n - else: - stream.handleError(oq, olen, n) - if stream.isend: break - else: - state = STATE_LEAD_BYTE - stream.handleError(oq, olen, n) - if stream.isend: break - of STATE_ESCAPE_START: - if b == 0x24 or b == 0x28: - lead = b - state = STATE_ESCAPE - else: - output = false - state = outputstate - stream.handleError(oq, olen, n) - if stream.isend: break - continue # prepend (no inc i) - of STATE_ESCAPE: - let l = lead - lead = 0 - block statenonnull: - var s: ISO2022JPState - if l == 0x28: - case b - of 0x42: s = STATE_ASCII - of 0x4A: s = STATE_ROMAN - of 0x49: s = STATE_KATAKANA - else: break statenonnull - elif l == 0x24 and b in {0x40u8, 0x42u8}: - s = STATE_LEAD_BYTE - else: break statenonnull - state = s - outputstate = s - if output: - stream.handleError(oq, olen, n) - if stream.isend: - break - output = true - inc i - continue - output = false - state = outputstate - stream.handleError(oq, olen, n) - if stream.isend: break - hasbuf = true - buf = l - continue # prepend (no inc i) - inc i - stream.iso2022jphasbuf = hasbuf - stream.iso2022jpbuf = buf - stream.iso2022jplead = lead - stream.iso2022jpstate = state - stream.iso2022jpoutput = output - stream.iso2022jpoutputstate = outputstate - -proc decodeShiftJIS(stream: DecoderStream, iq: openArray[uint8], - oq: ptr UncheckedArray[uint32], olen: int, n: var int) = - var lead = stream.sjislead - var i = 0 - while i < iq.len: - let b = iq[i] - if lead != 0: - var ptrisnull = true; - var p = 0u16 - let offset = if b < 0x7Fu8: 0x40u16 else: 0x41u16 - let leadoffset = if lead < 0xA0: 0x81u16 else: 0xC1u16 - if b in 0x40u8..0x7Eu8 or b in 0x80u8..0xFCu8: - p = (uint16(lead) - leadoffset) * 188 + uint16(b) - offset - ptrisnull = false - lead = 0 - if not ptrisnull and p in 8836u16..10715u16: - stream.append_codepoint 0xE000u16 - 8836 + p, oq, olen, n - inc i - continue - elif not ptrisnull and p < Jis0208Decode.len and Jis0208Decode[p] != 0: - let c = Jis0208Decode[p] - stream.append_codepoint c, oq, olen, n - else: - stream.handleError(oq, olen, n) - if stream.isend: break - if cast[char](b) in Ascii: - continue # prepend (no inc i) - elif cast[char](b) in Ascii or b == 0x80: - stream.append_codepoint b, oq, olen, n - elif b in 0xA1u8..0xDFu8: - stream.append_codepoint 0xFF61u16 - 0xA1 + uint16(b), oq, olen, n - elif b in {0x81..0x9F} + {0xE0..0xFC}: - lead = b - else: - stream.handleError(oq, olen, n) - if stream.isend: break - inc i - stream.sjislead = lead - -proc decodeEUCKR(stream: DecoderStream, iq: openArray[uint8], - oq: ptr UncheckedArray[uint32], olen: int, n: var int) = - var lead = stream.euckrlead - for b in iq: - if lead != 0: - if b in 0x41u8..0xFEu8: - let p = (uint16(lead) - 0x81) * 190 + (uint16(b) - 0x41) - if p < EUCKRDecode.len and EUCKRDecode[p] != 0: - let c = EUCKRDecode[p] - stream.append_codepoint c, oq, olen, n - continue - stream.handleError(oq, olen, n) - if stream.isend: break - elif cast[char](b) in Ascii: - stream.append_codepoint b, oq, olen, n - elif b in {0x81u8..0xFEu8}: - lead = b - else: - stream.handleError(oq, olen, n) - if stream.isend: break - stream.euckrlead = lead - -proc decodeUTF16(stream: DecoderStream, iq: openArray[uint8], - oq: ptr UncheckedArray[uint32], olen: int, n: var int, be: static bool) = - var i = 0 - var lead = stream.u16lead - var haslead = stream.u16haslead - var surr = stream.u16surr - var hassurr = stream.u16hassurr - while i < iq.len: - if not haslead: - haslead = true - lead = iq[i] - else: - let cu = if be: - (uint16(lead) shl 8) + uint16(iq[i]) - else: - (uint16(iq[i]) shl 8) + uint16(lead) - haslead = false - if hassurr: - hassurr = false - if cu in 0xDC00u16 .. 0xDFFFu16: - let c = 0x10000 + ((uint32(surr) - 0xD800) shl 10) + (uint32(cu) - 0xDC00) - stream.append_codepoint c, oq, olen, n - inc i - continue - haslead = true # prepend the last two bytes - stream.handleError(oq, olen, n) - continue - if cu in 0xD800u16 .. 0xDBFFu16: - surr = cu - hassurr = true - inc i - continue - elif cu in 0xDC00u16 .. 0xDFFFu16: - stream.handleError(oq, olen, n) - if stream.isend: # fatal error - break - else: - inc i - continue - stream.append_codepoint uint32(cu), oq, olen, n - inc i - stream.u16lead = lead - stream.u16haslead = haslead - stream.u16surr = surr - stream.u16hassurr = hassurr - -proc decodeUTF16LE(stream: DecoderStream, iq: openArray[uint8], - oq: ptr UncheckedArray[uint32], olen: int, n: var int) = - stream.decodeUTF16(iq, oq, olen, n, false) - -proc decodeUTF16BE(stream: DecoderStream, iq: openArray[uint8], - oq: ptr UncheckedArray[uint32], olen: int, n: var int) = - stream.decodeUTF16(iq, oq, olen, n, true) - -proc decodeXUserDefined(stream: DecoderStream, iq: openArray[uint8], - oq: ptr UncheckedArray[uint32], olen: int, n: var int) = - for b in iq: - let c = cast[char](b) - if c in Ascii: - stream.append_codepoint c, oq, olen, n - else: - let c = 0xF780 + uint32(c) - 0x80 - stream.append_codepoint c, oq, olen, n - -proc decodeSingleByte(stream: DecoderStream, iq: openArray[uint8], - oq: ptr UncheckedArray[uint32], olen: int, n: var int, - map: array[char, uint16]) = - for b in iq: - let c = cast[char](b) - if c in Ascii: - stream.append_codepoint c, oq, olen, n - else: - let p = map[cast[char](b - 0x80)] - if p == 0u16: - stream.handleError(oq, olen, n) - else: - stream.append_codepoint uint32(p), oq, olen, n - -proc decodeReplacement(stream: DecoderStream, oq: ptr UncheckedArray[uint32], - olen: int, n: var int) = - if not stream.replreported: - stream.replreported = true - stream.handleError(oq, olen, n) - # I think that's it? - -# copy any data remaining from previous passes -proc copyBuffers(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int): int = - if stream.bufs.len == 1: - # one page: stream.bs ..< stream.bi - let n = min((stream.bi - stream.bs) * sizeof(stream.bufs[0][0]), olen) - copyMem(addr oq[0], addr stream.bufs[0][stream.bs], n) - stream.bs += n div sizeof(uint32) - if stream.bs >= stream.bi: - # read entire page; recycle it - stream.bs = 0 - stream.bi = 0 - return n - else: - # multiple pages: - # stream.bs ..< stream.buflen - # 0 ..< stream.buflen - # ... - # 0 ..< stream.bi - let a = (stream.buflen - stream.bs) * sizeof(stream.bufs[0][0]) - if a < olen: - copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a) - var ns = a - stream.bs = 0 - var i = 1 - while i < stream.bufs.high: - let n = min(stream.buflen * sizeof(stream.bufs[0][0]), olen - ns) - copyMem(addr oq[ns div sizeof(uint32)], addr stream.bufs[i][0], n) - ns += n - if ns >= olen: - # i'th buffer still has contents. - stream.bs = n div sizeof(uint32) - break - stream.bs = 0 - inc i - if ns < olen: - # last page - let n = min(stream.bi * sizeof(stream.bufs[0][0]), olen - ns) - copyMem(addr oq[ns div sizeof(uint32)], addr stream.bufs[i][0], n) - ns += n - stream.bs = n div sizeof(uint32) - if stream.bs >= stream.bi: - # read entire page; recycle it - stream.bs = 0 - stream.bi = 0 - for j in i ..< stream.bufs.len: - stream.bufs[j - i] = stream.bufs[j] - stream.bufs.setLen(stream.bufs.len - i) - return ns - elif a > olen: - copyMem(addr oq[0], addr stream.bufs[0][stream.bs], olen) - stream.bs += olen div sizeof(uint32) - assert stream.bs < stream.buflen - return olen - else: # a == olen - copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a) - stream.bs = 0 - stream.bufs.delete(0) - return a - -proc checkEnd(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int, - n: var int) = - if not stream.isend and stream.bufs.len == 1 and - stream.bs >= stream.bi and stream.source.atEnd: - stream.isend = true - case stream.charset - of CHARSET_UTF_16_LE, CHARSET_UTF_16_BE: - if stream.u16haslead or stream.u16hassurr: - stream.handleError(oq, olen, n) - of CHARSET_UTF_8: - if stream.u8needed != 0: - stream.handleError(oq, olen, n) - of CHARSET_GB18030, CHARSET_GBK: - if stream.gb18first != 0 or stream.gb18second != 0 or stream.gb18third != 0: - stream.handleError(oq, olen, n) - of CHARSET_BIG5: - if stream.big5lead != 0: - stream.handleError(oq, olen, n) - of CHARSET_EUC_JP: - if stream.eucjplead != 0: - stream.handleError(oq, olen, n) - of CHARSET_ISO_2022_JP: - case stream.iso2022jpstate - of STATE_ASCII, STATE_ROMAN, STATE_KATAKANA, STATE_LEAD_BYTE: discard - of STATE_TRAIL_BYTE: - stream.handleError(oq, olen, n) - of STATE_ESCAPE_START: - stream.handleError(oq, olen, n) - of STATE_ESCAPE: - stream.isend = false - stream.iso2022jpbuf = stream.iso2022jplead - stream.iso2022jphasbuf = true - stream.iso2022jplead = 0 - stream.iso2022jpoutput = false - stream.iso2022jpstate = stream.iso2022jpoutputstate - stream.handleError(oq, olen, n) - of CHARSET_SHIFT_JIS: - if stream.sjislead != 0: - stream.handleError(oq, olen, n) - of CHARSET_EUC_KR: - if stream.euckrlead != 0: - stream.handleError(oq, olen, n) - else: discard - -proc prepend*(stream: DecoderStream, c: uint32) = - append_codepoint_buf stream, c - -const ReadSize = 4096 -proc readData*(stream: DecoderStream, buffer: pointer, olen: int): int = - const l = sizeof(stream.bufs[0][0]) - assert olen mod l == 0, "Buffer size must be divisible by " & $l - if olen == 0: return - let oq = cast[ptr UncheckedArray[uint32]](buffer) - result = stream.copyBuffers(oq, olen) - let olen = olen - result - if olen == 0 or stream.source.atEnd: - # either output filled with buffered data; nothing to decode - # or we're at the end of the source stream - stream.checkEnd(oq, olen, result) - return result - var iq = newSeqUninitialized[uint8](ReadSize) - let ilen = stream.source.readData(cast[pointer](addr iq[0]), ReadSize) - if ilen == 0: - stream.checkEnd(oq, olen, result) - return result - template iqoa: openArray[uint8] = toOpenArray(iq, 0, ilen - 1) - case stream.charset - of CHARSET_UTF_8: - stream.decodeUTF8(iqoa, oq, olen, result) - of CHARSET_IBM866: - stream.decodeSingleByte(iqoa, oq, olen, result, IBM866Decode) - of CHARSET_ISO_8859_2: - stream.decodeSingleByte(iqoa, oq, olen, result, ISO88592Decode) - of CHARSET_ISO_8859_3: - stream.decodeSingleByte(iqoa, oq, olen, result, ISO88593Decode) - of CHARSET_ISO_8859_4: - stream.decodeSingleByte(iqoa, oq, olen, result, ISO88594Decode) - of CHARSET_ISO_8859_5: - stream.decodeSingleByte(iqoa, oq, olen, result, ISO88595Decode) - of CHARSET_ISO_8859_6: - stream.decodeSingleByte(iqoa, oq, olen, result, ISO88596Decode) - of CHARSET_ISO_8859_7: - stream.decodeSingleByte(iqoa, oq, olen, result, ISO88597Decode) - of CHARSET_ISO_8859_8, - CHARSET_ISO_8859_8_I: - stream.decodeSingleByte(iqoa, oq, olen, result, ISO88598Decode) - of CHARSET_ISO_8859_10: - stream.decodeSingleByte(iqoa, oq, olen, result, ISO885910Decode) - of CHARSET_ISO_8859_13: - stream.decodeSingleByte(iqoa, oq, olen, result, ISO885913Decode) - of CHARSET_ISO_8859_14: - stream.decodeSingleByte(iqoa, oq, olen, result, ISO885914Decode) - of CHARSET_ISO_8859_15: - stream.decodeSingleByte(iqoa, oq, olen, result, ISO885915Decode) - of CHARSET_ISO_8859_16: - stream.decodeSingleByte(iqoa, oq, olen, result, ISO885916Decode) - of CHARSET_KOI8_R: - stream.decodeSingleByte(iqoa, oq, olen, result, KOI8RDecode) - of CHARSET_KOI8_U: - stream.decodeSingleByte(iqoa, oq, olen, result, KOI8UDecode) - of CHARSET_MACINTOSH: - stream.decodeSingleByte(iqoa, oq, olen, result, MacintoshDecode) - of CHARSET_WINDOWS_874: - stream.decodeSingleByte(iqoa, oq, olen, result, Windows874Decode) - of CHARSET_WINDOWS_1250: - stream.decodeSingleByte(iqoa, oq, olen, result, Windows1250Decode) - of CHARSET_WINDOWS_1251: - stream.decodeSingleByte(iqoa, oq, olen, result, Windows1251Decode) - of CHARSET_WINDOWS_1252: - stream.decodeSingleByte(iqoa, oq, olen, result, Windows1252Decode) - of CHARSET_WINDOWS_1253: - stream.decodeSingleByte(iqoa, oq, olen, result, Windows1253Decode) - of CHARSET_WINDOWS_1254: - stream.decodeSingleByte(iqoa, oq, olen, result, Windows1254Decode) - of CHARSET_WINDOWS_1255: - stream.decodeSingleByte(iqoa, oq, olen, result, Windows1255Decode) - of CHARSET_WINDOWS_1256: - stream.decodeSingleByte(iqoa, oq, olen, result, Windows1256Decode) - of CHARSET_WINDOWS_1257: - stream.decodeSingleByte(iqoa, oq, olen, result, Windows1257Decode) - of CHARSET_WINDOWS_1258: - stream.decodeSingleByte(iqoa, oq, olen, result, Windows1258Decode) - of CHARSET_X_MAC_CYRILLIC: - stream.decodeSingleByte(iqoa, oq, olen, result, XMacCyrillicDecode) - of CHARSET_GBK, CHARSET_GB18030: - stream.decodeGb18030(iqoa, oq, olen, result) - of CHARSET_BIG5: - stream.decodeBig5(iqoa, oq, olen, result) - of CHARSET_EUC_JP: - stream.decodeEUCJP(iqoa, oq, olen, result) - of CHARSET_ISO_2022_JP: - stream.decodeISO2022JP(iqoa, oq, olen, result) - of CHARSET_SHIFT_JIS: - stream.decodeShiftJIS(iqoa, oq, olen, result) - of CHARSET_EUC_KR: - stream.decodeEUCKR(iqoa, oq, olen, result) - of CHARSET_REPLACEMENT: - stream.decodeReplacement(oq, olen, result) - of CHARSET_UTF_16_LE: - stream.decodeUTF16LE(iqoa, oq, olen, result) - of CHARSET_UTF_16_BE: - stream.decodeUTF16BE(iqoa, oq, olen, result) - of CHARSET_X_USER_DEFINED: - stream.decodeXUserDefined(iqoa, oq, olen, result) - of CHARSET_UNKNOWN: - doAssert false, "Somebody forgot to set the character set here" - stream.checkEnd(oq, olen, result) - -# Returns the number of bytes read. -proc readData*(stream: DecoderStream, buf: var openarray[uint32]): int = - return stream.readData(addr buf[0], buf.len * sizeof(buf[0])) - -proc atEnd*(stream: DecoderStream): bool = - return stream.isend - -# Read all and convert to UTF-8. -# Probably not very efficient. Oh well. -proc readAll*(stream: DecoderStream): string = - var buf = newSeqUninitialized[uint32](stream.buflen) - while not stream.atEnd: - let n = stream.readData(buf) - for i in 0 ..< n div 4: - let r = cast[Rune](buf[i]) - result &= $r - -proc newDecoderStream*(source: Stream, cs = CHARSET_UTF_8, buflen = 1024, - errormode = DECODER_ERROR_MODE_REPLACEMENT): DecoderStream = - result = DecoderStream( - source: source, - charset: cs, - buflen: buflen, - errormode: errormode - ) - when nimvm: - result.bufs = @[newSeq[uint32](buflen)] - else: - result.bufs = @[newSeqUninitialized[uint32](buflen)] - case cs - of CHARSET_UTF_8: - result.u8bounds = 0x80u8 .. 0xBFu8 - else: discard diff --git a/src/encoding/encoderstream.nim b/src/encoding/encoderstream.nim deleted file mode 100644 index 45911579..00000000 --- a/src/encoding/encoderstream.nim +++ /dev/null @@ -1,533 +0,0 @@ -# Heavily based on https://encoding.spec.whatwg.org/ - -import algorithm -import streams -import unicode - -import data/charset -import utils/map - -# EncoderStream encodes utf-32 to the specified encoding. -type - EncoderErrorMode* = enum - ENCODER_ERROR_MODE_FATAL, ENCODER_ERROR_MODE_HTML - - ISO2022JPState = enum - STATE_ASCII, STATE_ROMAN, STATE_JIS0208 - - EncoderStream* = ref object - source: Stream - errormode: EncoderErrorMode - isend: bool - failed*: bool - bufs: seq[seq[uint8]] - bs: int - bi: int - buflen: int - errc: uint32 - case charset: Charset - of CHARSET_ISO_2022_JP: - iso2022jpstate: ISO2022JPState - else: discard - -template append_byte_buf(stream: EncoderStream, c: uint8) = - if stream.bi >= stream.buflen: - stream.bufs.add(newSeqUninitialized[uint8](stream.buflen)) - stream.bi = 0 - stream.bufs[^1][stream.bi] = c - inc stream.bi - -template append_byte(stream: EncoderStream, c: uint8, - oq: ptr UncheckedArray[uint8], olen: int, n: var int) = - if n < olen: - oq[n] = c - inc n - else: - append_byte_buf stream, c - -template append_byte(stream: EncoderStream, c: char, - oq: ptr UncheckedArray[uint8], olen: int, n: var int) = - stream.append_byte cast[uint8](c), oq, olen, n - -template append_byte(stream: EncoderStream, c: uint32, - oq: ptr UncheckedArray[uint8], olen: int, n: var int) = - stream.append_byte cast[uint8](c), oq, olen, n - -template append_byte(stream: EncoderStream, c: int, - oq: ptr UncheckedArray[uint8], olen: int, n: var int) = - stream.append_byte cast[uint8](c), oq, olen, n - -func findPair[U, V](map: seq[(U, V)], c: uint32): int = - return searchInMap(map, cast[U](c)) - -proc handleError(stream: EncoderStream, oq: ptr UncheckedArray[uint8], - olen: int, n: var int, c: uint32) = - case stream.errormode - of ENCODER_ERROR_MODE_FATAL: - stream.isend = true - stream.failed = true - of ENCODER_ERROR_MODE_HTML: - stream.append_byte '&', oq, olen, n - stream.append_byte '#', oq, olen, n - if stream.errc == 0: - stream.append_byte '0', oq, olen, n - else: - while stream.errc > 0: - stream.append_byte cast[char](0x30 + stream.errc mod 10), oq, olen, n - stream.errc = stream.errc div 10 - stream.append_byte ';', oq, olen, n - -proc gb18030RangesPointer(c: uint32): uint32 = - if c == 0xE7C7: - return 7457 - # Let offset be the last pointer in index gb18030 ranges that is less than or - # equal to pointer and code point offset its corresponding code point. - var offset: uint32 - var p: uint32 - if c >= 0x10000: - # omitted from the map for storage efficiency - offset = 0x10000 - p = 189000 - elif c >= 0xFFE6: - # Needed because upperBound returns the first element greater than pointer - # OR last on failure, so we can't just remove one if p is e.g. 39400. - offset = 0xFFE6 - p = 39394 - else: - # Find the first range that is greater than p, or last if no such element - # is found. - # We want the last that is <=, so decrease index by one. - let i = upperBound(Gb18030RangesEncode, c, func(a: tuple[ucs, p: uint16], b: uint32): int = - cmp(uint32(a.ucs), b)) - let elem = Gb18030RangesEncode[i - 1] - offset = elem.ucs - p = elem.p - return p + c - offset - -proc encodeUTF8(stream: EncoderStream, iq: openArray[uint32], - oq: ptr UncheckedArray[uint8], olen: int, n: var int) = - for c in iq: - if c < 0x80: - stream.append_byte c, oq, olen, n - else: - var count: int - var offset: uint8 - case c - of 0x80..0x7FF: - count = 1 - offset = 0xC0 - of 0x800..0xFFFF: - count = 2 - offset = 0xE0 - of 0x10000..0x10FFFF: - count = 3 - offset = 0xF0 - else: - assert false - {.linearScanEnd.} - stream.append_byte (c shr (6 * count)) + offset, oq, olen, n - for j in countdown(count - 1, 0): - let tmp = c shr (6 * j) - stream.append_byte 0x80 or (tmp and 0x3F), oq, olen, n - -proc encodeSingleByte(stream: EncoderStream, iq: openArray[uint32], - oq: ptr UncheckedArray[uint8], olen: int, n: var int, - map: seq[tuple[ucs: uint16, val: char]]) = - for c in iq: - if c < 0x80: - stream.append_byte cast[uint8](c), oq, olen, n - continue - let j = findPair(map, c) - if j != -1: - stream.append_byte cast[uint8](map[j].val) + 0x80, oq, olen, n - else: - stream.handleError(oq, olen, n, c) - -proc encodeXUserDefined(stream: EncoderStream, iq: openArray[uint32], - oq: ptr UncheckedArray[uint8], olen: int, n: var int) = - for c in iq: - if c < 0x80: - stream.append_byte cast[uint8](c), oq, olen, n - continue - if c in 0xF780u32..0xF7FFu32: - let b = cast[uint8](c - 0xF780 + 0x80) - stream.append_byte b, oq, olen, n - continue - stream.handleError(oq, olen, n, c) - -proc encodeGb18030(stream: EncoderStream, iq: openArray[uint32], - oq: ptr UncheckedArray[uint8], olen: int, n: var int, isGBK = false) = - for c in iq: - if isGBK and c == 0x20AC: - stream.append_byte 0x80, oq, olen, n - continue - let i = if c > 0xFFFF: -1 else: findPair(Gb18030Encode, c) - if i != -1: - let p = Gb18030Encode[i].p - let lead = p div 190 + 0x81 - let trail = p mod 190 - let offset: uint8 = if trail < 0x3F: 0x40 else: 0x41 - stream.append_byte lead, oq, olen, n - stream.append_byte cast[uint8](trail) + offset, oq, olen, n - continue - if isGBK: - stream.handleError(oq, olen, n, c) - continue - var p = gb18030RangesPointer(c) - let b1 = p div (10 * 126 * 10) - p = p mod (10 * 126 * 10) - let b2 = p div (10 * 126) - p = p mod (10 * 126) - let b3 = p div 10 - let b4 = p mod 10 - stream.append_byte b1, oq, olen, n - stream.append_byte b2, oq, olen, n - stream.append_byte b3, oq, olen, n - stream.append_byte b4, oq, olen, n - -proc encodeBig5(stream: EncoderStream, iq: openArray[uint32], - oq: ptr UncheckedArray[uint8], olen: int, n: var int) = - for c in iq: - if c < 0x80: - stream.append_byte c, oq, olen, n - continue - let i = findPair(Big5Encode, c) - if i == -1: - stream.handleError(oq, olen, n, c) - continue - let p = Big5Encode[i].p - let lead = p div 157 + 0x81 - let trail = p mod 157 - let offset: uint8 = if trail < 0x3F: 0x40 else: 0x62 - stream.append_byte lead, oq, olen, n - stream.append_byte cast[uint8](trail) + offset, oq, olen, n - -proc encodeEUCJP(stream: EncoderStream, iq: openArray[uint32], - oq: ptr UncheckedArray[uint8], olen: int, n: var int) = - for c in iq: - if c < 0x80: - stream.append_byte c, oq, olen, n - elif c == 0xA5: - stream.append_byte 0x5C, oq, olen, n - elif c == 0x203E: - stream.append_byte 0x5C, oq, olen, n - elif c in 0xFF61u32..0xFF9Fu32: - stream.append_byte 0x8E, oq, olen, n - stream.append_byte c - 0xFF61 + 0xA1, oq, olen, n - else: - let c = if c == 0x2212: - 0xFF0Du32 - else: - c - let i = findPair(Jis0208Encode, c) - if i != -1: - let p = Jis0208Encode[i].p - let lead = p div 94 + 0xA1 - let trail = p mod 94 + 0xA1 - stream.append_byte lead, oq, olen, n - stream.append_byte trail, oq, olen, n - else: - stream.handleError(oq, olen, n, c) - -proc encodeISO2022JP(stream: EncoderStream, iq: openArray[uint32], - oq: ptr UncheckedArray[uint8], olen: int, n: var int) = - var state = stream.iso2022jpstate - var i = 0 - while i < iq.len: - let c = iq[i] - if state in {STATE_ASCII, STATE_ROMAN} and - c in [0x0Eu32, 0x0Fu32, 0x1Bu32]: - stream.handleError(oq, olen, n, 0xFFFD) - elif state == STATE_ASCII and c < 0x80 and c notin [0x5Cu32, 0x7Eu32] or - c == 0xA5 or c == 0x203E: - if c < 0x80: - stream.append_byte c, oq, olen, n - elif c == 0xA5: - stream.append_byte 0xA5, oq, olen, n - elif c == 0x203E: - stream.append_byte 0x7E, oq, olen, n - elif c < 0x80 and state != STATE_ASCII: - state = STATE_ASCII - stream.append_byte 0x1B, oq, olen, n - stream.append_byte 0x28, oq, olen, n - stream.append_byte 0x42, oq, olen, n - # prepend - continue - elif c == 0xA5 or c == 0x203E and state != STATE_ROMAN: - state = STATE_ROMAN - stream.append_byte 0x1B, oq, olen, n - stream.append_byte 0x28, oq, olen, n - stream.append_byte 0x4A, oq, olen, n - # prepend - continue - else: - var c = c - if c == 0x2212: - c = 0xFF0D - if c in 0xFF61u32..0xFF9Fu32: - let j = findPair(ISO2022JPKatakanaEncode, c - 0xFF61) - c = ISO2022JPKatakanaEncode[j].ucs - let j = findPair(Jis0208Encode, c) - if j == -1: - if state == STATE_JIS0208: - state = STATE_ASCII - stream.append_byte 0x1B, oq, olen, n - stream.append_byte 0x28, oq, olen, n - stream.append_byte 0x42, oq, olen, n - # prepend - continue - stream.handleError(oq, olen, n, c) - else: - let p = Jis0208Encode[j].p - if state != STATE_JIS0208: - state = STATE_JIS0208 - stream.append_byte 0x1B, oq, olen, n - stream.append_byte 0x24, oq, olen, n - stream.append_byte 0x42, oq, olen, n - # prepend - continue - let lead = p div 94 + 0x21 - let trail = p mod 94 + 0x21 - stream.append_byte lead, oq, olen, n - stream.append_byte trail, oq, olen, n - inc i - stream.iso2022jpstate = state - -proc encodeShiftJIS(stream: EncoderStream, iq: openArray[uint32], - oq: ptr UncheckedArray[uint8], olen: int, n: var int) = - for c in iq: - if c <= 0x80: - stream.append_byte c, oq, olen, n - elif c == 0xA5: - stream.append_byte 0x5C, oq, olen, n - elif c == 0x203E: - stream.append_byte 0x7E, oq, olen, n - elif c in 0xFF61u32..0xFF9Fu32: - stream.append_byte c - 0xFF61 + 0xA1, oq, olen, n - else: - let c = if c == 0x2212: 0xFF0Du32 else: c - let j = findPair(ShiftJISEncode, c) - if j == -1: - stream.handleError(oq, olen, n, c) - else: - let p = ShiftJISEncode[j].p - let lead = p div 188 - let lead_offset = if lead < 0x1F: 0x81u16 else: 0xC1u16 - let trail = p mod 188 - let offset = if trail < 0x3F: 0x40u16 else: 0x41u16 - stream.append_byte lead + lead_offset, oq, olen, n - stream.append_byte trail + offset, oq, olen, n - -proc encodeEUCKR(stream: EncoderStream, iq: openArray[uint32], - oq: ptr UncheckedArray[uint8], olen: int, n: var int) = - for c in iq: - if c < 0x80: - stream.append_byte c, oq, olen, n - else: - let i = findPair(Jis0208Encode, c) - if i != -1: - let p = Jis0208Encode[i].p - let lead = p div 190 + 0x81 - let trail = p mod 190 + 0x41 - stream.append_byte lead, oq, olen, n - stream.append_byte trail, oq, olen, n - else: - stream.handleError(oq, olen, n, c) - -# copy any data remaining from previous passes -proc copyBuffers(stream: EncoderStream, oq: ptr UncheckedArray[uint8], olen: int): int = - if stream.bufs.len == 1: - # one page: stream.bs ..< stream.bi - let n = min((stream.bi - stream.bs) * sizeof(stream.bufs[0][0]), olen) - copyMem(addr oq[0], addr stream.bufs[0][stream.bs], n) - stream.bs += n - if stream.bs >= stream.bi: - # read entire page; recycle it - stream.bs = 0 - stream.bi = 0 - return n - else: - # multiple pages: - # stream.bs ..< stream.buflen - # 0 ..< stream.buflen - # ... - # 0 ..< stream.bi - let a = (stream.buflen - stream.bs) * sizeof(stream.bufs[0][0]) - if a < olen: - copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a) - var ns = a - stream.bs = 0 - var i = 1 - while i < stream.bufs.high: - let n = min(stream.buflen * sizeof(stream.bufs[0][0]), olen - ns) - copyMem(addr oq[ns], addr stream.bufs[i][0], n) - ns += n - if ns >= olen: - # i'th buffer still has contents. - stream.bs = n - break - stream.bs = 0 - inc i - if ns < olen: - # last page - let n = min(stream.bi * sizeof(stream.bufs[0][0]), olen - ns) - copyMem(addr oq[ns], addr stream.bufs[i][0], n) - ns += n - stream.bs = n - if stream.bs >= stream.bi: - # read entire page; recycle it - stream.bs = 0 - stream.bi = 0 - for j in i ..< stream.bufs.len: - stream.bufs[j - i] = stream.bufs[j] - stream.bufs.setLen(stream.bufs.len - i) - return ns - elif a > olen: - copyMem(addr oq[0], addr stream.bufs[0][stream.bs], olen) - stream.bs += olen - assert stream.bs < stream.buflen - return olen - else: # a == olen - copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a) - stream.bs = 0 - stream.bufs.delete(0) - return a - -proc checkEnd(stream: EncoderStream, oq: ptr UncheckedArray[uint8], olen: int, - n: var int) = - if not stream.isend and stream.bufs.len == 1 and - stream.bs >= stream.bi and stream.source.atEnd: - stream.isend = true - if stream.charset == CHARSET_ISO_2022_JP: - if stream.iso2022jpstate != STATE_ASCII: - stream.append_byte 0x1B, oq, olen, n - stream.append_byte 0x28, oq, olen, n - stream.append_byte 0x42, oq, olen, n - -const ReadSize = 4096 -var iq {.threadVar.}: array[ReadSize div sizeof(uint32), uint32] -proc readData*(stream: EncoderStream, buffer: pointer, olen: int): int = - if olen == 0: return - let oq = cast[ptr UncheckedArray[uint8]](buffer) - result = stream.copyBuffers(oq, olen) - let olen = olen - result - if olen == 0 or stream.source.atEnd: - # either output filled with buffered data; nothing to decode - # or we're at the end of the source stream - stream.checkEnd(oq, olen, result) - return result - let ilen0 = stream.source.readData(cast[pointer](addr iq[0]), ReadSize) - #TODO what if ilen0 is 0? - assert ilen0 != 0 - assert ilen0 mod sizeof(uint32) == 0 #TODO what to do if false? - let ilen = ilen0 div sizeof(uint32) - template iqoa: openArray[uint32] = - toOpenArray(iq, 0, ilen - 1) - case stream.charset - of CHARSET_UTF_8: - stream.encodeUTF8(iqoa, oq, olen, result) - of CHARSET_IBM866: - stream.encodeSingleByte(iqoa, oq, olen, result, IBM866Encode) - of CHARSET_ISO_8859_2: - stream.encodeSingleByte(iqoa, oq, olen, result, ISO88592Encode) - of CHARSET_ISO_8859_3: - stream.encodeSingleByte(iqoa, oq, olen, result, ISO88593Encode) - of CHARSET_ISO_8859_4: - stream.encodeSingleByte(iqoa, oq, olen, result, ISO88594Encode) - of CHARSET_ISO_8859_5: - stream.encodeSingleByte(iqoa, oq, olen, result, ISO88595Encode) - of CHARSET_ISO_8859_6: - stream.encodeSingleByte(iqoa, oq, olen, result, ISO88596Encode) - of CHARSET_ISO_8859_7: - stream.encodeSingleByte(iqoa, oq, olen, result, ISO88597Encode) - of CHARSET_ISO_8859_8, CHARSET_ISO_8859_8_I: - stream.encodeSingleByte(iqoa, oq, olen, result, ISO88598Encode) - of CHARSET_ISO_8859_10: - stream.encodeSingleByte(iqoa, oq, olen, result, ISO885910Encode) - of CHARSET_ISO_8859_13: - stream.encodeSingleByte(iqoa, oq, olen, result, ISO885913Encode) - of CHARSET_ISO_8859_14: - stream.encodeSingleByte(iqoa, oq, olen, result, ISO885914Encode) - of CHARSET_ISO_8859_15: - stream.encodeSingleByte(iqoa, oq, olen, result, ISO885915Encode) - of CHARSET_ISO_8859_16: - stream.encodeSingleByte(iqoa, oq, olen, result, ISO885916Encode) - of CHARSET_KOI8_R: - stream.encodeSingleByte(iqoa, oq, olen, result, KOI8REncode) - of CHARSET_KOI8_U: - stream.encodeSingleByte(iqoa, oq, olen, result, KOI8UEncode) - of CHARSET_MACINTOSH: - stream.encodeSingleByte(iqoa, oq, olen, result, MacintoshEncode) - of CHARSET_WINDOWS_874: - stream.encodeSingleByte(iqoa, oq, olen, result, Windows874Encode) - of CHARSET_WINDOWS_1250: - stream.encodeSingleByte(iqoa, oq, olen, result, Windows1250Encode) - of CHARSET_WINDOWS_1251: - stream.encodeSingleByte(iqoa, oq, olen, result, Windows1251Encode) - of CHARSET_WINDOWS_1252: - stream.encodeSingleByte(iqoa, oq, olen, result, Windows1252Encode) - of CHARSET_WINDOWS_1253: - stream.encodeSingleByte(iqoa, oq, olen, result, Windows1253Encode) - of CHARSET_WINDOWS_1254: - stream.encodeSingleByte(iqoa, oq, olen, result, Windows1254Encode) - of CHARSET_WINDOWS_1255: - stream.encodeSingleByte(iqoa, oq, olen, result, Windows1255Encode) - of CHARSET_WINDOWS_1256: - stream.encodeSingleByte(iqoa, oq, olen, result, Windows1256Encode) - of CHARSET_WINDOWS_1257: - stream.encodeSingleByte(iqoa, oq, olen, result, Windows1257Encode) - of CHARSET_WINDOWS_1258: - stream.encodeSingleByte(iqoa, oq, olen, result, Windows1258Encode) - of CHARSET_X_MAC_CYRILLIC: - stream.encodeSingleByte(iqoa, oq, olen, result, XMacCyrillicEncode) - of CHARSET_GBK: - stream.encodeGb18030(iqoa, oq, olen, result, true) - of CHARSET_GB18030: - stream.encodeGb18030(iqoa, oq, olen, result) - of CHARSET_BIG5: - stream.encodeBig5(iqoa, oq, olen, result) - of CHARSET_EUC_JP: - stream.encodeEUCJP(iqoa, oq, olen, result) - of CHARSET_ISO_2022_JP: - stream.encodeISO2022JP(iqoa, oq, olen, result) - of CHARSET_SHIFT_JIS: - stream.encodeShiftJIS(iqoa, oq, olen, result) - of CHARSET_EUC_KR: - stream.encodeEUCKR(iqoa, oq, olen, result) - of CHARSET_X_USER_DEFINED: - stream.encodeXUserDefined(iqoa, oq, olen, result) - of CHARSET_UNKNOWN: - doAssert false, "Somebody forgot to set the character set here" - else: discard - stream.checkEnd(oq, olen, result) - -# Returns the number of bytes read. -proc readData*(stream: EncoderStream, buf: var seq[uint8]): int = - return stream.readData(addr buf[0], buf.len * sizeof(buf[0])) - -proc atEnd*(stream: EncoderStream): bool = - return stream.isend - -proc readAll*(stream: EncoderStream): string = - var buf = newString(4096) - while not stream.atEnd: - let olen = stream.readData(addr buf[0], buf.len) - if olen < buf.len: - buf.setLen(olen) - result &= buf - break - result &= buf - -proc newEncoderStream*(source: Stream, cs = CHARSET_UTF_8, buflen = 4096, - errormode: EncoderErrorMode = ENCODER_ERROR_MODE_HTML): EncoderStream = - result = EncoderStream( - source: source, - charset: cs, - buflen: buflen, - errormode: errormode - ) - doAssert cs notin {CHARSET_UTF_16_LE, CHARSET_UTF_16_BE, CHARSET_REPLACEMENT} - when nimvm: - result.bufs = @[newSeq[uint8](buflen)] - else: - result.bufs = @[newSeqUninitialized[uint8](buflen)] diff --git a/src/html/chadombuilder.nim b/src/html/chadombuilder.nim index 336543d1..c7a41619 100644 --- a/src/html/chadombuilder.nim +++ b/src/html/chadombuilder.nim @@ -2,13 +2,14 @@ import deques import options import streams -import data/charset import html/dom import html/htmlparser import html/tags import js/javascript import types/url +import chakasu/charset + # DOMBuilder implementation for Chawan. type diff --git a/src/html/dom.nim b/src/html/dom.nim index c0dc066c..00f2a549 100644 --- a/src/html/dom.nim +++ b/src/html/dom.nim @@ -10,8 +10,6 @@ import tables import css/cssparser import css/sheet import css/values -import data/charset -import encoding/decoderstream import html/event import html/tags import img/bitmap @@ -33,6 +31,9 @@ import types/vector import utils/mimeguess import utils/twtstr +import chakasu/charset +import chakasu/decoderstream + type FormMethod* = enum FORM_METHOD_GET, FORM_METHOD_POST, FORM_METHOD_DIALOG diff --git a/src/html/htmlparser.nim b/src/html/htmlparser.nim index 2192ccf8..60a9aad9 100644 --- a/src/html/htmlparser.nim +++ b/src/html/htmlparser.nim @@ -6,13 +6,14 @@ import strutils import tables import unicode -import data/charset -import encoding/decoderstream import html/htmltokenizer import html/parseerror import html/tags import utils/twtstr +import chakasu/charset +import chakasu/decoderstream + # Generics break without exporting macros. Maybe a compiler bug? export macros diff --git a/src/html/htmltokenizer.nim b/src/html/htmltokenizer.nim index d38d427c..f487f31f 100644 --- a/src/html/htmltokenizer.nim +++ b/src/html/htmltokenizer.nim @@ -5,7 +5,6 @@ import macros import tables import unicode -import encoding/decoderstream import html/entity import html/parseerror import html/tags @@ -13,6 +12,8 @@ import utils/opt import utils/radixtree import utils/twtstr +import chakasu/decoderstream + # Tokenizer type Tokenizer* = object diff --git a/src/io/lineedit.nim b/src/io/lineedit.nim index ac10df06..1c1d273c 100644 --- a/src/io/lineedit.nim +++ b/src/io/lineedit.nim @@ -5,15 +5,16 @@ import unicode import bindings/quickjs import buffer/cell -import data/charset import display/term -import encoding/decoderstream -import encoding/encoderstream import js/javascript import types/color import utils/opt import utils/twtstr +import chakasu/charset +import chakasu/decoderstream +import chakasu/encoderstream + type LineEditState* = enum EDIT, FINISH, CANCEL diff --git a/src/io/loader.nim b/src/io/loader.nim index 8e125b31..65ce091f 100644 --- a/src/io/loader.nim +++ b/src/io/loader.nim @@ -20,7 +20,6 @@ import strutils import tables import bindings/curl -import data/charset import io/about import io/connecterror import io/file @@ -42,6 +41,8 @@ import types/url import utils/mimeguess import utils/twtstr +import chakasu/charset + export request export response diff --git a/src/io/response.nim b/src/io/response.nim index dedddbcd..38f6b397 100644 --- a/src/io/response.nim +++ b/src/io/response.nim @@ -1,13 +1,14 @@ import streams import bindings/quickjs -import data/charset import io/headers import io/promise import io/request import js/javascript import types/url +import chakasu/charset + type Response* = ref object res*: int diff --git a/src/main.nim b/src/main.nim index 73ea1eed..be636dcb 100644 --- a/src/main.nim +++ b/src/main.nim @@ -1,3 +1,5 @@ +import version + import ips/forkserver let forks = newForkServer() @@ -10,11 +12,12 @@ when defined(profile): import nimprof import config/config -import data/charset import display/client import utils/opt import utils/twtstr +import chakasu/charset + let conf = readConfig() set_cjk_ambiguous(conf.display.double_width_ambiguous) let params = commandLineParams() diff --git a/src/render/rendertext.nim b/src/render/rendertext.nim index b0538cd8..8d13dcf2 100644 --- a/src/render/rendertext.nim +++ b/src/render/rendertext.nim @@ -2,10 +2,11 @@ import streams import unicode import buffer/cell -import data/charset -import encoding/decoderstream import utils/twtstr +import chakasu/charset +import chakasu/decoderstream + type StreamRenderer* = object ansiparser: AnsiCodeParser format: Format diff --git a/src/types/buffersource.nim b/src/types/buffersource.nim index 9235e07f..a5e3e21f 100644 --- a/src/types/buffersource.nim +++ b/src/types/buffersource.nim @@ -3,10 +3,11 @@ import options when defined(posix): import posix -import data/charset import io/request import types/url +import chakasu/charset + type BufferSourceType* = enum CLONE, LOAD_REQUEST, LOAD_PIPE diff --git a/src/version.nim b/src/version.nim new file mode 100644 index 00000000..407b974a --- /dev/null +++ b/src/version.nim @@ -0,0 +1,29 @@ +{.used.} + +import macros + +template imp(x: untyped) = import x + +macro tryImport(x: untyped, name: static string) = + let vs = ident(name & "Version") + quote do: + when not compiles(imp `x`): + static: + error("Cannot find submodule " & `name` & ".\n" & + "Please run `make submodule` to fetch the required submodules.") + import `x` as `vs` + +macro checkVersion(xs: static string, major, minor, patch: int) = + let x = ident(xs & "Version") + quote do: + when `x`.Major < `major` or `x`.Minor < `minor` or `x`.Patch < `patch`: + var es = $`major` & "." & $`minor` & "." & $`patch` + var gs = $`x`.Major & "." & $`x`.Minor & "." & $`x`.Patch + error("Version of " & `xs` & " too low (expected " & es & ", got " & + gs & ").\n" & + "Please run `make submodule` to update.") + +tryImport chakasu/version, "chakasu" + +static: + checkVersion("chakasu", 0, 1, 2) |