import algorithm import os import strutils import tables import utils/twtstr type Charset* = enum CHARSET_UNKNOWN CHARSET_UTF_8 = "UTF-8" CHARSET_IBM866 = "IBM866" CHARSET_ISO_8859_2 = "ISO-8859-2" CHARSET_ISO_8859_3 = "ISO-8859-3" CHARSET_ISO_8859_4 = "ISO-8859-4" CHARSET_ISO_8859_5 = "ISO-8859-5" CHARSET_ISO_8859_6 = "ISO-8859-6" CHARSET_ISO_8859_7 = "ISO-8859-7" CHARSET_ISO_8859_8 = "ISO-8859-8" CHARSET_ISO_8859_8_I = "ISO-8859-8-I" CHARSET_ISO_8859_10 = "ISO-8859-10" CHARSET_ISO_8859_13 = "ISO-8859-13" CHARSET_ISO_8859_14 = "ISO-8859-14" CHARSET_ISO_8859_15 = "ISO-8859-15" CHARSET_ISO_8859_16 = "ISO-8859-16" CHARSET_KOI8_R = "KOI8-R" CHARSET_KOI8_U = "KOI8-U" CHARSET_MACINTOSH = "macintosh" CHARSET_WINDOWS_874 = "windows-874" CHARSET_WINDOWS_1250 = "windows-1250" CHARSET_WINDOWS_1251 = "windows-1251" CHARSET_WINDOWS_1252 = "windows-1252" CHARSET_WINDOWS_1253 = "windows-1253" CHARSET_WINDOWS_1254 = "windows-1254" CHARSET_WINDOWS_1255 = "windows-1255" CHARSET_WINDOWS_1256 = "windows-1256" CHARSET_WINDOWS_1257 = "windows-1257" CHARSET_WINDOWS_1258 = "windows-1258" CHARSET_X_MAC_CYRILLIC = "x-mac-cyrillic" CHARSET_GBK = "GBK" CHARSET_GB18030 = "gb18030" CHARSET_BIG5 = "Big5" CHARSET_EUC_JP = "EUC-JP" CHARSET_ISO_2022_JP = "ISO-2022-JP" CHARSET_SHIFT_JIS = "Shift_JIS" CHARSET_EUC_KR = "EUC-KR" CHARSET_REPLACEMENT = "replacement" CHARSET_UTF_16_BE = "UTF-16BE" CHARSET_UTF_16_LE = "UTF-16LE" CHARSET_X_USER_DEFINED = "x-user-defined" const CharsetMap = { # UTF-8 (The Encoding) "unicode-1-1-utf-8": CHARSET_UTF_8, "unicode11utf-8": CHARSET_UTF_8, "unicode20utf-8": CHARSET_UTF_8, "utf-8": CHARSET_UTF_8, "utf8": CHARSET_UTF_8, "x-unicode20utf8": CHARSET_UTF_8, # IBM866 "866": CHARSET_IBM_866, "cp866": CHARSET_IBM_866, "csibm866": CHARSET_IBM_866, "ibm866": CHARSET_IBM_866, # ISO-8859-2 "csisolatin2": CHARSET_ISO_8859_2, "iso-8859-2": CHARSET_ISO_8859_2, "iso-ir-101": CHARSET_ISO_8859_2, "iso8859-2": CHARSET_ISO_8859_2, "iso88592": CHARSET_ISO_8859_2, "iso_8859-2": CHARSET_ISO_8859_2, "iso_8859-2:1987": CHARSET_ISO_8859_2, "l2": CHARSET_ISO_8859_2, "latin2": CHARSET_ISO_8859_2, # ISO-8859-3 "csisolatin3": CHARSET_ISO_8859_3, "iso-8859-3": CHARSET_ISO_8859_3, "iso-ir-109": CHARSET_ISO_8859_3, "iso8859-3": CHARSET_ISO_8859_3, "iso88593": CHARSET_ISO_8859_3, "iso_8859-3": CHARSET_ISO_8859_3, "iso_8859-3:1988": CHARSET_ISO_8859_3, "l3": CHARSET_ISO_8859_3, "latin3": CHARSET_ISO_8859_3, # ISO-8859-4 "csisolatin4": CHARSET_ISO_8859_4, "iso-8859-4": CHARSET_ISO_8859_4, "iso-ir-110": CHARSET_ISO_8859_4, "iso8859-4": CHARSET_ISO_8859_4, "iso88594": CHARSET_ISO_8859_4, "iso_8859-4": CHARSET_ISO_8859_4, "iso_8859-4:1988": CHARSET_ISO_8859_4, "l4": CHARSET_ISO_8859_4, "latin4": CHARSET_ISO_8859_4, # ISO-8859-5 "csisolatincyrillic": CHARSET_ISO_8859_5, "cyrillic": CHARSET_ISO_8859_5, "iso-8859-5": CHARSET_ISO_8859_5, "iso-ir-144": CHARSET_ISO_8859_5, "iso8859-5": CHARSET_ISO_8859_5, "iso88595": CHARSET_ISO_8859_5, "iso_8859-5": CHARSET_ISO_8859_5, "iso_8859-5:1988": CHARSET_ISO_8859_5, # ISO-8859-6 "arabic": CHARSET_ISO_8859_6, "asmo-708": CHARSET_ISO_8859_6, "csiso88596e": CHARSET_ISO_8859_6, "csiso88596i": CHARSET_ISO_8859_6, "csisolatinarabic": CHARSET_ISO_8859_6, "ecma-114": CHARSET_ISO_8859_6, "iso-8859-6": CHARSET_ISO_8859_6, "iso-8859-6-e": CHARSET_ISO_8859_6, "iso-8859-6-i": CHARSET_ISO_8859_6, "iso-ir-127": CHARSET_ISO_8859_6, "iso8859-6": CHARSET_ISO_8859_6, "iso88596": CHARSET_ISO_8859_6, "iso_8859-6": CHARSET_ISO_8859_6, "iso_8859-6:1987": CHARSET_ISO_8859_6, # ISO-8859-7 "csisolatingreek": CHARSET_ISO_8859_7, "ecma-118": CHARSET_ISO_8859_7, "elot_928": CHARSET_ISO_8859_7, "greek": CHARSET_ISO_8859_7, "greek8": CHARSET_ISO_8859_7, "iso-8859-7": CHARSET_ISO_8859_7, "iso-ir-126": CHARSET_ISO_8859_7, "iso8859-7": CHARSET_ISO_8859_7, "iso88597": CHARSET_ISO_8859_7, "iso_8859-7": CHARSET_ISO_8859_7, "iso_8859-7:1987": CHARSET_ISO_8859_7, "sun_eu_greek": CHARSET_ISO_8859_7, # ISO-8859-8 "csiso88598e": CHARSET_ISO_8859_8, "csisolatinhebrew": CHARSET_ISO_8859_8, "hebrew": CHARSET_ISO_8859_8, "iso-8859-8": CHARSET_ISO_8859_8, "iso-8859-8-e": CHARSET_ISO_8859_8, "iso-ir-138": CHARSET_ISO_8859_8, "iso8859-8": CHARSET_ISO_8859_8, "iso88598": CHARSET_ISO_8859_8, "iso_8859-8": CHARSET_ISO_8859_8, "iso_8859-8:1988": CHARSET_ISO_8859_8, "visual": CHARSET_ISO_8859_8, # ISO-8859-8-I "csiso88598i": CHARSET_ISO_8859_8_I, "iso-8859-8-i": CHARSET_ISO_8859_8_I, "logical": CHARSET_ISO_8859_8_I, # ISO-8859-10 "csisolatin6": CHARSET_ISO_8859_10, "iso-8859-10": CHARSET_ISO_8859_10, "iso-ir-157": CHARSET_ISO_8859_10, "iso8859-10": CHARSET_ISO_8859_10, "iso885910": CHARSET_ISO_8859_10, "l6": CHARSET_ISO_8859_10, "latin6": CHARSET_ISO_8859_10, # ISO-8859-13 "iso-8859-13": CHARSET_ISO_8859_13, "iso8859-13": CHARSET_ISO_8859_13, "iso885913": CHARSET_ISO_8859_13, # ISO-8859-14 "iso-8859-14": CHARSET_ISO_8859_14, "iso8859-14": CHARSET_ISO_8859_14, "iso885914": CHARSET_ISO_8859_14, # ISO-8859-15 "csisolatin9": CHARSET_ISO_8859_15, "iso-8859-15": CHARSET_ISO_8859_15, "iso8859-15": CHARSET_ISO_8859_15, "iso885915": CHARSET_ISO_8859_15, "iso_8859-15": CHARSET_ISO_8859_15, "l9": CHARSET_ISO_8859_15, # ISO-8859-16 "iso-8859-16": CHARSET_ISO_8859_16, # KOI8-R "cskoi8r": CHARSET_KOI8_R, "koi": CHARSET_KOI8_R, "koi8": CHARSET_KOI8_R, "koi8-r": CHARSET_KOI8_R, "koi8_r": CHARSET_KOI8_R, # KOI8-U "koi8-ru": CHARSET_KOI8_U, "koi8-u": CHARSET_KOI8_U, # macintosh "csmacintosh": CHARSET_MACINTOSH, "mac": CHARSET_MACINTOSH, "macintosh": CHARSET_MACINTOSH, "x-mac-roman": CHARSET_MACINTOSH, # windows-874 "dos-874": CHARSET_WINDOWS_874, "iso-8859-11": CHARSET_WINDOWS_874, "iso8859-11": CHARSET_WINDOWS_874, "iso885911": CHARSET_WINDOWS_874, "tis-620": CHARSET_WINDOWS_874, "windows-874": CHARSET_WINDOWS_874, # windows-1250 "cp1250": CHARSET_WINDOWS_1250, "windows-1250": CHARSET_WINDOWS_1250, "x-cp1250" : CHARSET_WINDOWS_1250, # windows-1251 "cp1251": CHARSET_WINDOWS_1251, "windows-1251": CHARSET_WINDOWS_1251, "x-cp1251": CHARSET_WINDOWS_1251, # windows-1252 "ansi_x3.4-1968": CHARSET_WINDOWS_1252, "ascii": CHARSET_WINDOWS_1252, # lol "cp1252": CHARSET_WINDOWS_1252, "cp819": CHARSET_WINDOWS_1252, "csisolatin1": CHARSET_WINDOWS_1252, "ibm819": CHARSET_WINDOWS_1252, "iso-8859-1": CHARSET_WINDOWS_1252, "iso88591": CHARSET_WINDOWS_1252, "iso_8859-1:1987": CHARSET_WINDOWS_1252, "l1": CHARSET_WINDOWS_1252, "latin1": CHARSET_WINDOWS_1252, "us-ascii": CHARSET_WINDOWS_1252, "windows-1252": CHARSET_WINDOWS_1252, "x-cp1252": CHARSET_WINDOWS_1252, # windows-1253 "cp1253": CHARSET_WINDOWS_1253, "windows-1253": CHARSET_WINDOWS_1253, "x-cp1253": CHARSET_WINDOWS_1253, # windows-1254 "cp1254": CHARSET_WINDOWS_1254, "csisolatin5": CHARSET_WINDOWS_1254, "iso-8859-9": CHARSET_WINDOWS_1254, "iso-ir-148": CHARSET_WINDOWS_1254, "iso8859-9": CHARSET_WINDOWS_1254, "iso88599": CHARSET_WINDOWS_1254, "iso_8859-9": CHARSET_WINDOWS_1254, "iso_8859-9:1989": CHARSET_WINDOWS_1254, "l5": CHARSET_WINDOWS_1254, "latin5": CHARSET_WINDOWS_1254, "windows-1254": CHARSET_WINDOWS_1254, "x-cp1254": CHARSET_WINDOWS_1254, # windows-1255 "cp1255": CHARSET_WINDOWS_1255, "windows-1255": CHARSET_WINDOWS_1255, "x-cp1255": CHARSET_WINDOWS_1255, # windows-1256 "cp1256": CHARSET_WINDOWS_1256, "windows-1256": CHARSET_WINDOWS_1256, "x-cp1256": CHARSET_WINDOWS_1256, # windows-1257 "cp1257": CHARSET_WINDOWS_1257, "windows-1257": CHARSET_WINDOWS_1257, "x-cp1257": CHARSET_WINDOWS_1257, # windows-1258 "cp1258": CHARSET_WINDOWS_1258, "windows-1258": CHARSET_WINDOWS_1258, "x-cp1258": CHARSET_WINDOWS_1258, # x-mac-cyrillic "x-mac-cyrillic": CHARSET_X_MAC_CYRILLIC, "x-mac-ukrainian": CHARSET_X_MAC_CYRILLIC, # GBK "chinese": CHARSET_GBK, "csgb2312": CHARSET_GBK, "csiso58gb231280": CHARSET_GBK, "gb2312": CHARSET_GBK, "gb_2312": CHARSET_GBK, "gb_2312-80": CHARSET_GBK, "gbk": CHARSET_GBK, "iso-ir-58": CHARSET_GBK, "x-gbk": CHARSET_GBK, # gb18030 "gb18030": CHARSET_GB18030, # Big5 "big5": CHARSET_BIG5, "big5-hkscs": CHARSET_BIG5, "cn-big5": CHARSET_BIG5, "csbig5": CHARSET_BIG5, "x-x-big5": CHARSET_BIG5, # EUC-JP "cseucpkdfmtjapanese": CHARSET_EUC_JP, "euc-jp": CHARSET_EUC_JP, "x-euc-jp": CHARSET_EUC_JP, # ISO-2022-JP (ugh) "csiso2022jp": CHARSET_ISO_2022_JP, "iso-2022-jp": CHARSET_ISO_2022_JP, # Shift_JIS "csshiftjis": CHARSET_SHIFT_JIS, "ms932": CHARSET_SHIFT_JIS, "ms_kanji": CHARSET_SHIFT_JIS, "shift-jis": CHARSET_SHIFT_JIS, "shift_jis": CHARSET_SHIFT_JIS, "sjis": CHARSET_SHIFT_JIS, "windows-31j": CHARSET_SHIFT_JIS, "x-sjis": CHARSET_SHIFT_JIS, # EUC-KR "cseuckr": CHARSET_EUC_KR, "csksc56011987": CHARSET_EUC_KR, "euc-kr": CHARSET_EUC_KR, "iso-ir-149": CHARSET_EUC_KR, "korean": CHARSET_EUC_KR, "ks_c_5601-1987": CHARSET_EUC_KR, "ks_c_5601-1989": CHARSET_EUC_KR, "ksc5601": CHARSET_EUC_KR, "ksc_5601": CHARSET_EUC_KR, "windows-949": CHARSET_EUC_KR, # replacement "csiso2022kr": CHARSET_REPLACEMENT, "hz-gb-2312": CHARSET_REPLACEMENT, "iso-2022-cn": CHARSET_REPLACEMENT, "iso-2022-cn-ext": CHARSET_REPLACEMENT, "iso-2022-kr": CHARSET_REPLACEMENT, "replacement": CHARSET_REPLACEMENT, # UTF-16BE "unicodefffe": CHARSET_UTF_16_BE, "utf-16be": CHARSET_UTF_16_BE, # UTF-16LE "csunicode": CHARSET_UTF_16_LE, "iso-10646-ucs-2": CHARSET_UTF_16_LE, "ucs-2": CHARSET_UTF_16_LE, "unicode": CHARSET_UTF_16_LE, "unicodefeff": CHARSET_UTF_16_LE, "utf-16": CHARSET_UTF_16_LE, "utf-16le": CHARSET_UTF_16_LE, # x-user-defined "x-user-defined": CHARSET_X_USER_DEFINED }.toTable() func normalizeLocale(s: string): string = for i in 0 ..< s.len: if cast[uint8](s[i]) > 0x20 and s[i] != '_' and s[i] != '-': result &= s[i].toLowerAscii() const NormalizedCharsetMap = (func(): Table[string, Charset] = for k, v in CharsetMap: result[k.normalizeLocale()] = v)() const DefaultCharset* = CHARSET_UTF_8 proc getCharset*(s: string): Charset = return CharsetMap.getOrDefault(s.strip().toLower(), CHARSET_UNKNOWN) proc getLocaleCharset*(s: string): Charset = let ss = s.after('.') if ss != "": return NormalizedCharsetMap.getOrDefault(ss.normalizeLocale(), CHARSET_UNKNOWN) # We could try to guess the charset based on the language here, like w3m # does. # However, these days it is more likely for any system to be using UTF-8 # than any other charset, irrespective of the language. So we just assume # UTF-8. return DefaultCharset iterator mappairs(path: string): tuple[a, b: int] = let s = staticRead(path) for line in s.split('\n'): if line.len == 0 or line[0] == '#': continue var i = 0 while line[i] == ' ': inc i var j = i while i < line.len and line[i] in '0'..'9': inc i let index = parseInt(line.substr(j, i - 1)) inc i # tab j = i while i < line.len and line[i] in {'0'..'9', 'A'..'F', 'x'}: inc i let n = parseHexInt(line.substr(j, i - 1)) yield (index, n) # I'm pretty sure single-byte encodings map to ucs-2. func loadCharsetMap8(path: string): tuple[ decode: array[char, uint16], encode: seq[ tuple[ ucs: uint16, val: char ] ], ] = var m: int for index, n in mappairs("res/map" / path): result.decode[char(index)] = uint16(n) if index > m: m = index for index in low(char) .. char(m): let val = result.decode[index] if val != 0u16: result.encode.add((val, index)) result.encode.sort() func loadCharsetMap8Encode(path: string): seq[tuple[ucs: uint16, val: char]] = for index, n in mappairs("res/map" / path): result.add((uint16(n), char(index))) result.sort() func loadGb18030Ranges(path: string): tuple[ decode: seq[ tuple[ p: uint16, ucs: uint16 ]], encode: seq[ tuple[ ucs: uint16, p: uint16 ]]] = for index, n in mappairs("res/map" / path): if uint32(index) > uint32(high(uint16)): break result.decode.add((uint16(index), uint16(n))) result.encode.add((uint16(n), uint16(index))) result.encode.sort() type UCS16x16* = tuple[ucs, p: uint16] func loadCharsetMap16(path: string, len: static uint16): tuple[ decode: array[len, uint16], encode: seq[UCS16x16]] = for index, n in mappairs("res/map" / path): result.decode[uint16(index)] = uint16(n) result.encode.add((uint16(n), uint16(index))) result.encode.sort() type UCS32x16* = tuple[ucs: uint32, p: uint16] func loadBig5Map(path: string, offset: static uint16): tuple[ decode: array[19782u16 - offset, uint32], # ouch (+75KB...) encode: seq[UCS32x16]] = for index, n in mappairs("res/map" / path): result.decode[uint16(index) - offset] = uint32(n) result.encode.add((uint32(n), uint16(index))) #for i in result.decode: assert x != 0 # fail result.encode.sort() const (IBM866Decode*, IBM866Encode*) = loadCharsetMap8("index-ibm866.txt") const (ISO88592Decode*, ISO88592Encode*) = loadCharsetMap8("index-iso-8859-2.txt") const (ISO88593Decode*, ISO88593Encode*) = loadCharsetMap8("index-iso-8859-3.txt") const (ISO88594Decode*, ISO88594Encode*) = loadCharsetMap8("index-iso-8859-4.txt") const (ISO88595Decode*, ISO88595Encode*) = loadCharsetMap8("index-iso-8859-5.txt") const (ISO88596Decode*, ISO88596Encode*) = loadCharsetMap8("index-iso-8859-6.txt") const (ISO88597Decode*, ISO88597Encode*) = loadCharsetMap8("index-iso-8859-7.txt") const (ISO88598Decode*, ISO88598Encode*) = loadCharsetMap8("index-iso-8859-8.txt") const (ISO885910Decode*, ISO885910Encode*) = loadCharsetMap8("index-iso-8859-10.txt") const (ISO885913Decode*, ISO885913Encode*) = loadCharsetMap8("index-iso-8859-13.txt") const (ISO885914Decode*, ISO885914Encode*) = loadCharsetMap8("index-iso-8859-14.txt") const (ISO885915Decode*, ISO885915Encode*) = loadCharsetMap8("index-iso-8859-15.txt") const (ISO885916Decode*, ISO885916Encode*) = loadCharsetMap8("index-iso-8859-16.txt") const (KOI8RDecode*, KOI8REncode*) = loadCharsetMap8("index-koi8-r.txt") const (KOI8UDecode*, KOI8UEncode*) = loadCharsetMap8("index-koi8-u.txt") const (MacintoshDecode*, MacintoshEncode*) = loadCharsetMap8("index-macintosh.txt") const (Windows874Decode*, Windows874Encode*) = loadCharsetMap8("index-windows-874.txt") const (Windows1250Decode*, Windows1250Encode*) = loadCharsetMap8("index-windows-1250.txt") const (Windows1251Decode*, Windows1251Encode*) = loadCharsetMap8("index-windows-1251.txt") const (Windows1252Decode*, Windows1252Encode*) = loadCharsetMap8("index-windows-1252.txt") const (Windows1253Decode*, Windows1253Encode*) = loadCharsetMap8("index-windows-1253.txt") const (Windows1254Decode*, Windows1254Encode*) = loadCharsetMap8("index-windows-1254.txt") const (Windows1255Decode*, Windows1255Encode*) = loadCharsetMap8("index-windows-1255.txt") const (Windows1256Decode*, Windows1256Encode*) = loadCharsetMap8("index-windows-1256.txt") const (Windows1257Decode*, Windows1257Encode*) = loadCharsetMap8("index-windows-1257.txt") const (Windows1258Decode*, Windows1258Encode*) = loadCharsetMap8("index-windows-1258.txt") const (XMacCyrillicDecode*, XMacCyrillicEncode*) = loadCharsetMap8("index-x-mac-cyrillic.txt") const (Gb18030RangesDecode*, Gb18030RangesEncode*) = loadGb18030Ranges("index-gb18030-ranges.txt") const (Gb18030Decode*, Gb18030Encode*) = loadCharsetMap16("index-gb18030.txt", len = 23940) #for x in Gb18030Decode: assert x != 0 # success const Big5DecodeOffset* = 942 const (Big5Decode*, Big5Encode*) = loadBig5Map("index-big5.txt", offset = Big5DecodeOffset) const (Jis0208Decode*, Jis0208Encode*) = loadCharsetMap16("index-jis0208.txt", len = 11104) const (Jis0212Decode*, Jis0212Encode*) = loadCharsetMap16("index-jis0212.txt", len = 7211) const ISO2022JPKatakanaEncode* = loadCharsetMap8Encode("index-iso-2022-jp-katakana.txt") const (EUCKRDecode*, EUCKREncode*) = loadCharsetMap16("index-euc-kr.txt", len = 23750)