diff options
author | bptato <nincsnevem662@gmail.com> | 2023-05-19 01:50:17 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2023-05-19 01:50:17 +0200 |
commit | 26e8968a6499742cf37e00292a7d1c8ed620cad5 (patch) | |
tree | a3922f02f09b5c025dddcfe0e7a3a719c47ba4da /src/data | |
parent | dac6a09c14b258ed725dcb265305a6445edc02ad (diff) | |
download | chawan-26e8968a6499742cf37e00292a7d1c8ed620cad5.tar.gz |
Add display/output encoding
Some encodings are still missing
Diffstat (limited to 'src/data')
-rw-r--r-- | src/data/charset.nim | 39 |
1 files changed, 31 insertions, 8 deletions
diff --git a/src/data/charset.nim b/src/data/charset.nim index f93a82b3..45d7786c 100644 --- a/src/data/charset.nim +++ b/src/data/charset.nim @@ -3,6 +3,8 @@ import os import strutils import tables +import utils/twtstr + type Charset* = enum CHARSET_UNKNOWN CHARSET_UTF_8 = "UTF-8" @@ -314,9 +316,32 @@ const CharsetMap = { "x-user-defined": CHARSET_X_USER_DEFINED }.toTable() +func normalizeLocale(s: string): string = + for i in 0 ..< s.len: + if cast[uint8](s[i]) > 0x20 and s[i] != '_' and s[i] != '-': + result &= s[i].toLowerAscii() + +const NormalizedCharsetMap = (func(): Table[string, Charset] = + for k, v in CharsetMap: + result[k.normalizeLocale()] = v)() + +const DefaultCharset* = CHARSET_UTF_8 + proc getCharset*(s: string): Charset = return CharsetMap.getOrDefault(s.strip().toLower(), CHARSET_UNKNOWN) +proc getLocaleCharset*(s: string): Charset = + let ss = s.after('.') + if ss != "": + return NormalizedCharsetMap.getOrDefault(ss.normalizeLocale(), + CHARSET_UNKNOWN) + # We could try to guess the charset based on the language here, like w3m + # does. + # However, these days it is more likely for any system to be using UTF-8 + # than any other charset, irrespective of the language. So we just assume + # UTF-8. + return DefaultCharset + iterator mappairs(path: string): tuple[a, b: int] = let s = staticRead(path) for line in s.split('\n'): @@ -372,23 +397,21 @@ func loadGb18030Ranges(path: string): tuple[ result.encode.add((uint16(n), uint16(index))) result.encode.sort() +type UCS16x16* = tuple[ucs, p: uint16] + func loadCharsetMap16(path: string, len: static uint16): tuple[ decode: array[len, uint16], - encode: seq[ - tuple[ - ucs: uint16, - p: uint16 ]]] = + encode: seq[UCS16x16]] = for index, n in mappairs("res/map" / path): result.decode[uint16(index)] = uint16(n) result.encode.add((uint16(n), uint16(index))) result.encode.sort() +type UCS32x16* = tuple[ucs: uint32, p: uint16] + func loadBig5Map(path: string, offset: static uint16): tuple[ decode: array[19782u16 - offset, uint32], # ouch (+75KB...) - encode: seq[ - tuple[ - ucs: uint32, - p: uint16 ]]] = + encode: seq[UCS32x16]] = for index, n in mappairs("res/map" / path): result.decode[uint16(index) - offset] = uint32(n) result.encode.add((uint32(n), uint16(index))) |