about summary refs log tree commit diff stats
path: root/src/data
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2023-05-19 01:50:17 +0200
committerbptato <nincsnevem662@gmail.com>2023-05-19 01:50:17 +0200
commit26e8968a6499742cf37e00292a7d1c8ed620cad5 (patch)
treea3922f02f09b5c025dddcfe0e7a3a719c47ba4da /src/data
parentdac6a09c14b258ed725dcb265305a6445edc02ad (diff)
downloadchawan-26e8968a6499742cf37e00292a7d1c8ed620cad5.tar.gz
Add display/output encoding
Some encodings are still missing
Diffstat (limited to 'src/data')
-rw-r--r--src/data/charset.nim39
1 files changed, 31 insertions, 8 deletions
diff --git a/src/data/charset.nim b/src/data/charset.nim
index f93a82b3..45d7786c 100644
--- a/src/data/charset.nim
+++ b/src/data/charset.nim
@@ -3,6 +3,8 @@ import os
 import strutils
 import tables
 
+import utils/twtstr
+
 type Charset* = enum
   CHARSET_UNKNOWN
   CHARSET_UTF_8 = "UTF-8"
@@ -314,9 +316,32 @@ const CharsetMap = {
   "x-user-defined": CHARSET_X_USER_DEFINED
 }.toTable()
 
+func normalizeLocale(s: string): string =
+  for i in 0 ..< s.len:
+    if cast[uint8](s[i]) > 0x20 and s[i] != '_' and s[i] != '-':
+      result &= s[i].toLowerAscii()
+
+const NormalizedCharsetMap = (func(): Table[string, Charset] =
+  for k, v in CharsetMap:
+    result[k.normalizeLocale()] = v)()
+
+const DefaultCharset* = CHARSET_UTF_8
+
 proc getCharset*(s: string): Charset =
   return CharsetMap.getOrDefault(s.strip().toLower(), CHARSET_UNKNOWN)
 
+proc getLocaleCharset*(s: string): Charset =
+  let ss = s.after('.')
+  if ss != "":
+    return NormalizedCharsetMap.getOrDefault(ss.normalizeLocale(),
+      CHARSET_UNKNOWN)
+  # We could try to guess the charset based on the language here, like w3m
+  # does.
+  # However, these days it is more likely for any system to be using UTF-8
+  # than any other charset, irrespective of the language. So we just assume
+  # UTF-8.
+  return DefaultCharset
+
 iterator mappairs(path: string): tuple[a, b: int] =
   let s = staticRead(path)
   for line in s.split('\n'):
@@ -372,23 +397,21 @@ func loadGb18030Ranges(path: string): tuple[
     result.encode.add((uint16(n), uint16(index)))
   result.encode.sort()
 
+type UCS16x16* = tuple[ucs, p: uint16]
+
 func loadCharsetMap16(path: string, len: static uint16): tuple[
         decode: array[len, uint16],
-        encode: seq[
-          tuple[
-            ucs: uint16,
-            p: uint16 ]]] =
+        encode: seq[UCS16x16]] =
   for index, n in mappairs("res/map" / path):
     result.decode[uint16(index)] = uint16(n)
     result.encode.add((uint16(n), uint16(index)))
   result.encode.sort()
 
+type UCS32x16* = tuple[ucs: uint32, p: uint16]
+
 func loadBig5Map(path: string, offset: static uint16): tuple[
         decode: array[19782u16 - offset, uint32], # ouch (+75KB...)
-        encode: seq[
-          tuple[
-            ucs: uint32,
-            p: uint16 ]]] =
+        encode: seq[UCS32x16]] =
   for index, n in mappairs("res/map" / path):
     result.decode[uint16(index) - offset] = uint32(n)
     result.encode.add((uint32(n), uint16(index)))