about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2023-05-19 01:50:17 +0200
committerbptato <nincsnevem662@gmail.com>2023-05-19 01:50:17 +0200
commit26e8968a6499742cf37e00292a7d1c8ed620cad5 (patch)
treea3922f02f09b5c025dddcfe0e7a3a719c47ba4da /src
parentdac6a09c14b258ed725dcb265305a6445edc02ad (diff)
downloadchawan-26e8968a6499742cf37e00292a7d1c8ed620cad5.tar.gz
Add display/output encoding
Some encodings are still missing
Diffstat (limited to 'src')
-rw-r--r--src/config/config.nim20
-rw-r--r--src/data/charset.nim39
-rw-r--r--src/display/pager.nim6
-rw-r--r--src/display/term.nim55
-rw-r--r--src/encoding/decoderstream.nim19
-rw-r--r--src/encoding/encoderstream.nim389
-rw-r--r--src/html/htmlparser.nim65
-rw-r--r--src/main.nim13
-rw-r--r--src/render/rendertext.nim2
9 files changed, 524 insertions, 84 deletions
diff --git a/src/config/config.nim b/src/config/config.nim
index e7bb0c63..45c177ad 100644
--- a/src/config/config.nim
+++ b/src/config/config.nim
@@ -67,6 +67,7 @@ type
     wrap*: bool
 
   EncodingConfig = object
+    display_charset*: Option[Charset]
     document_charset*: seq[Charset]
 
   ExternalConfig = object
@@ -292,7 +293,6 @@ proc parseConfigValue(x: var Option[ColorMode], v: TomlValue, k: string)
 proc parseConfigValue(x: var Option[FormatMode], v: TomlValue, k: string)
 proc parseConfigValue(x: var FormatMode, v: TomlValue, k: string)
 proc parseConfigValue(x: var RGBAColor, v: TomlValue, k: string)
-proc parseConfigValue(x: var Option[bool], v: TomlValue, k: string)
 proc parseConfigValue[T](x: var Option[T], v: TomlValue, k: string)
 proc parseConfigValue(x: var ActionMap, v: TomlValue, k: string)
 proc parseConfigValue(x: var CSSConfig, v: TomlValue, k: string)
@@ -413,24 +413,14 @@ proc parseConfigValue(x: var RGBAColor, v: TomlValue, k: string) =
         "' for key " & k)
   x = c.get
 
-proc parseConfigValue(x: var Option[bool], v: TomlValue, k: string) =
-  typeCheck(v, {VALUE_STRING, VALUE_BOOLEAN}, k)
-  if v.vt == VALUE_STRING:
-    if v.s == "auto":
-      x = none(bool)
-    else:
-      raise newException(ValueError, "invalid value '" & v.s &
-        "' for key " & k)
+proc parseConfigValue[T](x: var Option[T], v: TomlValue, k: string) =
+  if v.vt == VALUE_STRING and v.s == "auto":
+    x = none(T)
   else:
-    var y: bool
+    var y: T
     parseConfigValue(y, v, k)
     x = some(y)
 
-proc parseConfigValue[T](x: var Option[T], v: TomlValue, k: string) =
-  var y: T
-  parseConfigValue(y, v, k)
-  x = some(y)
-
 proc parseConfigValue(x: var ActionMap, v: TomlValue, k: string) =
   typeCheck(v, VALUE_TABLE, k)
   for kk, vv in v:
diff --git a/src/data/charset.nim b/src/data/charset.nim
index f93a82b3..45d7786c 100644
--- a/src/data/charset.nim
+++ b/src/data/charset.nim
@@ -3,6 +3,8 @@ import os
 import strutils
 import tables
 
+import utils/twtstr
+
 type Charset* = enum
   CHARSET_UNKNOWN
   CHARSET_UTF_8 = "UTF-8"
@@ -314,9 +316,32 @@ const CharsetMap = {
   "x-user-defined": CHARSET_X_USER_DEFINED
 }.toTable()
 
+func normalizeLocale(s: string): string =
+  for i in 0 ..< s.len:
+    if cast[uint8](s[i]) > 0x20 and s[i] != '_' and s[i] != '-':
+      result &= s[i].toLowerAscii()
+
+const NormalizedCharsetMap = (func(): Table[string, Charset] =
+  for k, v in CharsetMap:
+    result[k.normalizeLocale()] = v)()
+
+const DefaultCharset* = CHARSET_UTF_8
+
 proc getCharset*(s: string): Charset =
   return CharsetMap.getOrDefault(s.strip().toLower(), CHARSET_UNKNOWN)
 
+proc getLocaleCharset*(s: string): Charset =
+  let ss = s.after('.')
+  if ss != "":
+    return NormalizedCharsetMap.getOrDefault(ss.normalizeLocale(),
+      CHARSET_UNKNOWN)
+  # We could try to guess the charset based on the language here, like w3m
+  # does.
+  # However, these days it is more likely for any system to be using UTF-8
+  # than any other charset, irrespective of the language. So we just assume
+  # UTF-8.
+  return DefaultCharset
+
 iterator mappairs(path: string): tuple[a, b: int] =
   let s = staticRead(path)
   for line in s.split('\n'):
@@ -372,23 +397,21 @@ func loadGb18030Ranges(path: string): tuple[
     result.encode.add((uint16(n), uint16(index)))
   result.encode.sort()
 
+type UCS16x16* = tuple[ucs, p: uint16]
+
 func loadCharsetMap16(path: string, len: static uint16): tuple[
         decode: array[len, uint16],
-        encode: seq[
-          tuple[
-            ucs: uint16,
-            p: uint16 ]]] =
+        encode: seq[UCS16x16]] =
   for index, n in mappairs("res/map" / path):
     result.decode[uint16(index)] = uint16(n)
     result.encode.add((uint16(n), uint16(index)))
   result.encode.sort()
 
+type UCS32x16* = tuple[ucs: uint32, p: uint16]
+
 func loadBig5Map(path: string, offset: static uint16): tuple[
         decode: array[19782u16 - offset, uint32], # ouch (+75KB...)
-        encode: seq[
-          tuple[
-            ucs: uint32,
-            p: uint16 ]]] =
+        encode: seq[UCS32x16]] =
   for index, n in mappairs("res/map" / path):
     result.decode[uint16(index) - offset] = uint32(n)
     result.encode.add((uint32(n), uint16(index)))
diff --git a/src/display/pager.nim b/src/display/pager.nim
index bd07d52e..1767f09b 100644
--- a/src/display/pager.nim
+++ b/src/display/pager.nim
@@ -318,6 +318,7 @@ proc drawBuffer*(pager: Pager, container: Container, ostream: Stream) =
       ostream.write(line.str & "\n")
     else:
       var x = 0
+      var w = 0
       var i = 0
       var s = ""
       for f in line.formats:
@@ -327,9 +328,10 @@ proc drawBuffer*(pager: Pager, container: Container, ostream: Stream) =
           fastRuneAt(line.str, i, r)
           outstr &= r
           x += r.width()
-        s &= outstr
+        s &= pager.term.processOutputString(outstr, w)
         s &= pager.term.processFormat(format, f.format)
-      s &= line.str.substr(i) & pager.term.processFormat(format, newFormat()) & "\n"
+      s &= pager.term.processOutputString(line.str.substr(i), w)
+      s &= pager.term.processFormat(format, newFormat()) & "\n"
       ostream.write(s))
   ostream.flush()
 
diff --git a/src/display/term.nim b/src/display/term.nim
index ce518428..e665faee 100644
--- a/src/display/term.nim
+++ b/src/display/term.nim
@@ -1,6 +1,7 @@
 import math
 import options
 import os
+import streams
 import tables
 import terminal
 import unicode
@@ -8,6 +9,8 @@ import unicode
 import bindings/termcap
 import buffer/cell
 import config/config
+import data/charset
+import encoding/encoderstream
 import io/window
 import utils/twtstr
 import types/color
@@ -39,6 +42,7 @@ type
 
   Terminal* = ref TerminalObj
   TerminalObj = object
+    cs: Charset
     config: Config
     infile: File
     outfile: File
@@ -320,18 +324,37 @@ proc windowChange*(term: Terminal, attrs: WindowAttributes) =
   term.canvas = newFixedGrid(attrs.width, attrs.height)
   term.cleared = false
 
-proc processOutputString(term: Terminal, str: string, w: var int): string =
+proc processOutputString*(term: Terminal, str: string, w: var int): string =
   if str.validateUtf8() != -1:
     return "?"
-  for r in str.runes():
-    # twidth wouldn't work here, the view may start at the nth character.
-    # pager must ensure tabs are converted beforehand.
-    let tw = r.width()
-    if r.isControlChar():
-      result &= "^" & getControlLetter(char(r))
-    elif tw != 0:
-      result &= r
-    w += tw
+  if term.cs != CHARSET_UTF_8:
+    #TODO: This is incredibly inefficient.
+    var u32buf = ""
+    for r in str.runes():
+      let tw = r.width()
+      if r.isControlChar():
+        u32buf &= char(0) & char(0) & char(0) & "^" &
+          char(0) & char(0) & char(0) & getControlLetter(char(r))
+      elif tw != 0:
+        let ol = u32buf.len
+        u32buf.setLen(ol + sizeof(uint32))
+        var u32 = cast[uint32](r)
+        copyMem(addr u32buf[ol], addr u32, sizeof(u32))
+      w += tw
+    let ss = newStringStream(u32buf)
+    let encoder = newEncoderStream(ss, cs = term.cs,
+      errormode = ENCODER_ERROR_MODE_FATAL)
+    result &= encoder.readAll()
+  else:
+    for r in str.runes():
+      # twidth wouldn't work here, the view may start at the nth character.
+      # pager must ensure tabs are converted beforehand.
+      let tw = r.width()
+      if r.isControlChar():
+        result &= "^" & getControlLetter(char(r))
+      elif tw != 0:
+        result &= r
+      w += tw
 
 proc generateFullOutput(term: Terminal, grid: FixedGrid): string =
   var format = newFormat()
@@ -427,6 +450,18 @@ proc applyConfig(term: Terminal) =
   if term.isatty() and term.config.display.alt_screen.isSome:
     term.smcup = term.config.display.alt_screen.get
   term.mincontrast = term.config.display.minimum_contrast
+  if term.config.encoding.display_charset.isSome:
+    term.cs = term.config.encoding.display_charset.get
+  else:
+    term.cs = DefaultCharset
+    for s in ["LC_ALL", "LC_CTYPE", "LANG"]:
+      let env = getEnv(s)
+      if env == "":
+        continue
+      let cs = getLocaleCharset(env)
+      if cs != CHARSET_UNKNOWN:
+        term.cs = cs
+        break
 
 proc outputGrid*(term: Terminal) =
   if term.config.display.force_clear:
diff --git a/src/encoding/decoderstream.nim b/src/encoding/decoderstream.nim
index 425f264f..8bfd4d10 100644
--- a/src/encoding/decoderstream.nim
+++ b/src/encoding/decoderstream.nim
@@ -8,8 +8,7 @@ import utils/twtstr
 # DecoderStream decodes any encoding to valid utf-32.
 type
   DecoderErrorMode* = enum
-    DECODER_ERROR_MODE_FATAL, DECODER_ERROR_MODE_REPLACEMENT,
-    DECODER_ERROR_MODE_HTML
+    DECODER_ERROR_MODE_FATAL, DECODER_ERROR_MODE_REPLACEMENT
 
   ISO2022JPState = enum
     STATE_ASCII, STATE_ROMAN, STATE_KATAKANA, STATE_LEAD_BYTE,
@@ -27,6 +26,7 @@ type
     c: uint32
     case charset: Charset
     of CHARSET_UTF_8:
+      u8c: uint32
       u8needed: int
       u8seen: int
       u8bounds: Slice[uint8]
@@ -83,22 +83,11 @@ proc handleError(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: in
   of DECODER_ERROR_MODE_FATAL:
     stream.isend = true
     stream.failed = true
-  of DECODER_ERROR_MODE_HTML:
-    if stream.charset == CHARSET_UTF_8:
-      # "html" mode is handled as "replacement" for utf-8.
-      stream.append_codepoint 0xFFFD, oq, olen, n
-    else:
-      stream.append_codepoint '&', oq, olen, n
-      stream.append_codepoint '#', oq, olen, n
-      while stream.c > 0:
-        stream.append_codepoint cast[char](0x30 + stream.c mod 10), oq, olen, n
-        stream.c = stream.c div 10
-      stream.append_codepoint ';', oq, olen, n
   of DECODER_ERROR_MODE_REPLACEMENT:
     stream.append_codepoint 0xFFFD, oq, olen, n
 
 proc decodeUTF8(stream: DecoderStream, iq: var seq[uint8], oq: ptr UncheckedArray[uint32], ilen, olen: int, n: var int) =
-  var c = stream.c
+  var c = stream.u8c
   var needed = stream.u8needed
   var seen = stream.u8seen
   var bounds = stream.u8bounds
@@ -156,7 +145,7 @@ proc decodeUTF8(stream: DecoderStream, iq: var seq[uint8], oq: ptr UncheckedArra
       needed = 0
       seen = 0
     inc i
-  stream.c = c
+  stream.u8c = c
   stream.u8bounds = bounds
   stream.u8seen = seen
   stream.u8needed = needed
diff --git a/src/encoding/encoderstream.nim b/src/encoding/encoderstream.nim
new file mode 100644
index 00000000..155b7d31
--- /dev/null
+++ b/src/encoding/encoderstream.nim
@@ -0,0 +1,389 @@
+# Heavily based on https://encoding.spec.whatwg.org/
+
+import algorithm
+import streams
+import unicode
+
+import data/charset
+
+# EncoderStream encodes utf-32 to the specified encoding.
+type
+  EncoderErrorMode* = enum
+    ENCODER_ERROR_MODE_FATAL, ENCODER_ERROR_MODE_HTML
+
+  ISO2022JPState = enum
+    STATE_ASCII, STATE_ROMAN, STATE_JIS0208
+
+  EncoderStream* = ref object
+    source: Stream
+    errormode: EncoderErrorMode
+    isend: bool
+    failed*: bool
+    bufs: seq[seq[uint8]]
+    bs: int
+    bi: int
+    buflen: int
+    errc: uint32
+    charset: Charset
+
+template append_byte_buf(stream: EncoderStream, c: uint8) =
+  if stream.bi >= stream.buflen:
+    stream.bufs.add(newSeqUninitialized[uint8](stream.buflen))
+    stream.bi = 0
+  stream.bufs[^1][stream.bi] = c
+  inc stream.bi
+
+template append_byte(stream: EncoderStream, c: uint8,
+    oq: ptr UncheckedArray[uint8], olen: int, n: var int) =
+  if n < olen:
+    oq[n] = c
+    inc n
+  else:
+    append_byte_buf stream, c
+
+template append_byte(stream: EncoderStream, c: char,
+    oq: ptr UncheckedArray[uint8], olen: int, n: var int) =
+  stream.append_byte cast[uint8](c), oq, olen, n
+
+template append_byte(stream: EncoderStream, c: uint32,
+    oq: ptr UncheckedArray[uint8], olen: int, n: var int) =
+  stream.append_byte cast[uint8](c), oq, olen, n
+
+template append_byte(stream: EncoderStream, c: int,
+    oq: ptr UncheckedArray[uint8], olen: int, n: var int) =
+  stream.append_byte cast[uint8](c), oq, olen, n
+
+proc handleError(stream: EncoderStream, oq: ptr UncheckedArray[uint8],
+    olen: int, n: var int, c: uint32) =
+  case stream.errormode
+  of ENCODER_ERROR_MODE_FATAL:
+    stream.isend = true
+    stream.failed = true
+  of ENCODER_ERROR_MODE_HTML:
+    stream.append_byte '&', oq, olen, n
+    stream.append_byte '#', oq, olen, n
+    if stream.errc == 0:
+      stream.append_byte '0', oq, olen, n
+    else:
+      while stream.errc > 0:
+        stream.append_byte cast[char](0x30 + stream.errc mod 10), oq, olen, n
+        stream.errc = stream.errc div 10
+    stream.append_byte ';', oq, olen, n
+
+proc gb18030RangesPointer(c: uint32): uint32 =
+  if c == 0xE7C7:
+    return 7457
+  # Let offset be the last pointer in index gb18030 ranges that is less than or
+  # equal to pointer and code point offset its corresponding code point.
+  var offset: uint32
+  var p: uint32
+  if c >= 0x10000:
+    # omitted from the map for storage efficiency
+    offset = 0x10000
+    p = 189000
+  elif c >= 0xFFE6:
+    # Needed because upperBound returns the first element greater than pointer
+    # OR last on failure, so we can't just remove one if p is e.g. 39400.
+    offset = 0xFFE6
+    p = 39394
+  else:
+    # Find the first range that is greater than p, or last if no such element
+    # is found.
+    # We want the last that is <=, so decrease index by one.
+    let i = upperBound(Gb18030RangesEncode, c, func(a: tuple[ucs, p: uint16], b: uint32): int =
+      cmp(cast[uint32](a.ucs), b))
+    let elem = Gb18030RangesEncode[i - 1]
+    offset = elem.ucs
+    p = elem.p
+  return p + c - offset
+
+proc encodeUTF8(stream: EncoderStream, iq: var seq[uint32],
+    oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) =
+  var i = 0
+  while i < ilen:
+    let c = iq[i]
+    var count: int
+    var offset: uint8
+    case c
+    of 0x0080..0x07FF:
+      count = 1
+      offset = 0xC0
+    of 0x0800..0xFFFF:
+      count = 2
+      offset = 0xE0
+    of 0x10000..0x10FFFF:
+      count = 3
+      offset = 0xF0
+    else:
+      assert false
+    stream.append_byte (c shr (6 * count)) + offset, oq, olen, n
+    while count > 0:
+      let tmp = c shr (6 * (count - 1))
+      stream.append_byte 0x80 or (tmp and 0x3F), oq, olen, n
+      dec count
+
+proc encodeSingleByte(stream: EncoderStream, iq: var seq[uint32],
+    oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int,
+    map: seq[tuple[ucs: uint16, val: char]]) =
+  for i in 0 ..< ilen:
+    let c = iq[i]
+    if c < 0x80:
+      stream.append_byte cast[uint8](c), oq, olen, n
+      continue
+    if c <= 0xFFFF:
+      let j = binarySearch(map, cast[uint16](c),
+        proc(a: tuple[ucs: uint16, val: char], b: uint16): int =
+          cmp(a.ucs, b))
+      if j != -1:
+        stream.append_byte cast[uint8](map[j].val) + 0x80, oq, olen, n
+        continue
+    stream.handleError(oq, olen, n, c)
+
+proc encodeXUserDefined(stream: EncoderStream, iq: var seq[uint32],
+    oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) =
+  for i in 0 ..< ilen:
+    let c = iq[i]
+    if c < 0x80:
+      stream.append_byte cast[uint8](c), oq, olen, n
+      continue
+    if c in 0xF780u32..0xF7FFu32:
+      let b = cast[uint8](c - 0xF780 + 0x80)
+      stream.append_byte b, oq, olen, n
+      continue
+    stream.handleError(oq, olen, n, c)
+
+proc encodeGb18030(stream: EncoderStream, iq: var seq[uint32],
+    oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int,
+    isGBK = false) =
+  for c in iq:
+    if isGBK and c == 0x20AC:
+      stream.append_byte 0x80, oq, olen, n
+      continue
+    let i = if c > 0xFFFF: -1 else: binarySearch(Gb18030Encode, cast[uint16](c),
+      proc(a: UCS16x16, b: uint16): int =
+        cmp(a.ucs, b))
+    if i != -1:
+      let p = Gb18030Encode[i].p
+      let lead = p div 190 + 0x81
+      let trail = p mod 190
+      let offset: uint8 = if trail < 0x3F: 0x40 else: 0x41
+      stream.append_byte lead, oq, olen, n
+      stream.append_byte cast[uint8](trail) + offset, oq, olen, n
+      continue
+    if isGBK:
+      stream.handleError(oq, olen, n, c)
+      continue
+    var p = gb18030RangesPointer(c)
+    let b1 = p div (10 * 126 * 10)
+    p = p mod (10 * 126 * 10)
+    let b2 = p div (10 * 126)
+    p = p mod (10 * 126)
+    let b3 = p div 10
+    let b4 = p mod 10
+    stream.append_byte b1, oq, olen, n
+    stream.append_byte b2, oq, olen, n
+    stream.append_byte b3, oq, olen, n
+    stream.append_byte b4, oq, olen, n
+
+proc encodeBig5(stream: EncoderStream, iq: var seq[uint32],
+    oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) =
+  for c in iq:
+    if c < 0x80:
+      stream.append_byte c, oq, olen, n
+      continue
+    let i = binarySearch(Big5Encode, cast[uint16](c),
+      proc(a: UCS32x16, b: uint16): int =
+        cmp(a.ucs, b))
+    if i == -1:
+      stream.handleError(oq, olen, n, c)
+      continue
+    let p = Big5Encode[i].p
+    let lead = p div 157 + 0x81
+    let trail = p mod 157
+    let offset: uint8 = if trail < 0x3F: 0x40 else: 0x62
+    stream.append_byte lead, oq, olen, n
+    stream.append_byte cast[uint8](trail) + offset, oq, olen, n
+
+proc encodeEUCJP(stream: EncoderStream, iq: var seq[uint32],
+    oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) =
+  for c in iq:
+    if c < 0x80:
+      stream.append_byte c, oq, olen, n
+    elif c == 0xA5:
+      stream.append_byte 0x5C, oq, olen, n
+    elif c == 0x203E:
+      stream.append_byte 0x5C, oq, olen, n
+    elif c in 0xFF61u32..0xFF9Fu32:
+      stream.append_byte 0x8E, oq, olen, n
+      stream.append_byte c - 0xFF61 + 0xA1, oq, olen, n
+    else:
+      let c = if c == 0x2212:
+        0xFF0Du32
+      else:
+        c
+      let i = binarySearch(Jis0208Encode, cast[uint16](c),
+        proc(a: UCS16x16, b: uint16): int =
+          cmp(a.ucs, b))
+      if i != -1:
+        let p = Jis0208Encode[i].p
+        let lead = p div 94 + 0xA1
+        let trail = p mod 94 + 0xA1
+        stream.append_byte lead, oq, olen, n
+        stream.append_byte trail, oq, olen, n
+      else:
+        stream.handleError(oq, olen, n, c)
+
+# copy any data remaining from previous passes
+proc copyBuffers(stream: EncoderStream, oq: ptr UncheckedArray[uint8], olen: int): int =
+  if stream.bufs.len == 1:
+    # one page: stream.bs ..< stream.bi
+    let n = min((stream.bi - stream.bs) * sizeof(stream.bufs[0][0]), olen)
+    copyMem(addr oq[0], addr stream.bufs[0][stream.bs], n)
+    stream.bs += n
+    if stream.bs >= stream.bi:
+      # read entire page; recycle it
+      stream.bs = 0
+      stream.bi = 0
+    return n
+  else:
+    # multiple pages:
+    # stream.bs ..< stream.buflen
+    # 0 ..< stream.buflen
+    # ...
+    # 0 ..< stream.bi
+    let a = (stream.buflen - stream.bs) * sizeof(stream.bufs[0][0])
+    if a < olen:
+      copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a)
+      var ns = a
+      stream.bs = 0
+      var i = 1
+      while i < stream.bufs.high:
+        let n = min(stream.buflen * sizeof(stream.bufs[0][0]), olen - ns)
+        copyMem(addr oq[ns], addr stream.bufs[i][0], n)
+        ns += n
+        if ns >= olen:
+          # i'th buffer still has contents.
+          stream.bs = n
+          break
+        stream.bs = 0
+        inc i
+      if ns < olen:
+        # last page
+        let n = min(stream.bi * sizeof(stream.bufs[0][0]), olen - ns)
+        copyMem(addr oq[ns], addr stream.bufs[i][0], n)
+        ns += n
+        stream.bs = n
+        if stream.bs >= stream.bi:
+          # read entire page; recycle it
+          stream.bs = 0
+          stream.bi = 0
+      for j in i ..< stream.bufs.len:
+        stream.bufs[j - i] = stream.bufs[j]
+      stream.bufs.setLen(stream.bufs.len - i)
+      return ns
+    elif a > olen:
+      copyMem(addr oq[0], addr stream.bufs[0][stream.bs], olen)
+      stream.bs += olen
+      assert stream.bs < stream.buflen
+      return olen
+    else: # a == olen
+      copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a)
+      stream.bs = 0
+      stream.bufs.delete(0)
+      return a
+
+proc checkEnd(stream: EncoderStream, oq: ptr UncheckedArray[uint8], olen: int,
+    n: var int) =
+  if not stream.isend and stream.bufs.len == 1 and
+      stream.bs >= stream.bi and stream.source.atEnd:
+    stream.isend = true
+
+const ReadSize = 4096
+proc readData*(stream: EncoderStream, buffer: pointer, olen: int): int =
+  if olen == 0: return
+  let oq = cast[ptr UncheckedArray[uint8]](buffer)
+  result = stream.copyBuffers(oq, olen)
+  let olen = olen - result
+  if olen == 0 or stream.source.atEnd:
+    # either output filled with buffered data; nothing to decode
+    # or we're at the end of the source stream
+    stream.checkEnd(oq, olen, result)
+    return result
+  var iq = newSeqUninitialized[uint32](ReadSize div sizeof(uint32))
+  let ilen0 = stream.source.readData(cast[pointer](addr iq[0]), ReadSize)
+  assert ilen0 mod sizeof(uint32) == 0 #TODO what to do if false?
+  let ilen = ilen0 div sizeof(uint32)
+  case stream.charset
+  of CHARSET_UTF_8: stream.encodeUTF8(iq, oq, ilen, olen, result)
+  of CHARSET_IBM866: stream.encodeSingleByte(iq, oq, ilen, olen, result, IBM866Encode)
+  of CHARSET_ISO_8859_2: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88592Encode)
+  of CHARSET_ISO_8859_3: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88593Encode)
+  of CHARSET_ISO_8859_4: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88594Encode)
+  of CHARSET_ISO_8859_5: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88595Encode)
+  of CHARSET_ISO_8859_6: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88596Encode)
+  of CHARSET_ISO_8859_7: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88597Encode)
+  of CHARSET_ISO_8859_8,
+     CHARSET_ISO_8859_8_I: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88598Encode)
+  of CHARSET_ISO_8859_10: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885910Encode)
+  of CHARSET_ISO_8859_13: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885913Encode)
+  of CHARSET_ISO_8859_14: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885914Encode)
+  of CHARSET_ISO_8859_15: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885915Encode)
+  of CHARSET_ISO_8859_16: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885916Encode)
+  of CHARSET_KOI8_R: stream.encodeSingleByte(iq, oq, ilen, olen, result, KOI8REncode)
+  of CHARSET_KOI8_U: stream.encodeSingleByte(iq, oq, ilen, olen, result, KOI8UEncode)
+  of CHARSET_MACINTOSH: stream.encodeSingleByte(iq, oq, ilen, olen, result, MacintoshEncode)
+  of CHARSET_WINDOWS_874: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows874Encode)
+  of CHARSET_WINDOWS_1250: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1250Encode)
+  of CHARSET_WINDOWS_1251: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1251Encode)
+  of CHARSET_WINDOWS_1252: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1252Encode)
+  of CHARSET_WINDOWS_1253: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1253Encode)
+  of CHARSET_WINDOWS_1254: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1254Encode)
+  of CHARSET_WINDOWS_1255: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1255Encode)
+  of CHARSET_WINDOWS_1256: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1256Encode)
+  of CHARSET_WINDOWS_1257: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1257Encode)
+  of CHARSET_WINDOWS_1258: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1258Encode)
+  of CHARSET_X_MAC_CYRILLIC: stream.encodeSingleByte(iq, oq, ilen, olen, result, XMacCyrillicEncode)
+  of CHARSET_GBK: stream.encodeGb18030(iq, oq, ilen, olen, result, true)
+  of CHARSET_GB18030: stream.encodeGb18030(iq, oq, ilen, olen, result)
+  of CHARSET_BIG5: stream.encodeBig5(iq, oq, ilen, olen, result)
+  of CHARSET_EUC_JP: stream.encodeEUCJP(iq, oq, ilen, olen, result)
+#  of CHARSET_ISO_2022_JP: stream.decodeISO2022JP(iq, oq, ilen, olen, result)
+#  of CHARSET_SHIFT_JIS: stream.decodeShiftJIS(iq, oq, ilen, olen, result)
+#  of CHARSET_EUC_KR: stream.decodeEUCKR(iq, oq, ilen, olen, result)
+#  of CHARSET_REPLACEMENT: stream.decodeReplacement(oq, olen, result)
+#  of CHARSET_UTF_16_LE: stream.decodeUTF16LE(iq, oq, ilen, olen, result)
+#  of CHARSET_UTF_16_BE: stream.decodeUTF16BE(iq, oq, ilen, olen, result)
+  of CHARSET_X_USER_DEFINED: stream.encodeXUserDefined(iq, oq, ilen, olen, result)
+  of CHARSET_UNKNOWN: assert false, "Somebody forgot to set the character set here"
+  else: assert false, "TODO"
+  stream.checkEnd(oq, olen, result)
+
+# Returns the number of bytes read.
+proc readData*(stream: EncoderStream, buf: var seq[uint8]): int =
+  return stream.readData(addr buf[0], buf.len * sizeof(buf[0]))
+
+proc atEnd*(stream: EncoderStream): bool =
+  return stream.isend
+
+proc readAll*(stream: EncoderStream): string =
+  var buf = newString(4096)
+  while not stream.atEnd:
+    let olen = stream.readData(addr buf[0], buf.len)
+    if olen < buf.len:
+      buf.setLen(olen)
+      result &= buf
+      break
+    result &= buf
+
+proc newEncoderStream*(source: Stream, cs = CHARSET_UTF_8, buflen = 4096,
+                       errormode: EncoderErrorMode = ENCODER_ERROR_MODE_HTML): EncoderStream =
+  result = EncoderStream(
+    source: source,
+    charset: cs,
+    buflen: buflen,
+    errormode: errormode
+  )
+  when nimvm:
+    result.bufs = @[newSeq[uint8](buflen)]
+  else:
+    result.bufs = @[newSeqUninitialized[uint8](buflen)]
diff --git a/src/html/htmlparser.nim b/src/html/htmlparser.nim
index d03e0d24..f6af8709 100644
--- a/src/html/htmlparser.nim
+++ b/src/html/htmlparser.nim
@@ -2174,50 +2174,49 @@ proc finishParsing(parser: var HTML5Parser) =
     script.execute()
   #TODO events
 
+proc bomSniff(inputStream: Stream): Charset =
+  # bom sniff
+  const u8bom = char(0xEF) & char(0xBB) & char(0xBF)
+  const bebom = char(0xFE) & char(0xFF)
+  const lebom = char(0xFF) & char(0xFE)
+  var bom = inputStream.readStr(2)
+  if bom == bebom:
+    return CHARSET_UTF_16_BE
+  elif bom == lebom:
+    return CHARSET_UTF_16_LE
+  else:
+    bom &= inputStream.readChar()
+    if bom == u8bom:
+      return CHARSET_UTF_8
+    else:
+      inputStream.setPosition(0)
+
 proc parseHTML*(inputStream: Stream, charsets: seq[Charset] = @[],
-    fallbackcs = CHARSET_UTF_8, window: Window = nil,
-    url: URL = nil, canReinterpret = true): Document =
+    fallbackcs = DefaultCharset, window: Window = nil, url: URL = nil,
+    canReinterpret = true): Document =
   var charsetStack: seq[Charset]
   for i in countdown(charsets.high, 0):
     charsetStack.add(charsets[i])
   var canReinterpret = canReinterpret
+  var confidence: CharsetConfidence
+  let scs = inputStream.bomSniff()
+  if scs != CHARSET_UNKNOWN:
+    charsetStack.add(scs)
+    confidence = CONFIDENCE_CERTAIN
+  elif charsetStack.len == 0:
+    charsetStack.add(fallbackcs)
   while true:
     var parser: HTML5Parser
-    var bom: string
-    let islastcs = charsetStack.len == 0
-    if not islastcs:
-      parser.charset = charsetStack.pop()
-      if not canReinterpret:
-        parser.confidence = CONFIDENCE_CERTAIN
-    else:
-      # bom sniff
-      const u8bom = char(0xEF) & char(0xBB) & char(0xBF)
-      const bebom = char(0xFE) & char(0xFF)
-      const lebom = char(0xFF) & char(0xFE)
-      bom = inputStream.readStr(2)
-      if bom == bebom:
-        parser.charset = CHARSET_UTF_16_BE
-        parser.confidence = CONFIDENCE_CERTAIN
-        bom = ""
-      elif bom == lebom:
-        parser.charset = CHARSET_UTF_16_LE
-        parser.confidence = CONFIDENCE_CERTAIN
-        bom = ""
-      else:
-        bom &= inputStream.readChar()
-        if bom == u8bom:
-          parser.charset = CHARSET_UTF_8
-          parser.confidence = CONFIDENCE_CERTAIN
-          bom = ""
-        else:
-          parser.charset = fallbackcs
-    let em = if islastcs or not canReinterpret:
+    parser.confidence = confidence
+    confidence = CONFIDENCE_TENTATIVE
+    parser.charset = charsetStack.pop()
+    if not canReinterpret:
+      parser.confidence = CONFIDENCE_CERTAIN
+    let em = if charsetStack.len == 0 or not canReinterpret:
       DECODER_ERROR_MODE_REPLACEMENT
     else:
       DECODER_ERROR_MODE_FATAL
     let decoder = newDecoderStream(inputStream, parser.charset, errormode = em)
-    for c in bom:
-      decoder.prepend(cast[uint32](c))
     parser.document = newDocument()
     parser.document.contentType = "text/html"
     if window != nil:
diff --git a/src/main.nim b/src/main.nim
index 0371c801..605fba94 100644
--- a/src/main.nim
+++ b/src/main.nim
@@ -35,7 +35,8 @@ Options:
     -c, --css <stylesheet>      Pass stylesheet (e.g. -c 'a{color: blue}')
     -o, --opt <config>          Pass config options (e.g. -o 'page.q="QUIT"')
     -T, --type <type>           Specify content mime type
-    -I, --input-charset <name>  Specify document charset
+    -I, --input-charset <enc>   Specify document charset
+    -O, --display-charset <enc> Specify display charset
     -M, --monochrome            Set color-mode to 'monochrome'
     -V, --visual                Visual startup mode
     -r, --run <script/file>     Run passed script or file
@@ -84,6 +85,16 @@ while i < params.len:
       cs = some(c)
     else:
       help(1)
+  of "-O", "--output-charset":
+    inc i
+    if i < params.len:
+      let c = getCharset(params[i])
+      if c == CHARSET_UNKNOWN:
+        stderr.write("Unknown charset " & params[i] & "\n")
+        quit(1)
+      conf.encoding.display_charset = some(c)
+    else:
+      help(1)
   of "-":
     discard # emulate programs that accept - as stdin
   of "-d", "-dump", "--dump":
diff --git a/src/render/rendertext.nim b/src/render/rendertext.nim
index d0576c75..ed79d60f 100644
--- a/src/render/rendertext.nim
+++ b/src/render/rendertext.nim
@@ -21,6 +21,8 @@ proc newStreamRenderer*(stream: Stream, charsets: seq[Charset]): StreamRenderer
   result.ansiparser.state = PARSE_DONE
   for i in countdown(charsets.high, 0):
     result.charsets.add(charsets[i])
+  if charsets.len == 0:
+    result.charsets = @[DefaultCharset]
   let cs = result.charsets.pop()
   let em = if charsets.len > 0:
     DECODER_ERROR_MODE_FATAL