about summary refs log tree commit diff stats
path: root/src/encoding
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2022-12-10 19:48:11 +0100
committerbptato <nincsnevem662@gmail.com>2022-12-10 19:48:11 +0100
commit5c267ce6c9c1add997767f5eaad248cdd829c6fc (patch)
treee3b8a1cdd7f7f67f2d810ce1747fa197c1387ff1 /src/encoding
parent78cc5143a3a98f2803258d42818cf17727d70fe7 (diff)
downloadchawan-5c267ce6c9c1add997767f5eaad248cdd829c6fc.tar.gz
Fix decoder bugs, move decoder to encoding/
Diffstat (limited to 'src/encoding')
-rw-r--r--src/encoding/decoderstream.nim844
1 files changed, 844 insertions, 0 deletions
diff --git a/src/encoding/decoderstream.nim b/src/encoding/decoderstream.nim
new file mode 100644
index 00000000..15d6311c
--- /dev/null
+++ b/src/encoding/decoderstream.nim
@@ -0,0 +1,844 @@
+import algorithm
+import streams
+import unicode
+
+import data/charset
+import utils/twtstr
+
+# DecoderStream decodes any encoding to valid utf-32.
+type
+  DecoderErrorMode* = enum
+    DECODER_ERROR_MODE_FATAL, DECODER_ERROR_MODE_REPLACEMENT,
+    DECODER_ERROR_MODE_HTML
+
+  ISO2022JPState = enum
+    STATE_ASCII, STATE_ROMAN, STATE_KATAKANA, STATE_LEAD_BYTE,
+    STATE_TRAIL_BYTE, STATE_ESCAPE_START, STATE_ESCAPE
+
+  DecoderStream* = ref object
+    source: Stream
+    errormode: DecoderErrorMode
+    isend: bool
+    failed*: bool
+    bufs: seq[seq[uint32]]
+    bs: int
+    bi: int
+    buflen: int
+    c: uint32
+    case charset: Charset
+    of CHARSET_UTF_8:
+      u8needed: int
+      u8seen: int
+      u8bounds: Slice[uint8]
+    of CHARSET_GBK, CHARSET_GB18030:
+      gb18first: uint8
+      gb18second: uint8
+      gb18third: uint8
+      gb18buf: uint8
+      gb18hasbuf: bool
+    of CHARSET_BIG5:
+      big5lead: uint8
+    of CHARSET_EUC_JP:
+      eucjplead: uint8
+      eucjpjis0212: bool
+    of CHARSET_ISO_2022_JP:
+      iso2022jplead: uint8
+      iso2022jpstate: ISO2022JPState
+      iso2022jpoutputstate: ISO2022JPState
+      iso2022jpoutput: bool
+      iso2022jpbuf: uint8
+      iso2022jphasbuf: bool
+    of CHARSET_SHIFT_JIS:
+      sjislead: uint8
+    of CHARSET_EUC_KR:
+      euckrlead: uint8
+    of CHARSET_UTF_16_BE, CHARSET_UTF_16_LE:
+      u16lead: uint8
+      u16surr: uint16
+      u16haslead: bool
+      u16hassurr: bool
+    of CHARSET_REPLACEMENT:
+      replreported: bool
+    else: discard
+
+template append_codepoint_buf(stream: DecoderStream, c: uint32) =
+  if stream.bi >= stream.buflen:
+    stream.bufs.add(newSeqUninitialized[uint32](stream.buflen))
+    stream.bi = 0
+  stream.bufs[^1][stream.bi] = c
+  inc stream.bi
+
+template append_codepoint(stream: DecoderStream, c: uint32, oq: ptr UncheckedArray[uint32], olen: int, n: var int) =
+  if n < olen:
+    oq[n div sizeof(uint32)] = c
+    n += sizeof(uint32)
+  else:
+    append_codepoint_buf stream, c
+
+template append_codepoint(stream: DecoderStream, c: char, oq: ptr UncheckedArray[uint32], olen: int, n: var int) =
+  stream.append_codepoint cast[uint32](c), oq, olen, n
+
+proc handleError(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int, n: var int) =
+  case stream.errormode
+  of DECODER_ERROR_MODE_FATAL:
+    stream.isend = true
+    stream.failed = true
+  of DECODER_ERROR_MODE_HTML:
+    if stream.charset == CHARSET_UTF_8:
+      # "html" mode is handled as "replacement" for utf-8.
+      stream.append_codepoint 0xFFFD, oq, olen, n
+    else:
+      stream.append_codepoint '&', oq, olen, n
+      stream.append_codepoint '#', oq, olen, n
+      while stream.c > 0:
+        stream.append_codepoint cast[char](0x30 + stream.c mod 10), oq, olen, n
+        stream.c = stream.c div 10
+      stream.append_codepoint ';', oq, olen, n
+  of DECODER_ERROR_MODE_REPLACEMENT:
+    stream.append_codepoint 0xFFFD, oq, olen, n
+
+proc decodeUTF8(stream: DecoderStream, iq: var seq[uint8], oq: ptr UncheckedArray[uint32], ilen, olen: int, n: var int) =
+  var c = stream.c
+  var needed = stream.u8needed
+  var seen = stream.u8seen
+  var bounds = stream.u8bounds
+  var i = 0
+  while i < ilen:
+    let b = iq[i]
+    if needed == 0:
+      case b
+      of 0x00u8 .. 0x7Fu8:
+        stream.append_codepoint uint32(b), oq, olen, n
+      of 0xC2u8 .. 0xDFu8:
+        needed = 1
+        c = cast[uint32](b) and 0x1F
+      of 0xE0u8:
+        bounds.a = 0xA0
+        needed = 2
+        c = cast[uint32](b) and 0xF
+      of 0xEDu8:
+        bounds.b = 0x9F
+        needed = 2
+        c = cast[uint32](b) and 0xF
+      of 0xE1u8 .. 0xECu8, 0xEEu8 .. 0xEFu8:
+        needed = 2
+        c = cast[uint32](b) and 0xF
+      of 0xF0u8:
+        bounds.a = 0x90
+        needed = 3
+        c = cast[uint32](b) and 0x7
+      of 0xF4u8:
+        bounds.b = 0x8F
+        needed = 3
+        c = cast[uint32](b) and 0x7
+      of 0xF1u8 .. 0xF3u8:
+        needed = 3
+        c = cast[uint32](b) and 0x7
+      else:
+        stream.handleError(oq, olen, n)
+        if stream.isend: # fatal error
+          break
+      inc i
+      continue
+    if b notin bounds:
+      c = 0
+      needed = 0
+      seen = 0
+      bounds = 0x80u8 .. 0xBFu8
+      stream.handleError(oq, olen, n)
+      continue # prepend (no inc i)
+    bounds = 0x80u8 .. 0xBFu8
+    c = (c shl 6) or (b and 0x3F)
+    inc seen
+    if seen == needed:
+      stream.append_codepoint c, oq, olen, n
+      c = 0
+      needed = 0
+      seen = 0
+    inc i
+  stream.c = c
+  stream.u8bounds = bounds
+  stream.u8seen = seen
+  stream.u8needed = needed
+
+proc gb18RangesCodepoint(p: uint32): uint32 =
+  if p > 39419 and p < 189000 or p > 1237575:
+    return high(uint32) # null
+  if p == 7457:
+    return 0xE7C7
+  # Let offset be the last pointer in index gb18030 ranges that is less than or
+  # equal to pointer and code point offset its corresponding code point.
+  var offset: uint32
+  var c: uint32
+  if p >= 189000:
+    # omitted from the map for storage efficiency
+    offset = 189000
+    c = 0x10000
+  elif p >= 39394:
+    # Needed because upperBound returns the first element greater than pointer
+    # OR last on failure, so we can't just remove one if p is e.g. 39400.
+    offset = 39394
+    c = 0xFFE6
+  else:
+    # Find the first range that is greater than p, or last if no such element
+    # is found.
+    # We want the last that is <=, so decrease index by one.
+    let i = upperBound(Gb18030RangesDecode, p, func(a: tuple[p, ucs: uint16], b: uint32): int =
+      cmp(cast[uint32](a.p), b))
+    let elem = Gb18030RangesDecode[i - 1]
+    offset = elem.p
+    c = elem.ucs
+  return c + p - offset
+
+proc decodeGb18030(stream: DecoderStream, iq: var seq[uint8],
+                   oq: ptr UncheckedArray[uint32], ilen, olen: int,
+                   n: var int) =
+  var first = stream.gb18first
+  var second = stream.gb18second
+  var third = stream.gb18third
+  var buf = stream.gb18buf
+  var hasbuf = stream.gb18hasbuf
+  var i = 0
+  while i < ilen:
+    let b = if hasbuf:
+      hasbuf = false
+      dec i
+      buf
+    else:
+      iq[i]
+    if third != 0:
+      if b notin 0x30u8 .. 0x39u8:
+        hasbuf = true
+        buf = second
+        first = third
+        first = 0
+        second = 0
+        third = 0
+        stream.handleError(oq, olen, n)
+        if stream.isend: break
+        continue # prepend (no inc i)
+      else:
+        let p = ((uint32(first) - 0x81) * 10 * 126 * 10) +
+                ((uint32(second) - 0x30) * (10 * 126)) +
+                ((uint32(third) - 0x81) * 10) + uint32(b) - 0x30
+        let c = gb18RangesCodepoint(p)
+        first = 0
+        second = 0
+        third = 0
+        if c == high(uint32): # null
+          stream.handleError(oq, olen, n)
+          if stream.isend: break
+        else:
+          stream.append_codepoint c, oq, olen, n
+    elif second != 0:
+      if b in 0x81u8 .. 0xFEu8:
+        third = b
+      else:
+        hasbuf = true
+        buf = second
+        first = 0
+        second = 0
+        third = 0
+        stream.handleError(oq, olen, n)
+        if stream.isend: break
+    elif first != 0:
+      if b in 0x30u8 .. 0x39u8:
+        second = b
+      else:
+        let ff = first
+        first = 0
+        if b in 0x40u8 .. 0x7Eu8:
+          let offset = if b < 0x7F: 0x40u32 else: 0x41u32
+          let p = (uint16(ff) - 0x81) * 190 + (uint16(b) - offset)
+          if p < Gb18030Decode.len:
+            let c = Gb18030Decode[cast[uint16](p)]
+            stream.append_codepoint uint32(c), oq, olen, n
+            inc i
+            continue
+        if cast[char](b) in Ascii:
+          continue # prepend (no inc i)
+        else:
+          stream.handleError(oq, olen, n)
+          if stream.isend: break
+    elif cast[char](b) in Ascii:
+      stream.append_codepoint b, oq, olen, n
+    elif b == 0x80:
+      stream.append_codepoint 0x20AC, oq, olen, n
+    elif b in 0x81u8 .. 0xFEu8:
+      first = b
+    else:
+      stream.handleError(oq, olen, n)
+      if stream.isend: break
+    inc i
+  stream.gb18first = first
+  stream.gb18second = second
+  stream.gb18third = third
+  stream.gb18buf = buf
+  stream.gb18hasbuf = hasbuf
+
+proc decodeBig5(stream: DecoderStream, iq: var seq[uint8],
+                oq: ptr UncheckedArray[uint32], ilen, olen: int,
+                n: var int) =
+  var i = 0
+  while i < ilen:
+    if stream.big5lead != 0:
+      let lead = uint32(stream.big5lead)
+      stream.big5lead = 0
+      let offset = if iq[i] < 0x7F: 0x40u16 else: 0x7E
+      if iq[i] in {0x40u8 .. 0x7Eu8, 0xA1 .. 0xFEu8}:
+        let p = (lead - 0x81) * 157 + uint16(iq[i]) - offset
+        template output_two(a, b: uint32) =
+          stream.append_codepoint a, oq, olen, n
+          stream.append_codepoint b, oq, olen, n
+        block no_continue:
+          case p
+          of 1133: output_two 0x00CA, 0x0304
+          of 1135: output_two 0x00CA, 0x030C
+          of 1164: output_two 0x00EA, 0x0304
+          of 1166: output_two 0x00EA, 0x030C
+          else: break no_continue
+          inc i
+          continue
+        if p < Big5Decode.len - Big5DecodeOffset:
+          let c = Big5Decode[p - Big5DecodeOffset]
+          if c != 0:
+            stream.append_codepoint c, oq, olen, n
+            inc i
+            continue
+      if cast[char](iq[i]) in Ascii:
+        stream.append_codepoint iq[i], oq, olen, n
+      else:
+        stream.handleError(oq, olen, n)
+        if stream.isend: break
+    elif cast[char](iq[i]) in Ascii:
+      stream.append_codepoint iq[i], oq, olen, n
+    elif iq[i] in 0x00u8 .. 0xFEu8:
+      stream.big5lead = iq[i]
+    else:
+      stream.handleError(oq, olen, n)
+      if stream.isend: break
+    inc i
+
+proc decodeEUCJP(stream: DecoderStream, iq: var seq[uint8],
+                 oq: ptr UncheckedArray[uint32], ilen, olen: int,
+                 n: var int) =
+  var jis0212 = stream.eucjpjis0212
+  var lead = stream.eucjplead
+  var i = 0
+  while i < ilen:
+    let b = iq[i]
+    if lead == 0x8E and b in 0xA1u8 .. 0xDFu8:
+      lead = 0
+      stream.append_codepoint iq[i], oq, olen, n
+    elif lead == 0x8F and b in 0xA1u8 .. 0xFEu8:
+      jis0212 = true
+      lead = b
+    elif lead != 0:
+      if lead in 0xA1u8 .. 0xFEu8 and b in 0xA1u8 .. 0xFEu8:
+        let p = (uint16(lead) - 0xA1) * 94 + uint16(b) - 0xA1
+        lead = 0
+        var c: uint16
+        if jis0212:
+          if p < Jis0212Decode.len:
+            c = Jis0212Decode[p]
+        else:
+          if p < Jis0208Decode.len:
+            c = Jis0208Decode[p]
+        jis0212 = false
+        if c != 0:
+          stream.append_codepoint c, oq, olen, n
+          inc i
+          continue
+      else:
+        lead = 0
+      stream.handleError(oq, olen, n)
+      if stream.isend: break
+    elif cast[char](b) in Ascii:
+      stream.append_codepoint b, oq, olen, n
+    elif b in {0x8Eu8, 0x8Fu8, 0xA1u8 .. 0xFEu8}:
+      lead = b
+    else:
+      stream.handleError(oq, olen, n)
+      if stream.isend: break
+    inc i
+  stream.eucjpjis0212 = jis0212
+  stream.eucjplead = lead
+
+proc decodeISO2022JP(stream: DecoderStream, iq: var seq[uint8],
+                     oq: ptr UncheckedArray[uint32], ilen, olen: int,
+                     n: var int) =
+  var i = 0
+  var lead = stream.iso2022jplead
+  var state = stream.iso2022jpstate
+  var output = stream.iso2022jpoutput
+  var outputstate = stream.iso2022jpoutputstate
+  var buf = stream.iso2022jpbuf
+  var hasbuf = stream.iso2022jphasbuf
+  while i < ilen:
+    let b = if hasbuf:
+      hasbuf = false
+      dec i
+      buf
+    else:
+      iq[i]
+    case state
+    of STATE_ASCII:
+      case b
+      of 0x1B: state = STATE_ESCAPE_START
+      of {0x00u8..0x7Fu8} - {0x0Eu8, 0x0Fu8, 0x1Bu8}:
+        output = false
+        stream.append_codepoint b, oq, olen, n
+      else:
+        output = false
+        stream.handleError(oq, olen, n)
+        if stream.isend: break
+    of STATE_ROMAN:
+      case b
+      of 0x1B: state = STATE_ESCAPE_START
+      of 0x5C:
+        output = false
+        stream.append_codepoint 0x00A5, oq, olen, n
+      of 0x7E:
+        output = false
+        stream.append_codepoint 0x203E, oq, olen, n
+      of {0x00u8..0x7Fu8} - {0x0Eu8, 0x0Fu8, 0x1Bu8, 0x5Cu8, 0x7Eu8}:
+        output = false
+        stream.append_codepoint b, oq, olen, n
+      else:
+        output = false
+        stream.handleError(oq, olen, n)
+        if stream.isend: break
+    of STATE_KATAKANA:
+      case b
+      of 0x1B: state = STATE_ESCAPE_START
+      of 0x21u8..0x5Fu8:
+        output = false
+        stream.append_codepoint 0xFF61u16 - 0x21 + uint16(b), oq, olen, n
+      else:
+        output = false
+        stream.handleError(oq, olen, n)
+        if stream.isend: break
+    of STATE_LEAD_BYTE:
+      case b
+      of 0x1B: state = STATE_ESCAPE_START
+      of 0x21u8..0x7Eu8:
+        output = false
+        lead = b
+        state = STATE_TRAIL_BYTE
+      else:
+        output = false
+        stream.handleError(oq, olen, n)
+        if stream.isend: break
+    of STATE_TRAIL_BYTE:
+      case b
+      of 0x1B:
+        state = STATE_ESCAPE_START
+        stream.handleError(oq, olen, n)
+        if stream.isend: break
+      of 0x21u8..0x7Eu8:
+        state = STATE_LEAD_BYTE
+        let p = (uint16(lead) - 0x21) * 94 + uint16(b) - 0x21
+        if p < Jis0208Decode.len:
+          let c = Jis0208Decode[p]
+          if c != 0:
+            stream.append_codepoint c, oq, olen, n
+          else:
+            stream.handleError(oq, olen, n)
+            if stream.isend: break
+      else:
+        state = STATE_LEAD_BYTE
+        stream.handleError(oq, olen, n)
+        if stream.isend: break
+    of STATE_ESCAPE_START:
+      if b == 0x24 or b == 0x28:
+        lead = b
+        state = STATE_ESCAPE
+      else:
+        output = false
+        state = outputstate
+        stream.handleError(oq, olen, n)
+        if stream.isend: break
+        continue # prepend (no inc i)
+    of STATE_ESCAPE:
+      let l = lead
+      lead = 0
+      block statenonnull:
+        var s: ISO2022JPState
+        if l == 0x28:
+          case b
+          of 0x42: s = STATE_ASCII
+          of 0x4A: s = STATE_ROMAN
+          of 0x49: s = STATE_KATAKANA
+          else: break statenonnull
+        elif l == 0x24 and b in {0x40u8, 0x42u8}:
+          s = STATE_LEAD_BYTE
+        else: break statenonnull
+        state = s
+        outputstate = s
+        if not output:
+          output = true
+          stream.handleError(oq, olen, n)
+          if stream.isend:
+            output = true
+            break
+        output = true
+        inc i
+        continue
+      output = false
+      state = outputstate
+      stream.handleError(oq, olen, n)
+      if stream.isend: break
+      continue # prepend (no inc i)
+    inc i
+  stream.iso2022jphasbuf = hasbuf
+  stream.iso2022jpbuf = buf
+  stream.iso2022jplead = lead
+  stream.iso2022jpstate = state
+  stream.iso2022jpoutput = output
+  stream.iso2022jpoutputstate = outputstate
+
+proc decodeShiftJIS(stream: DecoderStream, iq: var seq[uint8],
+                    oq: ptr UncheckedArray[uint32], ilen, olen: int,
+                    n: var int) =
+  var lead = stream.sjislead
+  var i = 0
+  while i < ilen:
+    let b = iq[i]
+    if lead != 0:
+      let l = lead
+      lead = 0
+      let offset = if b < 0x7Fu8: 0x40u16 else: 0x41u16
+      let leadoffset = if l < 0xA0: 0x81u16 else: 0xC1u16
+      if b in 0x40u8..0x7Eu8 or b in 0x80u8..0xFCu8:
+        let p = (uint16(l) - leadoffset) * 188 + uint16(b) - offset
+        if p in 8836u16..10715u16:
+          stream.append_codepoint 0xE000u16 - 8836 + p, oq, olen, n
+          inc i
+          continue
+        if p < Jis0208Decode.len and Jis0208Decode[p] != 0:
+          let c = Jis0208Decode[p]
+          stream.append_codepoint c, oq, olen, n
+          inc i
+          continue
+      if cast[char](b) in Ascii:
+        continue # prepend (no inc i)
+      else:
+        stream.handleError(oq, olen, n)
+        if stream.isend: break
+    elif cast[char](b) in Ascii or b == 0x80:
+      stream.append_codepoint b, oq, olen, n
+    elif b in 0xA1u8..0xDFu8:
+      stream.append_codepoint 0xFF61u16 - 0xA1 + uint16(b), oq, olen, n
+    elif b in {0x81..0x9F} + {0xE0..0xFC}:
+      lead = b
+    else:
+      stream.handleError(oq, olen, n)
+      if stream.isend: break
+    inc i
+  stream.sjislead = lead
+
+proc decodeEUCKR(stream: DecoderStream, iq: var seq[uint8],
+                 oq: ptr UncheckedArray[uint32], ilen, olen: int,
+                 n: var int) =
+  var lead = stream.euckrlead
+  var i = 0
+  while i < ilen:
+    let b = iq[i]
+    if lead != 0:
+      if b in 0x41u8..0xFEu8:
+        let p = (uint16(lead) - 0x81) * 190 + (uint16(b) - 0x41)
+        if p < EUCKRDecode.len and EUCKRDecode[p] != 0:
+          let c = EUCKRDecode[p]
+          stream.append_codepoint c, oq, olen, n
+          inc i
+          continue
+      stream.handleError(oq, olen, n)
+      if stream.isend: break
+    elif cast[char](b) in Ascii:
+      stream.append_codepoint b, oq, olen, n
+    elif b in {0x81u8..0xFEu8}:
+      lead = b
+    else:
+      stream.handleError(oq, olen, n)
+      if stream.isend: break
+    inc i
+  stream.euckrlead = lead
+
+proc decodeUTF16(stream: DecoderStream, iq: var seq[uint8],
+                 oq: ptr UncheckedArray[uint32], ilen, olen: int,
+                 n: var int, be: static bool) =
+  var i = 0
+  var lead = stream.u16lead
+  var haslead = stream.u16haslead
+  var surr = stream.u16surr
+  var hassurr = stream.u16hassurr
+  while i < ilen:
+    if not haslead:
+      haslead = true
+      lead = iq[i]
+    else:
+      let cu = if be:
+        (uint16(lead) shl 8) + uint16(iq[i])
+      else:
+        (uint16(iq[i]) shl 8) + uint16(lead)
+      haslead = false
+      if hassurr:
+        hassurr = false
+        if cu in 0xDC00u16 .. 0xDFFFu16:
+          let c = 0x10000 + ((uint32(surr) - 0xD800) shl 10) + (uint32(cu) - 0xDC00)
+          stream.append_codepoint c, oq, olen, n
+          inc i
+          continue
+        haslead = true # prepend the last two bytes
+        stream.handleError(oq, olen, n)
+        continue
+      if cu in 0xD800u16 .. 0xDBFFu16:
+        surr = cu
+        hassurr = true
+        inc i
+        continue
+      elif cu in 0xDC00u16 .. 0xDFFFu16:
+        stream.handleError(oq, olen, n)
+        if stream.isend: # fatal error
+          break
+        else:
+          inc i
+          continue
+      stream.append_codepoint uint32(cu), oq, olen, n
+    inc i
+  stream.u16lead = lead
+  stream.u16haslead = haslead
+  stream.u16surr = surr
+  stream.u16hassurr = hassurr
+
+proc decodeUTF16LE(stream: DecoderStream, iq: var seq[uint8],
+                   oq: ptr UncheckedArray[uint32], ilen, olen: int,
+                   n: var int) =
+  stream.decodeUTF16(iq, oq, ilen, olen, n, false)
+
+proc decodeUTF16BE(stream: DecoderStream, iq: var seq[uint8],
+                   oq: ptr UncheckedArray[uint32], ilen, olen: int,
+                   n: var int) =
+  stream.decodeUTF16(iq, oq, ilen, olen, n, true)
+
+proc decodeXUserDefined(stream: DecoderStream, iq: var seq[uint8],
+                        oq: ptr UncheckedArray[uint32], ilen, olen: int,
+                        n: var int) =
+  for i in 0 ..< ilen:
+    let c = cast[char](iq[i])
+    if c in Ascii:
+      stream.append_codepoint c, oq, olen, n
+    else:
+      let c = 0xF780 + cast[uint32](c) - 0x80
+      stream.append_codepoint c, oq, olen, n
+
+proc decodeSingleByte(stream: DecoderStream, iq: var seq[uint8],
+                      oq: ptr UncheckedArray[uint32], ilen, olen: int,
+                      n: var int, map: array[char, uint16]) =
+  for i in 0 ..< ilen:
+    let c = cast[char](iq[i])
+    if c in Ascii:
+      stream.append_codepoint c, oq, olen, n
+    else:
+      let p = map[c]
+      if p == 0u16:
+        stream.handleError(oq, olen, n)
+      else:
+        stream.append_codepoint cast[uint32](oq), oq, olen, n
+
+proc decodeReplacement(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int, n: var int) =
+  if not stream.replreported:
+    stream.replreported = true
+    stream.handleError(oq, olen, n)
+  # I think that's it?
+
+# copy any data remaining from previous passes
+proc copyBuffers(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int): int =
+  if stream.bufs.len == 1:
+    # one page: stream.bs ..< stream.bi
+    let n = min((stream.bi - stream.bs) * sizeof(stream.bufs[0][0]), olen)
+    copyMem(addr oq[0], addr stream.bufs[0][stream.bs], n)
+    stream.bs += n div sizeof(uint32)
+    if stream.bs >= stream.bi:
+      # read entire page; recycle it
+      stream.bs = 0
+      stream.bi = 0
+    return n
+  else:
+    # multiple pages:
+    # stream.bs ..< stream.buflen
+    # 0 ..< stream.buflen
+    # ...
+    # 0 ..< stream.bi
+    let a = (stream.buflen - stream.bs) * sizeof(stream.bufs[0][0])
+    if a < olen:
+      copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a)
+      var ns = a
+      stream.bs = 0
+      var i = 1
+      while i < stream.bufs.high:
+        let n = min(stream.buflen * sizeof(stream.bufs[0][0]), olen - ns)
+        copyMem(addr oq[ns div sizeof(uint32)], addr stream.bufs[i][0], n)
+        ns += n
+        if ns >= olen:
+          # i'th buffer still has contents.
+          stream.bs = n div sizeof(uint32)
+          break
+        stream.bs = 0
+        inc i
+      if ns < olen:
+        # last page
+        let n = min(stream.bi * sizeof(stream.bufs[0][0]), olen - ns)
+        copyMem(addr oq[ns div sizeof(uint32)], addr stream.bufs[i][0], n)
+        ns += n
+        stream.bs = n div sizeof(uint32)
+        if stream.bs >= stream.bi:
+          # read entire page; recycle it
+          stream.bs = 0
+          stream.bi = 0
+      for j in i ..< stream.bufs.len:
+        stream.bufs[j - i] = stream.bufs[j]
+      stream.bufs.setLen(stream.bufs.len - i)
+      return ns
+    elif a > olen:
+      copyMem(addr oq[0], addr stream.bufs[0][stream.bs], olen)
+      stream.bs += olen div sizeof(uint32)
+      assert stream.bs < stream.buflen
+      return olen
+    else: # a == olen
+      copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a)
+      stream.bs = 0
+      stream.bufs.delete(0)
+      return a
+
+proc checkEnd(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int,
+              n: var int) =
+  if not stream.isend and stream.bufs.len == 1 and
+      stream.bs >= stream.bi and stream.source.atEnd:
+    stream.isend = true
+    case stream.charset
+    of CHARSET_UTF_16_LE, CHARSET_UTF_16_BE:
+      if stream.u16haslead or stream.u16hassurr:
+        stream.handleError(oq, olen, n)
+    of CHARSET_UTF_8:
+      if stream.u8needed != 0:
+        stream.handleError(oq, olen, n)
+    of CHARSET_GB18030, CHARSET_GBK:
+      if stream.gb18first != 0 or stream.gb18second != 0 or stream.gb18third != 0:
+        stream.handleError(oq, olen, n)
+    of CHARSET_BIG5:
+      if stream.big5lead != 0:
+        stream.handleError(oq, olen, n)
+    of CHARSET_EUC_JP:
+      if stream.eucjplead != 0:
+        stream.handleError(oq, olen, n)
+    of CHARSET_ISO_2022_JP:
+      case stream.iso2022jpstate
+      of STATE_ASCII, STATE_ROMAN, STATE_KATAKANA, STATE_LEAD_BYTE: discard
+      of STATE_TRAIL_BYTE:
+        stream.handleError(oq, olen, n)
+      of STATE_ESCAPE_START:
+        stream.handleError(oq, olen, n)
+      of STATE_ESCAPE:
+        stream.isend = false
+        stream.iso2022jpbuf = stream.iso2022jplead
+        stream.iso2022jphasbuf = true
+        stream.iso2022jplead = 0
+        stream.iso2022jpoutput = false
+        stream.iso2022jpstate = stream.iso2022jpoutputstate
+        stream.handleError(oq, olen, n)
+    of CHARSET_SHIFT_JIS:
+      if stream.sjislead != 0:
+        stream.handleError(oq, olen, n)
+    of CHARSET_EUC_KR:
+      if stream.euckrlead != 0:
+        stream.handleError(oq, olen, n)
+    else: discard
+
+proc prepend*(stream: DecoderStream, c: uint32) =
+  append_codepoint_buf stream, c
+
+const ReadSize = 4096
+proc readData*(stream: DecoderStream, buffer: pointer, olen: int): int =
+  const l = sizeof(stream.bufs[0][0]) 
+  assert olen mod l == 0, "Buffer size must be divisible by " & $l
+  if olen == 0: return
+  let oq = cast[ptr UncheckedArray[uint32]](buffer)
+  result = stream.copyBuffers(oq, olen)
+  let olen = olen - result
+  if olen == 0:
+    stream.checkEnd(oq, olen, result)
+    return result # output filled with buffered data; nothing to decode.
+  var iq = newSeqUninitialized[uint8](ReadSize)
+  let ilen = stream.source.readData(cast[pointer](addr iq[0]), ReadSize)
+  case stream.charset
+  of CHARSET_UTF_8: stream.decodeUTF8(iq, oq, ilen, olen, result)
+  of CHARSET_IBM866: stream.decodeSingleByte(iq, oq, ilen, olen, result, IBM866Decode)
+  of CHARSET_ISO_8859_2: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88592Decode)
+  of CHARSET_ISO_8859_3: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88593Decode)
+  of CHARSET_ISO_8859_4: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88594Decode)
+  of CHARSET_ISO_8859_5: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88595Decode)
+  of CHARSET_ISO_8859_6: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88596Decode)
+  of CHARSET_ISO_8859_7: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88597Decode)
+  of CHARSET_ISO_8859_8,
+     CHARSET_ISO_8859_8_I: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88598Decode)
+  of CHARSET_ISO_8859_10: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885910Decode)
+  of CHARSET_ISO_8859_13: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885913Decode)
+  of CHARSET_ISO_8859_14: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885914Decode)
+  of CHARSET_ISO_8859_15: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885915Decode)
+  of CHARSET_ISO_8859_16: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885916Decode)
+  of CHARSET_KOI8_R: stream.decodeSingleByte(iq, oq, ilen, olen, result, KOI8RDecode)
+  of CHARSET_KOI8_U: stream.decodeSingleByte(iq, oq, ilen, olen, result, KOI8UDecode)
+  of CHARSET_MACINTOSH: stream.decodeSingleByte(iq, oq, ilen, olen, result, MacintoshDecode)
+  of CHARSET_WINDOWS_874: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows874Decode)
+  of CHARSET_WINDOWS_1250: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1250Decode)
+  of CHARSET_WINDOWS_1251: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1251Decode)
+  of CHARSET_WINDOWS_1252: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1252Decode)
+  of CHARSET_WINDOWS_1253: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1253Decode)
+  of CHARSET_WINDOWS_1254: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1254Decode)
+  of CHARSET_WINDOWS_1255: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1255Decode)
+  of CHARSET_WINDOWS_1256: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1256Decode)
+  of CHARSET_WINDOWS_1257: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1257Decode)
+  of CHARSET_WINDOWS_1258: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1258Decode)
+  of CHARSET_X_MAC_CYRILLIC: stream.decodeSingleByte(iq, oq, ilen, olen, result, XMacCyrillicDecode)
+  of CHARSET_GBK, CHARSET_GB18030: stream.decodeGb18030(iq, oq, ilen, olen, result)
+  of CHARSET_BIG5: stream.decodeBig5(iq, oq, ilen, olen, result)
+  of CHARSET_EUC_JP: stream.decodeEUCJP(iq, oq, ilen, olen, result)
+  of CHARSET_ISO_2022_JP: stream.decodeISO2022JP(iq, oq, ilen, olen, result)
+  of CHARSET_SHIFT_JIS: stream.decodeShiftJIS(iq, oq, ilen, olen, result)
+  of CHARSET_EUC_KR: stream.decodeEUCKR(iq, oq, ilen, olen, result)
+  of CHARSET_REPLACEMENT: stream.decodeReplacement(oq, olen, result)
+  of CHARSET_UTF_16_LE: stream.decodeUTF16LE(iq, oq, ilen, olen, result)
+  of CHARSET_UTF_16_BE: stream.decodeUTF16BE(iq, oq, ilen, olen, result)
+  of CHARSET_X_USER_DEFINED: stream.decodeXUserDefined(iq, oq, ilen, olen, result)
+  of CHARSET_UNKNOWN: assert false, "Somebody forgot to set the character set here"
+  stream.checkEnd(oq, olen, result)
+
+proc readRunes*(stream: DecoderStream, olen: int): seq[Rune] =
+  when nimvm:
+    let s = stream.source.readStr(olen)
+    result = s.toRunes()
+    if stream.source.atEnd:
+      stream.isend = true
+  else:
+    assert false
+
+proc atEnd*(stream: DecoderStream): bool =
+  return stream.isend
+
+proc newDecoderStream*(source: Stream, cs = CHARSET_UTF_8, buflen = 1024,
+                       errormode = DECODER_ERROR_MODE_REPLACEMENT): DecoderStream =
+  result = DecoderStream(
+    source: source,
+    charset: cs,
+    buflen: buflen,
+    errormode: errormode
+  )
+  when nimvm:
+    result.bufs = @[newSeq[uint32](buflen)]
+  else:
+    result.bufs = @[newSeqUninitialized[uint32](buflen)]
+  case cs
+  of CHARSET_UTF_8:
+    result.u8bounds = 0x80u8 .. 0xBFu8
+  else: discard