import algorithm
import streams
import unicode
import data/charset
import utils/twtstr
# DecoderStream decodes any encoding to valid utf-32.
type
DecoderErrorMode* = enum
DECODER_ERROR_MODE_FATAL, DECODER_ERROR_MODE_REPLACEMENT,
DECODER_ERROR_MODE_HTML
ISO2022JPState = enum
STATE_ASCII, STATE_ROMAN, STATE_KATAKANA, STATE_LEAD_BYTE,
STATE_TRAIL_BYTE, STATE_ESCAPE_START, STATE_ESCAPE
DecoderStream* = ref object
source: Stream
errormode: DecoderErrorMode
isend: bool
failed*: bool
bufs: seq[seq[uint32]]
bs: int
bi: int
buflen: int
c: uint32
case charset: Charset
of CHARSET_UTF_8:
u8needed: int
u8seen: int
u8bounds: Slice[uint8]
of CHARSET_GBK, CHARSET_GB18030:
gb18first: uint8
gb18second: uint8
gb18third: uint8
gb18buf: uint8
gb18hasbuf: bool
of CHARSET_BIG5:
big5lead: uint8
of CHARSET_EUC_JP:
eucjplead: uint8
eucjpjis0212: bool
of CHARSET_ISO_2022_JP:
iso2022jplead: uint8
iso2022jpstate: ISO2022JPState
iso2022jpoutputstate: ISO2022JPState
iso2022jpoutput: bool
iso2022jpbuf: uint8
iso2022jphasbuf: bool
of CHARSET_SHIFT_JIS:
sjislead: uint8
of CHARSET_EUC_KR:
euckrlead: uint8
of CHARSET_UTF_16_BE, CHARSET_UTF_16_LE:
u16lead: uint8
u16surr: uint16
u16haslead: bool
u16hassurr: bool
of CHARSET_REPLACEMENT:
replreported: bool
else: discard
template append_codepoint_buf(stream: DecoderStream, c: uint32) =
if stream.bi >= stream.buflen:
stream.bufs.add(newSeqUninitialized[uint32](stream.buflen))
stream.bi = 0
stream.bufs[^1][stream.bi] = c
inc stream.bi
template append_codepoint(stream: DecoderStream, c: uint32, oq: ptr UncheckedArray[uint32], olen: int, n: var int) =
if n < olen:
oq[n div sizeof(uint32)] = c
n += sizeof(uint32)
else:
append_codepoint_buf stream, c
template append_codepoint(stream: DecoderStream, c: char, oq: ptr UncheckedArray[uint32], olen: int, n: var int) =
stream.append_codepoint cast[uint32](c), oq, olen, n
proc handleError(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int, n: var int) =
case stream.errormode
of DECODER_ERROR_MODE_FATAL:
stream.isend = true
stream.failed = true
of DECODER_ERROR_MODE_HTML:
if stream.charset == CHARSET_UTF_8:
# "html" mode is handled as "replacement" for utf-8.
stream.append_codepoint 0xFFFD, oq, olen, n
else:
stream.append_codepoint '&', oq, olen, n
stream.append_codepoint '#', oq, olen, n
while stream.c > 0:
stream.append_codepoint cast[char](0x30 + stream.c mod 10), oq, olen, n
stream.c = stream.c div 10
stream.append_codepoint ';', oq, olen, n
of DECODER_ERROR_MODE_REPLACEMENT:
stream.append_codepoint 0xFFFD, oq, olen, n
proc decodeUTF8(stream: DecoderStream, iq: var seq[uint8], oq: ptr UncheckedArray[uint32], ilen, olen: int, n: var int) =
var c = stream.c
var needed = stream.u8needed
var seen = stream.u8seen
var bounds = stream.u8bounds
var i = 0
while i < ilen:
let b = iq[i]
if needed == 0:
case b
of 0x00u8 .. 0x7Fu8:
stream.append_codepoint uint32(b), oq, olen, n
of 0xC2u8 .. 0xDFu8:
needed = 1
c = cast[uint32](b) and 0x1F
of 0xE0u8:
bounds.a = 0xA0
needed = 2
c = cast[uint32](b) and 0xF
of 0xEDu8:
bounds.b = 0x9F
needed = 2
c = cast[uint32](b) and 0xF
of 0xE1u8 .. 0xECu8, 0xEEu8 .. 0xEFu8:
needed = 2
c = cast[uint32](b) and 0xF
of 0xF0u8:
bounds.a = 0x90
needed = 3
c = cast[uint32](b) and 0x7
of 0xF4u8:
bounds.b = 0x8F
needed = 3
c = cast[uint32](b) and 0x7
of 0xF1u8 .. 0xF3u8:
needed = 3
c = cast[uint32](b) and 0x7
else:
stream.handleError(oq, olen, n)
if stream.isend: # fatal error
break
inc i
continue
if b notin bounds:
c = 0
needed = 0
seen = 0
bounds = 0x80u8 .. 0xBFu8
stream.handleError(oq, olen, n)
continue # prepend (no inc i)
bounds = 0x80u8 .. 0xBFu8
c = (c shl 6) or (b and 0x3F)
inc seen
if seen == needed:
stream.append_codepoint c, oq, olen, n
c = 0
needed = 0
seen = 0
inc i
stream.c = c
stream.u8bounds = bounds
stream.u8seen = seen
stream.u8needed = needed
proc gb18RangesCodepoint(p: uint32): uint32 =
if p > 39419 and p < 189000 or p > 1237575:
return high(uint32) # null
if p == 7457:
return 0xE7C7
# Let offset be the last pointer in index gb18030 ranges that is less than or
# equal to pointer and code point offset its corresponding code point.
var offset: uint32
var c: uint32
if p >= 189000:
# omitted from the map for storage efficiency
offset = 189000
c = 0x10000
elif p >= 39394:
# Needed because upperBound returns the first element greater than pointer
# OR last on failure, so we can't just remove one if p is e.g. 39400.
offset = 39394
c = 0xFFE6
else:
# Find the first range that is greater than p, or last if no such element
# is found.
# We want the last that is <=, so decrease index by one.
let i = upperBound(Gb18030RangesDecode, p, func(a: tuple[p, ucs: uint16], b: uint32): int =
cmp(cast[uint32](a.p), b))
let elem = Gb18030RangesDecode[i - 1]
offset = elem.p
c = elem.ucs
return c + p - offset
proc decodeGb18030(stream: DecoderStream, iq: var seq[uint8],
oq: ptr UncheckedArray[uint32], ilen, olen: int,
n: var int) =
var first = stream.gb18first
var second = stream.gb18second
var third = stream.gb18third
var buf = stream.gb18buf
var hasbuf = stream.gb18hasbuf
var i = 0
while i < ilen:
let b = if hasbuf:
hasbuf = false
dec i
buf
else:
iq[i]
if third != 0:
if b notin 0x30u8 .. 0x39u8:
hasbuf = true
buf = second
first = third
first = 0
second = 0
third = 0
stream.handleError(oq, olen, n)
if stream.isend: break
continue # prepend (no inc i)
else:
let p = ((uint32(first) - 0x81) * 10 * 126 * 10) +
((uint32(second) - 0x30) * (10 * 126)) +
((uint32(third) - 0x81) * 10) + uint32(b) - 0x30
let c = gb18RangesCodepoint(p)
first = 0
second = 0
third = 0
if c == high(uint32): # null
stream.handleError(oq, olen, n)
if stream.isend: break
else:
stream.append_codepoint c, oq, olen, n
elif second != 0:
if b in 0x81u8 .. 0xFEu8:
third = b
else:
hasbuf = true
buf = second
first = 0
second = 0
third = 0
stream.handleError(oq, olen, n)
if stream.isend: break
elif first != 0:
if b in 0x30u8 .. 0x39u8:
second = b
else:
let ff = first
first = 0
if b in 0x40u8 .. 0x7Eu8:
let offset = if b < 0x7F: 0x40u32 else: 0x41u32
let p = (uint16(ff) - 0x81) * 190 + (uint16(b) - offset)
if p < Gb18030Decode.len:
let c = Gb18030Decode[cast[uint16](p)]
stream.append_codepoint uint32(c), oq, olen, n
inc i
continue
if cast[char](b) in Ascii:
continue # prepend (no inc i)
else:
stream.handleError(oq, olen, n)
if stream.isend: break
elif cast[char](b) in Ascii:
stream.append_codepoint b, oq, olen, n
elif b == 0x80:
stream.append_codepoint 0x20AC, oq, olen, n
elif b in 0x81u8 .. 0xFEu8:
first = b
else:
stream.handleError(oq, olen, n)
if stream.isend: break
inc i
stream.gb18first = first
stream.gb18second = second
stream.gb18third = third
stream.gb18buf = buf
stream.gb18hasbuf = hasbuf
proc decodeBig5(stream: DecoderStream, iq: var seq[uint8],
oq: ptr UncheckedArray[uint32], ilen, olen: int,
n: var int) =
var i = 0
while i < ilen:
if stream.big5lead != 0:
let lead = uint32(stream.big5lead)
stream.big5lead = 0
let offset = if iq[i] < 0x7F: 0x40u16 else: 0x7E
if iq[i] in {0x40u8 .. 0x7Eu8, 0xA1 .. 0xFEu8}:
let p = (lead - 0x81) * 157 + uint16(iq[i]) - offset
template output_two(a, b: uint32) =
stream.append_codepoint a, oq, olen, n
stream.append_codepoint b, oq, olen, n
block no_continue:
case p
of 1133: output_two 0x00CA, 0x0304
of 1135: output_two 0x00CA, 0x030C
of 1164: output_two 0x00EA, 0x0304
of 1166: output_two 0x00EA, 0x030C
else: break no_continue
inc i
continue
if p < Big5Decode.len - Big5DecodeOffset:
let c = Big5Decode[p - Big5DecodeOffset]
if c != 0:
stream.append_codepoint c, oq, olen, n
inc i
continue
if cast[char](iq[i]) in Ascii:
stream.append_codepoint iq[i], oq, olen, n
else:
stream.handleError(oq, olen, n)
if stream.isend: break
elif cast[char](iq[i]) in Ascii:
stream.append_codepoint iq[i], oq, olen, n
elif iq[i] in 0x00u8 .. 0xFEu8:
stream.big5lead = iq[i]
else:
stream.handleError(oq, olen, n)
if stream.isend: break
inc i
proc decodeEUCJP(stream: DecoderStream, iq: var seq[uint8],
oq: ptr UncheckedArray[uint32], ilen, olen: int,
n: var int) =
var jis0212 = stream.eucjpjis0212
var lead = stream.eucjplead
var i = 0
while i < ilen:
let b = iq[i]
if lead == 0x8E and b in 0xA1u8 .. 0xDFu8:
lead = 0
stream.append_codepoint iq[i], oq, olen, n
elif lead == 0x8F and b in 0xA1u8 .. 0xFEu8:
jis0212 = true
lead = b
elif lead != 0:
if lead in 0xA1u8 .. 0xFEu8 and b in 0xA1u8 .. 0xFEu8:
let p = (uint16(lead) - 0xA1) * 94 + uint16(b) - 0xA1
lead = 0
var c: uint16
if jis0212:
if p < Jis0212Decode.len:
c = Jis0212Decode[p]
else:
if p < Jis0208Decode.len:
c = Jis0208Decode[p]
jis0212 = false
if c != 0:
stream.append_codepoint c, oq, olen, n
inc i
continue
else:
lead = 0
stream.handleError(oq, olen, n)
if stream.isend: break
elif cast[char](b) in Ascii:
stream.append_codepoint b, oq, olen, n
elif b in {0x8Eu8, 0x8Fu8, 0xA1u8 .. 0xFEu8}:
lead = b
else:
stream.handleError(oq, olen, n)
if stream.isend: break
inc i
stream.eucjpjis0212 = jis0212
stream.eucjplead = lead
proc decodeISO2022JP(stream: DecoderStream, iq: var seq[uint8],
oq: ptr UncheckedArray[uint32], ilen, olen: int,
n: var int) =
var i = 0
var lead = stream.iso2022jplead
var state = stream.iso2022jpstate
var output = stream.iso2022jpoutput
var outputstate = stream.iso2022jpoutputstate
var buf = stream.iso2022jpbuf
var hasbuf = stream.iso2022jphasbuf
while i < ilen:
let b = if hasbuf:
hasbuf = false
dec i
buf
else:
iq[i]
case state
of STATE_ASCII:
case b
of 0x1B: state = STATE_ESCAPE_START
of {0x00u8..0x7Fu8} - {0x0Eu8, 0x0Fu8, 0x1Bu8}:
output = false
stream.append_codepoint b, oq, olen, n
else:
output = false
stream.handleError(oq, olen, n)
if stream.isend: break
of STATE_ROMAN:
case b
of 0x1B: state = STATE_ESCAPE_START
of 0x5C:
output = false
stream.append_codepoint 0x00A5, oq, olen, n
of 0x7E:
output = false
stream.append_codepoint 0x203E, oq, olen, n
of {0x00u8..0x7Fu8} - {0x0Eu8, 0x0Fu8, 0x1Bu8, 0x5Cu8, 0x7Eu8}:
output = false
stream.append_codepoint b, oq, olen, n
else:
output = false
stream.handleError(oq, olen, n)
if stream.isend: break
of STATE_KATAKANA:
case b
of 0x1B: state = STATE_ESCAPE_START
of 0x21u8..0x5Fu8:
output = false
stream.append_codepoint 0xFF61u16 - 0x21 + uint16(b), oq, olen, n
else:
output = false
stream.handleError(oq, olen, n)
if stream.isend: break
of STATE_LEAD_BYTE:
case b
of 0x1B: state = STATE_ESCAPE_START
of 0x21u8..0x7Eu8:
output = false
lead = b
state = STATE_TRAIL_BYTE
else:
output = false
stream.handleError(oq, olen, n)
if stream.isend: break
of STATE_TRAIL_BYTE:
case b
of 0x1B:
state = STATE_ESCAPE_START
stream.handleError(oq, olen, n)
if stream.isend: break
of 0x21u8..0x7Eu8:
state = STATE_LEAD_BYTE
let p = (uint16(lead) - 0x21) * 94 + uint16(b) - 0x21
if p < Jis0208Decode.len:
let c = Jis0208Decode[p]
if c != 0:
stream.append_codepoint c, oq, olen, n
else:
stream.handleError(oq, olen, n)
if stream.isend: break
else:
state = STATE_LEAD_BYTE
stream.handleError(oq, olen, n)
if stream.isend: break
of STATE_ESCAPE_START:
if b == 0x24 or b == 0x28:
lead = b
state = STATE_ESCAPE
else:
output = false
state = outputstate
stream.handleError(oq, olen, n)
if stream.isend: break
continue # prepend (no inc i)
of STATE_ESCAPE:
let l = lead
lead = 0
block statenonnull:
var s: ISO2022JPState
if l == 0x28:
case b
of 0x42: s = STATE_ASCII
of 0x4A: s = STATE_ROMAN
of 0x49: s = STATE_KATAKANA
else: break statenonnull
elif l == 0x24 and b in {0x40u8, 0x42u8}:
s = STATE_LEAD_BYTE
else: break statenonnull
state = s
outputstate = s
if not output:
output = true
stream.handleError(oq, olen, n)
if stream.isend:
output = true
break
output = true
inc i
continue
output = false
state = outputstate
stream.handleError(oq, olen, n)
if stream.isend: break
continue # prepend (no inc i)
inc i
stream.iso2022jphasbuf = hasbuf
stream.iso2022jpbuf = buf
stream.iso2022jplead = lead
stream.iso2022jpstate = state
stream.iso2022jpoutput = output
stream.iso2022jpoutputstate = outputstate
proc decodeShiftJIS(stream: DecoderStream, iq: var seq[uint8],
oq: ptr UncheckedArray[uint32], ilen, olen: int,
n: var int) =
var lead = stream.sjislead
var i = 0
while i < ilen:
let b = iq[i]
if lead != 0:
let l = lead
lead = 0
let offset = if b < 0x7Fu8: 0x40u16 else: 0x41u16
let leadoffset = if l < 0xA0: 0x81u16 else: 0xC1u16
if b in 0x40u8..0x7Eu8 or b in 0x80u8..0xFCu8:
let p = (uint16(l) - leadoffset) * 188 + uint16(b) - offset
if p in 8836u16..10715u16:
stream.append_codepoint 0xE000u16 - 8836 + p, oq, olen, n
inc i
continue
if p < Jis0208Decode.len and Jis0208Decode[p] != 0:
let c = Jis0208Decode[p]
stream.append_codepoint c, oq, olen, n
inc i
continue
if cast[char](b) in Ascii:
continue # prepend (no inc i)
else:
stream.handleError(oq, olen, n)
if stream.isend: break
elif cast[char](b) in Ascii or b == 0x80:
stream.append_codepoint b, oq, olen, n
elif b in 0xA1u8..0xDFu8:
stream.append_codepoint 0xFF61u16 - 0xA1 + uint16(b), oq, olen, n
elif b in {0x81..0x9F} + {0xE0..0xFC}:
lead = b
else:
stream.handleError(oq, olen, n)
if stream.isend: break
inc i
stream.sjislead = lead
proc decodeEUCKR(stream: DecoderStream, iq: var seq[uint8],
oq: ptr UncheckedArray[uint32], ilen, olen: int,
n: var int) =
var lead = stream.euckrlead
var i = 0
while i < ilen:
let b = iq[i]
if lead != 0:
if b in 0x41u8..0xFEu8:
let p = (uint16(lead) - 0x81) * 190 + (uint16(b) - 0x41)
if p < EUCKRDecode.len and EUCKRDecode[p] != 0:
let c = EUCKRDecode[p]
stream.append_codepoint c, oq, olen, n
inc i
continue
stream.handleError(oq, olen, n)
if stream.isend: break
elif cast[char](b) in Ascii:
stream.append_codepoint b, oq, olen, n
elif b in {0x81u8..0xFEu8}:
lead = b
else:
stream.handleError(oq, olen, n)
if stream.isend: break
inc i
stream.euckrlead = lead
proc decodeUTF16(stream: DecoderStream, iq: var seq[uint8],
oq: ptr UncheckedArray[uint32], ilen, olen: int,
n: var int, be: static bool) =
var i = 0
var lead = stream.u16lead
var haslead = stream.u16haslead
var surr = stream.u16surr
var hassurr = stream.u16hassurr
while i < ilen:
if not haslead:
haslead = true
lead = iq[i]
else:
let cu = if be:
(uint16(lead) shl 8) + uint16(iq[i])
else:
(uint16(iq[i]) shl 8) + uint16(lead)
haslead = false
if hassurr:
hassurr = false
if cu in 0xDC00u16 .. 0xDFFFu16:
let c = 0x10000 + ((uint32(surr) - 0xD800) shl 10) + (uint32(cu) - 0xDC00)
stream.append_codepoint c, oq, olen, n
inc i
continue
haslead = true # prepend the last two bytes
stream.handleError(oq, olen, n)
continue
if cu in 0xD800u16 .. 0xDBFFu16:
surr = cu
hassurr = true
inc i
continue
elif cu in 0xDC00u16 .. 0xDFFFu16:
stream.handleError(oq, olen, n)
if stream.isend: # fatal error
break
else:
inc i
continue
stream.append_codepoint uint32(cu), oq, olen, n
inc i
stream.u16lead = lead
stream.u16haslead = haslead
stream.u16surr = surr
stream.u16hassurr = hassurr
proc decodeUTF16LE(stream: DecoderStream, iq: var seq[uint8],
oq: ptr UncheckedArray[uint32], ilen, olen: int,
n: var int) =
stream.decodeUTF16(iq, oq, ilen, olen, n, false)
proc decodeUTF16BE(stream: DecoderStream, iq: var seq[uint8],
oq: ptr UncheckedArray[uint32], ilen, olen: int,
n: var int) =
stream.decodeUTF16(iq, oq, ilen, olen, n, true)
proc decodeXUserDefined(stream: DecoderStream, iq: var seq[uint8],
oq: ptr UncheckedArray[uint32], ilen, olen: int,
n: var int) =
for i in 0 ..< ilen:
let c = cast[char](iq[i])
if c in Ascii:
stream.append_codepoint c, oq, olen, n
else:
let c = 0xF780 + cast[uint32](c) - 0x80
stream.append_codepoint c, oq, olen, n
proc decodeSingleByte(stream: DecoderStream, iq: var seq[uint8],
oq: ptr UncheckedArray[uint32], ilen, olen: int,
n: var int, map: array[char, uint16]) =
for i in 0 ..< ilen:
let c = cast[char](iq[i])
if c in Ascii:
stream.append_codepoint c, oq, olen, n
else:
let p = map[c]
if p == 0u16:
stream.handleError(oq, olen, n)
else:
stream.append_codepoint cast[uint32](oq), oq, olen, n
proc decodeReplacement(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int, n: var int) =
if not stream.replreported:
stream.replreported = true
stream.handleError(oq, olen, n)
# I think that's it?
# copy any data remaining from previous passes
proc copyBuffers(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int): int =
if stream.bufs.len == 1:
# one page: stream.bs ..< stream.bi
let n = min((stream.bi - stream.bs) * sizeof(stream.bufs[0][0]), olen)
copyMem(addr oq[0], addr stream.bufs[0][stream.bs], n)
stream.bs += n div sizeof(uint32)
if stream.bs >= stream.bi:
# read entire page; recycle it
stream.bs = 0
stream.bi = 0
return n
else:
# multiple pages:
# stream.bs ..< stream.buflen
# 0 ..< stream.buflen
# ...
# 0 ..< stream.bi
let a = (stream.buflen - stream.bs) * sizeof(stream.bufs[0][0])
if a < olen:
copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a)
var ns = a
stream.bs = 0
var i = 1
while i < stream.bufs.high:
let n = min(stream.buflen * sizeof(stream.bufs[0][0]), olen - ns)
copyMem(addr oq[ns div sizeof(uint32)], addr stream.bufs[i][0], n)
ns += n
if ns >= olen:
# i'th buffer still has contents.
stream.bs = n div sizeof(uint32)
break
stream.bs = 0
inc i
if ns < olen:
# last page
let n = min(stream.bi * sizeof(stream.bufs[0][0]), olen - ns)
copyMem(addr oq[ns div sizeof(uint32)], addr stream.bufs[i][0], n)
ns += n
stream.bs = n div sizeof(uint32)
if stream.bs >= stream.bi:
# read entire page; recycle it
stream.bs = 0
stream.bi = 0
for j in i ..< stream.bufs.len:
stream.bufs[j - i] = stream.bufs[j]
stream.bufs.setLen(stream.bufs.len - i)
return ns
elif a > olen:
copyMem(addr oq[0], addr stream.bufs[0][stream.bs], olen)
stream.bs += olen div sizeof(uint32)
assert stream.bs < stream.buflen
return olen
else: # a == olen
copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a)
stream.bs = 0
stream.bufs.delete(0)
return a
proc checkEnd(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int,
n: var int) =
if not stream.isend and stream.bufs.len == 1 and
stream.bs >= stream.bi and stream.source.atEnd:
stream.isend = true
case stream.charset
of CHARSET_UTF_16_LE, CHARSET_UTF_16_BE:
if stream.u16haslead or stream.u16hassurr:
stream.handleError(oq, olen, n)
of CHARSET_UTF_8:
if stream.u8needed != 0:
stream.handleError(oq, olen, n)
of CHARSET_GB18030, CHARSET_GBK:
if stream.gb18first != 0 or stream.gb18second != 0 or stream.gb18third != 0:
stream.handleError(oq, olen, n)
of CHARSET_BIG5:
if stream.big5lead != 0:
stream.handleError(oq, olen, n)
of CHARSET_EUC_JP:
if stream.eucjplead != 0:
stream.handleError(oq, olen, n)
of CHARSET_ISO_2022_JP:
case stream.iso2022jpstate
of STATE_ASCII, STATE_ROMAN, STATE_KATAKANA, STATE_LEAD_BYTE: discard
of STATE_TRAIL_BYTE:
stream.handleError(oq, olen, n)
of STATE_ESCAPE_START:
stream.handleError(oq, olen, n)
of STATE_ESCAPE:
stream.isend = false
stream.iso2022jpbuf = stream.iso2022jplead
stream.iso2022jphasbuf = true
stream.iso2022jplead = 0
stream.iso2022jpoutput = false
stream.iso2022jpstate = stream.iso2022jpoutputstate
stream.handleError(oq, olen, n)
of CHARSET_SHIFT_JIS:
if stream.sjislead != 0:
stream.handleError(oq, olen, n)
of CHARSET_EUC_KR:
if stream.euckrlead != 0:
stream.handleError(oq, olen, n)
else: discard
proc prepend*(stream: DecoderStream, c: uint32) =
append_codepoint_buf stream, c
const ReadSize = 4096
proc readData*(stream: DecoderStream, buffer: pointer, olen: int): int =
const l = sizeof(stream.bufs[0][0])
assert olen mod l == 0, "Buffer size must be divisible by " & $l
if olen == 0: return
let oq = cast[ptr UncheckedArray[uint32]](buffer)
result = stream.copyBuffers(oq, olen)
let olen = olen - result
if olen == 0 or stream.source.atEnd:
# either output filled with buffered data; nothing to decode
# or we're at the end of the source stream
stream.checkEnd(oq, olen, result)
return result
var iq = newSeqUninitialized[uint8](ReadSize)
let ilen = stream.source.readData(cast[pointer](addr iq[0]), ReadSize)
case stream.charset
of CHARSET_UTF_8: stream.decodeUTF8(iq, oq, ilen, olen, result)
of CHARSET_IBM866: stream.decodeSingleByte(iq, oq, ilen, olen, result, IBM866Decode)
of CHARSET_ISO_8859_2: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88592Decode)
of CHARSET_ISO_8859_3: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88593Decode)
of CHARSET_ISO_8859_4: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88594Decode)
of CHARSET_ISO_8859_5: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88595Decode)
of CHARSET_ISO_8859_6: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88596Decode)
of CHARSET_ISO_8859_7: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88597Decode)
of CHARSET_ISO_8859_8,
CHARSET_ISO_8859_8_I: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88598Decode)
of CHARSET_ISO_8859_10: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885910Decode)
of CHARSET_ISO_8859_13: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885913Decode)
of CHARSET_ISO_8859_14: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885914Decode)
of CHARSET_ISO_8859_15: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885915Decode)
of CHARSET_ISO_8859_16: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885916Decode)
of CHARSET_KOI8_R: stream.decodeSingleByte(iq, oq, ilen, olen, result, KOI8RDecode)
of CHARSET_KOI8_U: stream.decodeSingleByte(iq, oq, ilen, olen, result, KOI8UDecode)
of CHARSET_MACINTOSH: stream.decodeSingleByte(iq, oq, ilen, olen, result, MacintoshDecode)
of CHARSET_WINDOWS_874: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows874Decode)
of CHARSET_WINDOWS_1250: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1250Decode)
of CHARSET_WINDOWS_1251: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1251Decode)
of CHARSET_WINDOWS_1252: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1252Decode)
of CHARSET_WINDOWS_1253: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1253Decode)
of CHARSET_WINDOWS_1254: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1254Decode)
of CHARSET_WINDOWS_1255: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1255Decode)
of CHARSET_WINDOWS_1256: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1256Decode)
of CHARSET_WINDOWS_1257: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1257Decode)
of CHARSET_WINDOWS_1258: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1258Decode)
of CHARSET_X_MAC_CYRILLIC: stream.decodeSingleByte(iq, oq, ilen, olen, result, XMacCyrillicDecode)
of CHARSET_GBK, CHARSET_GB18030: stream.decodeGb18030(iq, oq, ilen, olen, result)
of CHARSET_BIG5: stream.decodeBig5(iq, oq, ilen, olen, result)
of CHARSET_EUC_JP: stream.decodeEUCJP(iq, oq, ilen, olen, result)
of CHARSET_ISO_2022_JP: stream.decodeISO2022JP(iq, oq, ilen, olen, result)
of CHARSET_SHIFT_JIS: stream.decodeShiftJIS(iq, oq, ilen, olen, result)
of CHARSET_EUC_KR: stream.decodeEUCKR(iq, oq, ilen, olen, result)
of CHARSET_REPLACEMENT: stream.decodeReplacement(oq, olen, result)
of CHARSET_UTF_16_LE: stream.decodeUTF16LE(iq, oq, ilen, olen, result)
of CHARSET_UTF_16_BE: stream.decodeUTF16BE(iq, oq, ilen, olen, result)
of CHARSET_X_USER_DEFINED: stream.decodeXUserDefined(iq, oq, ilen, olen, result)
of CHARSET_UNKNOWN: assert false, "Somebody forgot to set the character set here"
stream.checkEnd(oq, olen, result)
# Returns the number of bytes read.
proc readData*(stream: DecoderStream, buf: var seq[uint32]): int =
return stream.readData(addr buf[0], buf.len * sizeof(buf[0]))
proc readRunes*(stream: DecoderStream, olen: int): seq[Rune] =
when nimvm:
let s = stream.source.readStr(olen)
result = s.toRunes()
if stream.source.atEnd:
stream.isend = true
else:
assert false
proc atEnd*(stream: DecoderStream): bool =
return stream.isend
# Read all and convert to UTF-8.
# Probably not very efficient. Oh well.
proc readAll*(stream: DecoderStream): string =
var buf = newSeqUninitialized[uint32](stream.buflen)
while not stream.atEnd:
let n = stream.readData(buf)
for i in 0 ..< n div 4:
let r = cast[Rune](buf[i])
result &= $r
proc newDecoderStream*(source: Stream, cs = CHARSET_UTF_8, buflen = 1024,
errormode = DECODER_ERROR_MODE_REPLACEMENT): DecoderStream =
result = DecoderStream(
source: source,
charset: cs,
buflen: buflen,
errormode: errormode
)
when nimvm:
result.bufs = @[newSeq[uint32](buflen)]
else:
result.bufs = @[newSeqUninitialized[uint32](buflen)]
case cs
of CHARSET_UTF_8:
result.u8bounds = 0x80u8 .. 0xBFu8
else: discard