diff options
author | bptato <nincsnevem662@gmail.com> | 2022-12-10 19:48:11 +0100 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2022-12-10 19:48:11 +0100 |
commit | 5c267ce6c9c1add997767f5eaad248cdd829c6fc (patch) | |
tree | e3b8a1cdd7f7f67f2d810ce1747fa197c1387ff1 /src/encoding | |
parent | 78cc5143a3a98f2803258d42818cf17727d70fe7 (diff) | |
download | chawan-5c267ce6c9c1add997767f5eaad248cdd829c6fc.tar.gz |
Fix decoder bugs, move decoder to encoding/
Diffstat (limited to 'src/encoding')
-rw-r--r-- | src/encoding/decoderstream.nim | 844 |
1 files changed, 844 insertions, 0 deletions
diff --git a/src/encoding/decoderstream.nim b/src/encoding/decoderstream.nim new file mode 100644 index 00000000..15d6311c --- /dev/null +++ b/src/encoding/decoderstream.nim @@ -0,0 +1,844 @@ +import algorithm +import streams +import unicode + +import data/charset +import utils/twtstr + +# DecoderStream decodes any encoding to valid utf-32. +type + DecoderErrorMode* = enum + DECODER_ERROR_MODE_FATAL, DECODER_ERROR_MODE_REPLACEMENT, + DECODER_ERROR_MODE_HTML + + ISO2022JPState = enum + STATE_ASCII, STATE_ROMAN, STATE_KATAKANA, STATE_LEAD_BYTE, + STATE_TRAIL_BYTE, STATE_ESCAPE_START, STATE_ESCAPE + + DecoderStream* = ref object + source: Stream + errormode: DecoderErrorMode + isend: bool + failed*: bool + bufs: seq[seq[uint32]] + bs: int + bi: int + buflen: int + c: uint32 + case charset: Charset + of CHARSET_UTF_8: + u8needed: int + u8seen: int + u8bounds: Slice[uint8] + of CHARSET_GBK, CHARSET_GB18030: + gb18first: uint8 + gb18second: uint8 + gb18third: uint8 + gb18buf: uint8 + gb18hasbuf: bool + of CHARSET_BIG5: + big5lead: uint8 + of CHARSET_EUC_JP: + eucjplead: uint8 + eucjpjis0212: bool + of CHARSET_ISO_2022_JP: + iso2022jplead: uint8 + iso2022jpstate: ISO2022JPState + iso2022jpoutputstate: ISO2022JPState + iso2022jpoutput: bool + iso2022jpbuf: uint8 + iso2022jphasbuf: bool + of CHARSET_SHIFT_JIS: + sjislead: uint8 + of CHARSET_EUC_KR: + euckrlead: uint8 + of CHARSET_UTF_16_BE, CHARSET_UTF_16_LE: + u16lead: uint8 + u16surr: uint16 + u16haslead: bool + u16hassurr: bool + of CHARSET_REPLACEMENT: + replreported: bool + else: discard + +template append_codepoint_buf(stream: DecoderStream, c: uint32) = + if stream.bi >= stream.buflen: + stream.bufs.add(newSeqUninitialized[uint32](stream.buflen)) + stream.bi = 0 + stream.bufs[^1][stream.bi] = c + inc stream.bi + +template append_codepoint(stream: DecoderStream, c: uint32, oq: ptr UncheckedArray[uint32], olen: int, n: var int) = + if n < olen: + oq[n div sizeof(uint32)] = c + n += sizeof(uint32) + else: + append_codepoint_buf stream, c + +template append_codepoint(stream: DecoderStream, c: char, oq: ptr UncheckedArray[uint32], olen: int, n: var int) = + stream.append_codepoint cast[uint32](c), oq, olen, n + +proc handleError(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int, n: var int) = + case stream.errormode + of DECODER_ERROR_MODE_FATAL: + stream.isend = true + stream.failed = true + of DECODER_ERROR_MODE_HTML: + if stream.charset == CHARSET_UTF_8: + # "html" mode is handled as "replacement" for utf-8. + stream.append_codepoint 0xFFFD, oq, olen, n + else: + stream.append_codepoint '&', oq, olen, n + stream.append_codepoint '#', oq, olen, n + while stream.c > 0: + stream.append_codepoint cast[char](0x30 + stream.c mod 10), oq, olen, n + stream.c = stream.c div 10 + stream.append_codepoint ';', oq, olen, n + of DECODER_ERROR_MODE_REPLACEMENT: + stream.append_codepoint 0xFFFD, oq, olen, n + +proc decodeUTF8(stream: DecoderStream, iq: var seq[uint8], oq: ptr UncheckedArray[uint32], ilen, olen: int, n: var int) = + var c = stream.c + var needed = stream.u8needed + var seen = stream.u8seen + var bounds = stream.u8bounds + var i = 0 + while i < ilen: + let b = iq[i] + if needed == 0: + case b + of 0x00u8 .. 0x7Fu8: + stream.append_codepoint uint32(b), oq, olen, n + of 0xC2u8 .. 0xDFu8: + needed = 1 + c = cast[uint32](b) and 0x1F + of 0xE0u8: + bounds.a = 0xA0 + needed = 2 + c = cast[uint32](b) and 0xF + of 0xEDu8: + bounds.b = 0x9F + needed = 2 + c = cast[uint32](b) and 0xF + of 0xE1u8 .. 0xECu8, 0xEEu8 .. 0xEFu8: + needed = 2 + c = cast[uint32](b) and 0xF + of 0xF0u8: + bounds.a = 0x90 + needed = 3 + c = cast[uint32](b) and 0x7 + of 0xF4u8: + bounds.b = 0x8F + needed = 3 + c = cast[uint32](b) and 0x7 + of 0xF1u8 .. 0xF3u8: + needed = 3 + c = cast[uint32](b) and 0x7 + else: + stream.handleError(oq, olen, n) + if stream.isend: # fatal error + break + inc i + continue + if b notin bounds: + c = 0 + needed = 0 + seen = 0 + bounds = 0x80u8 .. 0xBFu8 + stream.handleError(oq, olen, n) + continue # prepend (no inc i) + bounds = 0x80u8 .. 0xBFu8 + c = (c shl 6) or (b and 0x3F) + inc seen + if seen == needed: + stream.append_codepoint c, oq, olen, n + c = 0 + needed = 0 + seen = 0 + inc i + stream.c = c + stream.u8bounds = bounds + stream.u8seen = seen + stream.u8needed = needed + +proc gb18RangesCodepoint(p: uint32): uint32 = + if p > 39419 and p < 189000 or p > 1237575: + return high(uint32) # null + if p == 7457: + return 0xE7C7 + # Let offset be the last pointer in index gb18030 ranges that is less than or + # equal to pointer and code point offset its corresponding code point. + var offset: uint32 + var c: uint32 + if p >= 189000: + # omitted from the map for storage efficiency + offset = 189000 + c = 0x10000 + elif p >= 39394: + # Needed because upperBound returns the first element greater than pointer + # OR last on failure, so we can't just remove one if p is e.g. 39400. + offset = 39394 + c = 0xFFE6 + else: + # Find the first range that is greater than p, or last if no such element + # is found. + # We want the last that is <=, so decrease index by one. + let i = upperBound(Gb18030RangesDecode, p, func(a: tuple[p, ucs: uint16], b: uint32): int = + cmp(cast[uint32](a.p), b)) + let elem = Gb18030RangesDecode[i - 1] + offset = elem.p + c = elem.ucs + return c + p - offset + +proc decodeGb18030(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + var first = stream.gb18first + var second = stream.gb18second + var third = stream.gb18third + var buf = stream.gb18buf + var hasbuf = stream.gb18hasbuf + var i = 0 + while i < ilen: + let b = if hasbuf: + hasbuf = false + dec i + buf + else: + iq[i] + if third != 0: + if b notin 0x30u8 .. 0x39u8: + hasbuf = true + buf = second + first = third + first = 0 + second = 0 + third = 0 + stream.handleError(oq, olen, n) + if stream.isend: break + continue # prepend (no inc i) + else: + let p = ((uint32(first) - 0x81) * 10 * 126 * 10) + + ((uint32(second) - 0x30) * (10 * 126)) + + ((uint32(third) - 0x81) * 10) + uint32(b) - 0x30 + let c = gb18RangesCodepoint(p) + first = 0 + second = 0 + third = 0 + if c == high(uint32): # null + stream.handleError(oq, olen, n) + if stream.isend: break + else: + stream.append_codepoint c, oq, olen, n + elif second != 0: + if b in 0x81u8 .. 0xFEu8: + third = b + else: + hasbuf = true + buf = second + first = 0 + second = 0 + third = 0 + stream.handleError(oq, olen, n) + if stream.isend: break + elif first != 0: + if b in 0x30u8 .. 0x39u8: + second = b + else: + let ff = first + first = 0 + if b in 0x40u8 .. 0x7Eu8: + let offset = if b < 0x7F: 0x40u32 else: 0x41u32 + let p = (uint16(ff) - 0x81) * 190 + (uint16(b) - offset) + if p < Gb18030Decode.len: + let c = Gb18030Decode[cast[uint16](p)] + stream.append_codepoint uint32(c), oq, olen, n + inc i + continue + if cast[char](b) in Ascii: + continue # prepend (no inc i) + else: + stream.handleError(oq, olen, n) + if stream.isend: break + elif cast[char](b) in Ascii: + stream.append_codepoint b, oq, olen, n + elif b == 0x80: + stream.append_codepoint 0x20AC, oq, olen, n + elif b in 0x81u8 .. 0xFEu8: + first = b + else: + stream.handleError(oq, olen, n) + if stream.isend: break + inc i + stream.gb18first = first + stream.gb18second = second + stream.gb18third = third + stream.gb18buf = buf + stream.gb18hasbuf = hasbuf + +proc decodeBig5(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + var i = 0 + while i < ilen: + if stream.big5lead != 0: + let lead = uint32(stream.big5lead) + stream.big5lead = 0 + let offset = if iq[i] < 0x7F: 0x40u16 else: 0x7E + if iq[i] in {0x40u8 .. 0x7Eu8, 0xA1 .. 0xFEu8}: + let p = (lead - 0x81) * 157 + uint16(iq[i]) - offset + template output_two(a, b: uint32) = + stream.append_codepoint a, oq, olen, n + stream.append_codepoint b, oq, olen, n + block no_continue: + case p + of 1133: output_two 0x00CA, 0x0304 + of 1135: output_two 0x00CA, 0x030C + of 1164: output_two 0x00EA, 0x0304 + of 1166: output_two 0x00EA, 0x030C + else: break no_continue + inc i + continue + if p < Big5Decode.len - Big5DecodeOffset: + let c = Big5Decode[p - Big5DecodeOffset] + if c != 0: + stream.append_codepoint c, oq, olen, n + inc i + continue + if cast[char](iq[i]) in Ascii: + stream.append_codepoint iq[i], oq, olen, n + else: + stream.handleError(oq, olen, n) + if stream.isend: break + elif cast[char](iq[i]) in Ascii: + stream.append_codepoint iq[i], oq, olen, n + elif iq[i] in 0x00u8 .. 0xFEu8: + stream.big5lead = iq[i] + else: + stream.handleError(oq, olen, n) + if stream.isend: break + inc i + +proc decodeEUCJP(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + var jis0212 = stream.eucjpjis0212 + var lead = stream.eucjplead + var i = 0 + while i < ilen: + let b = iq[i] + if lead == 0x8E and b in 0xA1u8 .. 0xDFu8: + lead = 0 + stream.append_codepoint iq[i], oq, olen, n + elif lead == 0x8F and b in 0xA1u8 .. 0xFEu8: + jis0212 = true + lead = b + elif lead != 0: + if lead in 0xA1u8 .. 0xFEu8 and b in 0xA1u8 .. 0xFEu8: + let p = (uint16(lead) - 0xA1) * 94 + uint16(b) - 0xA1 + lead = 0 + var c: uint16 + if jis0212: + if p < Jis0212Decode.len: + c = Jis0212Decode[p] + else: + if p < Jis0208Decode.len: + c = Jis0208Decode[p] + jis0212 = false + if c != 0: + stream.append_codepoint c, oq, olen, n + inc i + continue + else: + lead = 0 + stream.handleError(oq, olen, n) + if stream.isend: break + elif cast[char](b) in Ascii: + stream.append_codepoint b, oq, olen, n + elif b in {0x8Eu8, 0x8Fu8, 0xA1u8 .. 0xFEu8}: + lead = b + else: + stream.handleError(oq, olen, n) + if stream.isend: break + inc i + stream.eucjpjis0212 = jis0212 + stream.eucjplead = lead + +proc decodeISO2022JP(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + var i = 0 + var lead = stream.iso2022jplead + var state = stream.iso2022jpstate + var output = stream.iso2022jpoutput + var outputstate = stream.iso2022jpoutputstate + var buf = stream.iso2022jpbuf + var hasbuf = stream.iso2022jphasbuf + while i < ilen: + let b = if hasbuf: + hasbuf = false + dec i + buf + else: + iq[i] + case state + of STATE_ASCII: + case b + of 0x1B: state = STATE_ESCAPE_START + of {0x00u8..0x7Fu8} - {0x0Eu8, 0x0Fu8, 0x1Bu8}: + output = false + stream.append_codepoint b, oq, olen, n + else: + output = false + stream.handleError(oq, olen, n) + if stream.isend: break + of STATE_ROMAN: + case b + of 0x1B: state = STATE_ESCAPE_START + of 0x5C: + output = false + stream.append_codepoint 0x00A5, oq, olen, n + of 0x7E: + output = false + stream.append_codepoint 0x203E, oq, olen, n + of {0x00u8..0x7Fu8} - {0x0Eu8, 0x0Fu8, 0x1Bu8, 0x5Cu8, 0x7Eu8}: + output = false + stream.append_codepoint b, oq, olen, n + else: + output = false + stream.handleError(oq, olen, n) + if stream.isend: break + of STATE_KATAKANA: + case b + of 0x1B: state = STATE_ESCAPE_START + of 0x21u8..0x5Fu8: + output = false + stream.append_codepoint 0xFF61u16 - 0x21 + uint16(b), oq, olen, n + else: + output = false + stream.handleError(oq, olen, n) + if stream.isend: break + of STATE_LEAD_BYTE: + case b + of 0x1B: state = STATE_ESCAPE_START + of 0x21u8..0x7Eu8: + output = false + lead = b + state = STATE_TRAIL_BYTE + else: + output = false + stream.handleError(oq, olen, n) + if stream.isend: break + of STATE_TRAIL_BYTE: + case b + of 0x1B: + state = STATE_ESCAPE_START + stream.handleError(oq, olen, n) + if stream.isend: break + of 0x21u8..0x7Eu8: + state = STATE_LEAD_BYTE + let p = (uint16(lead) - 0x21) * 94 + uint16(b) - 0x21 + if p < Jis0208Decode.len: + let c = Jis0208Decode[p] + if c != 0: + stream.append_codepoint c, oq, olen, n + else: + stream.handleError(oq, olen, n) + if stream.isend: break + else: + state = STATE_LEAD_BYTE + stream.handleError(oq, olen, n) + if stream.isend: break + of STATE_ESCAPE_START: + if b == 0x24 or b == 0x28: + lead = b + state = STATE_ESCAPE + else: + output = false + state = outputstate + stream.handleError(oq, olen, n) + if stream.isend: break + continue # prepend (no inc i) + of STATE_ESCAPE: + let l = lead + lead = 0 + block statenonnull: + var s: ISO2022JPState + if l == 0x28: + case b + of 0x42: s = STATE_ASCII + of 0x4A: s = STATE_ROMAN + of 0x49: s = STATE_KATAKANA + else: break statenonnull + elif l == 0x24 and b in {0x40u8, 0x42u8}: + s = STATE_LEAD_BYTE + else: break statenonnull + state = s + outputstate = s + if not output: + output = true + stream.handleError(oq, olen, n) + if stream.isend: + output = true + break + output = true + inc i + continue + output = false + state = outputstate + stream.handleError(oq, olen, n) + if stream.isend: break + continue # prepend (no inc i) + inc i + stream.iso2022jphasbuf = hasbuf + stream.iso2022jpbuf = buf + stream.iso2022jplead = lead + stream.iso2022jpstate = state + stream.iso2022jpoutput = output + stream.iso2022jpoutputstate = outputstate + +proc decodeShiftJIS(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + var lead = stream.sjislead + var i = 0 + while i < ilen: + let b = iq[i] + if lead != 0: + let l = lead + lead = 0 + let offset = if b < 0x7Fu8: 0x40u16 else: 0x41u16 + let leadoffset = if l < 0xA0: 0x81u16 else: 0xC1u16 + if b in 0x40u8..0x7Eu8 or b in 0x80u8..0xFCu8: + let p = (uint16(l) - leadoffset) * 188 + uint16(b) - offset + if p in 8836u16..10715u16: + stream.append_codepoint 0xE000u16 - 8836 + p, oq, olen, n + inc i + continue + if p < Jis0208Decode.len and Jis0208Decode[p] != 0: + let c = Jis0208Decode[p] + stream.append_codepoint c, oq, olen, n + inc i + continue + if cast[char](b) in Ascii: + continue # prepend (no inc i) + else: + stream.handleError(oq, olen, n) + if stream.isend: break + elif cast[char](b) in Ascii or b == 0x80: + stream.append_codepoint b, oq, olen, n + elif b in 0xA1u8..0xDFu8: + stream.append_codepoint 0xFF61u16 - 0xA1 + uint16(b), oq, olen, n + elif b in {0x81..0x9F} + {0xE0..0xFC}: + lead = b + else: + stream.handleError(oq, olen, n) + if stream.isend: break + inc i + stream.sjislead = lead + +proc decodeEUCKR(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + var lead = stream.euckrlead + var i = 0 + while i < ilen: + let b = iq[i] + if lead != 0: + if b in 0x41u8..0xFEu8: + let p = (uint16(lead) - 0x81) * 190 + (uint16(b) - 0x41) + if p < EUCKRDecode.len and EUCKRDecode[p] != 0: + let c = EUCKRDecode[p] + stream.append_codepoint c, oq, olen, n + inc i + continue + stream.handleError(oq, olen, n) + if stream.isend: break + elif cast[char](b) in Ascii: + stream.append_codepoint b, oq, olen, n + elif b in {0x81u8..0xFEu8}: + lead = b + else: + stream.handleError(oq, olen, n) + if stream.isend: break + inc i + stream.euckrlead = lead + +proc decodeUTF16(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int, be: static bool) = + var i = 0 + var lead = stream.u16lead + var haslead = stream.u16haslead + var surr = stream.u16surr + var hassurr = stream.u16hassurr + while i < ilen: + if not haslead: + haslead = true + lead = iq[i] + else: + let cu = if be: + (uint16(lead) shl 8) + uint16(iq[i]) + else: + (uint16(iq[i]) shl 8) + uint16(lead) + haslead = false + if hassurr: + hassurr = false + if cu in 0xDC00u16 .. 0xDFFFu16: + let c = 0x10000 + ((uint32(surr) - 0xD800) shl 10) + (uint32(cu) - 0xDC00) + stream.append_codepoint c, oq, olen, n + inc i + continue + haslead = true # prepend the last two bytes + stream.handleError(oq, olen, n) + continue + if cu in 0xD800u16 .. 0xDBFFu16: + surr = cu + hassurr = true + inc i + continue + elif cu in 0xDC00u16 .. 0xDFFFu16: + stream.handleError(oq, olen, n) + if stream.isend: # fatal error + break + else: + inc i + continue + stream.append_codepoint uint32(cu), oq, olen, n + inc i + stream.u16lead = lead + stream.u16haslead = haslead + stream.u16surr = surr + stream.u16hassurr = hassurr + +proc decodeUTF16LE(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + stream.decodeUTF16(iq, oq, ilen, olen, n, false) + +proc decodeUTF16BE(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + stream.decodeUTF16(iq, oq, ilen, olen, n, true) + +proc decodeXUserDefined(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int) = + for i in 0 ..< ilen: + let c = cast[char](iq[i]) + if c in Ascii: + stream.append_codepoint c, oq, olen, n + else: + let c = 0xF780 + cast[uint32](c) - 0x80 + stream.append_codepoint c, oq, olen, n + +proc decodeSingleByte(stream: DecoderStream, iq: var seq[uint8], + oq: ptr UncheckedArray[uint32], ilen, olen: int, + n: var int, map: array[char, uint16]) = + for i in 0 ..< ilen: + let c = cast[char](iq[i]) + if c in Ascii: + stream.append_codepoint c, oq, olen, n + else: + let p = map[c] + if p == 0u16: + stream.handleError(oq, olen, n) + else: + stream.append_codepoint cast[uint32](oq), oq, olen, n + +proc decodeReplacement(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int, n: var int) = + if not stream.replreported: + stream.replreported = true + stream.handleError(oq, olen, n) + # I think that's it? + +# copy any data remaining from previous passes +proc copyBuffers(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int): int = + if stream.bufs.len == 1: + # one page: stream.bs ..< stream.bi + let n = min((stream.bi - stream.bs) * sizeof(stream.bufs[0][0]), olen) + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], n) + stream.bs += n div sizeof(uint32) + if stream.bs >= stream.bi: + # read entire page; recycle it + stream.bs = 0 + stream.bi = 0 + return n + else: + # multiple pages: + # stream.bs ..< stream.buflen + # 0 ..< stream.buflen + # ... + # 0 ..< stream.bi + let a = (stream.buflen - stream.bs) * sizeof(stream.bufs[0][0]) + if a < olen: + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a) + var ns = a + stream.bs = 0 + var i = 1 + while i < stream.bufs.high: + let n = min(stream.buflen * sizeof(stream.bufs[0][0]), olen - ns) + copyMem(addr oq[ns div sizeof(uint32)], addr stream.bufs[i][0], n) + ns += n + if ns >= olen: + # i'th buffer still has contents. + stream.bs = n div sizeof(uint32) + break + stream.bs = 0 + inc i + if ns < olen: + # last page + let n = min(stream.bi * sizeof(stream.bufs[0][0]), olen - ns) + copyMem(addr oq[ns div sizeof(uint32)], addr stream.bufs[i][0], n) + ns += n + stream.bs = n div sizeof(uint32) + if stream.bs >= stream.bi: + # read entire page; recycle it + stream.bs = 0 + stream.bi = 0 + for j in i ..< stream.bufs.len: + stream.bufs[j - i] = stream.bufs[j] + stream.bufs.setLen(stream.bufs.len - i) + return ns + elif a > olen: + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], olen) + stream.bs += olen div sizeof(uint32) + assert stream.bs < stream.buflen + return olen + else: # a == olen + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a) + stream.bs = 0 + stream.bufs.delete(0) + return a + +proc checkEnd(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: int, + n: var int) = + if not stream.isend and stream.bufs.len == 1 and + stream.bs >= stream.bi and stream.source.atEnd: + stream.isend = true + case stream.charset + of CHARSET_UTF_16_LE, CHARSET_UTF_16_BE: + if stream.u16haslead or stream.u16hassurr: + stream.handleError(oq, olen, n) + of CHARSET_UTF_8: + if stream.u8needed != 0: + stream.handleError(oq, olen, n) + of CHARSET_GB18030, CHARSET_GBK: + if stream.gb18first != 0 or stream.gb18second != 0 or stream.gb18third != 0: + stream.handleError(oq, olen, n) + of CHARSET_BIG5: + if stream.big5lead != 0: + stream.handleError(oq, olen, n) + of CHARSET_EUC_JP: + if stream.eucjplead != 0: + stream.handleError(oq, olen, n) + of CHARSET_ISO_2022_JP: + case stream.iso2022jpstate + of STATE_ASCII, STATE_ROMAN, STATE_KATAKANA, STATE_LEAD_BYTE: discard + of STATE_TRAIL_BYTE: + stream.handleError(oq, olen, n) + of STATE_ESCAPE_START: + stream.handleError(oq, olen, n) + of STATE_ESCAPE: + stream.isend = false + stream.iso2022jpbuf = stream.iso2022jplead + stream.iso2022jphasbuf = true + stream.iso2022jplead = 0 + stream.iso2022jpoutput = false + stream.iso2022jpstate = stream.iso2022jpoutputstate + stream.handleError(oq, olen, n) + of CHARSET_SHIFT_JIS: + if stream.sjislead != 0: + stream.handleError(oq, olen, n) + of CHARSET_EUC_KR: + if stream.euckrlead != 0: + stream.handleError(oq, olen, n) + else: discard + +proc prepend*(stream: DecoderStream, c: uint32) = + append_codepoint_buf stream, c + +const ReadSize = 4096 +proc readData*(stream: DecoderStream, buffer: pointer, olen: int): int = + const l = sizeof(stream.bufs[0][0]) + assert olen mod l == 0, "Buffer size must be divisible by " & $l + if olen == 0: return + let oq = cast[ptr UncheckedArray[uint32]](buffer) + result = stream.copyBuffers(oq, olen) + let olen = olen - result + if olen == 0: + stream.checkEnd(oq, olen, result) + return result # output filled with buffered data; nothing to decode. + var iq = newSeqUninitialized[uint8](ReadSize) + let ilen = stream.source.readData(cast[pointer](addr iq[0]), ReadSize) + case stream.charset + of CHARSET_UTF_8: stream.decodeUTF8(iq, oq, ilen, olen, result) + of CHARSET_IBM866: stream.decodeSingleByte(iq, oq, ilen, olen, result, IBM866Decode) + of CHARSET_ISO_8859_2: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88592Decode) + of CHARSET_ISO_8859_3: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88593Decode) + of CHARSET_ISO_8859_4: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88594Decode) + of CHARSET_ISO_8859_5: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88595Decode) + of CHARSET_ISO_8859_6: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88596Decode) + of CHARSET_ISO_8859_7: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88597Decode) + of CHARSET_ISO_8859_8, + CHARSET_ISO_8859_8_I: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO88598Decode) + of CHARSET_ISO_8859_10: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885910Decode) + of CHARSET_ISO_8859_13: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885913Decode) + of CHARSET_ISO_8859_14: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885914Decode) + of CHARSET_ISO_8859_15: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885915Decode) + of CHARSET_ISO_8859_16: stream.decodeSingleByte(iq, oq, ilen, olen, result, ISO885916Decode) + of CHARSET_KOI8_R: stream.decodeSingleByte(iq, oq, ilen, olen, result, KOI8RDecode) + of CHARSET_KOI8_U: stream.decodeSingleByte(iq, oq, ilen, olen, result, KOI8UDecode) + of CHARSET_MACINTOSH: stream.decodeSingleByte(iq, oq, ilen, olen, result, MacintoshDecode) + of CHARSET_WINDOWS_874: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows874Decode) + of CHARSET_WINDOWS_1250: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1250Decode) + of CHARSET_WINDOWS_1251: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1251Decode) + of CHARSET_WINDOWS_1252: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1252Decode) + of CHARSET_WINDOWS_1253: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1253Decode) + of CHARSET_WINDOWS_1254: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1254Decode) + of CHARSET_WINDOWS_1255: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1255Decode) + of CHARSET_WINDOWS_1256: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1256Decode) + of CHARSET_WINDOWS_1257: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1257Decode) + of CHARSET_WINDOWS_1258: stream.decodeSingleByte(iq, oq, ilen, olen, result, Windows1258Decode) + of CHARSET_X_MAC_CYRILLIC: stream.decodeSingleByte(iq, oq, ilen, olen, result, XMacCyrillicDecode) + of CHARSET_GBK, CHARSET_GB18030: stream.decodeGb18030(iq, oq, ilen, olen, result) + of CHARSET_BIG5: stream.decodeBig5(iq, oq, ilen, olen, result) + of CHARSET_EUC_JP: stream.decodeEUCJP(iq, oq, ilen, olen, result) + of CHARSET_ISO_2022_JP: stream.decodeISO2022JP(iq, oq, ilen, olen, result) + of CHARSET_SHIFT_JIS: stream.decodeShiftJIS(iq, oq, ilen, olen, result) + of CHARSET_EUC_KR: stream.decodeEUCKR(iq, oq, ilen, olen, result) + of CHARSET_REPLACEMENT: stream.decodeReplacement(oq, olen, result) + of CHARSET_UTF_16_LE: stream.decodeUTF16LE(iq, oq, ilen, olen, result) + of CHARSET_UTF_16_BE: stream.decodeUTF16BE(iq, oq, ilen, olen, result) + of CHARSET_X_USER_DEFINED: stream.decodeXUserDefined(iq, oq, ilen, olen, result) + of CHARSET_UNKNOWN: assert false, "Somebody forgot to set the character set here" + stream.checkEnd(oq, olen, result) + +proc readRunes*(stream: DecoderStream, olen: int): seq[Rune] = + when nimvm: + let s = stream.source.readStr(olen) + result = s.toRunes() + if stream.source.atEnd: + stream.isend = true + else: + assert false + +proc atEnd*(stream: DecoderStream): bool = + return stream.isend + +proc newDecoderStream*(source: Stream, cs = CHARSET_UTF_8, buflen = 1024, + errormode = DECODER_ERROR_MODE_REPLACEMENT): DecoderStream = + result = DecoderStream( + source: source, + charset: cs, + buflen: buflen, + errormode: errormode + ) + when nimvm: + result.bufs = @[newSeq[uint32](buflen)] + else: + result.bufs = @[newSeqUninitialized[uint32](buflen)] + case cs + of CHARSET_UTF_8: + result.u8bounds = 0x80u8 .. 0xBFu8 + else: discard |