diff options
author | bptato <nincsnevem662@gmail.com> | 2023-05-19 01:50:17 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2023-05-19 01:50:17 +0200 |
commit | 26e8968a6499742cf37e00292a7d1c8ed620cad5 (patch) | |
tree | a3922f02f09b5c025dddcfe0e7a3a719c47ba4da /src/encoding | |
parent | dac6a09c14b258ed725dcb265305a6445edc02ad (diff) | |
download | chawan-26e8968a6499742cf37e00292a7d1c8ed620cad5.tar.gz |
Add display/output encoding
Some encodings are still missing
Diffstat (limited to 'src/encoding')
-rw-r--r-- | src/encoding/decoderstream.nim | 19 | ||||
-rw-r--r-- | src/encoding/encoderstream.nim | 389 |
2 files changed, 393 insertions, 15 deletions
diff --git a/src/encoding/decoderstream.nim b/src/encoding/decoderstream.nim index 425f264f..8bfd4d10 100644 --- a/src/encoding/decoderstream.nim +++ b/src/encoding/decoderstream.nim @@ -8,8 +8,7 @@ import utils/twtstr # DecoderStream decodes any encoding to valid utf-32. type DecoderErrorMode* = enum - DECODER_ERROR_MODE_FATAL, DECODER_ERROR_MODE_REPLACEMENT, - DECODER_ERROR_MODE_HTML + DECODER_ERROR_MODE_FATAL, DECODER_ERROR_MODE_REPLACEMENT ISO2022JPState = enum STATE_ASCII, STATE_ROMAN, STATE_KATAKANA, STATE_LEAD_BYTE, @@ -27,6 +26,7 @@ type c: uint32 case charset: Charset of CHARSET_UTF_8: + u8c: uint32 u8needed: int u8seen: int u8bounds: Slice[uint8] @@ -83,22 +83,11 @@ proc handleError(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: in of DECODER_ERROR_MODE_FATAL: stream.isend = true stream.failed = true - of DECODER_ERROR_MODE_HTML: - if stream.charset == CHARSET_UTF_8: - # "html" mode is handled as "replacement" for utf-8. - stream.append_codepoint 0xFFFD, oq, olen, n - else: - stream.append_codepoint '&', oq, olen, n - stream.append_codepoint '#', oq, olen, n - while stream.c > 0: - stream.append_codepoint cast[char](0x30 + stream.c mod 10), oq, olen, n - stream.c = stream.c div 10 - stream.append_codepoint ';', oq, olen, n of DECODER_ERROR_MODE_REPLACEMENT: stream.append_codepoint 0xFFFD, oq, olen, n proc decodeUTF8(stream: DecoderStream, iq: var seq[uint8], oq: ptr UncheckedArray[uint32], ilen, olen: int, n: var int) = - var c = stream.c + var c = stream.u8c var needed = stream.u8needed var seen = stream.u8seen var bounds = stream.u8bounds @@ -156,7 +145,7 @@ proc decodeUTF8(stream: DecoderStream, iq: var seq[uint8], oq: ptr UncheckedArra needed = 0 seen = 0 inc i - stream.c = c + stream.u8c = c stream.u8bounds = bounds stream.u8seen = seen stream.u8needed = needed diff --git a/src/encoding/encoderstream.nim b/src/encoding/encoderstream.nim new file mode 100644 index 00000000..155b7d31 --- /dev/null +++ b/src/encoding/encoderstream.nim @@ -0,0 +1,389 @@ +# Heavily based on https://encoding.spec.whatwg.org/ + +import algorithm +import streams +import unicode + +import data/charset + +# EncoderStream encodes utf-32 to the specified encoding. +type + EncoderErrorMode* = enum + ENCODER_ERROR_MODE_FATAL, ENCODER_ERROR_MODE_HTML + + ISO2022JPState = enum + STATE_ASCII, STATE_ROMAN, STATE_JIS0208 + + EncoderStream* = ref object + source: Stream + errormode: EncoderErrorMode + isend: bool + failed*: bool + bufs: seq[seq[uint8]] + bs: int + bi: int + buflen: int + errc: uint32 + charset: Charset + +template append_byte_buf(stream: EncoderStream, c: uint8) = + if stream.bi >= stream.buflen: + stream.bufs.add(newSeqUninitialized[uint8](stream.buflen)) + stream.bi = 0 + stream.bufs[^1][stream.bi] = c + inc stream.bi + +template append_byte(stream: EncoderStream, c: uint8, + oq: ptr UncheckedArray[uint8], olen: int, n: var int) = + if n < olen: + oq[n] = c + inc n + else: + append_byte_buf stream, c + +template append_byte(stream: EncoderStream, c: char, + oq: ptr UncheckedArray[uint8], olen: int, n: var int) = + stream.append_byte cast[uint8](c), oq, olen, n + +template append_byte(stream: EncoderStream, c: uint32, + oq: ptr UncheckedArray[uint8], olen: int, n: var int) = + stream.append_byte cast[uint8](c), oq, olen, n + +template append_byte(stream: EncoderStream, c: int, + oq: ptr UncheckedArray[uint8], olen: int, n: var int) = + stream.append_byte cast[uint8](c), oq, olen, n + +proc handleError(stream: EncoderStream, oq: ptr UncheckedArray[uint8], + olen: int, n: var int, c: uint32) = + case stream.errormode + of ENCODER_ERROR_MODE_FATAL: + stream.isend = true + stream.failed = true + of ENCODER_ERROR_MODE_HTML: + stream.append_byte '&', oq, olen, n + stream.append_byte '#', oq, olen, n + if stream.errc == 0: + stream.append_byte '0', oq, olen, n + else: + while stream.errc > 0: + stream.append_byte cast[char](0x30 + stream.errc mod 10), oq, olen, n + stream.errc = stream.errc div 10 + stream.append_byte ';', oq, olen, n + +proc gb18030RangesPointer(c: uint32): uint32 = + if c == 0xE7C7: + return 7457 + # Let offset be the last pointer in index gb18030 ranges that is less than or + # equal to pointer and code point offset its corresponding code point. + var offset: uint32 + var p: uint32 + if c >= 0x10000: + # omitted from the map for storage efficiency + offset = 0x10000 + p = 189000 + elif c >= 0xFFE6: + # Needed because upperBound returns the first element greater than pointer + # OR last on failure, so we can't just remove one if p is e.g. 39400. + offset = 0xFFE6 + p = 39394 + else: + # Find the first range that is greater than p, or last if no such element + # is found. + # We want the last that is <=, so decrease index by one. + let i = upperBound(Gb18030RangesEncode, c, func(a: tuple[ucs, p: uint16], b: uint32): int = + cmp(cast[uint32](a.ucs), b)) + let elem = Gb18030RangesEncode[i - 1] + offset = elem.ucs + p = elem.p + return p + c - offset + +proc encodeUTF8(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) = + var i = 0 + while i < ilen: + let c = iq[i] + var count: int + var offset: uint8 + case c + of 0x0080..0x07FF: + count = 1 + offset = 0xC0 + of 0x0800..0xFFFF: + count = 2 + offset = 0xE0 + of 0x10000..0x10FFFF: + count = 3 + offset = 0xF0 + else: + assert false + stream.append_byte (c shr (6 * count)) + offset, oq, olen, n + while count > 0: + let tmp = c shr (6 * (count - 1)) + stream.append_byte 0x80 or (tmp and 0x3F), oq, olen, n + dec count + +proc encodeSingleByte(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int, + map: seq[tuple[ucs: uint16, val: char]]) = + for i in 0 ..< ilen: + let c = iq[i] + if c < 0x80: + stream.append_byte cast[uint8](c), oq, olen, n + continue + if c <= 0xFFFF: + let j = binarySearch(map, cast[uint16](c), + proc(a: tuple[ucs: uint16, val: char], b: uint16): int = + cmp(a.ucs, b)) + if j != -1: + stream.append_byte cast[uint8](map[j].val) + 0x80, oq, olen, n + continue + stream.handleError(oq, olen, n, c) + +proc encodeXUserDefined(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) = + for i in 0 ..< ilen: + let c = iq[i] + if c < 0x80: + stream.append_byte cast[uint8](c), oq, olen, n + continue + if c in 0xF780u32..0xF7FFu32: + let b = cast[uint8](c - 0xF780 + 0x80) + stream.append_byte b, oq, olen, n + continue + stream.handleError(oq, olen, n, c) + +proc encodeGb18030(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int, + isGBK = false) = + for c in iq: + if isGBK and c == 0x20AC: + stream.append_byte 0x80, oq, olen, n + continue + let i = if c > 0xFFFF: -1 else: binarySearch(Gb18030Encode, cast[uint16](c), + proc(a: UCS16x16, b: uint16): int = + cmp(a.ucs, b)) + if i != -1: + let p = Gb18030Encode[i].p + let lead = p div 190 + 0x81 + let trail = p mod 190 + let offset: uint8 = if trail < 0x3F: 0x40 else: 0x41 + stream.append_byte lead, oq, olen, n + stream.append_byte cast[uint8](trail) + offset, oq, olen, n + continue + if isGBK: + stream.handleError(oq, olen, n, c) + continue + var p = gb18030RangesPointer(c) + let b1 = p div (10 * 126 * 10) + p = p mod (10 * 126 * 10) + let b2 = p div (10 * 126) + p = p mod (10 * 126) + let b3 = p div 10 + let b4 = p mod 10 + stream.append_byte b1, oq, olen, n + stream.append_byte b2, oq, olen, n + stream.append_byte b3, oq, olen, n + stream.append_byte b4, oq, olen, n + +proc encodeBig5(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) = + for c in iq: + if c < 0x80: + stream.append_byte c, oq, olen, n + continue + let i = binarySearch(Big5Encode, cast[uint16](c), + proc(a: UCS32x16, b: uint16): int = + cmp(a.ucs, b)) + if i == -1: + stream.handleError(oq, olen, n, c) + continue + let p = Big5Encode[i].p + let lead = p div 157 + 0x81 + let trail = p mod 157 + let offset: uint8 = if trail < 0x3F: 0x40 else: 0x62 + stream.append_byte lead, oq, olen, n + stream.append_byte cast[uint8](trail) + offset, oq, olen, n + +proc encodeEUCJP(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) = + for c in iq: + if c < 0x80: + stream.append_byte c, oq, olen, n + elif c == 0xA5: + stream.append_byte 0x5C, oq, olen, n + elif c == 0x203E: + stream.append_byte 0x5C, oq, olen, n + elif c in 0xFF61u32..0xFF9Fu32: + stream.append_byte 0x8E, oq, olen, n + stream.append_byte c - 0xFF61 + 0xA1, oq, olen, n + else: + let c = if c == 0x2212: + 0xFF0Du32 + else: + c + let i = binarySearch(Jis0208Encode, cast[uint16](c), + proc(a: UCS16x16, b: uint16): int = + cmp(a.ucs, b)) + if i != -1: + let p = Jis0208Encode[i].p + let lead = p div 94 + 0xA1 + let trail = p mod 94 + 0xA1 + stream.append_byte lead, oq, olen, n + stream.append_byte trail, oq, olen, n + else: + stream.handleError(oq, olen, n, c) + +# copy any data remaining from previous passes +proc copyBuffers(stream: EncoderStream, oq: ptr UncheckedArray[uint8], olen: int): int = + if stream.bufs.len == 1: + # one page: stream.bs ..< stream.bi + let n = min((stream.bi - stream.bs) * sizeof(stream.bufs[0][0]), olen) + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], n) + stream.bs += n + if stream.bs >= stream.bi: + # read entire page; recycle it + stream.bs = 0 + stream.bi = 0 + return n + else: + # multiple pages: + # stream.bs ..< stream.buflen + # 0 ..< stream.buflen + # ... + # 0 ..< stream.bi + let a = (stream.buflen - stream.bs) * sizeof(stream.bufs[0][0]) + if a < olen: + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a) + var ns = a + stream.bs = 0 + var i = 1 + while i < stream.bufs.high: + let n = min(stream.buflen * sizeof(stream.bufs[0][0]), olen - ns) + copyMem(addr oq[ns], addr stream.bufs[i][0], n) + ns += n + if ns >= olen: + # i'th buffer still has contents. + stream.bs = n + break + stream.bs = 0 + inc i + if ns < olen: + # last page + let n = min(stream.bi * sizeof(stream.bufs[0][0]), olen - ns) + copyMem(addr oq[ns], addr stream.bufs[i][0], n) + ns += n + stream.bs = n + if stream.bs >= stream.bi: + # read entire page; recycle it + stream.bs = 0 + stream.bi = 0 + for j in i ..< stream.bufs.len: + stream.bufs[j - i] = stream.bufs[j] + stream.bufs.setLen(stream.bufs.len - i) + return ns + elif a > olen: + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], olen) + stream.bs += olen + assert stream.bs < stream.buflen + return olen + else: # a == olen + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a) + stream.bs = 0 + stream.bufs.delete(0) + return a + +proc checkEnd(stream: EncoderStream, oq: ptr UncheckedArray[uint8], olen: int, + n: var int) = + if not stream.isend and stream.bufs.len == 1 and + stream.bs >= stream.bi and stream.source.atEnd: + stream.isend = true + +const ReadSize = 4096 +proc readData*(stream: EncoderStream, buffer: pointer, olen: int): int = + if olen == 0: return + let oq = cast[ptr UncheckedArray[uint8]](buffer) + result = stream.copyBuffers(oq, olen) + let olen = olen - result + if olen == 0 or stream.source.atEnd: + # either output filled with buffered data; nothing to decode + # or we're at the end of the source stream + stream.checkEnd(oq, olen, result) + return result + var iq = newSeqUninitialized[uint32](ReadSize div sizeof(uint32)) + let ilen0 = stream.source.readData(cast[pointer](addr iq[0]), ReadSize) + assert ilen0 mod sizeof(uint32) == 0 #TODO what to do if false? + let ilen = ilen0 div sizeof(uint32) + case stream.charset + of CHARSET_UTF_8: stream.encodeUTF8(iq, oq, ilen, olen, result) + of CHARSET_IBM866: stream.encodeSingleByte(iq, oq, ilen, olen, result, IBM866Encode) + of CHARSET_ISO_8859_2: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88592Encode) + of CHARSET_ISO_8859_3: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88593Encode) + of CHARSET_ISO_8859_4: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88594Encode) + of CHARSET_ISO_8859_5: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88595Encode) + of CHARSET_ISO_8859_6: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88596Encode) + of CHARSET_ISO_8859_7: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88597Encode) + of CHARSET_ISO_8859_8, + CHARSET_ISO_8859_8_I: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88598Encode) + of CHARSET_ISO_8859_10: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885910Encode) + of CHARSET_ISO_8859_13: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885913Encode) + of CHARSET_ISO_8859_14: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885914Encode) + of CHARSET_ISO_8859_15: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885915Encode) + of CHARSET_ISO_8859_16: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885916Encode) + of CHARSET_KOI8_R: stream.encodeSingleByte(iq, oq, ilen, olen, result, KOI8REncode) + of CHARSET_KOI8_U: stream.encodeSingleByte(iq, oq, ilen, olen, result, KOI8UEncode) + of CHARSET_MACINTOSH: stream.encodeSingleByte(iq, oq, ilen, olen, result, MacintoshEncode) + of CHARSET_WINDOWS_874: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows874Encode) + of CHARSET_WINDOWS_1250: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1250Encode) + of CHARSET_WINDOWS_1251: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1251Encode) + of CHARSET_WINDOWS_1252: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1252Encode) + of CHARSET_WINDOWS_1253: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1253Encode) + of CHARSET_WINDOWS_1254: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1254Encode) + of CHARSET_WINDOWS_1255: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1255Encode) + of CHARSET_WINDOWS_1256: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1256Encode) + of CHARSET_WINDOWS_1257: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1257Encode) + of CHARSET_WINDOWS_1258: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1258Encode) + of CHARSET_X_MAC_CYRILLIC: stream.encodeSingleByte(iq, oq, ilen, olen, result, XMacCyrillicEncode) + of CHARSET_GBK: stream.encodeGb18030(iq, oq, ilen, olen, result, true) + of CHARSET_GB18030: stream.encodeGb18030(iq, oq, ilen, olen, result) + of CHARSET_BIG5: stream.encodeBig5(iq, oq, ilen, olen, result) + of CHARSET_EUC_JP: stream.encodeEUCJP(iq, oq, ilen, olen, result) +# of CHARSET_ISO_2022_JP: stream.decodeISO2022JP(iq, oq, ilen, olen, result) +# of CHARSET_SHIFT_JIS: stream.decodeShiftJIS(iq, oq, ilen, olen, result) +# of CHARSET_EUC_KR: stream.decodeEUCKR(iq, oq, ilen, olen, result) +# of CHARSET_REPLACEMENT: stream.decodeReplacement(oq, olen, result) +# of CHARSET_UTF_16_LE: stream.decodeUTF16LE(iq, oq, ilen, olen, result) +# of CHARSET_UTF_16_BE: stream.decodeUTF16BE(iq, oq, ilen, olen, result) + of CHARSET_X_USER_DEFINED: stream.encodeXUserDefined(iq, oq, ilen, olen, result) + of CHARSET_UNKNOWN: assert false, "Somebody forgot to set the character set here" + else: assert false, "TODO" + stream.checkEnd(oq, olen, result) + +# Returns the number of bytes read. +proc readData*(stream: EncoderStream, buf: var seq[uint8]): int = + return stream.readData(addr buf[0], buf.len * sizeof(buf[0])) + +proc atEnd*(stream: EncoderStream): bool = + return stream.isend + +proc readAll*(stream: EncoderStream): string = + var buf = newString(4096) + while not stream.atEnd: + let olen = stream.readData(addr buf[0], buf.len) + if olen < buf.len: + buf.setLen(olen) + result &= buf + break + result &= buf + +proc newEncoderStream*(source: Stream, cs = CHARSET_UTF_8, buflen = 4096, + errormode: EncoderErrorMode = ENCODER_ERROR_MODE_HTML): EncoderStream = + result = EncoderStream( + source: source, + charset: cs, + buflen: buflen, + errormode: errormode + ) + when nimvm: + result.bufs = @[newSeq[uint8](buflen)] + else: + result.bufs = @[newSeqUninitialized[uint8](buflen)] |