diff options
author | bptato <nincsnevem662@gmail.com> | 2023-05-20 01:06:46 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2023-05-20 01:06:56 +0200 |
commit | a3a3563a2cecce3b8091252e37d55bf07c123f48 (patch) | |
tree | 968372d2ff14630e7681eb9f8517ff647fd64e94 /src | |
parent | 26e8968a6499742cf37e00292a7d1c8ed620cad5 (diff) | |
download | chawan-a3a3563a2cecce3b8091252e37d55bf07c123f48.tar.gz |
Support all whatwg encodings
Diffstat (limited to 'src')
-rw-r--r-- | src/data/charset.nim | 7 | ||||
-rw-r--r-- | src/encoding/encoderstream.nim | 161 |
2 files changed, 143 insertions, 25 deletions
diff --git a/src/data/charset.nim b/src/data/charset.nim index 45d7786c..560e8643 100644 --- a/src/data/charset.nim +++ b/src/data/charset.nim @@ -407,6 +407,12 @@ func loadCharsetMap16(path: string, len: static uint16): tuple[ result.encode.add((uint16(n), uint16(index))) result.encode.sort() +func loadCharsetMapSJIS(path: string): seq[UCS16x16] = + for index, n in mappairs("res/map" / path): + if n notin 8272..8835: + result.add((uint16(n), uint16(index))) + result.sort() + type UCS32x16* = tuple[ucs: uint32, p: uint16] func loadBig5Map(path: string, offset: static uint16): tuple[ @@ -451,6 +457,7 @@ const (Gb18030Decode*, Gb18030Encode*) = loadCharsetMap16("index-gb18030.txt", l const Big5DecodeOffset* = 942 const (Big5Decode*, Big5Encode*) = loadBig5Map("index-big5.txt", offset = Big5DecodeOffset) const (Jis0208Decode*, Jis0208Encode*) = loadCharsetMap16("index-jis0208.txt", len = 11104) +const ShiftJISEncode* = loadCharsetMapSJIS("index-jis0208.txt") const (Jis0212Decode*, Jis0212Encode*) = loadCharsetMap16("index-jis0212.txt", len = 7211) const ISO2022JPKatakanaEncode* = loadCharsetMap8Encode("index-iso-2022-jp-katakana.txt") const (EUCKRDecode*, EUCKREncode*) = loadCharsetMap16("index-euc-kr.txt", len = 23750) diff --git a/src/encoding/encoderstream.nim b/src/encoding/encoderstream.nim index 155b7d31..57f27dd1 100644 --- a/src/encoding/encoderstream.nim +++ b/src/encoding/encoderstream.nim @@ -24,7 +24,10 @@ type bi: int buflen: int errc: uint32 - charset: Charset + case charset: Charset + of CHARSET_ISO_2022_JP: + iso2022jpstate: ISO2022JPState + else: discard template append_byte_buf(stream: EncoderStream, c: uint8) = if stream.bi >= stream.buflen: @@ -53,6 +56,14 @@ template append_byte(stream: EncoderStream, c: int, oq: ptr UncheckedArray[uint8], olen: int, n: var int) = stream.append_byte cast[uint8](c), oq, olen, n +func findPair[U, V](map: seq[(U, V)], c: uint32): int = + when not (typeof(c) is U): + if c > cast[typeof(c)](high(U)): + return -1 + return binarySearch(map, cast[U](c), + proc(a: tuple[ucs: U, val: V], b: U): int = + cmp(a.ucs, b)) + proc handleError(stream: EncoderStream, oq: ptr UncheckedArray[uint8], olen: int, n: var int, c: uint32) = case stream.errormode @@ -130,14 +141,11 @@ proc encodeSingleByte(stream: EncoderStream, iq: var seq[uint32], if c < 0x80: stream.append_byte cast[uint8](c), oq, olen, n continue - if c <= 0xFFFF: - let j = binarySearch(map, cast[uint16](c), - proc(a: tuple[ucs: uint16, val: char], b: uint16): int = - cmp(a.ucs, b)) - if j != -1: - stream.append_byte cast[uint8](map[j].val) + 0x80, oq, olen, n - continue - stream.handleError(oq, olen, n, c) + let j = findPair(map, c) + if j != -1: + stream.append_byte cast[uint8](map[j].val) + 0x80, oq, olen, n + else: + stream.handleError(oq, olen, n, c) proc encodeXUserDefined(stream: EncoderStream, iq: var seq[uint32], oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) = @@ -159,9 +167,7 @@ proc encodeGb18030(stream: EncoderStream, iq: var seq[uint32], if isGBK and c == 0x20AC: stream.append_byte 0x80, oq, olen, n continue - let i = if c > 0xFFFF: -1 else: binarySearch(Gb18030Encode, cast[uint16](c), - proc(a: UCS16x16, b: uint16): int = - cmp(a.ucs, b)) + let i = if c > 0xFFFF: -1 else: findPair(Gb18030Encode, c) if i != -1: let p = Gb18030Encode[i].p let lead = p div 190 + 0x81 @@ -191,9 +197,7 @@ proc encodeBig5(stream: EncoderStream, iq: var seq[uint32], if c < 0x80: stream.append_byte c, oq, olen, n continue - let i = binarySearch(Big5Encode, cast[uint16](c), - proc(a: UCS32x16, b: uint16): int = - cmp(a.ucs, b)) + let i = findPair(Big5Encode, c) if i == -1: stream.handleError(oq, olen, n, c) continue @@ -221,9 +225,7 @@ proc encodeEUCJP(stream: EncoderStream, iq: var seq[uint32], 0xFF0Du32 else: c - let i = binarySearch(Jis0208Encode, cast[uint16](c), - proc(a: UCS16x16, b: uint16): int = - cmp(a.ucs, b)) + let i = findPair(Jis0208Encode, c) if i != -1: let p = Jis0208Encode[i].p let lead = p div 94 + 0xA1 @@ -233,6 +235,111 @@ proc encodeEUCJP(stream: EncoderStream, iq: var seq[uint32], else: stream.handleError(oq, olen, n, c) +proc encodeISO2022JP(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) = + var state = stream.iso2022jpstate + var i = 0 + while i < ilen: + let c = iq[i] + if state in {STATE_ASCII, STATE_ROMAN} and + c in [0x0Eu32, 0x0Fu32, 0x1Bu32]: + stream.handleError(oq, olen, n, 0xFFFD) + elif state == STATE_ASCII and c < 0x80 and c notin [0x5Cu32, 0x7Eu32] or + c == 0xA5 or c == 0x203E: + if c < 0x80: + stream.append_byte c, oq, olen, n + elif c == 0xA5: + stream.append_byte 0xA5, oq, olen, n + elif c == 0x203E: + stream.append_byte 0x7E, oq, olen, n + elif c < 0x80 and state != STATE_ASCII: + state = STATE_ASCII + stream.append_byte 0x1B, oq, olen, n + stream.append_byte 0x28, oq, olen, n + stream.append_byte 0x42, oq, olen, n + # prepend + continue + elif c == 0xA5 or c == 0x203E and state != STATE_ROMAN: + state = STATE_ROMAN + stream.append_byte 0x1B, oq, olen, n + stream.append_byte 0x28, oq, olen, n + stream.append_byte 0x4A, oq, olen, n + # prepend + continue + else: + var c = c + if c == 0x2212: + c = 0xFF0D + if c in 0xFF61u32..0xFF9Fu32: + let j = findPair(ISO2022JPKatakanaEncode, c - 0xFF61) + c = ISO2022JPKatakanaEncode[j].ucs + let j = findPair(Jis0208Encode, c) + if j == -1: + if state == STATE_JIS0208: + state = STATE_ASCII + stream.append_byte 0x1B, oq, olen, n + stream.append_byte 0x28, oq, olen, n + stream.append_byte 0x42, oq, olen, n + # prepend + continue + stream.handleError(oq, olen, n, c) + else: + let p = Jis0208Encode[j].p + if state != STATE_JIS0208: + state = STATE_JIS0208 + stream.append_byte 0x1B, oq, olen, n + stream.append_byte 0x24, oq, olen, n + stream.append_byte 0x42, oq, olen, n + # prepend + continue + let lead = p div 94 + 0x21 + let trail = p mod 94 + 0x21 + stream.append_byte lead, oq, olen, n + stream.append_byte trail, oq, olen, n + inc i + stream.iso2022jpstate = state + +proc encodeShiftJIS(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) = + for c in iq: + if c <= 0x80: + stream.append_byte c, oq, olen, n + elif c == 0xA5: + stream.append_byte 0x5C, oq, olen, n + elif c == 0x203E: + stream.append_byte 0x7E, oq, olen, n + elif c in 0xFF61u32..0xFF9Fu32: + stream.append_byte c - 0xFF61 + 0xA1, oq, olen, n + else: + let c = if c == 0x2212: 0xFF0Du32 else: c + let j = findPair(ShiftJISEncode, c) + if j == -1: + stream.handleError(oq, olen, n, c) + else: + let p = ShiftJISEncode[j].p + let lead = p div 188 + let lead_offset = if lead < 0x1F: 0x81u16 else: 0xC1u16 + let trail = p mod 188 + let offset = if trail < 0x3F: 0x40u16 else: 0x41u16 + stream.append_byte lead + lead_offset, oq, olen, n + stream.append_byte trail + offset, oq, olen, n + +proc encodeEUCKR(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) = + for c in iq: + if c < 0x80: + stream.append_byte c, oq, olen, n + else: + let i = findPair(Jis0208Encode, c) + if i != -1: + let p = Jis0208Encode[i].p + let lead = p div 190 + 0x81 + let trail = p mod 190 + 0x41 + stream.append_byte lead, oq, olen, n + stream.append_byte trail, oq, olen, n + else: + stream.handleError(oq, olen, n, c) + # copy any data remaining from previous passes proc copyBuffers(stream: EncoderStream, oq: ptr UncheckedArray[uint8], olen: int): int = if stream.bufs.len == 1: @@ -297,6 +404,11 @@ proc checkEnd(stream: EncoderStream, oq: ptr UncheckedArray[uint8], olen: int, if not stream.isend and stream.bufs.len == 1 and stream.bs >= stream.bi and stream.source.atEnd: stream.isend = true + if stream.charset == CHARSET_ISO_2022_JP: + if stream.iso2022jpstate != STATE_ASCII: + stream.append_byte 0x1B, oq, olen, n + stream.append_byte 0x28, oq, olen, n + stream.append_byte 0x42, oq, olen, n const ReadSize = 4096 proc readData*(stream: EncoderStream, buffer: pointer, olen: int): int = @@ -313,6 +425,7 @@ proc readData*(stream: EncoderStream, buffer: pointer, olen: int): int = let ilen0 = stream.source.readData(cast[pointer](addr iq[0]), ReadSize) assert ilen0 mod sizeof(uint32) == 0 #TODO what to do if false? let ilen = ilen0 div sizeof(uint32) + iq.setLen(ilen) case stream.charset of CHARSET_UTF_8: stream.encodeUTF8(iq, oq, ilen, olen, result) of CHARSET_IBM866: stream.encodeSingleByte(iq, oq, ilen, olen, result, IBM866Encode) @@ -347,15 +460,12 @@ proc readData*(stream: EncoderStream, buffer: pointer, olen: int): int = of CHARSET_GB18030: stream.encodeGb18030(iq, oq, ilen, olen, result) of CHARSET_BIG5: stream.encodeBig5(iq, oq, ilen, olen, result) of CHARSET_EUC_JP: stream.encodeEUCJP(iq, oq, ilen, olen, result) -# of CHARSET_ISO_2022_JP: stream.decodeISO2022JP(iq, oq, ilen, olen, result) -# of CHARSET_SHIFT_JIS: stream.decodeShiftJIS(iq, oq, ilen, olen, result) -# of CHARSET_EUC_KR: stream.decodeEUCKR(iq, oq, ilen, olen, result) -# of CHARSET_REPLACEMENT: stream.decodeReplacement(oq, olen, result) -# of CHARSET_UTF_16_LE: stream.decodeUTF16LE(iq, oq, ilen, olen, result) -# of CHARSET_UTF_16_BE: stream.decodeUTF16BE(iq, oq, ilen, olen, result) + of CHARSET_ISO_2022_JP: stream.encodeISO2022JP(iq, oq, ilen, olen, result) + of CHARSET_SHIFT_JIS: stream.encodeShiftJIS(iq, oq, ilen, olen, result) + of CHARSET_EUC_KR: stream.encodeEUCKR(iq, oq, ilen, olen, result) of CHARSET_X_USER_DEFINED: stream.encodeXUserDefined(iq, oq, ilen, olen, result) of CHARSET_UNKNOWN: assert false, "Somebody forgot to set the character set here" - else: assert false, "TODO" + else: discard stream.checkEnd(oq, olen, result) # Returns the number of bytes read. @@ -383,6 +493,7 @@ proc newEncoderStream*(source: Stream, cs = CHARSET_UTF_8, buflen = 4096, buflen: buflen, errormode: errormode ) + doAssert cs notin {CHARSET_UTF_16_LE, CHARSET_UTF_16_BE, CHARSET_REPLACEMENT} when nimvm: result.bufs = @[newSeq[uint8](buflen)] else: |