about summary refs log tree commit diff stats
path: root/src/encoding
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2023-05-19 01:50:17 +0200
committerbptato <nincsnevem662@gmail.com>2023-05-19 01:50:17 +0200
commit26e8968a6499742cf37e00292a7d1c8ed620cad5 (patch)
treea3922f02f09b5c025dddcfe0e7a3a719c47ba4da /src/encoding
parentdac6a09c14b258ed725dcb265305a6445edc02ad (diff)
downloadchawan-26e8968a6499742cf37e00292a7d1c8ed620cad5.tar.gz
Add display/output encoding
Some encodings are still missing
Diffstat (limited to 'src/encoding')
-rw-r--r--src/encoding/decoderstream.nim19
-rw-r--r--src/encoding/encoderstream.nim389
2 files changed, 393 insertions, 15 deletions
diff --git a/src/encoding/decoderstream.nim b/src/encoding/decoderstream.nim
index 425f264f..8bfd4d10 100644
--- a/src/encoding/decoderstream.nim
+++ b/src/encoding/decoderstream.nim
@@ -8,8 +8,7 @@ import utils/twtstr
 # DecoderStream decodes any encoding to valid utf-32.
 type
   DecoderErrorMode* = enum
-    DECODER_ERROR_MODE_FATAL, DECODER_ERROR_MODE_REPLACEMENT,
-    DECODER_ERROR_MODE_HTML
+    DECODER_ERROR_MODE_FATAL, DECODER_ERROR_MODE_REPLACEMENT
 
   ISO2022JPState = enum
     STATE_ASCII, STATE_ROMAN, STATE_KATAKANA, STATE_LEAD_BYTE,
@@ -27,6 +26,7 @@ type
     c: uint32
     case charset: Charset
     of CHARSET_UTF_8:
+      u8c: uint32
       u8needed: int
       u8seen: int
       u8bounds: Slice[uint8]
@@ -83,22 +83,11 @@ proc handleError(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: in
   of DECODER_ERROR_MODE_FATAL:
     stream.isend = true
     stream.failed = true
-  of DECODER_ERROR_MODE_HTML:
-    if stream.charset == CHARSET_UTF_8:
-      # "html" mode is handled as "replacement" for utf-8.
-      stream.append_codepoint 0xFFFD, oq, olen, n
-    else:
-      stream.append_codepoint '&', oq, olen, n
-      stream.append_codepoint '#', oq, olen, n
-      while stream.c > 0:
-        stream.append_codepoint cast[char](0x30 + stream.c mod 10), oq, olen, n
-        stream.c = stream.c div 10
-      stream.append_codepoint ';', oq, olen, n
   of DECODER_ERROR_MODE_REPLACEMENT:
     stream.append_codepoint 0xFFFD, oq, olen, n
 
 proc decodeUTF8(stream: DecoderStream, iq: var seq[uint8], oq: ptr UncheckedArray[uint32], ilen, olen: int, n: var int) =
-  var c = stream.c
+  var c = stream.u8c
   var needed = stream.u8needed
   var seen = stream.u8seen
   var bounds = stream.u8bounds
@@ -156,7 +145,7 @@ proc decodeUTF8(stream: DecoderStream, iq: var seq[uint8], oq: ptr UncheckedArra
       needed = 0
       seen = 0
     inc i
-  stream.c = c
+  stream.u8c = c
   stream.u8bounds = bounds
   stream.u8seen = seen
   stream.u8needed = needed
diff --git a/src/encoding/encoderstream.nim b/src/encoding/encoderstream.nim
new file mode 100644
index 00000000..155b7d31
--- /dev/null
+++ b/src/encoding/encoderstream.nim
@@ -0,0 +1,389 @@
+# Heavily based on https://encoding.spec.whatwg.org/
+
+import algorithm
+import streams
+import unicode
+
+import data/charset
+
+# EncoderStream encodes utf-32 to the specified encoding.
+type
+  EncoderErrorMode* = enum
+    ENCODER_ERROR_MODE_FATAL, ENCODER_ERROR_MODE_HTML
+
+  ISO2022JPState = enum
+    STATE_ASCII, STATE_ROMAN, STATE_JIS0208
+
+  EncoderStream* = ref object
+    source: Stream
+    errormode: EncoderErrorMode
+    isend: bool
+    failed*: bool
+    bufs: seq[seq[uint8]]
+    bs: int
+    bi: int
+    buflen: int
+    errc: uint32
+    charset: Charset
+
+template append_byte_buf(stream: EncoderStream, c: uint8) =
+  if stream.bi >= stream.buflen:
+    stream.bufs.add(newSeqUninitialized[uint8](stream.buflen))
+    stream.bi = 0
+  stream.bufs[^1][stream.bi] = c
+  inc stream.bi
+
+template append_byte(stream: EncoderStream, c: uint8,
+    oq: ptr UncheckedArray[uint8], olen: int, n: var int) =
+  if n < olen:
+    oq[n] = c
+    inc n
+  else:
+    append_byte_buf stream, c
+
+template append_byte(stream: EncoderStream, c: char,
+    oq: ptr UncheckedArray[uint8], olen: int, n: var int) =
+  stream.append_byte cast[uint8](c), oq, olen, n
+
+template append_byte(stream: EncoderStream, c: uint32,
+    oq: ptr UncheckedArray[uint8], olen: int, n: var int) =
+  stream.append_byte cast[uint8](c), oq, olen, n
+
+template append_byte(stream: EncoderStream, c: int,
+    oq: ptr UncheckedArray[uint8], olen: int, n: var int) =
+  stream.append_byte cast[uint8](c), oq, olen, n
+
+proc handleError(stream: EncoderStream, oq: ptr UncheckedArray[uint8],
+    olen: int, n: var int, c: uint32) =
+  case stream.errormode
+  of ENCODER_ERROR_MODE_FATAL:
+    stream.isend = true
+    stream.failed = true
+  of ENCODER_ERROR_MODE_HTML:
+    stream.append_byte '&', oq, olen, n
+    stream.append_byte '#', oq, olen, n
+    if stream.errc == 0:
+      stream.append_byte '0', oq, olen, n
+    else:
+      while stream.errc > 0:
+        stream.append_byte cast[char](0x30 + stream.errc mod 10), oq, olen, n
+        stream.errc = stream.errc div 10
+    stream.append_byte ';', oq, olen, n
+
+proc gb18030RangesPointer(c: uint32): uint32 =
+  if c == 0xE7C7:
+    return 7457
+  # Let offset be the last pointer in index gb18030 ranges that is less than or
+  # equal to pointer and code point offset its corresponding code point.
+  var offset: uint32
+  var p: uint32
+  if c >= 0x10000:
+    # omitted from the map for storage efficiency
+    offset = 0x10000
+    p = 189000
+  elif c >= 0xFFE6:
+    # Needed because upperBound returns the first element greater than pointer
+    # OR last on failure, so we can't just remove one if p is e.g. 39400.
+    offset = 0xFFE6
+    p = 39394
+  else:
+    # Find the first range that is greater than p, or last if no such element
+    # is found.
+    # We want the last that is <=, so decrease index by one.
+    let i = upperBound(Gb18030RangesEncode, c, func(a: tuple[ucs, p: uint16], b: uint32): int =
+      cmp(cast[uint32](a.ucs), b))
+    let elem = Gb18030RangesEncode[i - 1]
+    offset = elem.ucs
+    p = elem.p
+  return p + c - offset
+
+proc encodeUTF8(stream: EncoderStream, iq: var seq[uint32],
+    oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) =
+  var i = 0
+  while i < ilen:
+    let c = iq[i]
+    var count: int
+    var offset: uint8
+    case c
+    of 0x0080..0x07FF:
+      count = 1
+      offset = 0xC0
+    of 0x0800..0xFFFF:
+      count = 2
+      offset = 0xE0
+    of 0x10000..0x10FFFF:
+      count = 3
+      offset = 0xF0
+    else:
+      assert false
+    stream.append_byte (c shr (6 * count)) + offset, oq, olen, n
+    while count > 0:
+      let tmp = c shr (6 * (count - 1))
+      stream.append_byte 0x80 or (tmp and 0x3F), oq, olen, n
+      dec count
+
+proc encodeSingleByte(stream: EncoderStream, iq: var seq[uint32],
+    oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int,
+    map: seq[tuple[ucs: uint16, val: char]]) =
+  for i in 0 ..< ilen:
+    let c = iq[i]
+    if c < 0x80:
+      stream.append_byte cast[uint8](c), oq, olen, n
+      continue
+    if c <= 0xFFFF:
+      let j = binarySearch(map, cast[uint16](c),
+        proc(a: tuple[ucs: uint16, val: char], b: uint16): int =
+          cmp(a.ucs, b))
+      if j != -1:
+        stream.append_byte cast[uint8](map[j].val) + 0x80, oq, olen, n
+        continue
+    stream.handleError(oq, olen, n, c)
+
+proc encodeXUserDefined(stream: EncoderStream, iq: var seq[uint32],
+    oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) =
+  for i in 0 ..< ilen:
+    let c = iq[i]
+    if c < 0x80:
+      stream.append_byte cast[uint8](c), oq, olen, n
+      continue
+    if c in 0xF780u32..0xF7FFu32:
+      let b = cast[uint8](c - 0xF780 + 0x80)
+      stream.append_byte b, oq, olen, n
+      continue
+    stream.handleError(oq, olen, n, c)
+
+proc encodeGb18030(stream: EncoderStream, iq: var seq[uint32],
+    oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int,
+    isGBK = false) =
+  for c in iq:
+    if isGBK and c == 0x20AC:
+      stream.append_byte 0x80, oq, olen, n
+      continue
+    let i = if c > 0xFFFF: -1 else: binarySearch(Gb18030Encode, cast[uint16](c),
+      proc(a: UCS16x16, b: uint16): int =
+        cmp(a.ucs, b))
+    if i != -1:
+      let p = Gb18030Encode[i].p
+      let lead = p div 190 + 0x81
+      let trail = p mod 190
+      let offset: uint8 = if trail < 0x3F: 0x40 else: 0x41
+      stream.append_byte lead, oq, olen, n
+      stream.append_byte cast[uint8](trail) + offset, oq, olen, n
+      continue
+    if isGBK:
+      stream.handleError(oq, olen, n, c)
+      continue
+    var p = gb18030RangesPointer(c)
+    let b1 = p div (10 * 126 * 10)
+    p = p mod (10 * 126 * 10)
+    let b2 = p div (10 * 126)
+    p = p mod (10 * 126)
+    let b3 = p div 10
+    let b4 = p mod 10
+    stream.append_byte b1, oq, olen, n
+    stream.append_byte b2, oq, olen, n
+    stream.append_byte b3, oq, olen, n
+    stream.append_byte b4, oq, olen, n
+
+proc encodeBig5(stream: EncoderStream, iq: var seq[uint32],
+    oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) =
+  for c in iq:
+    if c < 0x80:
+      stream.append_byte c, oq, olen, n
+      continue
+    let i = binarySearch(Big5Encode, cast[uint16](c),
+      proc(a: UCS32x16, b: uint16): int =
+        cmp(a.ucs, b))
+    if i == -1:
+      stream.handleError(oq, olen, n, c)
+      continue
+    let p = Big5Encode[i].p
+    let lead = p div 157 + 0x81
+    let trail = p mod 157
+    let offset: uint8 = if trail < 0x3F: 0x40 else: 0x62
+    stream.append_byte lead, oq, olen, n
+    stream.append_byte cast[uint8](trail) + offset, oq, olen, n
+
+proc encodeEUCJP(stream: EncoderStream, iq: var seq[uint32],
+    oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) =
+  for c in iq:
+    if c < 0x80:
+      stream.append_byte c, oq, olen, n
+    elif c == 0xA5:
+      stream.append_byte 0x5C, oq, olen, n
+    elif c == 0x203E:
+      stream.append_byte 0x5C, oq, olen, n
+    elif c in 0xFF61u32..0xFF9Fu32:
+      stream.append_byte 0x8E, oq, olen, n
+      stream.append_byte c - 0xFF61 + 0xA1, oq, olen, n
+    else:
+      let c = if c == 0x2212:
+        0xFF0Du32
+      else:
+        c
+      let i = binarySearch(Jis0208Encode, cast[uint16](c),
+        proc(a: UCS16x16, b: uint16): int =
+          cmp(a.ucs, b))
+      if i != -1:
+        let p = Jis0208Encode[i].p
+        let lead = p div 94 + 0xA1
+        let trail = p mod 94 + 0xA1
+        stream.append_byte lead, oq, olen, n
+        stream.append_byte trail, oq, olen, n
+      else:
+        stream.handleError(oq, olen, n, c)
+
+# copy any data remaining from previous passes
+proc copyBuffers(stream: EncoderStream, oq: ptr UncheckedArray[uint8], olen: int): int =
+  if stream.bufs.len == 1:
+    # one page: stream.bs ..< stream.bi
+    let n = min((stream.bi - stream.bs) * sizeof(stream.bufs[0][0]), olen)
+    copyMem(addr oq[0], addr stream.bufs[0][stream.bs], n)
+    stream.bs += n
+    if stream.bs >= stream.bi:
+      # read entire page; recycle it
+      stream.bs = 0
+      stream.bi = 0
+    return n
+  else:
+    # multiple pages:
+    # stream.bs ..< stream.buflen
+    # 0 ..< stream.buflen
+    # ...
+    # 0 ..< stream.bi
+    let a = (stream.buflen - stream.bs) * sizeof(stream.bufs[0][0])
+    if a < olen:
+      copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a)
+      var ns = a
+      stream.bs = 0
+      var i = 1
+      while i < stream.bufs.high:
+        let n = min(stream.buflen * sizeof(stream.bufs[0][0]), olen - ns)
+        copyMem(addr oq[ns], addr stream.bufs[i][0], n)
+        ns += n
+        if ns >= olen:
+          # i'th buffer still has contents.
+          stream.bs = n
+          break
+        stream.bs = 0
+        inc i
+      if ns < olen:
+        # last page
+        let n = min(stream.bi * sizeof(stream.bufs[0][0]), olen - ns)
+        copyMem(addr oq[ns], addr stream.bufs[i][0], n)
+        ns += n
+        stream.bs = n
+        if stream.bs >= stream.bi:
+          # read entire page; recycle it
+          stream.bs = 0
+          stream.bi = 0
+      for j in i ..< stream.bufs.len:
+        stream.bufs[j - i] = stream.bufs[j]
+      stream.bufs.setLen(stream.bufs.len - i)
+      return ns
+    elif a > olen:
+      copyMem(addr oq[0], addr stream.bufs[0][stream.bs], olen)
+      stream.bs += olen
+      assert stream.bs < stream.buflen
+      return olen
+    else: # a == olen
+      copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a)
+      stream.bs = 0
+      stream.bufs.delete(0)
+      return a
+
+proc checkEnd(stream: EncoderStream, oq: ptr UncheckedArray[uint8], olen: int,
+    n: var int) =
+  if not stream.isend and stream.bufs.len == 1 and
+      stream.bs >= stream.bi and stream.source.atEnd:
+    stream.isend = true
+
+const ReadSize = 4096
+proc readData*(stream: EncoderStream, buffer: pointer, olen: int): int =
+  if olen == 0: return
+  let oq = cast[ptr UncheckedArray[uint8]](buffer)
+  result = stream.copyBuffers(oq, olen)
+  let olen = olen - result
+  if olen == 0 or stream.source.atEnd:
+    # either output filled with buffered data; nothing to decode
+    # or we're at the end of the source stream
+    stream.checkEnd(oq, olen, result)
+    return result
+  var iq = newSeqUninitialized[uint32](ReadSize div sizeof(uint32))
+  let ilen0 = stream.source.readData(cast[pointer](addr iq[0]), ReadSize)
+  assert ilen0 mod sizeof(uint32) == 0 #TODO what to do if false?
+  let ilen = ilen0 div sizeof(uint32)
+  case stream.charset
+  of CHARSET_UTF_8: stream.encodeUTF8(iq, oq, ilen, olen, result)
+  of CHARSET_IBM866: stream.encodeSingleByte(iq, oq, ilen, olen, result, IBM866Encode)
+  of CHARSET_ISO_8859_2: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88592Encode)
+  of CHARSET_ISO_8859_3: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88593Encode)
+  of CHARSET_ISO_8859_4: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88594Encode)
+  of CHARSET_ISO_8859_5: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88595Encode)
+  of CHARSET_ISO_8859_6: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88596Encode)
+  of CHARSET_ISO_8859_7: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88597Encode)
+  of CHARSET_ISO_8859_8,
+     CHARSET_ISO_8859_8_I: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88598Encode)
+  of CHARSET_ISO_8859_10: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885910Encode)
+  of CHARSET_ISO_8859_13: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885913Encode)
+  of CHARSET_ISO_8859_14: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885914Encode)
+  of CHARSET_ISO_8859_15: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885915Encode)
+  of CHARSET_ISO_8859_16: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885916Encode)
+  of CHARSET_KOI8_R: stream.encodeSingleByte(iq, oq, ilen, olen, result, KOI8REncode)
+  of CHARSET_KOI8_U: stream.encodeSingleByte(iq, oq, ilen, olen, result, KOI8UEncode)
+  of CHARSET_MACINTOSH: stream.encodeSingleByte(iq, oq, ilen, olen, result, MacintoshEncode)
+  of CHARSET_WINDOWS_874: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows874Encode)
+  of CHARSET_WINDOWS_1250: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1250Encode)
+  of CHARSET_WINDOWS_1251: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1251Encode)
+  of CHARSET_WINDOWS_1252: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1252Encode)
+  of CHARSET_WINDOWS_1253: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1253Encode)
+  of CHARSET_WINDOWS_1254: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1254Encode)
+  of CHARSET_WINDOWS_1255: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1255Encode)
+  of CHARSET_WINDOWS_1256: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1256Encode)
+  of CHARSET_WINDOWS_1257: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1257Encode)
+  of CHARSET_WINDOWS_1258: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1258Encode)
+  of CHARSET_X_MAC_CYRILLIC: stream.encodeSingleByte(iq, oq, ilen, olen, result, XMacCyrillicEncode)
+  of CHARSET_GBK: stream.encodeGb18030(iq, oq, ilen, olen, result, true)
+  of CHARSET_GB18030: stream.encodeGb18030(iq, oq, ilen, olen, result)
+  of CHARSET_BIG5: stream.encodeBig5(iq, oq, ilen, olen, result)
+  of CHARSET_EUC_JP: stream.encodeEUCJP(iq, oq, ilen, olen, result)
+#  of CHARSET_ISO_2022_JP: stream.decodeISO2022JP(iq, oq, ilen, olen, result)
+#  of CHARSET_SHIFT_JIS: stream.decodeShiftJIS(iq, oq, ilen, olen, result)
+#  of CHARSET_EUC_KR: stream.decodeEUCKR(iq, oq, ilen, olen, result)
+#  of CHARSET_REPLACEMENT: stream.decodeReplacement(oq, olen, result)
+#  of CHARSET_UTF_16_LE: stream.decodeUTF16LE(iq, oq, ilen, olen, result)
+#  of CHARSET_UTF_16_BE: stream.decodeUTF16BE(iq, oq, ilen, olen, result)
+  of CHARSET_X_USER_DEFINED: stream.encodeXUserDefined(iq, oq, ilen, olen, result)
+  of CHARSET_UNKNOWN: assert false, "Somebody forgot to set the character set here"
+  else: assert false, "TODO"
+  stream.checkEnd(oq, olen, result)
+
+# Returns the number of bytes read.
+proc readData*(stream: EncoderStream, buf: var seq[uint8]): int =
+  return stream.readData(addr buf[0], buf.len * sizeof(buf[0]))
+
+proc atEnd*(stream: EncoderStream): bool =
+  return stream.isend
+
+proc readAll*(stream: EncoderStream): string =
+  var buf = newString(4096)
+  while not stream.atEnd:
+    let olen = stream.readData(addr buf[0], buf.len)
+    if olen < buf.len:
+      buf.setLen(olen)
+      result &= buf
+      break
+    result &= buf
+
+proc newEncoderStream*(source: Stream, cs = CHARSET_UTF_8, buflen = 4096,
+                       errormode: EncoderErrorMode = ENCODER_ERROR_MODE_HTML): EncoderStream =
+  result = EncoderStream(
+    source: source,
+    charset: cs,
+    buflen: buflen,
+    errormode: errormode
+  )
+  when nimvm:
+    result.bufs = @[newSeq[uint8](buflen)]
+  else:
+    result.bufs = @[newSeqUninitialized[uint8](buflen)]