diff options
author | bptato <nincsnevem662@gmail.com> | 2023-05-19 01:50:17 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2023-05-19 01:50:17 +0200 |
commit | 26e8968a6499742cf37e00292a7d1c8ed620cad5 (patch) | |
tree | a3922f02f09b5c025dddcfe0e7a3a719c47ba4da /src | |
parent | dac6a09c14b258ed725dcb265305a6445edc02ad (diff) | |
download | chawan-26e8968a6499742cf37e00292a7d1c8ed620cad5.tar.gz |
Add display/output encoding
Some encodings are still missing
Diffstat (limited to 'src')
-rw-r--r-- | src/config/config.nim | 20 | ||||
-rw-r--r-- | src/data/charset.nim | 39 | ||||
-rw-r--r-- | src/display/pager.nim | 6 | ||||
-rw-r--r-- | src/display/term.nim | 55 | ||||
-rw-r--r-- | src/encoding/decoderstream.nim | 19 | ||||
-rw-r--r-- | src/encoding/encoderstream.nim | 389 | ||||
-rw-r--r-- | src/html/htmlparser.nim | 65 | ||||
-rw-r--r-- | src/main.nim | 13 | ||||
-rw-r--r-- | src/render/rendertext.nim | 2 |
9 files changed, 524 insertions, 84 deletions
diff --git a/src/config/config.nim b/src/config/config.nim index e7bb0c63..45c177ad 100644 --- a/src/config/config.nim +++ b/src/config/config.nim @@ -67,6 +67,7 @@ type wrap*: bool EncodingConfig = object + display_charset*: Option[Charset] document_charset*: seq[Charset] ExternalConfig = object @@ -292,7 +293,6 @@ proc parseConfigValue(x: var Option[ColorMode], v: TomlValue, k: string) proc parseConfigValue(x: var Option[FormatMode], v: TomlValue, k: string) proc parseConfigValue(x: var FormatMode, v: TomlValue, k: string) proc parseConfigValue(x: var RGBAColor, v: TomlValue, k: string) -proc parseConfigValue(x: var Option[bool], v: TomlValue, k: string) proc parseConfigValue[T](x: var Option[T], v: TomlValue, k: string) proc parseConfigValue(x: var ActionMap, v: TomlValue, k: string) proc parseConfigValue(x: var CSSConfig, v: TomlValue, k: string) @@ -413,24 +413,14 @@ proc parseConfigValue(x: var RGBAColor, v: TomlValue, k: string) = "' for key " & k) x = c.get -proc parseConfigValue(x: var Option[bool], v: TomlValue, k: string) = - typeCheck(v, {VALUE_STRING, VALUE_BOOLEAN}, k) - if v.vt == VALUE_STRING: - if v.s == "auto": - x = none(bool) - else: - raise newException(ValueError, "invalid value '" & v.s & - "' for key " & k) +proc parseConfigValue[T](x: var Option[T], v: TomlValue, k: string) = + if v.vt == VALUE_STRING and v.s == "auto": + x = none(T) else: - var y: bool + var y: T parseConfigValue(y, v, k) x = some(y) -proc parseConfigValue[T](x: var Option[T], v: TomlValue, k: string) = - var y: T - parseConfigValue(y, v, k) - x = some(y) - proc parseConfigValue(x: var ActionMap, v: TomlValue, k: string) = typeCheck(v, VALUE_TABLE, k) for kk, vv in v: diff --git a/src/data/charset.nim b/src/data/charset.nim index f93a82b3..45d7786c 100644 --- a/src/data/charset.nim +++ b/src/data/charset.nim @@ -3,6 +3,8 @@ import os import strutils import tables +import utils/twtstr + type Charset* = enum CHARSET_UNKNOWN CHARSET_UTF_8 = "UTF-8" @@ -314,9 +316,32 @@ const CharsetMap = { "x-user-defined": CHARSET_X_USER_DEFINED }.toTable() +func normalizeLocale(s: string): string = + for i in 0 ..< s.len: + if cast[uint8](s[i]) > 0x20 and s[i] != '_' and s[i] != '-': + result &= s[i].toLowerAscii() + +const NormalizedCharsetMap = (func(): Table[string, Charset] = + for k, v in CharsetMap: + result[k.normalizeLocale()] = v)() + +const DefaultCharset* = CHARSET_UTF_8 + proc getCharset*(s: string): Charset = return CharsetMap.getOrDefault(s.strip().toLower(), CHARSET_UNKNOWN) +proc getLocaleCharset*(s: string): Charset = + let ss = s.after('.') + if ss != "": + return NormalizedCharsetMap.getOrDefault(ss.normalizeLocale(), + CHARSET_UNKNOWN) + # We could try to guess the charset based on the language here, like w3m + # does. + # However, these days it is more likely for any system to be using UTF-8 + # than any other charset, irrespective of the language. So we just assume + # UTF-8. + return DefaultCharset + iterator mappairs(path: string): tuple[a, b: int] = let s = staticRead(path) for line in s.split('\n'): @@ -372,23 +397,21 @@ func loadGb18030Ranges(path: string): tuple[ result.encode.add((uint16(n), uint16(index))) result.encode.sort() +type UCS16x16* = tuple[ucs, p: uint16] + func loadCharsetMap16(path: string, len: static uint16): tuple[ decode: array[len, uint16], - encode: seq[ - tuple[ - ucs: uint16, - p: uint16 ]]] = + encode: seq[UCS16x16]] = for index, n in mappairs("res/map" / path): result.decode[uint16(index)] = uint16(n) result.encode.add((uint16(n), uint16(index))) result.encode.sort() +type UCS32x16* = tuple[ucs: uint32, p: uint16] + func loadBig5Map(path: string, offset: static uint16): tuple[ decode: array[19782u16 - offset, uint32], # ouch (+75KB...) - encode: seq[ - tuple[ - ucs: uint32, - p: uint16 ]]] = + encode: seq[UCS32x16]] = for index, n in mappairs("res/map" / path): result.decode[uint16(index) - offset] = uint32(n) result.encode.add((uint32(n), uint16(index))) diff --git a/src/display/pager.nim b/src/display/pager.nim index bd07d52e..1767f09b 100644 --- a/src/display/pager.nim +++ b/src/display/pager.nim @@ -318,6 +318,7 @@ proc drawBuffer*(pager: Pager, container: Container, ostream: Stream) = ostream.write(line.str & "\n") else: var x = 0 + var w = 0 var i = 0 var s = "" for f in line.formats: @@ -327,9 +328,10 @@ proc drawBuffer*(pager: Pager, container: Container, ostream: Stream) = fastRuneAt(line.str, i, r) outstr &= r x += r.width() - s &= outstr + s &= pager.term.processOutputString(outstr, w) s &= pager.term.processFormat(format, f.format) - s &= line.str.substr(i) & pager.term.processFormat(format, newFormat()) & "\n" + s &= pager.term.processOutputString(line.str.substr(i), w) + s &= pager.term.processFormat(format, newFormat()) & "\n" ostream.write(s)) ostream.flush() diff --git a/src/display/term.nim b/src/display/term.nim index ce518428..e665faee 100644 --- a/src/display/term.nim +++ b/src/display/term.nim @@ -1,6 +1,7 @@ import math import options import os +import streams import tables import terminal import unicode @@ -8,6 +9,8 @@ import unicode import bindings/termcap import buffer/cell import config/config +import data/charset +import encoding/encoderstream import io/window import utils/twtstr import types/color @@ -39,6 +42,7 @@ type Terminal* = ref TerminalObj TerminalObj = object + cs: Charset config: Config infile: File outfile: File @@ -320,18 +324,37 @@ proc windowChange*(term: Terminal, attrs: WindowAttributes) = term.canvas = newFixedGrid(attrs.width, attrs.height) term.cleared = false -proc processOutputString(term: Terminal, str: string, w: var int): string = +proc processOutputString*(term: Terminal, str: string, w: var int): string = if str.validateUtf8() != -1: return "?" - for r in str.runes(): - # twidth wouldn't work here, the view may start at the nth character. - # pager must ensure tabs are converted beforehand. - let tw = r.width() - if r.isControlChar(): - result &= "^" & getControlLetter(char(r)) - elif tw != 0: - result &= r - w += tw + if term.cs != CHARSET_UTF_8: + #TODO: This is incredibly inefficient. + var u32buf = "" + for r in str.runes(): + let tw = r.width() + if r.isControlChar(): + u32buf &= char(0) & char(0) & char(0) & "^" & + char(0) & char(0) & char(0) & getControlLetter(char(r)) + elif tw != 0: + let ol = u32buf.len + u32buf.setLen(ol + sizeof(uint32)) + var u32 = cast[uint32](r) + copyMem(addr u32buf[ol], addr u32, sizeof(u32)) + w += tw + let ss = newStringStream(u32buf) + let encoder = newEncoderStream(ss, cs = term.cs, + errormode = ENCODER_ERROR_MODE_FATAL) + result &= encoder.readAll() + else: + for r in str.runes(): + # twidth wouldn't work here, the view may start at the nth character. + # pager must ensure tabs are converted beforehand. + let tw = r.width() + if r.isControlChar(): + result &= "^" & getControlLetter(char(r)) + elif tw != 0: + result &= r + w += tw proc generateFullOutput(term: Terminal, grid: FixedGrid): string = var format = newFormat() @@ -427,6 +450,18 @@ proc applyConfig(term: Terminal) = if term.isatty() and term.config.display.alt_screen.isSome: term.smcup = term.config.display.alt_screen.get term.mincontrast = term.config.display.minimum_contrast + if term.config.encoding.display_charset.isSome: + term.cs = term.config.encoding.display_charset.get + else: + term.cs = DefaultCharset + for s in ["LC_ALL", "LC_CTYPE", "LANG"]: + let env = getEnv(s) + if env == "": + continue + let cs = getLocaleCharset(env) + if cs != CHARSET_UNKNOWN: + term.cs = cs + break proc outputGrid*(term: Terminal) = if term.config.display.force_clear: diff --git a/src/encoding/decoderstream.nim b/src/encoding/decoderstream.nim index 425f264f..8bfd4d10 100644 --- a/src/encoding/decoderstream.nim +++ b/src/encoding/decoderstream.nim @@ -8,8 +8,7 @@ import utils/twtstr # DecoderStream decodes any encoding to valid utf-32. type DecoderErrorMode* = enum - DECODER_ERROR_MODE_FATAL, DECODER_ERROR_MODE_REPLACEMENT, - DECODER_ERROR_MODE_HTML + DECODER_ERROR_MODE_FATAL, DECODER_ERROR_MODE_REPLACEMENT ISO2022JPState = enum STATE_ASCII, STATE_ROMAN, STATE_KATAKANA, STATE_LEAD_BYTE, @@ -27,6 +26,7 @@ type c: uint32 case charset: Charset of CHARSET_UTF_8: + u8c: uint32 u8needed: int u8seen: int u8bounds: Slice[uint8] @@ -83,22 +83,11 @@ proc handleError(stream: DecoderStream, oq: ptr UncheckedArray[uint32], olen: in of DECODER_ERROR_MODE_FATAL: stream.isend = true stream.failed = true - of DECODER_ERROR_MODE_HTML: - if stream.charset == CHARSET_UTF_8: - # "html" mode is handled as "replacement" for utf-8. - stream.append_codepoint 0xFFFD, oq, olen, n - else: - stream.append_codepoint '&', oq, olen, n - stream.append_codepoint '#', oq, olen, n - while stream.c > 0: - stream.append_codepoint cast[char](0x30 + stream.c mod 10), oq, olen, n - stream.c = stream.c div 10 - stream.append_codepoint ';', oq, olen, n of DECODER_ERROR_MODE_REPLACEMENT: stream.append_codepoint 0xFFFD, oq, olen, n proc decodeUTF8(stream: DecoderStream, iq: var seq[uint8], oq: ptr UncheckedArray[uint32], ilen, olen: int, n: var int) = - var c = stream.c + var c = stream.u8c var needed = stream.u8needed var seen = stream.u8seen var bounds = stream.u8bounds @@ -156,7 +145,7 @@ proc decodeUTF8(stream: DecoderStream, iq: var seq[uint8], oq: ptr UncheckedArra needed = 0 seen = 0 inc i - stream.c = c + stream.u8c = c stream.u8bounds = bounds stream.u8seen = seen stream.u8needed = needed diff --git a/src/encoding/encoderstream.nim b/src/encoding/encoderstream.nim new file mode 100644 index 00000000..155b7d31 --- /dev/null +++ b/src/encoding/encoderstream.nim @@ -0,0 +1,389 @@ +# Heavily based on https://encoding.spec.whatwg.org/ + +import algorithm +import streams +import unicode + +import data/charset + +# EncoderStream encodes utf-32 to the specified encoding. +type + EncoderErrorMode* = enum + ENCODER_ERROR_MODE_FATAL, ENCODER_ERROR_MODE_HTML + + ISO2022JPState = enum + STATE_ASCII, STATE_ROMAN, STATE_JIS0208 + + EncoderStream* = ref object + source: Stream + errormode: EncoderErrorMode + isend: bool + failed*: bool + bufs: seq[seq[uint8]] + bs: int + bi: int + buflen: int + errc: uint32 + charset: Charset + +template append_byte_buf(stream: EncoderStream, c: uint8) = + if stream.bi >= stream.buflen: + stream.bufs.add(newSeqUninitialized[uint8](stream.buflen)) + stream.bi = 0 + stream.bufs[^1][stream.bi] = c + inc stream.bi + +template append_byte(stream: EncoderStream, c: uint8, + oq: ptr UncheckedArray[uint8], olen: int, n: var int) = + if n < olen: + oq[n] = c + inc n + else: + append_byte_buf stream, c + +template append_byte(stream: EncoderStream, c: char, + oq: ptr UncheckedArray[uint8], olen: int, n: var int) = + stream.append_byte cast[uint8](c), oq, olen, n + +template append_byte(stream: EncoderStream, c: uint32, + oq: ptr UncheckedArray[uint8], olen: int, n: var int) = + stream.append_byte cast[uint8](c), oq, olen, n + +template append_byte(stream: EncoderStream, c: int, + oq: ptr UncheckedArray[uint8], olen: int, n: var int) = + stream.append_byte cast[uint8](c), oq, olen, n + +proc handleError(stream: EncoderStream, oq: ptr UncheckedArray[uint8], + olen: int, n: var int, c: uint32) = + case stream.errormode + of ENCODER_ERROR_MODE_FATAL: + stream.isend = true + stream.failed = true + of ENCODER_ERROR_MODE_HTML: + stream.append_byte '&', oq, olen, n + stream.append_byte '#', oq, olen, n + if stream.errc == 0: + stream.append_byte '0', oq, olen, n + else: + while stream.errc > 0: + stream.append_byte cast[char](0x30 + stream.errc mod 10), oq, olen, n + stream.errc = stream.errc div 10 + stream.append_byte ';', oq, olen, n + +proc gb18030RangesPointer(c: uint32): uint32 = + if c == 0xE7C7: + return 7457 + # Let offset be the last pointer in index gb18030 ranges that is less than or + # equal to pointer and code point offset its corresponding code point. + var offset: uint32 + var p: uint32 + if c >= 0x10000: + # omitted from the map for storage efficiency + offset = 0x10000 + p = 189000 + elif c >= 0xFFE6: + # Needed because upperBound returns the first element greater than pointer + # OR last on failure, so we can't just remove one if p is e.g. 39400. + offset = 0xFFE6 + p = 39394 + else: + # Find the first range that is greater than p, or last if no such element + # is found. + # We want the last that is <=, so decrease index by one. + let i = upperBound(Gb18030RangesEncode, c, func(a: tuple[ucs, p: uint16], b: uint32): int = + cmp(cast[uint32](a.ucs), b)) + let elem = Gb18030RangesEncode[i - 1] + offset = elem.ucs + p = elem.p + return p + c - offset + +proc encodeUTF8(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) = + var i = 0 + while i < ilen: + let c = iq[i] + var count: int + var offset: uint8 + case c + of 0x0080..0x07FF: + count = 1 + offset = 0xC0 + of 0x0800..0xFFFF: + count = 2 + offset = 0xE0 + of 0x10000..0x10FFFF: + count = 3 + offset = 0xF0 + else: + assert false + stream.append_byte (c shr (6 * count)) + offset, oq, olen, n + while count > 0: + let tmp = c shr (6 * (count - 1)) + stream.append_byte 0x80 or (tmp and 0x3F), oq, olen, n + dec count + +proc encodeSingleByte(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int, + map: seq[tuple[ucs: uint16, val: char]]) = + for i in 0 ..< ilen: + let c = iq[i] + if c < 0x80: + stream.append_byte cast[uint8](c), oq, olen, n + continue + if c <= 0xFFFF: + let j = binarySearch(map, cast[uint16](c), + proc(a: tuple[ucs: uint16, val: char], b: uint16): int = + cmp(a.ucs, b)) + if j != -1: + stream.append_byte cast[uint8](map[j].val) + 0x80, oq, olen, n + continue + stream.handleError(oq, olen, n, c) + +proc encodeXUserDefined(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) = + for i in 0 ..< ilen: + let c = iq[i] + if c < 0x80: + stream.append_byte cast[uint8](c), oq, olen, n + continue + if c in 0xF780u32..0xF7FFu32: + let b = cast[uint8](c - 0xF780 + 0x80) + stream.append_byte b, oq, olen, n + continue + stream.handleError(oq, olen, n, c) + +proc encodeGb18030(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int, + isGBK = false) = + for c in iq: + if isGBK and c == 0x20AC: + stream.append_byte 0x80, oq, olen, n + continue + let i = if c > 0xFFFF: -1 else: binarySearch(Gb18030Encode, cast[uint16](c), + proc(a: UCS16x16, b: uint16): int = + cmp(a.ucs, b)) + if i != -1: + let p = Gb18030Encode[i].p + let lead = p div 190 + 0x81 + let trail = p mod 190 + let offset: uint8 = if trail < 0x3F: 0x40 else: 0x41 + stream.append_byte lead, oq, olen, n + stream.append_byte cast[uint8](trail) + offset, oq, olen, n + continue + if isGBK: + stream.handleError(oq, olen, n, c) + continue + var p = gb18030RangesPointer(c) + let b1 = p div (10 * 126 * 10) + p = p mod (10 * 126 * 10) + let b2 = p div (10 * 126) + p = p mod (10 * 126) + let b3 = p div 10 + let b4 = p mod 10 + stream.append_byte b1, oq, olen, n + stream.append_byte b2, oq, olen, n + stream.append_byte b3, oq, olen, n + stream.append_byte b4, oq, olen, n + +proc encodeBig5(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) = + for c in iq: + if c < 0x80: + stream.append_byte c, oq, olen, n + continue + let i = binarySearch(Big5Encode, cast[uint16](c), + proc(a: UCS32x16, b: uint16): int = + cmp(a.ucs, b)) + if i == -1: + stream.handleError(oq, olen, n, c) + continue + let p = Big5Encode[i].p + let lead = p div 157 + 0x81 + let trail = p mod 157 + let offset: uint8 = if trail < 0x3F: 0x40 else: 0x62 + stream.append_byte lead, oq, olen, n + stream.append_byte cast[uint8](trail) + offset, oq, olen, n + +proc encodeEUCJP(stream: EncoderStream, iq: var seq[uint32], + oq: ptr UncheckedArray[uint8], ilen, olen: int, n: var int) = + for c in iq: + if c < 0x80: + stream.append_byte c, oq, olen, n + elif c == 0xA5: + stream.append_byte 0x5C, oq, olen, n + elif c == 0x203E: + stream.append_byte 0x5C, oq, olen, n + elif c in 0xFF61u32..0xFF9Fu32: + stream.append_byte 0x8E, oq, olen, n + stream.append_byte c - 0xFF61 + 0xA1, oq, olen, n + else: + let c = if c == 0x2212: + 0xFF0Du32 + else: + c + let i = binarySearch(Jis0208Encode, cast[uint16](c), + proc(a: UCS16x16, b: uint16): int = + cmp(a.ucs, b)) + if i != -1: + let p = Jis0208Encode[i].p + let lead = p div 94 + 0xA1 + let trail = p mod 94 + 0xA1 + stream.append_byte lead, oq, olen, n + stream.append_byte trail, oq, olen, n + else: + stream.handleError(oq, olen, n, c) + +# copy any data remaining from previous passes +proc copyBuffers(stream: EncoderStream, oq: ptr UncheckedArray[uint8], olen: int): int = + if stream.bufs.len == 1: + # one page: stream.bs ..< stream.bi + let n = min((stream.bi - stream.bs) * sizeof(stream.bufs[0][0]), olen) + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], n) + stream.bs += n + if stream.bs >= stream.bi: + # read entire page; recycle it + stream.bs = 0 + stream.bi = 0 + return n + else: + # multiple pages: + # stream.bs ..< stream.buflen + # 0 ..< stream.buflen + # ... + # 0 ..< stream.bi + let a = (stream.buflen - stream.bs) * sizeof(stream.bufs[0][0]) + if a < olen: + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a) + var ns = a + stream.bs = 0 + var i = 1 + while i < stream.bufs.high: + let n = min(stream.buflen * sizeof(stream.bufs[0][0]), olen - ns) + copyMem(addr oq[ns], addr stream.bufs[i][0], n) + ns += n + if ns >= olen: + # i'th buffer still has contents. + stream.bs = n + break + stream.bs = 0 + inc i + if ns < olen: + # last page + let n = min(stream.bi * sizeof(stream.bufs[0][0]), olen - ns) + copyMem(addr oq[ns], addr stream.bufs[i][0], n) + ns += n + stream.bs = n + if stream.bs >= stream.bi: + # read entire page; recycle it + stream.bs = 0 + stream.bi = 0 + for j in i ..< stream.bufs.len: + stream.bufs[j - i] = stream.bufs[j] + stream.bufs.setLen(stream.bufs.len - i) + return ns + elif a > olen: + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], olen) + stream.bs += olen + assert stream.bs < stream.buflen + return olen + else: # a == olen + copyMem(addr oq[0], addr stream.bufs[0][stream.bs], a) + stream.bs = 0 + stream.bufs.delete(0) + return a + +proc checkEnd(stream: EncoderStream, oq: ptr UncheckedArray[uint8], olen: int, + n: var int) = + if not stream.isend and stream.bufs.len == 1 and + stream.bs >= stream.bi and stream.source.atEnd: + stream.isend = true + +const ReadSize = 4096 +proc readData*(stream: EncoderStream, buffer: pointer, olen: int): int = + if olen == 0: return + let oq = cast[ptr UncheckedArray[uint8]](buffer) + result = stream.copyBuffers(oq, olen) + let olen = olen - result + if olen == 0 or stream.source.atEnd: + # either output filled with buffered data; nothing to decode + # or we're at the end of the source stream + stream.checkEnd(oq, olen, result) + return result + var iq = newSeqUninitialized[uint32](ReadSize div sizeof(uint32)) + let ilen0 = stream.source.readData(cast[pointer](addr iq[0]), ReadSize) + assert ilen0 mod sizeof(uint32) == 0 #TODO what to do if false? + let ilen = ilen0 div sizeof(uint32) + case stream.charset + of CHARSET_UTF_8: stream.encodeUTF8(iq, oq, ilen, olen, result) + of CHARSET_IBM866: stream.encodeSingleByte(iq, oq, ilen, olen, result, IBM866Encode) + of CHARSET_ISO_8859_2: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88592Encode) + of CHARSET_ISO_8859_3: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88593Encode) + of CHARSET_ISO_8859_4: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88594Encode) + of CHARSET_ISO_8859_5: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88595Encode) + of CHARSET_ISO_8859_6: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88596Encode) + of CHARSET_ISO_8859_7: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88597Encode) + of CHARSET_ISO_8859_8, + CHARSET_ISO_8859_8_I: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO88598Encode) + of CHARSET_ISO_8859_10: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885910Encode) + of CHARSET_ISO_8859_13: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885913Encode) + of CHARSET_ISO_8859_14: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885914Encode) + of CHARSET_ISO_8859_15: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885915Encode) + of CHARSET_ISO_8859_16: stream.encodeSingleByte(iq, oq, ilen, olen, result, ISO885916Encode) + of CHARSET_KOI8_R: stream.encodeSingleByte(iq, oq, ilen, olen, result, KOI8REncode) + of CHARSET_KOI8_U: stream.encodeSingleByte(iq, oq, ilen, olen, result, KOI8UEncode) + of CHARSET_MACINTOSH: stream.encodeSingleByte(iq, oq, ilen, olen, result, MacintoshEncode) + of CHARSET_WINDOWS_874: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows874Encode) + of CHARSET_WINDOWS_1250: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1250Encode) + of CHARSET_WINDOWS_1251: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1251Encode) + of CHARSET_WINDOWS_1252: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1252Encode) + of CHARSET_WINDOWS_1253: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1253Encode) + of CHARSET_WINDOWS_1254: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1254Encode) + of CHARSET_WINDOWS_1255: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1255Encode) + of CHARSET_WINDOWS_1256: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1256Encode) + of CHARSET_WINDOWS_1257: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1257Encode) + of CHARSET_WINDOWS_1258: stream.encodeSingleByte(iq, oq, ilen, olen, result, Windows1258Encode) + of CHARSET_X_MAC_CYRILLIC: stream.encodeSingleByte(iq, oq, ilen, olen, result, XMacCyrillicEncode) + of CHARSET_GBK: stream.encodeGb18030(iq, oq, ilen, olen, result, true) + of CHARSET_GB18030: stream.encodeGb18030(iq, oq, ilen, olen, result) + of CHARSET_BIG5: stream.encodeBig5(iq, oq, ilen, olen, result) + of CHARSET_EUC_JP: stream.encodeEUCJP(iq, oq, ilen, olen, result) +# of CHARSET_ISO_2022_JP: stream.decodeISO2022JP(iq, oq, ilen, olen, result) +# of CHARSET_SHIFT_JIS: stream.decodeShiftJIS(iq, oq, ilen, olen, result) +# of CHARSET_EUC_KR: stream.decodeEUCKR(iq, oq, ilen, olen, result) +# of CHARSET_REPLACEMENT: stream.decodeReplacement(oq, olen, result) +# of CHARSET_UTF_16_LE: stream.decodeUTF16LE(iq, oq, ilen, olen, result) +# of CHARSET_UTF_16_BE: stream.decodeUTF16BE(iq, oq, ilen, olen, result) + of CHARSET_X_USER_DEFINED: stream.encodeXUserDefined(iq, oq, ilen, olen, result) + of CHARSET_UNKNOWN: assert false, "Somebody forgot to set the character set here" + else: assert false, "TODO" + stream.checkEnd(oq, olen, result) + +# Returns the number of bytes read. +proc readData*(stream: EncoderStream, buf: var seq[uint8]): int = + return stream.readData(addr buf[0], buf.len * sizeof(buf[0])) + +proc atEnd*(stream: EncoderStream): bool = + return stream.isend + +proc readAll*(stream: EncoderStream): string = + var buf = newString(4096) + while not stream.atEnd: + let olen = stream.readData(addr buf[0], buf.len) + if olen < buf.len: + buf.setLen(olen) + result &= buf + break + result &= buf + +proc newEncoderStream*(source: Stream, cs = CHARSET_UTF_8, buflen = 4096, + errormode: EncoderErrorMode = ENCODER_ERROR_MODE_HTML): EncoderStream = + result = EncoderStream( + source: source, + charset: cs, + buflen: buflen, + errormode: errormode + ) + when nimvm: + result.bufs = @[newSeq[uint8](buflen)] + else: + result.bufs = @[newSeqUninitialized[uint8](buflen)] diff --git a/src/html/htmlparser.nim b/src/html/htmlparser.nim index d03e0d24..f6af8709 100644 --- a/src/html/htmlparser.nim +++ b/src/html/htmlparser.nim @@ -2174,50 +2174,49 @@ proc finishParsing(parser: var HTML5Parser) = script.execute() #TODO events +proc bomSniff(inputStream: Stream): Charset = + # bom sniff + const u8bom = char(0xEF) & char(0xBB) & char(0xBF) + const bebom = char(0xFE) & char(0xFF) + const lebom = char(0xFF) & char(0xFE) + var bom = inputStream.readStr(2) + if bom == bebom: + return CHARSET_UTF_16_BE + elif bom == lebom: + return CHARSET_UTF_16_LE + else: + bom &= inputStream.readChar() + if bom == u8bom: + return CHARSET_UTF_8 + else: + inputStream.setPosition(0) + proc parseHTML*(inputStream: Stream, charsets: seq[Charset] = @[], - fallbackcs = CHARSET_UTF_8, window: Window = nil, - url: URL = nil, canReinterpret = true): Document = + fallbackcs = DefaultCharset, window: Window = nil, url: URL = nil, + canReinterpret = true): Document = var charsetStack: seq[Charset] for i in countdown(charsets.high, 0): charsetStack.add(charsets[i]) var canReinterpret = canReinterpret + var confidence: CharsetConfidence + let scs = inputStream.bomSniff() + if scs != CHARSET_UNKNOWN: + charsetStack.add(scs) + confidence = CONFIDENCE_CERTAIN + elif charsetStack.len == 0: + charsetStack.add(fallbackcs) while true: var parser: HTML5Parser - var bom: string - let islastcs = charsetStack.len == 0 - if not islastcs: - parser.charset = charsetStack.pop() - if not canReinterpret: - parser.confidence = CONFIDENCE_CERTAIN - else: - # bom sniff - const u8bom = char(0xEF) & char(0xBB) & char(0xBF) - const bebom = char(0xFE) & char(0xFF) - const lebom = char(0xFF) & char(0xFE) - bom = inputStream.readStr(2) - if bom == bebom: - parser.charset = CHARSET_UTF_16_BE - parser.confidence = CONFIDENCE_CERTAIN - bom = "" - elif bom == lebom: - parser.charset = CHARSET_UTF_16_LE - parser.confidence = CONFIDENCE_CERTAIN - bom = "" - else: - bom &= inputStream.readChar() - if bom == u8bom: - parser.charset = CHARSET_UTF_8 - parser.confidence = CONFIDENCE_CERTAIN - bom = "" - else: - parser.charset = fallbackcs - let em = if islastcs or not canReinterpret: + parser.confidence = confidence + confidence = CONFIDENCE_TENTATIVE + parser.charset = charsetStack.pop() + if not canReinterpret: + parser.confidence = CONFIDENCE_CERTAIN + let em = if charsetStack.len == 0 or not canReinterpret: DECODER_ERROR_MODE_REPLACEMENT else: DECODER_ERROR_MODE_FATAL let decoder = newDecoderStream(inputStream, parser.charset, errormode = em) - for c in bom: - decoder.prepend(cast[uint32](c)) parser.document = newDocument() parser.document.contentType = "text/html" if window != nil: diff --git a/src/main.nim b/src/main.nim index 0371c801..605fba94 100644 --- a/src/main.nim +++ b/src/main.nim @@ -35,7 +35,8 @@ Options: -c, --css <stylesheet> Pass stylesheet (e.g. -c 'a{color: blue}') -o, --opt <config> Pass config options (e.g. -o 'page.q="QUIT"') -T, --type <type> Specify content mime type - -I, --input-charset <name> Specify document charset + -I, --input-charset <enc> Specify document charset + -O, --display-charset <enc> Specify display charset -M, --monochrome Set color-mode to 'monochrome' -V, --visual Visual startup mode -r, --run <script/file> Run passed script or file @@ -84,6 +85,16 @@ while i < params.len: cs = some(c) else: help(1) + of "-O", "--output-charset": + inc i + if i < params.len: + let c = getCharset(params[i]) + if c == CHARSET_UNKNOWN: + stderr.write("Unknown charset " & params[i] & "\n") + quit(1) + conf.encoding.display_charset = some(c) + else: + help(1) of "-": discard # emulate programs that accept - as stdin of "-d", "-dump", "--dump": diff --git a/src/render/rendertext.nim b/src/render/rendertext.nim index d0576c75..ed79d60f 100644 --- a/src/render/rendertext.nim +++ b/src/render/rendertext.nim @@ -21,6 +21,8 @@ proc newStreamRenderer*(stream: Stream, charsets: seq[Charset]): StreamRenderer result.ansiparser.state = PARSE_DONE for i in countdown(charsets.high, 0): result.charsets.add(charsets[i]) + if charsets.len == 0: + result.charsets = @[DefaultCharset] let cs = result.charsets.pop() let em = if charsets.len > 0: DECODER_ERROR_MODE_FATAL |