about summary refs log tree commit diff stats
path: root/adapter
diff options
context:
space:
mode:
authorbptato <nincsnevem662@gmail.com>2024-02-13 21:16:12 +0100
committerbptato <nincsnevem662@gmail.com>2024-02-25 02:46:21 +0100
commit6e98894199442e2213dc89e0c5fe970029f05b65 (patch)
tree57bf69a6fa825d72be1654482e8865b5e9b82829 /adapter
parentd41d4803b5ed15b7e8461394ee07ce5ab1de143a (diff)
downloadchawan-6e98894199442e2213dc89e0c5fe970029f05b65.tar.gz
Separate ANSI text decoding from main binary
Handling text/plain as ANSI colored text was problematic for two
reasons:

* You couldn't actually look at the real source of HTML pages or text
  files that used ANSI colors in the source.  In general, I only want
  ANSI colors when piping something into my pager, not when viewing any
  random file.
* More importantly, it introduced a separate rendering mode for
  plaintext documents, which resulted in the problem that only some
  buffers had DOMs.  This made it impossible to add functionality
  that would operate on the buffer's DOM, to e.g. implement w3m's
  MARK_URL.  Also, it locked us into the horribly inefficient line-based
  rendering model of entire documents.

Now we solve the problem in two separate parts:

* text/x-ansi is used automatically for documents received through
  stdin. A text/x-ansi handler ansi2html converts ANSI formatting to
  HTML.  text/x-ansi is also used for .ans, .asc file extensions.
* text/plain is a separate input mode in buffer, which places all text
  in a single <plaintext> tag.  Crucially, this does not invoke the HTML
  parser; that would eat NUL characters, which we should avoid.

One blind spot still remains: copiousoutput used to display ANSI colors,
and now it doesn't. To solve this, users can put the x-ansioutput
extension field to their mailcap entries, which behaves like
x-htmloutput except it first pipes the output into ansi2html.
Diffstat (limited to 'adapter')
-rw-r--r--adapter/format/ansi2html.nim380
1 files changed, 380 insertions, 0 deletions
diff --git a/adapter/format/ansi2html.nim b/adapter/format/ansi2html.nim
new file mode 100644
index 00000000..a7242be7
--- /dev/null
+++ b/adapter/format/ansi2html.nim
@@ -0,0 +1,380 @@
+import std/options
+import std/selectors
+
+import io/posixstream
+import types/color
+import utils/twtstr
+
+type
+  FormatFlag = enum
+    ffBold
+    ffItalic
+    ffUnderline
+    ffReverse
+    ffStrike
+    ffOverline
+    ffBlink
+
+  Format = object
+    fgcolor: CellColor
+    bgcolor: CellColor
+    flags: set[FormatFlag]
+
+# https://www.ecma-international.org/wp-content/uploads/ECMA-48_5th_edition_june_1991.pdf
+type
+  AnsiCodeParseState = enum
+    acpsDone, acpsStart, acpsParams, acpsInterm, acpsFinal, acpsBackspace,
+    acpsInBackspaceTransition, acpsInBackspace
+
+  AnsiCodeParser = object
+    state: AnsiCodeParseState
+    params: string
+
+proc getParam(parser: AnsiCodeParser, i: var int, colon = false): string =
+  while i < parser.params.len and
+      not (parser.params[i] == ';' or colon and parser.params[i] == ':'):
+    result &= parser.params[i]
+    inc i
+  if i < parser.params.len:
+    inc i
+
+template getParamU8(parser: AnsiCodeParser, i: var int,
+    colon = false): uint8 =
+  if i >= parser.params.len:
+    return false
+  let u = parseUInt8(parser.getParam(i))
+  if u.isNone:
+    return false
+  u.get
+
+proc parseSGRDefColor(parser: AnsiCodeParser, format: var Format,
+    i: var int, isfg: bool): bool =
+  let u = parser.getParamU8(i, colon = true)
+  template set_color(c: CellColor) =
+    if isfg:
+      format.fgcolor = c
+    else:
+      format.bgcolor = c
+  if u == 2:
+    let param0 = parser.getParamU8(i, colon = true)
+    if i < parser.params.len:
+      let r = param0
+      let g = parser.getParamU8(i, colon = true)
+      let b = parser.getParamU8(i, colon = true)
+      set_color cellColor(rgb(r, g, b))
+    else:
+      set_color cellColor(gray(param0))
+  elif u == 5:
+    let param0 = parser.getParamU8(i, colon = true)
+    if param0 in 0u8..15u8:
+      set_color cellColor(ANSIColor(param0))
+    elif param0 in 16u8..255u8:
+      set_color cellColor(EightBitColor(param0))
+  else:
+    return false
+
+proc parseSGRColor(parser: AnsiCodeParser, format: var Format,
+    i: var int, u: uint8): bool =
+  if u in 30u8..37u8:
+    format.fgcolor = cellColor(ANSIColor(u - 30))
+  elif u == 38:
+    return parser.parseSGRDefColor(format, i, isfg = true)
+  elif u == 39:
+    format.fgcolor = defaultColor
+  elif u in 40u8..47u8:
+    format.bgcolor = cellColor(ANSIColor(u - 40))
+  elif u == 48:
+    return parser.parseSGRDefColor(format, i, isfg = false)
+  elif u == 49:
+    format.bgcolor = defaultColor
+  elif u in 90u8..97u8:
+    format.fgcolor = cellColor(ANSIColor(u - 82))
+  elif u in 100u8..107u8:
+    format.bgcolor = cellColor(ANSIColor(u - 92))
+  else:
+    return false
+  return true
+
+const FormatCodes: array[FormatFlag, tuple[s, e: uint8]] = [
+  ffBold: (1u8, 22u8),
+  ffItalic: (3u8, 23u8),
+  ffUnderline: (4u8, 24u8),
+  ffReverse: (7u8, 27u8),
+  ffStrike: (9u8, 29u8),
+  ffOverline: (53u8, 55u8),
+  ffBlink: (5u8, 25u8),
+]
+
+proc parseSGRAspect(parser: AnsiCodeParser, format: var Format,
+    i: var int): bool =
+  let u = parser.getParamU8(i)
+  for flag, (s, e) in FormatCodes:
+    if u == s:
+      format.flags.incl(flag)
+      return true
+    if u == e:
+      format.flags.excl(flag)
+      return true
+  if u == 0:
+    format = Format()
+    return true
+  else:
+    return parser.parseSGRColor(format, i, u)
+
+proc parseSGR(parser: AnsiCodeParser, format: var Format) =
+  if parser.params.len == 0:
+    format = Format()
+  else:
+    var i = 0
+    while i < parser.params.len:
+      if not parser.parseSGRAspect(format, i):
+        break
+
+proc parseControlFunction(parser: var AnsiCodeParser, format: var Format,
+    f: char) =
+  if f == 'm':
+    parser.parseSGR(format)
+  else:
+    discard # unknown
+
+proc reset(parser: var AnsiCodeParser) =
+  parser.state = acpsStart
+  parser.params = ""
+
+type State = object
+  os: PosixStream
+  outbufIdx: int
+  outbuf: array[4096, char]
+  parser: AnsiCodeParser
+  currentFmt: Format
+  pendingFmt: Format
+  tmpFlags: set[FormatFlag]
+  af: bool
+  spanOpen: bool
+  hasPrintingBuf: bool
+  backspaceDecay: int
+
+proc flushOutbuf(state: var State) =
+  if state.outbufIdx > 0:
+    discard state.os.sendData(addr state.outbuf[0], state.outbufIdx)
+    state.outbufIdx = 0
+
+proc putc(state: var State, c: char) {.inline.} =
+  if state.outbufIdx + 4 >= state.outbuf.len: # max utf-8 char length
+    state.flushOutbuf()
+  state.outbuf[state.outbufIdx] = c
+  inc state.outbufIdx
+
+proc puts(state: var State, s: string) =
+  #TODO this is slower than it could be
+  for c in s:
+    state.putc(c)
+
+proc puts(state: var State, s: openArray[char]) =
+  #TODO this is slower than it could be
+  for c in s:
+    state.putc(c)
+
+proc puts(state: var State, s: static string) {.inline.} =
+  for c in s:
+    state.putc(c)
+
+proc flushFmt(state: var State) =
+  if state.pendingFmt != state.currentFmt:
+    if state.spanOpen:
+      state.puts("</span>")
+    if state.pendingFmt == Format():
+      state.currentFmt = state.pendingFmt
+      state.spanOpen = false
+      return
+    state.spanOpen = true
+    state.puts("<span style='")
+    let fmt = state.pendingFmt
+    var buf = ""
+    if fmt.fgcolor.t != ctNone:
+      buf &= "color: "
+      case fmt.fgcolor.t
+      of ctNone: discard
+      of ctANSI: buf &= "-cha-ansi(" & $fmt.fgcolor.color & ")"
+      of ctRGB: buf &= $fmt.fgcolor
+      buf &= ";"
+    if fmt.bgcolor.t != ctNone:
+      buf &= "background-color: "
+      case fmt.bgcolor.t
+      of ctNone: discard
+      of ctANSI: buf &= "-cha-ansi(" & $fmt.bgcolor.color & ")"
+      of ctRGB: buf &= $fmt.bgcolor
+      buf &= ";"
+    if ffOverline in fmt.flags or ffUnderline in fmt.flags or
+        ffStrike in fmt.flags or ffBlink in fmt.flags:
+      buf &= "text-decoration: "
+      if ffOverline in fmt.flags:
+        buf &= "overline "
+      if ffUnderline in fmt.flags:
+        buf &= "underline "
+      if ffStrike in fmt.flags:
+        buf &= "line-through "
+      if ffBlink in fmt.flags:
+        buf &= "blink "
+      buf &= ";"
+    if ffBold in fmt.flags:
+      buf &= "font-weight: bold;"
+    if ffItalic in fmt.flags:
+      buf &= "font-style: italic;"
+    #TODO reverse
+    buf &= "'>"
+    state.puts(buf)
+    state.currentFmt = fmt
+    state.hasPrintingBuf = false
+
+type ParseAnsiCodeResult = enum
+  pacrProcess, pacrSkip
+
+proc parseAnsiCode(state: var State, format: var Format, c: char):
+    ParseAnsiCodeResult =
+  case state.parser.state
+  of acpsStart:
+    if 0x40 <= int(c) and int(c) <= 0x5F:
+      if c != '[':
+        #C1, TODO?
+        state.parser.state = acpsDone
+      else:
+        state.parser.state = acpsParams
+    else:
+      state.parser.state = acpsDone
+      return pacrProcess
+  of acpsParams:
+    if 0x30 <= int(c) and int(c) <= 0x3F:
+      state.parser.params &= c
+    else:
+      state.parser.state = acpsInterm
+      return state.parseAnsiCode(format, c)
+  of acpsInterm:
+    if 0x20 <= int(c) and int(c) <= 0x2F:
+      discard
+    else:
+      state.parser.state = acpsFinal
+      return state.parseAnsiCode(format, c)
+  of acpsFinal:
+    state.parser.state = acpsDone
+    if 0x40 <= int(c) and int(c) <= 0x7E:
+      state.parser.parseControlFunction(format, c)
+    else:
+      return pacrProcess
+  of acpsDone:
+    discard
+  of acpsBackspace:
+    # We used to emulate less here, but it seems to yield dubious benefits
+    # considering that
+    # a) the only place backspace-based formatting is used in is manpages
+    # b) we have w3mman now, which is superior in all respects, so this is
+    # pretty much never used
+    # c) if we drop generality, the output can be parsed much more efficiently
+    # (without having to buffer the entire line first)
+    #
+    # So we buffer only the last non-formatted UTF-8 char, and override it when
+    # necessary.
+    if not state.hasPrintingBuf:
+      state.parser.state = acpsDone
+      return pacrProcess
+    var i = state.outbufIdx - 1
+    while true:
+      if i < 0:
+        state.parser.state = acpsDone
+        return pacrProcess
+      if (int(state.outbuf[i]) and 0xC0) != 0x80:
+        break
+      dec i
+    if state.outbuf[i] == '_' or c == '_':
+      # underline for underscore overstrike
+      if ffUnderline notin state.pendingFmt.flags:
+        state.tmpFlags.incl(ffUnderline)
+        state.pendingFmt.flags.incl(ffUnderline)
+      elif c == '_' and ffBold notin state.pendingFmt.flags:
+        state.tmpFlags.incl(ffBold)
+        state.pendingFmt.flags.incl(ffBold)
+    else:
+      # represent *any* non-underline overstrike with bold.
+      # it is sloppy, but enough for our purposes.
+      if ffBold notin state.pendingFmt.flags:
+        state.tmpFlags.incl(ffBold)
+        state.pendingFmt.flags.incl(ffBold)
+    state.outbufIdx = i # move back output pointer
+    state.parser.state = acpsInBackspaceTransition
+    state.flushFmt()
+    return pacrProcess
+  of acpsInBackspaceTransition:
+    if (int(c) and 0xC0) != 0x80:
+      # backspace char end, next char begin
+      state.parser.state = acpsInBackspace
+    return pacrProcess
+  of acpsInBackspace:
+    if (int(c) and 0xC0) != 0x80:
+      # second char after backspaced char begin
+      if c == '\b':
+        # got backspace again, overstriking previous char. here we don't have to
+        # override anything
+        state.parser.state = acpsBackspace
+        return pacrProcess
+      # welp. we have to fixup the previous char's formatting
+      var i = state.outbufIdx - 1
+      while true:
+        assert i >= 0
+        if (int(state.outbuf[i]) and 0xC0) != 0x80:
+          break
+        dec i
+      let s = state.outbuf[i..<state.outbufIdx]
+      state.outbufIdx = i
+      for flag in FormatFlag:
+        if flag in state.tmpFlags:
+          state.pendingFmt.flags.excl(flag)
+      state.tmpFlags = {}
+      state.flushFmt()
+      state.puts(s)
+      state.parser.state = acpsDone
+    return pacrProcess
+  state.flushFmt()
+  pacrSkip
+
+proc processData(state: var State, buf: openArray[char]) =
+  for c in buf:
+    if state.parser.state != acpsDone:
+      case state.parseAnsiCode(state.pendingFmt, c)
+      of pacrSkip: continue
+      of pacrProcess: discard
+    state.hasPrintingBuf = true
+    case c
+    of '<': state.puts("&lt;")
+    of '>': state.puts("&gt;")
+    of '\'': state.puts("&apos;")
+    of '"': state.puts("&quot;")
+    of '\e': state.parser.reset()
+    of '\b': state.parser.state = acpsBackspace
+    of '\0': state.puts("\uFFFD") # HTML eats NUL, so replace it here
+    else: state.putc(c)
+
+proc main() =
+  let ps = newPosixStream(stdin.getFileHandle())
+  var state = State(os: newPosixStream(stdout.getFileHandle()))
+  state.puts("<!DOCTYPE html>\n<body><pre style='margin: 0'>")
+  ps.setBlocking(false)
+  var buffer {.noinit.}: array[4096, char]
+  var selector = newSelector[int]()
+  block mainloop:
+    while true:
+      try:
+        let n = ps.recvData(buffer.toOpenArrayByte(0, buffer.high))
+        if n == 0:
+          break
+        state.processData(buffer.toOpenArray(0, n - 1))
+      except ErrorAgain:
+        state.flushOutbuf()
+        selector.registerHandle(ps.fd, {Read}, 0)
+        let events = selector.select(-1)
+        for event in events:
+          if Error in event.events:
+            break mainloop
+        selector.unregister(ps.fd)
+  state.flushOutbuf()
+
+main()