about summary refs log tree commit diff stats
path: root/baremetal/403unicode.mu
diff options
context:
space:
mode:
authorKartik Agaram <vc@akkartik.com>2021-01-09 18:55:24 -0800
committerKartik Agaram <vc@akkartik.com>2021-01-09 18:55:24 -0800
commit57e4978eac1e5deb544b07db77ac382433d35d03 (patch)
treebe0bd25ed1bbd7bc5a7d1d8cda114d904c13f631 /baremetal/403unicode.mu
parent0518944e379f343542c872a20d3d5a2aee744297 (diff)
downloadmu-57e4978eac1e5deb544b07db77ac382433d35d03.tar.gz
7492
Port some support for unicode to baremetal.
Diffstat (limited to 'baremetal/403unicode.mu')
-rw-r--r--baremetal/403unicode.mu193
1 files changed, 193 insertions, 0 deletions
diff --git a/baremetal/403unicode.mu b/baremetal/403unicode.mu
new file mode 100644
index 00000000..ea45f707
--- /dev/null
+++ b/baremetal/403unicode.mu
@@ -0,0 +1,193 @@
+# Helpers for Unicode.
+#
+# Mu has no characters, only code points and graphemes.
+# Code points are the indivisible atoms of text streams.
+#   https://en.wikipedia.org/wiki/Code_point
+# Graphemes are the smallest self-contained unit of text.
+# Graphemes may consist of multiple code points.
+#
+# Mu graphemes are always represented in utf-8, and they are required to fit
+# in 4 bytes.
+#
+# Mu doesn't currently support combining code points, or graphemes made of
+# multiple code points. One day we will.
+# We also don't currently support code points that translate into multiple
+# or wide graphemes. (In particular, Tab will never be supported.)
+
+# transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
+# https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
+#
+# The day we want to support combining characters, this function will need to
+# take multiple code points. Or something.
+fn to-grapheme in: code-point -> _/eax: grapheme {
+  var c/eax: int <- copy in
+  var num-trailers/ecx: int <- copy 0
+  var first/edx: int <- copy 0
+  $to-grapheme:compute-length: {
+    # single byte: just return it
+    compare c, 0x7f
+    {
+      break-if->
+      var g/eax: grapheme <- copy c
+      return g
+    }
+    # 2 bytes
+    compare c, 0x7ff
+    {
+      break-if->
+      num-trailers <- copy 1
+      first <- copy 0xc0
+      break $to-grapheme:compute-length
+    }
+    # 3 bytes
+    compare c, 0xffff
+    {
+      break-if->
+      num-trailers <- copy 2
+      first <- copy 0xe0
+      break $to-grapheme:compute-length
+    }
+    # 4 bytes
+    compare c, 0x1fffff
+    {
+      break-if->
+      num-trailers <- copy 3
+      first <- copy 0xf0
+      break $to-grapheme:compute-length
+    }
+    # more than 4 bytes: unsupported
+    # TODO: print error message to stderr
+    compare c, 0x1fffff
+    {
+      break-if->
+      return 0
+    }
+  }
+  # emit trailer bytes, 6 bits from 'in', first two bits '10'
+  var result/edi: grapheme <- copy 0
+  {
+    compare num-trailers, 0
+    break-if-<=
+    var tmp/esi: int <- copy c
+    tmp <- and 0x3f
+    tmp <- or 0x80
+    result <- shift-left 8
+    result <- or tmp
+    # update loop state
+    c <- shift-right 6
+    num-trailers <- decrement
+    loop
+  }
+  # emit engine
+  result <- shift-left 8
+  result <- or c
+  result <- or first
+  #
+  return result
+}
+
+# TODO: bring in tests once we have check-ints-equal
+
+# read the next grapheme from a stream of bytes
+fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {
+  # if at eof, return EOF
+  {
+    var eof?/eax: boolean <- stream-empty? in
+    compare eof?, 0  # false
+    break-if-=
+    return 0xffffffff
+  }
+  var c/eax: byte <- read-byte in
+  var num-trailers/ecx: int <- copy 0
+  $read-grapheme:compute-length: {
+    # single byte: just return it
+    compare c, 0xc0
+    {
+      break-if->=
+      var g/eax: grapheme <- copy c
+      return g
+    }
+    compare c, 0xfe
+    {
+      break-if-<
+      var g/eax: grapheme <- copy c
+      return g
+    }
+    # 2 bytes
+    compare c, 0xe0
+    {
+      break-if->=
+      num-trailers <- copy 1
+      break $read-grapheme:compute-length
+    }
+    # 3 bytes
+    compare c, 0xf0
+    {
+      break-if->=
+      num-trailers <- copy 2
+      break $read-grapheme:compute-length
+    }
+    # 4 bytes
+    compare c, 0xf8
+    {
+      break-if->=
+      num-trailers <- copy 3
+      break $read-grapheme:compute-length
+    }
+    # TODO: print error message
+    return 0
+  }
+  # prepend trailer bytes
+  var result/edi: grapheme <- copy c
+  var num-byte-shifts/edx: int <- copy 1
+  {
+    compare num-trailers, 0
+    break-if-<=
+    var tmp/eax: byte <- read-byte in
+    var tmp2/eax: int <- copy tmp
+    tmp2 <- shift-left-bytes tmp2, num-byte-shifts
+    result <- or tmp2
+    # update loop state
+    num-byte-shifts <- increment
+    num-trailers <- decrement
+    loop
+  }
+  return result
+}
+
+# needed because available primitives only shift by a literal/constant number of bits
+fn shift-left-bytes n: int, k: int -> _/eax: int {
+  var i/ecx: int <- copy 0
+  var result/eax: int <- copy n
+  {
+    compare i, k
+    break-if->=
+    compare i, 4  # only 4 bytes in 32 bits
+    break-if->=
+    result <- shift-left 8
+    i <- increment
+    loop
+  }
+  return result
+}
+
+# write a grapheme to a stream of bytes
+# this is like write-to-stream, except we skip leading 0 bytes
+fn write-grapheme out: (addr stream byte), g: grapheme {
+$write-grapheme:body: {
+  var c/eax: int <- copy g
+  append-byte out, c  # first byte is always written
+  c <- shift-right 8
+  compare c, 0
+  break-if-= $write-grapheme:body
+  append-byte out, c
+  c <- shift-right 8
+  compare c, 0
+  break-if-= $write-grapheme:body
+  append-byte out, c
+  c <- shift-right 8
+  compare c, 0
+  break-if-= $write-grapheme:body
+  append-byte out, c
+}
+}