about summary refs log tree commit diff stats
path: root/baremetal/403unicode.mu
diff options
context:
space:
mode:
authorKartik K. Agaram <vc@akkartik.com>2021-03-03 22:09:50 -0800
committerKartik K. Agaram <vc@akkartik.com>2021-03-03 22:21:03 -0800
commit71e4f3812982dba2efb471283d310224e8db363e (patch)
treeea111a1acb8b8845dbda39c0e1b4bac1d198143b /baremetal/403unicode.mu
parentc6b928be29ac8cdb4e4d6e1eaa20420ff03e5a4c (diff)
downloadmu-71e4f3812982dba2efb471283d310224e8db363e.tar.gz
7842 - new directory organization
Baremetal is now the default build target and therefore has its sources
at the top-level. Baremetal programs build using the phase-2 Mu toolchain
that requires a Linux kernel. This phase-2 codebase which used to be at
the top-level is now under the linux/ directory. Finally, the phase-2 toolchain,
while self-hosting, has a way to bootstrap from a C implementation, which
is now stored in linux/bootstrap. The bootstrap C implementation uses some
literate programming tools that are now in linux/bootstrap/tools.

So the whole thing has gotten inverted. Each directory should build one
artifact and include the main sources (along with standard library). Tools
used for building it are relegated to sub-directories, even though those
tools are often useful in their own right, and have had lots of interesting
programs written using them.

A couple of things have gotten dropped in this process:
  - I had old ways to run on just a Linux kernel, or with a Soso kernel.
    No more.
  - I had some old tooling for running a single test at the cursor. I haven't
    used that lately. Maybe I'll bring it back one day.

The reorg isn't done yet. Still to do:
  - redo documentation everywhere. All the README files, all other markdown,
    particularly vocabulary.md.
  - clean up how-to-run comments at the start of programs everywhere
  - rethink what to do with the html/ directory. Do we even want to keep
    supporting it?

In spite of these shortcomings, all the scripts at the top-level, linux/
and linux/bootstrap are working. The names of the scripts also feel reasonable.
This is a good milestone to take stock at.
Diffstat (limited to 'baremetal/403unicode.mu')
-rw-r--r--baremetal/403unicode.mu193
1 files changed, 0 insertions, 193 deletions
diff --git a/baremetal/403unicode.mu b/baremetal/403unicode.mu
deleted file mode 100644
index 6ec30c3d..00000000
--- a/baremetal/403unicode.mu
+++ /dev/null
@@ -1,193 +0,0 @@
-# Helpers for Unicode.
-#
-# Mu has no characters, only code points and graphemes.
-# Code points are the indivisible atoms of text streams.
-#   https://en.wikipedia.org/wiki/Code_point
-# Graphemes are the smallest self-contained unit of text.
-# Graphemes may consist of multiple code points.
-#
-# Mu graphemes are always represented in utf-8, and they are required to fit
-# in 4 bytes.
-#
-# Mu doesn't currently support combining code points, or graphemes made of
-# multiple code points. One day we will.
-# We also don't currently support code points that translate into multiple
-# or wide graphemes. (In particular, Tab will never be supported.)
-
-# transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
-# https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
-#
-# The day we want to support combining characters, this function will need to
-# take multiple code points. Or something.
-fn to-grapheme in: code-point -> _/eax: grapheme {
-  var c/eax: int <- copy in
-  var num-trailers/ecx: int <- copy 0
-  var first/edx: int <- copy 0
-  $to-grapheme:compute-length: {
-    # single byte: just return it
-    compare c, 0x7f
-    {
-      break-if->
-      var g/eax: grapheme <- copy c
-      return g
-    }
-    # 2 bytes
-    compare c, 0x7ff
-    {
-      break-if->
-      num-trailers <- copy 1
-      first <- copy 0xc0
-      break $to-grapheme:compute-length
-    }
-    # 3 bytes
-    compare c, 0xffff
-    {
-      break-if->
-      num-trailers <- copy 2
-      first <- copy 0xe0
-      break $to-grapheme:compute-length
-    }
-    # 4 bytes
-    compare c, 0x1fffff
-    {
-      break-if->
-      num-trailers <- copy 3
-      first <- copy 0xf0
-      break $to-grapheme:compute-length
-    }
-    # more than 4 bytes: unsupported
-    # TODO: print error message to stderr
-    compare c, 0x1fffff
-    {
-      break-if->
-      return 0
-    }
-  }
-  # emit trailer bytes, 6 bits from 'in', first two bits '10'
-  var result/edi: grapheme <- copy 0
-  {
-    compare num-trailers, 0
-    break-if-<=
-    var tmp/esi: int <- copy c
-    tmp <- and 0x3f
-    tmp <- or 0x80
-    result <- shift-left 8
-    result <- or tmp
-    # update loop state
-    c <- shift-right 6
-    num-trailers <- decrement
-    loop
-  }
-  # emit engine
-  result <- shift-left 8
-  result <- or c
-  result <- or first
-  #
-  return result
-}
-
-# TODO: bring in tests once we have check-ints-equal
-
-# read the next grapheme from a stream of bytes
-fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {
-  # if at eof, return EOF
-  {
-    var eof?/eax: boolean <- stream-empty? in
-    compare eof?, 0/false
-    break-if-=
-    return 0xffffffff
-  }
-  var c/eax: byte <- read-byte in
-  var num-trailers/ecx: int <- copy 0
-  $read-grapheme:compute-length: {
-    # single byte: just return it
-    compare c, 0xc0
-    {
-      break-if->=
-      var g/eax: grapheme <- copy c
-      return g
-    }
-    compare c, 0xfe
-    {
-      break-if-<
-      var g/eax: grapheme <- copy c
-      return g
-    }
-    # 2 bytes
-    compare c, 0xe0
-    {
-      break-if->=
-      num-trailers <- copy 1
-      break $read-grapheme:compute-length
-    }
-    # 3 bytes
-    compare c, 0xf0
-    {
-      break-if->=
-      num-trailers <- copy 2
-      break $read-grapheme:compute-length
-    }
-    # 4 bytes
-    compare c, 0xf8
-    {
-      break-if->=
-      num-trailers <- copy 3
-      break $read-grapheme:compute-length
-    }
-    # TODO: print error message
-    return 0
-  }
-  # prepend trailer bytes
-  var result/edi: grapheme <- copy c
-  var num-byte-shifts/edx: int <- copy 1
-  {
-    compare num-trailers, 0
-    break-if-<=
-    var tmp/eax: byte <- read-byte in
-    var tmp2/eax: int <- copy tmp
-    tmp2 <- shift-left-bytes tmp2, num-byte-shifts
-    result <- or tmp2
-    # update loop state
-    num-byte-shifts <- increment
-    num-trailers <- decrement
-    loop
-  }
-  return result
-}
-
-# needed because available primitives only shift by a literal/constant number of bits
-fn shift-left-bytes n: int, k: int -> _/eax: int {
-  var i/ecx: int <- copy 0
-  var result/eax: int <- copy n
-  {
-    compare i, k
-    break-if->=
-    compare i, 4  # only 4 bytes in 32 bits
-    break-if->=
-    result <- shift-left 8
-    i <- increment
-    loop
-  }
-  return result
-}
-
-# write a grapheme to a stream of bytes
-# this is like write-to-stream, except we skip leading 0 bytes
-fn write-grapheme out: (addr stream byte), g: grapheme {
-$write-grapheme:body: {
-  var c/eax: int <- copy g
-  append-byte out, c  # first byte is always written
-  c <- shift-right 8
-  compare c, 0
-  break-if-= $write-grapheme:body
-  append-byte out, c
-  c <- shift-right 8
-  compare c, 0
-  break-if-= $write-grapheme:body
-  append-byte out, c
-  c <- shift-right 8
-  compare c, 0
-  break-if-= $write-grapheme:body
-  append-byte out, c
-}
-}