about summary refs log tree commit diff stats
path: root/403code-point.mu
diff options
context:
space:
mode:
authorKartik Agaram <vc@akkartik.com>2020-08-02 20:51:52 -0700
committerKartik Agaram <vc@akkartik.com>2020-08-02 20:53:36 -0700
commitab445d6e7538538e1dc2e5b419b2d2727d0d096f (patch)
treee5ddb41b2480a841753421babc35377258fd13cb /403code-point.mu
parentea2d44bdefef7ded28cc33dbb6ac85842dc6ca44 (diff)
downloadmu-ab445d6e7538538e1dc2e5b419b2d2727d0d096f.tar.gz
6710 - utf-8 encoding
Example program:

  fn main -> r/ebx: int {
    var x/eax: code-point <- copy 0x2192  # unicode character 'rightwards arrow'
    print-code-point 0, x
    print-string 0, "\n"
    r <- copy 0
  }

Run:
  $ ./translate_mu x.mu  &&  ./a.elf
  →
  $
Diffstat (limited to '403code-point.mu')
-rw-r--r--403code-point.mu88
1 files changed, 34 insertions, 54 deletions
diff --git a/403code-point.mu b/403code-point.mu
index 3b2b7205..da3115bc 100644
--- a/403code-point.mu
+++ b/403code-point.mu
@@ -10,7 +10,7 @@
 # in 4 bytes.
 #
 # Mu doesn't currently support combining code points, or graphemes made of
-# multiple code points.
+# multiple code points. One day we will.
 
 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
@@ -63,7 +63,6 @@ $to-grapheme:body: {
     }
   }
   # emit trailer bytes, 6 bits from 'in', first two bits '10'
-  var byte-shifts/ebx: int <- copy 0
   var result/edi: int <- copy 0
   {
     compare num-trailers, 0
@@ -71,19 +70,17 @@ $to-grapheme:body: {
     var tmp/esi: int <- copy c
     tmp <- and 0x3f
     tmp <- or 0x80
-    tmp <- shift-left-bytes tmp, byte-shifts
+    result <- shift-left 8
     result <- or tmp
     # update loop state
     c <- shift-right 6
-    byte-shifts <- increment
     num-trailers <- decrement
     loop
   }
   # emit engine
-  var tmp/esi: int <- copy c
-  tmp <- or first
-  tmp <- shift-left-bytes tmp, byte-shifts
-  result <- or tmp
+  result <- shift-left 8
+  result <- or c
+  result <- or first
   #
   out <- copy result
 }
@@ -104,75 +101,58 @@ fn test-to-grapheme-single-byte {
   }
 }
 
+                                                              # byte       | byte      | byte      | byte
 # smallest 2-byte utf-8
 fn test-to-grapheme-two-bytes-min {
-  var in/eax: code-point <- copy 0x80       #        10     000000
+  var in/eax: code-point <- copy 0x80                         #                                 10     00-0000
   var out/eax: grapheme <- to-grapheme in
   var out-int/eax: int <- copy out
-  check-ints-equal out-int, 0xc280, "F 2gr" # 110 00010  10 000000
+  check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a"      #                         110 0-0010  10 00-0000
 }
 
 # largest 2-byte utf-8
 fn test-to-grapheme-two-bytes-max {
-  var in/eax: code-point <- copy 0x7ff      #     11111     111111
+  var in/eax: code-point <- copy 0x7ff                        #                             1-1111     11-1111
   var out/eax: grapheme <- to-grapheme in
   var out-int/eax: int <- copy out
-  check-ints-equal out-int, 0xdfbf, "F 2gr" # 110 11111  10 111111
+  check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b"      #                         110 1-1111  10 11-1111
 }
 
 # smallest 3-byte utf-8
 fn test-to-grapheme-three-bytes-min {
-  var in/eax: code-point <- copy 0x800      #               100000     000000
+  var in/eax: code-point <- copy 0x800                        #                            10-0000     00-0000
   var out/eax: grapheme <- to-grapheme in
   var out-int/eax: int <- copy out
-  check-ints-equal out-int, 0xc280, "F 2gr" # 1110 0000  10 100000  10 000000
-}
-
-# needed because available primitives only shift by a literal/constant number of bits
-fn shift-left-bytes n: int, k: int -> result/esi: int {
-  var i/eax: int <- copy 0
-  result <- copy n
-  {
-    compare i, k
-    break-if->=
-    compare i, 4  # only 4 bytes in 32 bits
-    break-if->=
-    result <- shift-left 8
-    i <- increment
-    loop
-  }
+  check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a"    #              1110 0000  10 10-0000  10 00-0000
 }
 
-fn test-shift-left-bytes-0 {
-  var result/esi: int <- shift-left-bytes 1, 0
-  check-ints-equal result, 1, "F - shift-left-bytes 0"
-}
-
-fn test-shift-left-bytes-1 {
-  var result/esi: int <- shift-left-bytes 1, 1
-  check-ints-equal result, 0x100, "F - shift-left-bytes 1"
-}
-
-fn test-shift-left-bytes-2 {
-  var result/esi: int <- shift-left-bytes 1, 2
-  check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
-}
-
-fn test-shift-left-bytes-3 {
-  var result/esi: int <- shift-left-bytes 1, 3
-  check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
+# largest 3-byte utf-8
+fn test-to-grapheme-three-bytes-max {
+  var in/eax: code-point <- copy 0xffff                       #                   1111     11-1111     11-1111
+  var out/eax: grapheme <- to-grapheme in
+  var out-int/eax: int <- copy out
+  check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b"    #              1110 1111  10 11-1111  10 11-1111
 }
 
-fn test-shift-left-bytes-4 {
-  var result/esi: int <- shift-left-bytes 1, 4
-  check-ints-equal result, 0, "F - shift-left-bytes 4"
+# smallest 4-byte utf-8
+fn test-to-grapheme-four-bytes-min {
+  var in/eax: code-point <- copy 0x10000                      #                 1-0000     00-0000     00-0000
+  var out/eax: grapheme <- to-grapheme in
+  var out-int/eax: int <- copy out
+  check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a"  # 1111-0 000  10 01-0000  10 00-0000  10 00-0000
 }
 
-fn test-shift-left-bytes-5 {
-  var result/esi: int <- shift-left-bytes 1, 5
-  check-ints-equal result, 0, "F - shift-left-bytes >4"
+# largest 4-byte utf-8
+fn test-to-grapheme-four-bytes-max {
+  var in/eax: code-point <- copy 0x1fffff                     #        111     11-1111     11-1111     11-1111
+  var out/eax: grapheme <- to-grapheme in
+  var out-int/eax: int <- copy out
+  check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b"  # 1111-0 111  10 11-1111  10 11-1111  10 11-1111
 }
 
-#? fn main {
+# To run all tests, uncomment this and run:
+#   $ ./translate_mu ''  &&  ./a.elf
+#? fn main -> r/ebx: int {
 #?   run-tests
+#?   r <- copy 0
 #? }