about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKartik Agaram <vc@akkartik.com>2020-08-28 23:24:04 -0700
committerKartik Agaram <vc@akkartik.com>2020-08-28 23:24:04 -0700
commitcd94852dbc713ff38f38a30d6e5fb4675606823c (patch)
treedc2a52048a609937cf0de0d0906336d52419ef65
parent392ebcce803423631de77fdc85c837be636078bb (diff)
downloadmu-cd94852dbc713ff38f38a30d6e5fb4675606823c.tar.gz
6733 - read utf-8 'grapheme' from byte stream
No support for combining characters. Graphemes are currently just utf-8
encodings of a single Unicode code-point. No support for code-points that
require more than 32 bits in utf-8.
-rw-r--r--112read-byte.subx46
-rw-r--r--400.mu1
-rw-r--r--403unicode.mu143
-rwxr-xr-xapps/assortbin44420 -> 44513 bytes
-rwxr-xr-xapps/bracesbin46283 -> 46376 bytes
-rwxr-xr-xapps/callsbin50930 -> 51023 bytes
-rwxr-xr-xapps/crenshaw2-1bin43761 -> 43854 bytes
-rwxr-xr-xapps/crenshaw2-1bbin44308 -> 44401 bytes
-rwxr-xr-xapps/dquotesbin48042 -> 48135 bytes
-rwxr-xr-xapps/factorialbin42864 -> 42957 bytes
-rwxr-xr-xapps/hexbin46600 -> 46693 bytes
-rwxr-xr-xapps/mubin388322 -> 388415 bytes
-rwxr-xr-xapps/packbin56999 -> 57092 bytes
-rwxr-xr-xapps/sigilsbin58652 -> 58745 bytes
-rwxr-xr-xapps/surveybin54352 -> 54445 bytes
-rwxr-xr-xapps/testsbin43192 -> 43285 bytes
16 files changed, 189 insertions, 1 deletions
diff --git a/112read-byte.subx b/112read-byte.subx
index 387cbb66..32f89647 100644
--- a/112read-byte.subx
+++ b/112read-byte.subx
@@ -33,7 +33,7 @@ $Stdin->buffer:
 # . op          subop               mod             rm32          base        index         scale       r32
 # . 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
 
-# return next byte value in eax, with top 3 bytes cleared.
+# Return next byte value in eax, with top 3 bytes cleared.
 # On reaching end of file, return 0xffffffff (Eof).
 read-byte-buffered:  # f: (addr buffered-file) -> byte-or-Eof/eax: byte
     # . prologue
@@ -268,6 +268,50 @@ test-read-byte-buffered-refills-buffer:
     # . end
     c3/return
 
+# Return next byte value in eax, with top 3 bytes cleared.
+# Abort on reaching end of file.
+read-byte:  # s: (addr stream byte) -> result/eax: byte
+    # . prologue
+    55/push-ebp
+    89/copy                         3/mod/direct    5/rm32/ebp    .           .             .           4/r32/esp   .               .                 # copy esp to ebp
+    # . save registers
+    51/push-ecx
+    56/push-esi
+    # esi = s
+    8b/copy                         1/mod/*+disp8   5/rm32/ebp    .           .             .           6/r32/esi   8/disp8         .                 # copy *(ebp+8) to esi
+    # ecx = s->read
+    8b/copy                         1/mod/*+disp8   6/rm32/esi    .           .             .           1/r32/ecx   4/disp8         .                 # copy *(esi+4) to ecx
+    # if (f->read >= f->write) abort
+    3b/compare                      0/mod/indirect  6/rm32/esi    .           .             .           1/r32/ecx   .               .                 # compare ecx with *esi
+    0f 8d/jump-if->=  $read-byte:abort/disp32
+    # result = f->data[f->read]
+    31/xor                          3/mod/direct    0/rm32/eax    .           .             .           0/r32/eax   .               .                 # clear eax
+    8a/copy-byte                    1/mod/*+disp8   4/rm32/sib    6/base/esi  1/index/ecx   .           0/r32/AL    0xc/disp8       .                 # copy byte at *(esi+ecx+12) to AL
+    # ++f->read
+    ff          0/subop/increment   1/mod/*+disp8   6/rm32/esi    .           .             .           .           4/disp8         .                 # increment *(esi+4)
+$read-byte:end:
+    # . restore registers
+    5e/pop-to-esi
+    59/pop-to-ecx
+    # . epilogue
+    89/copy                         3/mod/direct    4/rm32/esp    .           .             .           5/r32/ebp   .               .                 # copy ebp to esp
+    5d/pop-to-ebp
+    c3/return
+
+$read-byte:abort:
+    # . _write(2/stderr, error)
+    # . . push args
+    68/push  "read-byte: empty stream\n"/imm32
+    68/push  2/imm32/stderr
+    # . . call
+    e8/call  _write/disp32
+    # . . discard args
+    81          0/subop/add         3/mod/direct    4/rm32/esp    .           .             .           .           .               8/imm32           # add to esp
+    # . syscall(exit, 1)
+    bb/copy-to-ebx  1/imm32
+    e8/call  syscall_exit/disp32
+    # never gets here
+
 == data
 
 # a test buffered file for _test-stream
diff --git a/400.mu b/400.mu
index 09b84c67..0bcf7e64 100644
--- a/400.mu
+++ b/400.mu
@@ -51,6 +51,7 @@ sig tailor-exit-descriptor ed: (addr exit-descriptor), nbytes: int
 sig stop ed: (addr exit-descriptor), value: int
 #sig read f: fd or (addr stream byte), s: (addr stream byte) -> num-bytes-read/eax: int
 sig read-byte-buffered f: (addr buffered-file) -> byte-or-Eof/eax: byte
+sig read-byte s: (addr stream byte) -> result/eax: byte
 #sig write-stream f: fd or (addr stream byte), s: (addr stream byte)
 #sig error ed: (addr exit-descriptor), out: fd or (addr stream byte), msg: (addr array byte)
 sig write-byte-buffered f: (addr buffered-file), n: int
diff --git a/403unicode.mu b/403unicode.mu
index f10cb5ca..c43bbb57 100644
--- a/403unicode.mu
+++ b/403unicode.mu
@@ -55,6 +55,7 @@ $to-grapheme:body: {
       break $to-grapheme:compute-length
     }
     # more than 4 bytes: unsupported
+    # TODO: print to stderr
     compare c, 0x1fffff
     {
       break-if->
@@ -153,6 +154,148 @@ fn test-to-grapheme-four-bytes-max {
   check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b"  # 1111-0 111  10 11-1111  10 11-1111  10 11-1111
 }
 
+# read the next grapheme from a stream of bytes
+fn read-grapheme in: (addr stream byte) -> out/eax: grapheme {
+$read-grapheme:body: {
+  var c/eax: byte <- read-byte in
+  var num-trailers/ecx: int <- copy 0
+  $read-grapheme:compute-length: {
+    # single byte: just return it
+    compare c, 0xc0
+    {
+      break-if->=
+      out <- copy c
+      num-trailers <- copy 0
+      break $read-grapheme:body
+    }
+    compare c, 0xfe
+    {
+      break-if-<
+      out <- copy c
+      break $read-grapheme:body
+    }
+    # 2 bytes
+    compare c, 0xe0
+    {
+      break-if->=
+      num-trailers <- copy 1
+      break $read-grapheme:compute-length
+    }
+    # 3 bytes
+    compare c, 0xf0
+    {
+      break-if->=
+      num-trailers <- copy 2
+      break $read-grapheme:compute-length
+    }
+    # 4 bytes
+    compare c, 0xf8
+    {
+      break-if->=
+      num-trailers <- copy 3
+      break $read-grapheme:compute-length
+    }
+$read-grapheme:abort: {
+      # TODO: print to stderr
+      print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
+      var n/eax: int <- copy c
+      print-int32-hex-to-real-screen n
+      print-string-to-real-screen "\n"
+      var exit-status/ebx: int <- copy 1
+      syscall_exit
+    }
+  }
+  # prepend trailer bytes
+  var result/edi: int <- copy c
+  var num-byte-shifts/edx: int <- copy 1
+  {
+    compare num-trailers, 0
+    break-if-<=
+    var tmp/eax: byte <- read-byte in
+    var tmp2/eax: int <- copy tmp
+    tmp2 <- shift-left-bytes tmp2, num-byte-shifts
+    result <- or tmp2
+    # update loop state
+    num-byte-shifts <- increment
+    num-trailers <- decrement
+    loop
+  }
+  out <- copy result
+}
+}
+
+fn test-read-grapheme {
+  var s: (stream byte 0x30)
+  var s2/ecx: (addr stream byte) <- address s
+  write s2, "aΒc世d界e"
+  var c/eax: grapheme <- read-grapheme s2
+  var n/eax: int <- copy c
+  check-ints-equal n, 0x61, "F - test grapheme/0"
+  var c/eax: grapheme <- read-grapheme s2
+  var n/eax: int <- copy c
+  check-ints-equal n, 0x92ce, "F - test grapheme/1"  # greek capital letter beta
+  var c/eax: grapheme <- read-grapheme s2
+  var n/eax: int <- copy c
+  check-ints-equal n, 0x63, "F - test grapheme/2"
+  var c/eax: grapheme <- read-grapheme s2
+  var n/eax: int <- copy c
+  check-ints-equal n, 0x96b8e4, "F - test grapheme/3"
+  var c/eax: grapheme <- read-grapheme s2
+  var n/eax: int <- copy c
+  check-ints-equal n, 0x64, "F - test grapheme/4"
+  var c/eax: grapheme <- read-grapheme s2
+  var n/eax: int <- copy c
+  check-ints-equal n, 0x8c95e7, "F - test grapheme/5"
+  var c/eax: grapheme <- read-grapheme s2
+  var n/eax: int <- copy c
+  check-ints-equal n, 0x65, "F - test grapheme/6"
+}
+
+# needed because available primitives only shift by a literal/constant number of bits
+fn shift-left-bytes n: int, k: int -> result/eax: int {
+  var i/ecx: int <- copy 0
+  result <- copy n
+  {
+    compare i, k
+    break-if->=
+    compare i, 4  # only 4 bytes in 32 bits
+    break-if->=
+    result <- shift-left 8
+    i <- increment
+    loop
+  }
+}
+
+fn test-shift-left-bytes-0 {
+  var result/eax: int <- shift-left-bytes 1, 0
+  check-ints-equal result, 1, "F - shift-left-bytes 0"
+}
+
+fn test-shift-left-bytes-1 {
+  var result/eax: int <- shift-left-bytes 1, 1
+  check-ints-equal result, 0x100, "F - shift-left-bytes 1"
+}
+
+fn test-shift-left-bytes-2 {
+  var result/eax: int <- shift-left-bytes 1, 2
+  check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
+}
+
+fn test-shift-left-bytes-3 {
+  var result/eax: int <- shift-left-bytes 1, 3
+  check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
+}
+
+fn test-shift-left-bytes-4 {
+  var result/eax: int <- shift-left-bytes 1, 4
+  check-ints-equal result, 0, "F - shift-left-bytes 4"
+}
+
+fn test-shift-left-bytes-5 {
+  var result/eax: int <- shift-left-bytes 1, 5
+  check-ints-equal result, 0, "F - shift-left-bytes >4"
+}
+
 # To run all tests, uncomment this and run:
 #   $ ./translate_mu  &&  ./a.elf
 #? fn main -> r/ebx: int {
diff --git a/apps/assort b/apps/assort
index 1429467c..42c0c4d5 100755
--- a/apps/assort
+++ b/apps/assort
Binary files differdiff --git a/apps/braces b/apps/braces
index 0d80ed73..fefabcc8 100755
--- a/apps/braces
+++ b/apps/braces
Binary files differdiff --git a/apps/calls b/apps/calls
index b0fb2b58..443dc7f3 100755
--- a/apps/calls
+++ b/apps/calls
Binary files differdiff --git a/apps/crenshaw2-1 b/apps/crenshaw2-1
index ac1d1ddb..f26dedce 100755
--- a/apps/crenshaw2-1
+++ b/apps/crenshaw2-1
Binary files differdiff --git a/apps/crenshaw2-1b b/apps/crenshaw2-1b
index ff3f940a..139327ca 100755
--- a/apps/crenshaw2-1b
+++ b/apps/crenshaw2-1b
Binary files differdiff --git a/apps/dquotes b/apps/dquotes
index f5d80786..302c3490 100755
--- a/apps/dquotes
+++ b/apps/dquotes
Binary files differdiff --git a/apps/factorial b/apps/factorial
index 063a2f79..7e8edb63 100755
--- a/apps/factorial
+++ b/apps/factorial
Binary files differdiff --git a/apps/hex b/apps/hex
index cf837a45..75edad2d 100755
--- a/apps/hex
+++ b/apps/hex
Binary files differdiff --git a/apps/mu b/apps/mu
index 0dceb721..f3c88e9c 100755
--- a/apps/mu
+++ b/apps/mu
Binary files differdiff --git a/apps/pack b/apps/pack
index d831afbb..a6211a7d 100755
--- a/apps/pack
+++ b/apps/pack
Binary files differdiff --git a/apps/sigils b/apps/sigils
index 7fe34f28..82f4e43a 100755
--- a/apps/sigils
+++ b/apps/sigils
Binary files differdiff --git a/apps/survey b/apps/survey
index 8989971f..abb0d1f0 100755
--- a/apps/survey
+++ b/apps/survey
Binary files differdiff --git a/apps/tests b/apps/tests
index 0b826bff..7a4df46d 100755
--- a/apps/tests
+++ b/apps/tests
Binary files differ