diff options
author | Kartik Agaram <vc@akkartik.com> | 2020-08-28 23:24:04 -0700 |
---|---|---|
committer | Kartik Agaram <vc@akkartik.com> | 2020-08-28 23:24:04 -0700 |
commit | cd94852dbc713ff38f38a30d6e5fb4675606823c (patch) | |
tree | dc2a52048a609937cf0de0d0906336d52419ef65 | |
parent | 392ebcce803423631de77fdc85c837be636078bb (diff) | |
download | mu-cd94852dbc713ff38f38a30d6e5fb4675606823c.tar.gz |
6733 - read utf-8 'grapheme' from byte stream
No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8.
-rw-r--r-- | 112read-byte.subx | 46 | ||||
-rw-r--r-- | 400.mu | 1 | ||||
-rw-r--r-- | 403unicode.mu | 143 | ||||
-rwxr-xr-x | apps/assort | bin | 44420 -> 44513 bytes | |||
-rwxr-xr-x | apps/braces | bin | 46283 -> 46376 bytes | |||
-rwxr-xr-x | apps/calls | bin | 50930 -> 51023 bytes | |||
-rwxr-xr-x | apps/crenshaw2-1 | bin | 43761 -> 43854 bytes | |||
-rwxr-xr-x | apps/crenshaw2-1b | bin | 44308 -> 44401 bytes | |||
-rwxr-xr-x | apps/dquotes | bin | 48042 -> 48135 bytes | |||
-rwxr-xr-x | apps/factorial | bin | 42864 -> 42957 bytes | |||
-rwxr-xr-x | apps/hex | bin | 46600 -> 46693 bytes | |||
-rwxr-xr-x | apps/mu | bin | 388322 -> 388415 bytes | |||
-rwxr-xr-x | apps/pack | bin | 56999 -> 57092 bytes | |||
-rwxr-xr-x | apps/sigils | bin | 58652 -> 58745 bytes | |||
-rwxr-xr-x | apps/survey | bin | 54352 -> 54445 bytes | |||
-rwxr-xr-x | apps/tests | bin | 43192 -> 43285 bytes |
16 files changed, 189 insertions, 1 deletions
diff --git a/112read-byte.subx b/112read-byte.subx index 387cbb66..32f89647 100644 --- a/112read-byte.subx +++ b/112read-byte.subx @@ -33,7 +33,7 @@ $Stdin->buffer: # . op subop mod rm32 base index scale r32 # . 1-3 bytes 3 bits 2 bits 3 bits 3 bits 3 bits 2 bits 2 bits 0/1/2/4 bytes 0/1/2/4 bytes -# return next byte value in eax, with top 3 bytes cleared. +# Return next byte value in eax, with top 3 bytes cleared. # On reaching end of file, return 0xffffffff (Eof). read-byte-buffered: # f: (addr buffered-file) -> byte-or-Eof/eax: byte # . prologue @@ -268,6 +268,50 @@ test-read-byte-buffered-refills-buffer: # . end c3/return +# Return next byte value in eax, with top 3 bytes cleared. +# Abort on reaching end of file. +read-byte: # s: (addr stream byte) -> result/eax: byte + # . prologue + 55/push-ebp + 89/copy 3/mod/direct 5/rm32/ebp . . . 4/r32/esp . . # copy esp to ebp + # . save registers + 51/push-ecx + 56/push-esi + # esi = s + 8b/copy 1/mod/*+disp8 5/rm32/ebp . . . 6/r32/esi 8/disp8 . # copy *(ebp+8) to esi + # ecx = s->read + 8b/copy 1/mod/*+disp8 6/rm32/esi . . . 1/r32/ecx 4/disp8 . # copy *(esi+4) to ecx + # if (f->read >= f->write) abort + 3b/compare 0/mod/indirect 6/rm32/esi . . . 1/r32/ecx . . # compare ecx with *esi + 0f 8d/jump-if->= $read-byte:abort/disp32 + # result = f->data[f->read] + 31/xor 3/mod/direct 0/rm32/eax . . . 0/r32/eax . . # clear eax + 8a/copy-byte 1/mod/*+disp8 4/rm32/sib 6/base/esi 1/index/ecx . 0/r32/AL 0xc/disp8 . # copy byte at *(esi+ecx+12) to AL + # ++f->read + ff 0/subop/increment 1/mod/*+disp8 6/rm32/esi . . . . 4/disp8 . # increment *(esi+4) +$read-byte:end: + # . restore registers + 5e/pop-to-esi + 59/pop-to-ecx + # . epilogue + 89/copy 3/mod/direct 4/rm32/esp . . . 5/r32/ebp . . # copy ebp to esp + 5d/pop-to-ebp + c3/return + +$read-byte:abort: + # . _write(2/stderr, error) + # . . push args + 68/push "read-byte: empty stream\n"/imm32 + 68/push 2/imm32/stderr + # . . call + e8/call _write/disp32 + # . . discard args + 81 0/subop/add 3/mod/direct 4/rm32/esp . . . . . 8/imm32 # add to esp + # . syscall(exit, 1) + bb/copy-to-ebx 1/imm32 + e8/call syscall_exit/disp32 + # never gets here + == data # a test buffered file for _test-stream diff --git a/400.mu b/400.mu index 09b84c67..0bcf7e64 100644 --- a/400.mu +++ b/400.mu @@ -51,6 +51,7 @@ sig tailor-exit-descriptor ed: (addr exit-descriptor), nbytes: int sig stop ed: (addr exit-descriptor), value: int #sig read f: fd or (addr stream byte), s: (addr stream byte) -> num-bytes-read/eax: int sig read-byte-buffered f: (addr buffered-file) -> byte-or-Eof/eax: byte +sig read-byte s: (addr stream byte) -> result/eax: byte #sig write-stream f: fd or (addr stream byte), s: (addr stream byte) #sig error ed: (addr exit-descriptor), out: fd or (addr stream byte), msg: (addr array byte) sig write-byte-buffered f: (addr buffered-file), n: int diff --git a/403unicode.mu b/403unicode.mu index f10cb5ca..c43bbb57 100644 --- a/403unicode.mu +++ b/403unicode.mu @@ -55,6 +55,7 @@ $to-grapheme:body: { break $to-grapheme:compute-length } # more than 4 bytes: unsupported + # TODO: print to stderr compare c, 0x1fffff { break-if-> @@ -153,6 +154,148 @@ fn test-to-grapheme-four-bytes-max { check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b" # 1111-0 111 10 11-1111 10 11-1111 10 11-1111 } +# read the next grapheme from a stream of bytes +fn read-grapheme in: (addr stream byte) -> out/eax: grapheme { +$read-grapheme:body: { + var c/eax: byte <- read-byte in + var num-trailers/ecx: int <- copy 0 + $read-grapheme:compute-length: { + # single byte: just return it + compare c, 0xc0 + { + break-if->= + out <- copy c + num-trailers <- copy 0 + break $read-grapheme:body + } + compare c, 0xfe + { + break-if-< + out <- copy c + break $read-grapheme:body + } + # 2 bytes + compare c, 0xe0 + { + break-if->= + num-trailers <- copy 1 + break $read-grapheme:compute-length + } + # 3 bytes + compare c, 0xf0 + { + break-if->= + num-trailers <- copy 2 + break $read-grapheme:compute-length + } + # 4 bytes + compare c, 0xf8 + { + break-if->= + num-trailers <- copy 3 + break $read-grapheme:compute-length + } +$read-grapheme:abort: { + # TODO: print to stderr + print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: " + var n/eax: int <- copy c + print-int32-hex-to-real-screen n + print-string-to-real-screen "\n" + var exit-status/ebx: int <- copy 1 + syscall_exit + } + } + # prepend trailer bytes + var result/edi: int <- copy c + var num-byte-shifts/edx: int <- copy 1 + { + compare num-trailers, 0 + break-if-<= + var tmp/eax: byte <- read-byte in + var tmp2/eax: int <- copy tmp + tmp2 <- shift-left-bytes tmp2, num-byte-shifts + result <- or tmp2 + # update loop state + num-byte-shifts <- increment + num-trailers <- decrement + loop + } + out <- copy result +} +} + +fn test-read-grapheme { + var s: (stream byte 0x30) + var s2/ecx: (addr stream byte) <- address s + write s2, "aΒc世d界e" + var c/eax: grapheme <- read-grapheme s2 + var n/eax: int <- copy c + check-ints-equal n, 0x61, "F - test grapheme/0" + var c/eax: grapheme <- read-grapheme s2 + var n/eax: int <- copy c + check-ints-equal n, 0x92ce, "F - test grapheme/1" # greek capital letter beta + var c/eax: grapheme <- read-grapheme s2 + var n/eax: int <- copy c + check-ints-equal n, 0x63, "F - test grapheme/2" + var c/eax: grapheme <- read-grapheme s2 + var n/eax: int <- copy c + check-ints-equal n, 0x96b8e4, "F - test grapheme/3" + var c/eax: grapheme <- read-grapheme s2 + var n/eax: int <- copy c + check-ints-equal n, 0x64, "F - test grapheme/4" + var c/eax: grapheme <- read-grapheme s2 + var n/eax: int <- copy c + check-ints-equal n, 0x8c95e7, "F - test grapheme/5" + var c/eax: grapheme <- read-grapheme s2 + var n/eax: int <- copy c + check-ints-equal n, 0x65, "F - test grapheme/6" +} + +# needed because available primitives only shift by a literal/constant number of bits +fn shift-left-bytes n: int, k: int -> result/eax: int { + var i/ecx: int <- copy 0 + result <- copy n + { + compare i, k + break-if->= + compare i, 4 # only 4 bytes in 32 bits + break-if->= + result <- shift-left 8 + i <- increment + loop + } +} + +fn test-shift-left-bytes-0 { + var result/eax: int <- shift-left-bytes 1, 0 + check-ints-equal result, 1, "F - shift-left-bytes 0" +} + +fn test-shift-left-bytes-1 { + var result/eax: int <- shift-left-bytes 1, 1 + check-ints-equal result, 0x100, "F - shift-left-bytes 1" +} + +fn test-shift-left-bytes-2 { + var result/eax: int <- shift-left-bytes 1, 2 + check-ints-equal result, 0x10000, "F - shift-left-bytes 2" +} + +fn test-shift-left-bytes-3 { + var result/eax: int <- shift-left-bytes 1, 3 + check-ints-equal result, 0x1000000, "F - shift-left-bytes 3" +} + +fn test-shift-left-bytes-4 { + var result/eax: int <- shift-left-bytes 1, 4 + check-ints-equal result, 0, "F - shift-left-bytes 4" +} + +fn test-shift-left-bytes-5 { + var result/eax: int <- shift-left-bytes 1, 5 + check-ints-equal result, 0, "F - shift-left-bytes >4" +} + # To run all tests, uncomment this and run: # $ ./translate_mu && ./a.elf #? fn main -> r/ebx: int { diff --git a/apps/assort b/apps/assort index 1429467c..42c0c4d5 100755 --- a/apps/assort +++ b/apps/assort Binary files differdiff --git a/apps/braces b/apps/braces index 0d80ed73..fefabcc8 100755 --- a/apps/braces +++ b/apps/braces Binary files differdiff --git a/apps/calls b/apps/calls index b0fb2b58..443dc7f3 100755 --- a/apps/calls +++ b/apps/calls Binary files differdiff --git a/apps/crenshaw2-1 b/apps/crenshaw2-1 index ac1d1ddb..f26dedce 100755 --- a/apps/crenshaw2-1 +++ b/apps/crenshaw2-1 Binary files differdiff --git a/apps/crenshaw2-1b b/apps/crenshaw2-1b index ff3f940a..139327ca 100755 --- a/apps/crenshaw2-1b +++ b/apps/crenshaw2-1b Binary files differdiff --git a/apps/dquotes b/apps/dquotes index f5d80786..302c3490 100755 --- a/apps/dquotes +++ b/apps/dquotes Binary files differdiff --git a/apps/factorial b/apps/factorial index 063a2f79..7e8edb63 100755 --- a/apps/factorial +++ b/apps/factorial Binary files differdiff --git a/apps/hex b/apps/hex index cf837a45..75edad2d 100755 --- a/apps/hex +++ b/apps/hex Binary files differdiff --git a/apps/mu b/apps/mu index 0dceb721..f3c88e9c 100755 --- a/apps/mu +++ b/apps/mu Binary files differdiff --git a/apps/pack b/apps/pack index d831afbb..a6211a7d 100755 --- a/apps/pack +++ b/apps/pack Binary files differdiff --git a/apps/sigils b/apps/sigils index 7fe34f28..82f4e43a 100755 --- a/apps/sigils +++ b/apps/sigils Binary files differdiff --git a/apps/survey b/apps/survey index 8989971f..abb0d1f0 100755 --- a/apps/survey +++ b/apps/survey Binary files differdiff --git a/apps/tests b/apps/tests index 0b826bff..7a4df46d 100755 --- a/apps/tests +++ b/apps/tests Binary files differ |