diff options
Diffstat (limited to '403unicode.mu')
-rw-r--r-- | 403unicode.mu | 213 |
1 files changed, 5 insertions, 208 deletions
diff --git a/403unicode.mu b/403unicode.mu index 948e6618..6ec30c3d 100644 --- a/403unicode.mu +++ b/403unicode.mu @@ -56,15 +56,11 @@ fn to-grapheme in: code-point -> _/eax: grapheme { break $to-grapheme:compute-length } # more than 4 bytes: unsupported - # TODO: print to stderr + # TODO: print error message to stderr compare c, 0x1fffff { break-if-> - print-string-to-real-screen "unsupported code point " - print-int32-hex-to-real-screen c - print-string-to-real-screen "\n" - var exit-status/ebx: int <- copy 1 - syscall_exit + return 0 } } # emit trailer bytes, 6 bits from 'in', first two bits '10' @@ -90,69 +86,7 @@ fn to-grapheme in: code-point -> _/eax: grapheme { return result } -# single-byte code point have identical graphemes -fn test-to-grapheme-single-byte { - var in-int/ecx: int <- copy 0 - { - compare in-int, 0x7f - break-if-> - var in/eax: code-point <- copy in-int - var out/eax: grapheme <- to-grapheme in - var out-int/eax: int <- copy out - check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte" - in-int <- increment - loop - } -} - - # byte | byte | byte | byte -# smallest 2-byte utf-8 -fn test-to-grapheme-two-bytes-min { - var in/eax: code-point <- copy 0x80 # 10 00-0000 - var out/eax: grapheme <- to-grapheme in - var out-int/eax: int <- copy out - check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a" # 110 0-0010 10 00-0000 -} - -# largest 2-byte utf-8 -fn test-to-grapheme-two-bytes-max { - var in/eax: code-point <- copy 0x7ff # 1-1111 11-1111 - var out/eax: grapheme <- to-grapheme in - var out-int/eax: int <- copy out - check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b" # 110 1-1111 10 11-1111 -} - -# smallest 3-byte utf-8 -fn test-to-grapheme-three-bytes-min { - var in/eax: code-point <- copy 0x800 # 10-0000 00-0000 - var out/eax: grapheme <- to-grapheme in - var out-int/eax: int <- copy out - check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a" # 1110 0000 10 10-0000 10 00-0000 -} - -# largest 3-byte utf-8 -fn test-to-grapheme-three-bytes-max { - var in/eax: code-point <- copy 0xffff # 1111 11-1111 11-1111 - var out/eax: grapheme <- to-grapheme in - var out-int/eax: int <- copy out - check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b" # 1110 1111 10 11-1111 10 11-1111 -} - -# smallest 4-byte utf-8 -fn test-to-grapheme-four-bytes-min { - var in/eax: code-point <- copy 0x10000 # 1-0000 00-0000 00-0000 - var out/eax: grapheme <- to-grapheme in - var out-int/eax: int <- copy out - check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a" # 1111-0 000 10 01-0000 10 00-0000 10 00-0000 -} - -# largest 4-byte utf-8 -fn test-to-grapheme-four-bytes-max { - var in/eax: code-point <- copy 0x1fffff # 111 11-1111 11-1111 11-1111 - var out/eax: grapheme <- to-grapheme in - var out-int/eax: int <- copy out - check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b" # 1111-0 111 10 11-1111 10 11-1111 10 11-1111 -} +# TODO: bring in tests once we have check-ints-equal # read the next grapheme from a stream of bytes fn read-grapheme in: (addr stream byte) -> _/eax: grapheme { @@ -200,15 +134,8 @@ fn read-grapheme in: (addr stream byte) -> _/eax: grapheme { num-trailers <- copy 3 break $read-grapheme:compute-length } -$read-grapheme:abort: { - # TODO: print to stderr - print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: " - var n/eax: int <- copy c - print-int32-hex-to-real-screen n - print-string-to-real-screen "\n" - var exit-status/ebx: int <- copy 1 - syscall_exit - } + # TODO: print error message + return 0 } # prepend trailer bytes var result/edi: grapheme <- copy c @@ -228,99 +155,6 @@ $read-grapheme:abort: { return result } -fn test-read-grapheme { - var s: (stream byte 0x30) - var s2/ecx: (addr stream byte) <- address s - write s2, "aΒc世d界e" - var c/eax: grapheme <- read-grapheme s2 - var n/eax: int <- copy c - check-ints-equal n, 0x61, "F - test grapheme/0" - var c/eax: grapheme <- read-grapheme s2 - var n/eax: int <- copy c - check-ints-equal n, 0x92ce/greek-capital-letter-beta, "F - test grapheme/1" - var c/eax: grapheme <- read-grapheme s2 - var n/eax: int <- copy c - check-ints-equal n, 0x63, "F - test grapheme/2" - var c/eax: grapheme <- read-grapheme s2 - var n/eax: int <- copy c - check-ints-equal n, 0x96b8e4, "F - test grapheme/3" - var c/eax: grapheme <- read-grapheme s2 - var n/eax: int <- copy c - check-ints-equal n, 0x64, "F - test grapheme/4" - var c/eax: grapheme <- read-grapheme s2 - var n/eax: int <- copy c - check-ints-equal n, 0x8c95e7, "F - test grapheme/5" - var c/eax: grapheme <- read-grapheme s2 - var n/eax: int <- copy c - check-ints-equal n, 0x65, "F - test grapheme/6" -} - -fn read-grapheme-buffered in: (addr buffered-file) -> _/eax: grapheme { - var c/eax: byte <- read-byte-buffered in - var num-trailers/ecx: int <- copy 0 - $read-grapheme-buffered:compute-length: { - # single byte: just return it - compare c, 0xc0 - { - break-if->= - var g/eax: grapheme <- copy c - return g - } - compare c, 0xfe - { - break-if-< - var g/eax: grapheme <- copy c - return g - } - # 2 bytes - compare c, 0xe0 - { - break-if->= - num-trailers <- copy 1 - break $read-grapheme-buffered:compute-length - } - # 3 bytes - compare c, 0xf0 - { - break-if->= - num-trailers <- copy 2 - break $read-grapheme-buffered:compute-length - } - # 4 bytes - compare c, 0xf8 - { - break-if->= - num-trailers <- copy 3 - break $read-grapheme-buffered:compute-length - } -$read-grapheme-buffered:abort: { - # TODO: print to stderr - print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: " - var n/eax: int <- copy c - print-int32-hex-to-real-screen n - print-string-to-real-screen "\n" - var exit-status/ebx: int <- copy 1 - syscall_exit - } - } - # prepend trailer bytes - var result/edi: grapheme <- copy c - var num-byte-shifts/edx: int <- copy 1 - { - compare num-trailers, 0 - break-if-<= - var tmp/eax: byte <- read-byte-buffered in - var tmp2/eax: int <- copy tmp - tmp2 <- shift-left-bytes tmp2, num-byte-shifts - result <- or tmp2 - # update loop state - num-byte-shifts <- increment - num-trailers <- decrement - loop - } - return result -} - # needed because available primitives only shift by a literal/constant number of bits fn shift-left-bytes n: int, k: int -> _/eax: int { var i/ecx: int <- copy 0 @@ -337,43 +171,6 @@ fn shift-left-bytes n: int, k: int -> _/eax: int { return result } -fn test-shift-left-bytes-0 { - var result/eax: int <- shift-left-bytes 1, 0 - check-ints-equal result, 1, "F - shift-left-bytes 0" -} - -fn test-shift-left-bytes-1 { - var result/eax: int <- shift-left-bytes 1, 1 - check-ints-equal result, 0x100, "F - shift-left-bytes 1" -} - -fn test-shift-left-bytes-2 { - var result/eax: int <- shift-left-bytes 1, 2 - check-ints-equal result, 0x10000, "F - shift-left-bytes 2" -} - -fn test-shift-left-bytes-3 { - var result/eax: int <- shift-left-bytes 1, 3 - check-ints-equal result, 0x1000000, "F - shift-left-bytes 3" -} - -fn test-shift-left-bytes-4 { - var result/eax: int <- shift-left-bytes 1, 4 - check-ints-equal result, 0, "F - shift-left-bytes 4" -} - -fn test-shift-left-bytes-5 { - var result/eax: int <- shift-left-bytes 1, 5 - check-ints-equal result, 0, "F - shift-left-bytes >4" -} - -# To run all tests, uncomment this and run: -# $ ./translate_mu && ./a.elf -#? fn main -> _/ebx: int { -#? run-tests -#? r <- copy 0 -#? } - # write a grapheme to a stream of bytes # this is like write-to-stream, except we skip leading 0 bytes fn write-grapheme out: (addr stream byte), g: grapheme { |