From d253a3182859c7c989449122a60d5f362f19ded0 Mon Sep 17 00:00:00 2001 From: "Kartik K. Agaram" Date: Tue, 9 Nov 2021 08:12:11 -0800 Subject: rename grapheme to code-point-utf8 Longer name, but it doesn't lie. We have no data structure right now for combining multiple code points. And it makes no sense for the notion of a grapheme to conflate its Unicode encoding. --- 403unicode.mu | 141 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 68 insertions(+), 73 deletions(-) (limited to '403unicode.mu') diff --git a/403unicode.mu b/403unicode.mu index e1bfba3f..5d465d71 100644 --- a/403unicode.mu +++ b/403unicode.mu @@ -1,16 +1,11 @@ # Helpers for Unicode. # -# Mu has no characters, only code points and graphemes. -# Code points are the indivisible atoms of text streams. +# The basic unit for rendering Unicode is the code point. # https://en.wikipedia.org/wiki/Code_point -# Graphemes are the smallest self-contained unit of text. -# Graphemes may consist of multiple code points. +# The glyph a non-cursive font displays may represent multiple code points. # -# Mu graphemes are always represented in utf-8, and they are required to fit -# in 4 bytes. (This can be confusing if you focus just on ASCII, where Mu's -# graphemes and code-points are identical.) -# -# Mu doesn't yet support graphemes consisting of multiple code points. +# In addition to raw code points (just integers assigned special meaning), Mu +# provides a common encoding as a convenience: code-point-utf8. fn test-unicode-serialization-and-deserialization { var i/ebx: int <- copy 0 @@ -20,8 +15,8 @@ fn test-unicode-serialization-and-deserialization { # but not emoji break-if->= var c/eax: code-point <- copy i - var _g/eax: grapheme <- to-grapheme c - var g/ecx: grapheme <- copy _g + var _g/eax: code-point-utf8 <- to-utf8 c + var g/ecx: code-point-utf8 <- copy _g var c2/eax: code-point <- to-code-point g compare i, c2 { @@ -51,7 +46,7 @@ fn test-unicode-serialization-and-deserialization { } # transliterated from tb_utf8_char_to_unicode in https://github.com/nsf/termbox -fn to-code-point in: grapheme -> _/eax: code-point { +fn to-code-point in: code-point-utf8 -> _/eax: code-point { var g/ebx: int <- copy in # if single byte, just return it { @@ -61,7 +56,7 @@ fn to-code-point in: grapheme -> _/eax: code-point { return result } # - var len/edx: int <- grapheme-length in + var len/edx: int <- utf8-length in # extract bits from first byte var b/eax: byte <- copy-byte g var result/edi: code-point <- copy b @@ -99,16 +94,16 @@ fn to-code-point in: grapheme -> _/eax: code-point { # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm -fn to-grapheme in: code-point -> _/eax: grapheme { +fn to-utf8 in: code-point -> _/eax: code-point-utf8 { var c/eax: int <- copy in var num-trailers/ecx: int <- copy 0 var first/edx: int <- copy 0 - $to-grapheme:compute-length: { + $to-utf8:compute-length: { # single byte: just return it compare c, 0x7f { break-if-> - var g/eax: grapheme <- copy c + var g/eax: code-point-utf8 <- copy c return g } # 2 bytes @@ -117,7 +112,7 @@ fn to-grapheme in: code-point -> _/eax: grapheme { break-if-> num-trailers <- copy 1 first <- copy 0xc0 - break $to-grapheme:compute-length + break $to-utf8:compute-length } # 3 bytes compare c, 0xffff @@ -125,7 +120,7 @@ fn to-grapheme in: code-point -> _/eax: grapheme { break-if-> num-trailers <- copy 2 first <- copy 0xe0 - break $to-grapheme:compute-length + break $to-utf8:compute-length } # 4 bytes compare c, 0x1fffff @@ -133,7 +128,7 @@ fn to-grapheme in: code-point -> _/eax: grapheme { break-if-> num-trailers <- copy 3 first <- copy 0xf0 - break $to-grapheme:compute-length + break $to-utf8:compute-length } # more than 4 bytes: unsupported compare c, 0x1fffff @@ -144,7 +139,7 @@ fn to-grapheme in: code-point -> _/eax: grapheme { } } # emit trailer bytes, 6 bits from 'in', first two bits '10' - var result/edi: grapheme <- copy 0 + var result/edi: code-point-utf8 <- copy 0 { compare num-trailers, 0 break-if-<= @@ -166,16 +161,16 @@ fn to-grapheme in: code-point -> _/eax: grapheme { return result } -# single-byte code point have identical graphemes -fn test-to-grapheme-single-byte { +# single-byte code point have identical code-point-utf8s +fn test-to-utf8-single-byte { var in-int/ecx: int <- copy 0 { compare in-int, 0x7f break-if-> var in/eax: code-point <- copy in-int - var out/eax: grapheme <- to-grapheme in + var out/eax: code-point-utf8 <- to-utf8 in var out-int/eax: int <- copy out - check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte" + check-ints-equal out-int, in-int, "F - test-to-utf8-single-byte" in-int <- increment loop } @@ -183,55 +178,55 @@ fn test-to-grapheme-single-byte { # byte | byte | byte | byte # smallest 2-byte utf-8 -fn test-to-grapheme-two-bytes-min { +fn test-to-utf8-two-bytes-min { var in/eax: code-point <- copy 0x80 # 10 00-0000 - var out/eax: grapheme <- to-grapheme in + var out/eax: code-point-utf8 <- to-utf8 in var out-int/eax: int <- copy out - check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a" # 110 0-0010 10 00-0000 + check-ints-equal out-int, 0x80c2, "F - to-utf8/2a" # 110 0-0010 10 00-0000 } # largest 2-byte utf-8 -fn test-to-grapheme-two-bytes-max { +fn test-to-utf8-two-bytes-max { var in/eax: code-point <- copy 0x7ff # 1-1111 11-1111 - var out/eax: grapheme <- to-grapheme in + var out/eax: code-point-utf8 <- to-utf8 in var out-int/eax: int <- copy out - check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b" # 110 1-1111 10 11-1111 + check-ints-equal out-int, 0xbfdf, "F - to-utf8/2b" # 110 1-1111 10 11-1111 } # smallest 3-byte utf-8 -fn test-to-grapheme-three-bytes-min { +fn test-to-utf8-three-bytes-min { var in/eax: code-point <- copy 0x800 # 10-0000 00-0000 - var out/eax: grapheme <- to-grapheme in + var out/eax: code-point-utf8 <- to-utf8 in var out-int/eax: int <- copy out - check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a" # 1110 0000 10 10-0000 10 00-0000 + check-ints-equal out-int, 0x80a0e0, "F - to-utf8/3a" # 1110 0000 10 10-0000 10 00-0000 } # largest 3-byte utf-8 -fn test-to-grapheme-three-bytes-max { +fn test-to-utf8-three-bytes-max { var in/eax: code-point <- copy 0xffff # 1111 11-1111 11-1111 - var out/eax: grapheme <- to-grapheme in + var out/eax: code-point-utf8 <- to-utf8 in var out-int/eax: int <- copy out - check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b" # 1110 1111 10 11-1111 10 11-1111 + check-ints-equal out-int, 0xbfbfef, "F - to-utf8/3b" # 1110 1111 10 11-1111 10 11-1111 } # smallest 4-byte utf-8 -fn test-to-grapheme-four-bytes-min { +fn test-to-utf8-four-bytes-min { var in/eax: code-point <- copy 0x10000 # 1-0000 00-0000 00-0000 - var out/eax: grapheme <- to-grapheme in + var out/eax: code-point-utf8 <- to-utf8 in var out-int/eax: int <- copy out - check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a" # 1111-0 000 10 01-0000 10 00-0000 10 00-0000 + check-ints-equal out-int, 0x808090f0, "F - to-utf8/4a" # 1111-0 000 10 01-0000 10 00-0000 10 00-0000 } # largest 4-byte utf-8 -fn test-to-grapheme-four-bytes-max { +fn test-to-utf8-four-bytes-max { var in/eax: code-point <- copy 0x1fffff # 111 11-1111 11-1111 11-1111 - var out/eax: grapheme <- to-grapheme in + var out/eax: code-point-utf8 <- to-utf8 in var out-int/eax: int <- copy out - check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b" # 1111-0 111 10 11-1111 10 11-1111 10 11-1111 + check-ints-equal out-int, 0xbfbfbff7, "F - to-utf8/4b" # 1111-0 111 10 11-1111 10 11-1111 10 11-1111 } -# read the next grapheme from a stream of bytes -fn read-grapheme in: (addr stream byte) -> _/eax: grapheme { +# read the next code-point-utf8 from a stream of bytes +fn read-code-point-utf8 in: (addr stream byte) -> _/eax: code-point-utf8 { # if at eof, return EOF { var eof?/eax: boolean <- stream-empty? in @@ -241,18 +236,18 @@ fn read-grapheme in: (addr stream byte) -> _/eax: grapheme { } var c/eax: byte <- read-byte in var num-trailers/ecx: int <- copy 0 - $read-grapheme:compute-length: { + $read-code-point-utf8:compute-length: { # single byte: just return it compare c, 0xc0 { break-if->= - var g/eax: grapheme <- copy c + var g/eax: code-point-utf8 <- copy c return g } compare c, 0xfe { break-if-< - var g/eax: grapheme <- copy c + var g/eax: code-point-utf8 <- copy c return g } # 2 bytes @@ -260,27 +255,27 @@ fn read-grapheme in: (addr stream byte) -> _/eax: grapheme { { break-if->= num-trailers <- copy 1 - break $read-grapheme:compute-length + break $read-code-point-utf8:compute-length } # 3 bytes compare c, 0xf0 { break-if->= num-trailers <- copy 2 - break $read-grapheme:compute-length + break $read-code-point-utf8:compute-length } # 4 bytes compare c, 0xf8 { break-if->= num-trailers <- copy 3 - break $read-grapheme:compute-length + break $read-code-point-utf8:compute-length } abort "utf-8 encodings larger than 4 bytes are not yet supported" return 0 } # prepend trailer bytes - var result/edi: grapheme <- copy c + var result/edi: code-point-utf8 <- copy c var num-byte-shifts/edx: int <- copy 1 { compare num-trailers, 0 @@ -297,34 +292,34 @@ fn read-grapheme in: (addr stream byte) -> _/eax: grapheme { return result } -fn test-read-grapheme { +fn test-read-code-point-utf8 { var s: (stream byte 0x30) var s2/ecx: (addr stream byte) <- address s write s2, "aΒc世d界e" - var c/eax: grapheme <- read-grapheme s2 + var c/eax: code-point-utf8 <- read-code-point-utf8 s2 var n/eax: int <- copy c - check-ints-equal n, 0x61, "F - test grapheme/0" - var c/eax: grapheme <- read-grapheme s2 + check-ints-equal n, 0x61, "F - test code-point-utf8/0" + var c/eax: code-point-utf8 <- read-code-point-utf8 s2 var n/eax: int <- copy c - check-ints-equal n, 0x92ce/greek-capital-letter-beta, "F - test grapheme/1" - var c/eax: grapheme <- read-grapheme s2 + check-ints-equal n, 0x92ce/greek-capital-letter-beta, "F - test code-point-utf8/1" + var c/eax: code-point-utf8 <- read-code-point-utf8 s2 var n/eax: int <- copy c - check-ints-equal n, 0x63, "F - test grapheme/2" - var c/eax: grapheme <- read-grapheme s2 + check-ints-equal n, 0x63, "F - test code-point-utf8/2" + var c/eax: code-point-utf8 <- read-code-point-utf8 s2 var n/eax: int <- copy c - check-ints-equal n, 0x96b8e4, "F - test grapheme/3" - var c/eax: grapheme <- read-grapheme s2 + check-ints-equal n, 0x96b8e4, "F - test code-point-utf8/3" + var c/eax: code-point-utf8 <- read-code-point-utf8 s2 var n/eax: int <- copy c - check-ints-equal n, 0x64, "F - test grapheme/4" - var c/eax: grapheme <- read-grapheme s2 + check-ints-equal n, 0x64, "F - test code-point-utf8/4" + var c/eax: code-point-utf8 <- read-code-point-utf8 s2 var n/eax: int <- copy c - check-ints-equal n, 0x8c95e7, "F - test grapheme/5" - var c/eax: grapheme <- read-grapheme s2 + check-ints-equal n, 0x8c95e7, "F - test code-point-utf8/5" + var c/eax: code-point-utf8 <- read-code-point-utf8 s2 var n/eax: int <- copy c - check-ints-equal n, 0x65, "F - test grapheme/6" + check-ints-equal n, 0x65, "F - test code-point-utf8/6" } -fn grapheme-length g: grapheme -> _/edx: int { +fn utf8-length g: code-point-utf8 -> _/edx: int { { compare g, 0xff break-if-> @@ -389,23 +384,23 @@ fn test-shift-left-bytes-5 { check-ints-equal result, 0, "F - shift-left-bytes >4" } -# write a grapheme to a stream of bytes +# write a code-point-utf8 to a stream of bytes # this is like write-to-stream, except we skip leading 0 bytes -fn write-grapheme out: (addr stream byte), g: grapheme { -$write-grapheme:body: { +fn write-code-point-utf8 out: (addr stream byte), g: code-point-utf8 { +$write-code-point-utf8:body: { var c/eax: int <- copy g append-byte out, c # first byte is always written c <- shift-right 8 compare c, 0 - break-if-= $write-grapheme:body + break-if-= $write-code-point-utf8:body append-byte out, c c <- shift-right 8 compare c, 0 - break-if-= $write-grapheme:body + break-if-= $write-code-point-utf8:body append-byte out, c c <- shift-right 8 compare c, 0 - break-if-= $write-grapheme:body + break-if-= $write-code-point-utf8:body append-byte out, c } } -- cgit 1.4.1-2-gfad0