From 61ec86b71928afc4a7b0a9c787ba88d5b4b3040b Mon Sep 17 00:00:00 2001 From: Kartik Agaram Date: Mon, 30 Aug 2021 09:21:52 -0700 Subject: . --- html/403unicode.mu.html | 517 +++++++++++++++++++++++++++++++----------------- 1 file changed, 341 insertions(+), 176 deletions(-) (limited to 'html/403unicode.mu.html') diff --git a/html/403unicode.mu.html b/html/403unicode.mu.html index 6cee5b30..e8a93129 100644 --- a/html/403unicode.mu.html +++ b/html/403unicode.mu.html @@ -16,14 +16,16 @@ a { color:inherit; } * { font-size:12pt; font-size: 1em; } .PreProc { color: #c000c0; } .muRegEdx { color: #af5f00; } -.Special { color: #ff6060; } .LineNr { } +.muRegEdi { color: #00af00; } .muRegEsi { color: #005faf; } +.muRegEbx { color: #5f00ff; } .Constant { color: #008787; } -.muRegEdi { color: #00af00; } +.muFunction { color: #af5f00; text-decoration: underline; } .muRegEcx { color: #870000; } .Delimiter { color: #c000c0; } -.muFunction { color: #af5f00; text-decoration: underline; } +.Special { color: #ff6060; } +.muTest { color: #5f8700; } .muComment { color: #005faf; } --> @@ -69,190 +71,353 @@ if ('onhashchange' in window) { 7 # Graphemes may consist of multiple code points. 8 # 9 # Mu graphemes are always represented in utf-8, and they are required to fit - 10 # in 4 bytes. - 11 # - 12 # Mu doesn't currently support combining code points, or graphemes made of - 13 # multiple code points. One day we will. - 14 # We also don't currently support code points that translate into multiple - 15 # or wide graphemes. (In particular, Tab will never be supported.) + 10 # in 4 bytes. (This can be confusing if you focus just on ASCII, where Mu's + 11 # graphemes and code-points are identical.) + 12 # + 13 # Mu doesn't currently support combining code points, or graphemes made of + 14 # multiple code points. One day we will. + 15 # https://en.wikipedia.org/wiki/Combining_character 16 - 17 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox - 18 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm - 19 # - 20 # The day we want to support combining characters, this function will need to - 21 # take multiple code points. Or something. - 22 fn to-grapheme in: code-point -> _/eax: grapheme { - 23 var c/eax: int <- copy in - 24 var num-trailers/ecx: int <- copy 0 - 25 var first/edx: int <- copy 0 - 26 $to-grapheme:compute-length: { - 27 # single byte: just return it - 28 compare c, 0x7f + 17 fn test-unicode-serialization-and-deserialization { + 18 var i/ebx: int <- copy 0 + 19 var init?/esi: boolean <- copy 1/true + 20 { + 21 compare i, 0x10000 # 32 bits of utf-8 are sufficient for https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane + 22 # but not emoji + 23 break-if->= + 24 var c/eax: code-point <- copy i + 25 var _g/eax: grapheme <- to-grapheme c + 26 var g/ecx: grapheme <- copy _g + 27 var c2/eax: code-point <- to-code-point g + 28 compare i, c2 29 { - 30 break-if-> - 31 var g/eax: grapheme <- copy c - 32 return g - 33 } - 34 # 2 bytes - 35 compare c, 0x7ff - 36 { - 37 break-if-> - 38 num-trailers <- copy 1 - 39 first <- copy 0xc0 - 40 break $to-grapheme:compute-length - 41 } - 42 # 3 bytes - 43 compare c, 0xffff - 44 { - 45 break-if-> - 46 num-trailers <- copy 2 - 47 first <- copy 0xe0 - 48 break $to-grapheme:compute-length + 30 break-if-= + 31 { + 32 compare init?, 0/false + 33 break-if-= + 34 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "F - test-unicode-serialization-and-deserialization: ", 3/fg 0/bg + 35 } + 36 init? <- copy 0/false + 37 draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, i, 3/fg 0/bg + 38 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg + 39 { + 40 var x/eax: int <- copy g + 41 draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x, 3/fg 0/bg + 42 } + 43 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg + 44 { + 45 var x2/eax: int <- copy c2 + 46 draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x2, 3/fg 0/bg + 47 } + 48 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, " ", 3/fg 0/bg 49 } - 50 # 4 bytes - 51 compare c, 0x1fffff - 52 { - 53 break-if-> - 54 num-trailers <- copy 3 - 55 first <- copy 0xf0 - 56 break $to-grapheme:compute-length - 57 } - 58 # more than 4 bytes: unsupported - 59 # TODO: print error message to stderr - 60 compare c, 0x1fffff - 61 { - 62 break-if-> - 63 return 0 - 64 } - 65 } - 66 # emit trailer bytes, 6 bits from 'in', first two bits '10' - 67 var result/edi: grapheme <- copy 0 - 68 { - 69 compare num-trailers, 0 - 70 break-if-<= - 71 var tmp/esi: int <- copy c - 72 tmp <- and 0x3f - 73 tmp <- or 0x80 - 74 result <- shift-left 8 - 75 result <- or tmp - 76 # update loop state - 77 c <- shift-right 6 - 78 num-trailers <- decrement - 79 loop - 80 } - 81 # emit engine - 82 result <- shift-left 8 - 83 result <- or c - 84 result <- or first - 85 # - 86 return result - 87 } - 88 - 89 # TODO: bring in tests once we have check-ints-equal - 90 - 91 # read the next grapheme from a stream of bytes - 92 fn read-grapheme in: (addr stream byte) -> _/eax: grapheme { - 93 # if at eof, return EOF - 94 { - 95 var eof?/eax: boolean <- stream-empty? in - 96 compare eof?, 0/false - 97 break-if-= - 98 return 0xffffffff - 99 } -100 var c/eax: byte <- read-byte in -101 var num-trailers/ecx: int <- copy 0 -102 $read-grapheme:compute-length: { -103 # single byte: just return it -104 compare c, 0xc0 -105 { -106 break-if->= -107 var g/eax: grapheme <- copy c -108 return g -109 } -110 compare c, 0xfe + 50 i <- add 0xf # to speed things up; ensure increment is not a power of 2 + 51 loop + 52 } + 53 } + 54 + 55 # transliterated from tb_utf8_char_to_unicode in https://github.com/nsf/termbox + 56 fn to-code-point in: grapheme -> _/eax: code-point { + 57 var g/ebx: int <- copy in + 58 # if single byte, just return it + 59 { + 60 compare g, 0xff + 61 break-if-> + 62 var result/eax: code-point <- copy g + 63 return result + 64 } + 65 # + 66 var len/edx: int <- grapheme-length in + 67 # extract bits from first byte + 68 var b/eax: byte <- copy-byte g + 69 var result/edi: code-point <- copy b + 70 { + 71 compare len, 2 + 72 break-if-!= + 73 result <- and 0x1f + 74 } + 75 { + 76 compare len, 3 + 77 break-if-!= + 78 result <- and 0x0f + 79 } + 80 { + 81 compare len, 4 + 82 break-if-!= + 83 result <- and 0x07 + 84 } + 85 # extract bits from remaining bytes + 86 g <- shift-right 8 + 87 var i/ecx: int <- copy 1 + 88 { + 89 compare i, len + 90 break-if->= + 91 var b/eax: byte <- copy-byte g + 92 b <- and 0x3f + 93 result <- shift-left 6 + 94 result <- or b + 95 g <- shift-right 8 + 96 i <- increment + 97 loop + 98 } + 99 return result +100 } +101 +102 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox +103 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm +104 fn to-grapheme in: code-point -> _/eax: grapheme { +105 var c/eax: int <- copy in +106 var num-trailers/ecx: int <- copy 0 +107 var first/edx: int <- copy 0 +108 $to-grapheme:compute-length: { +109 # single byte: just return it +110 compare c, 0x7f 111 { -112 break-if-< +112 break-if-> 113 var g/eax: grapheme <- copy c 114 return g 115 } 116 # 2 bytes -117 compare c, 0xe0 +117 compare c, 0x7ff 118 { -119 break-if->= +119 break-if-> 120 num-trailers <- copy 1 -121 break $read-grapheme:compute-length -122 } -123 # 3 bytes -124 compare c, 0xf0 -125 { -126 break-if->= -127 num-trailers <- copy 2 -128 break $read-grapheme:compute-length -129 } -130 # 4 bytes -131 compare c, 0xf8 -132 { -133 break-if->= -134 num-trailers <- copy 3 -135 break $read-grapheme:compute-length -136 } -137 # TODO: print error message -138 return 0 -139 } -140 # prepend trailer bytes -141 var result/edi: grapheme <- copy c -142 var num-byte-shifts/edx: int <- copy 1 -143 { -144 compare num-trailers, 0 -145 break-if-<= -146 var tmp/eax: byte <- read-byte in -147 var tmp2/eax: int <- copy tmp -148 tmp2 <- shift-left-bytes tmp2, num-byte-shifts -149 result <- or tmp2 -150 # update loop state -151 num-byte-shifts <- increment -152 num-trailers <- decrement -153 loop -154 } -155 return result -156 } -157 -158 # needed because available primitives only shift by a literal/constant number of bits -159 fn shift-left-bytes n: int, k: int -> _/eax: int { -160 var i/ecx: int <- copy 0 -161 var result/eax: int <- copy n -162 { -163 compare i, k -164 break-if->= -165 compare i, 4 # only 4 bytes in 32 bits -166 break-if->= -167 result <- shift-left 8 -168 i <- increment -169 loop -170 } -171 return result -172 } -173 -174 # write a grapheme to a stream of bytes -175 # this is like write-to-stream, except we skip leading 0 bytes -176 fn write-grapheme out: (addr stream byte), g: grapheme { -177 $write-grapheme:body: { -178 var c/eax: int <- copy g -179 append-byte out, c # first byte is always written -180 c <- shift-right 8 -181 compare c, 0 -182 break-if-= $write-grapheme:body -183 append-byte out, c -184 c <- shift-right 8 -185 compare c, 0 -186 break-if-= $write-grapheme:body -187 append-byte out, c -188 c <- shift-right 8 -189 compare c, 0 -190 break-if-= $write-grapheme:body -191 append-byte out, c -192 } +121 first <- copy 0xc0 +122 break $to-grapheme:compute-length +123 } +124 # 3 bytes +125 compare c, 0xffff +126 { +127 break-if-> +128 num-trailers <- copy 2 +129 first <- copy 0xe0 +130 break $to-grapheme:compute-length +131 } +132 # 4 bytes +133 compare c, 0x1fffff +134 { +135 break-if-> +136 num-trailers <- copy 3 +137 first <- copy 0xf0 +138 break $to-grapheme:compute-length +139 } +140 # more than 4 bytes: unsupported +141 # TODO: print error message to stderr +142 compare c, 0x1fffff +143 { +144 break-if-> +145 return 0 +146 } +147 } +148 # emit trailer bytes, 6 bits from 'in', first two bits '10' +149 var result/edi: grapheme <- copy 0 +150 { +151 compare num-trailers, 0 +152 break-if-<= +153 var tmp/esi: int <- copy c +154 tmp <- and 0x3f +155 tmp <- or 0x80 +156 result <- shift-left 8 +157 result <- or tmp +158 # update loop state +159 c <- shift-right 6 +160 num-trailers <- decrement +161 loop +162 } +163 # emit engine +164 result <- shift-left 8 +165 result <- or c +166 result <- or first +167 # +168 return result +169 } +170 +171 # single-byte code point have identical graphemes +172 fn test-to-grapheme-single-byte { +173 var in-int/ecx: int <- copy 0 +174 { +175 compare in-int, 0x7f +176 break-if-> +177 var in/eax: code-point <- copy in-int +178 var out/eax: grapheme <- to-grapheme in +179 var out-int/eax: int <- copy out +180 check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte" +181 in-int <- increment +182 loop +183 } +184 } +185 +186 # byte | byte | byte | byte +187 # smallest 2-byte utf-8 +188 fn test-to-grapheme-two-bytes-min { +189 var in/eax: code-point <- copy 0x80 # 10 00-0000 +190 var out/eax: grapheme <- to-grapheme in +191 var out-int/eax: int <- copy out +192 check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a" # 110 0-0010 10 00-0000 193 } +194 +195 # largest 2-byte utf-8 +196 fn test-to-grapheme-two-bytes-max { +197 var in/eax: code-point <- copy 0x7ff # 1-1111 11-1111 +198 var out/eax: grapheme <- to-grapheme in +199 var out-int/eax: int <- copy out +200 check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b" # 110 1-1111 10 11-1111 +201 } +202 +203 # smallest 3-byte utf-8 +204 fn test-to-grapheme-three-bytes-min { +205 var in/eax: code-point <- copy 0x800 # 10-0000 00-0000 +206 var out/eax: grapheme <- to-grapheme in +207 var out-int/eax: int <- copy out +208 check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a" # 1110 0000 10 10-0000 10 00-0000 +209 } +210 +211 # largest 3-byte utf-8 +212 fn test-to-grapheme-three-bytes-max { +213 var in/eax: code-point <- copy 0xffff # 1111 11-1111 11-1111 +214 var out/eax: grapheme <- to-grapheme in +215 var out-int/eax: int <- copy out +216 check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b" # 1110 1111 10 11-1111 10 11-1111 +217 } +218 +219 # smallest 4-byte utf-8 +220 fn test-to-grapheme-four-bytes-min { +221 var in/eax: code-point <- copy 0x10000 # 1-0000 00-0000 00-0000 +222 var out/eax: grapheme <- to-grapheme in +223 var out-int/eax: int <- copy out +224 check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a" # 1111-0 000 10 01-0000 10 00-0000 10 00-0000 +225 } +226 +227 # largest 4-byte utf-8 +228 fn test-to-grapheme-four-bytes-max { +229 var in/eax: code-point <- copy 0x1fffff # 111 11-1111 11-1111 11-1111 +230 var out/eax: grapheme <- to-grapheme in +231 var out-int/eax: int <- copy out +232 check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b" # 1111-0 111 10 11-1111 10 11-1111 10 11-1111 +233 } +234 +235 # read the next grapheme from a stream of bytes +236 fn read-grapheme in: (addr stream byte) -> _/eax: grapheme { +237 # if at eof, return EOF +238 { +239 var eof?/eax: boolean <- stream-empty? in +240 compare eof?, 0/false +241 break-if-= +242 return 0xffffffff +243 } +244 var c/eax: byte <- read-byte in +245 var num-trailers/ecx: int <- copy 0 +246 $read-grapheme:compute-length: { +247 # single byte: just return it +248 compare c, 0xc0 +249 { +250 break-if->= +251 var g/eax: grapheme <- copy c +252 return g +253 } +254 compare c, 0xfe +255 { +256 break-if-< +257 var g/eax: grapheme <- copy c +258 return g +259 } +260 # 2 bytes +261 compare c, 0xe0 +262 { +263 break-if->= +264 num-trailers <- copy 1 +265 break $read-grapheme:compute-length +266 } +267 # 3 bytes +268 compare c, 0xf0 +269 { +270 break-if->= +271 num-trailers <- copy 2 +272 break $read-grapheme:compute-length +273 } +274 # 4 bytes +275 compare c, 0xf8 +276 { +277 break-if->= +278 num-trailers <- copy 3 +279 break $read-grapheme:compute-length +280 } +281 # TODO: print error message +282 return 0 +283 } +284 # prepend trailer bytes +285 var result/edi: grapheme <- copy c +286 var num-byte-shifts/edx: int <- copy 1 +287 { +288 compare num-trailers, 0 +289 break-if-<= +290 var tmp/eax: byte <- read-byte in +291 var tmp2/eax: int <- copy tmp +292 tmp2 <- shift-left-bytes tmp2, num-byte-shifts +293 result <- or tmp2 +294 # update loop state +295 num-byte-shifts <- increment +296 num-trailers <- decrement +297 loop +298 } +299 return result +300 } +301 +302 fn grapheme-length g: grapheme -> _/edx: int { +303 { +304 compare g, 0xff +305 break-if-> +306 return 1 +307 } +308 { +309 compare g, 0xffff +310 break-if-> +311 return 2 +312 } +313 { +314 compare g, 0xffffff +315 break-if-> +316 return 3 +317 } +318 return 4 +319 } +320 +321 # needed because available primitives only shift by a literal/constant number of bits +322 fn shift-left-bytes n: int, k: int -> _/eax: int { +323 var i/ecx: int <- copy 0 +324 var result/eax: int <- copy n +325 { +326 compare i, k +327 break-if->= +328 compare i, 4 # only 4 bytes in 32 bits +329 break-if->= +330 result <- shift-left 8 +331 i <- increment +332 loop +333 } +334 return result +335 } +336 +337 # write a grapheme to a stream of bytes +338 # this is like write-to-stream, except we skip leading 0 bytes +339 fn write-grapheme out: (addr stream byte), g: grapheme { +340 $write-grapheme:body: { +341 var c/eax: int <- copy g +342 append-byte out, c # first byte is always written +343 c <- shift-right 8 +344 compare c, 0 +345 break-if-= $write-grapheme:body +346 append-byte out, c +347 c <- shift-right 8 +348 compare c, 0 +349 break-if-= $write-grapheme:body +350 append-byte out, c +351 c <- shift-right 8 +352 compare c, 0 +353 break-if-= $write-grapheme:body +354 append-byte out, c +355 } +356 } -- cgit 1.4.1-2-gfad0