From 3350c34a74844e21ea69077e01efff3bae64bdcd Mon Sep 17 00:00:00 2001 From: Kartik Agaram Date: Tue, 23 Mar 2021 17:31:08 -0700 Subject: . --- html/linux/403unicode.mu.html | 452 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 452 insertions(+) create mode 100644 html/linux/403unicode.mu.html (limited to 'html/linux/403unicode.mu.html') diff --git a/html/linux/403unicode.mu.html b/html/linux/403unicode.mu.html new file mode 100644 index 00000000..298b5c1f --- /dev/null +++ b/html/linux/403unicode.mu.html @@ -0,0 +1,452 @@ + + + + +Mu - linux/403unicode.mu + + + + + + + + + + +https://github.com/akkartik/mu/blob/main/linux/403unicode.mu +
+  1 # Helpers for Unicode.
+  2 #
+  3 # Mu has no characters, only code points and graphemes.
+  4 # Code points are the indivisible atoms of text streams.
+  5 #   https://en.wikipedia.org/wiki/Code_point
+  6 # Graphemes are the smallest self-contained unit of text.
+  7 # Graphemes may consist of multiple code points.
+  8 #
+  9 # Mu graphemes are always represented in utf-8, and they are required to fit
+ 10 # in 4 bytes.
+ 11 #
+ 12 # Mu doesn't currently support combining code points, or graphemes made of
+ 13 # multiple code points. One day we will.
+ 14 # We also don't currently support code points that translate into multiple
+ 15 # or wide graphemes. (In particular, Tab will never be supported.)
+ 16 
+ 17 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
+ 18 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
+ 19 #
+ 20 # The day we want to support combining characters, this function will need to
+ 21 # take multiple code points. Or something.
+ 22 fn to-grapheme in: code-point -> _/eax: grapheme {
+ 23   var c/eax: int <- copy in
+ 24   var num-trailers/ecx: int <- copy 0
+ 25   var first/edx: int <- copy 0
+ 26   $to-grapheme:compute-length: {
+ 27     # single byte: just return it
+ 28     compare c, 0x7f
+ 29     {
+ 30       break-if->
+ 31       var g/eax: grapheme <- copy c
+ 32       return g
+ 33     }
+ 34     # 2 bytes
+ 35     compare c, 0x7ff
+ 36     {
+ 37       break-if->
+ 38       num-trailers <- copy 1
+ 39       first <- copy 0xc0
+ 40       break $to-grapheme:compute-length
+ 41     }
+ 42     # 3 bytes
+ 43     compare c, 0xffff
+ 44     {
+ 45       break-if->
+ 46       num-trailers <- copy 2
+ 47       first <- copy 0xe0
+ 48       break $to-grapheme:compute-length
+ 49     }
+ 50     # 4 bytes
+ 51     compare c, 0x1fffff
+ 52     {
+ 53       break-if->
+ 54       num-trailers <- copy 3
+ 55       first <- copy 0xf0
+ 56       break $to-grapheme:compute-length
+ 57     }
+ 58     # more than 4 bytes: unsupported
+ 59     # TODO: print to stderr
+ 60     compare c, 0x1fffff
+ 61     {
+ 62       break-if->
+ 63       print-string-to-real-screen "unsupported code point "
+ 64       print-int32-hex-to-real-screen c
+ 65       print-string-to-real-screen "\n"
+ 66       var exit-status/ebx: int <- copy 1
+ 67       syscall_exit
+ 68     }
+ 69   }
+ 70   # emit trailer bytes, 6 bits from 'in', first two bits '10'
+ 71   var result/edi: grapheme <- copy 0
+ 72   {
+ 73     compare num-trailers, 0
+ 74     break-if-<=
+ 75     var tmp/esi: int <- copy c
+ 76     tmp <- and 0x3f
+ 77     tmp <- or 0x80
+ 78     result <- shift-left 8
+ 79     result <- or tmp
+ 80     # update loop state
+ 81     c <- shift-right 6
+ 82     num-trailers <- decrement
+ 83     loop
+ 84   }
+ 85   # emit engine
+ 86   result <- shift-left 8
+ 87   result <- or c
+ 88   result <- or first
+ 89   #
+ 90   return result
+ 91 }
+ 92 
+ 93 # single-byte code point have identical graphemes
+ 94 fn test-to-grapheme-single-byte {
+ 95   var in-int/ecx: int <- copy 0
+ 96   {
+ 97     compare in-int, 0x7f
+ 98     break-if->
+ 99     var in/eax: code-point <- copy in-int
+100     var out/eax: grapheme <- to-grapheme in
+101     var out-int/eax: int <- copy out
+102     check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte"
+103     in-int <- increment
+104     loop
+105   }
+106 }
+107 
+108                                                               # byte       | byte      | byte      | byte
+109 # smallest 2-byte utf-8
+110 fn test-to-grapheme-two-bytes-min {
+111   var in/eax: code-point <- copy 0x80                         #                                 10     00-0000
+112   var out/eax: grapheme <- to-grapheme in
+113   var out-int/eax: int <- copy out
+114   check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a"      #                         110 0-0010  10 00-0000
+115 }
+116 
+117 # largest 2-byte utf-8
+118 fn test-to-grapheme-two-bytes-max {
+119   var in/eax: code-point <- copy 0x7ff                        #                             1-1111     11-1111
+120   var out/eax: grapheme <- to-grapheme in
+121   var out-int/eax: int <- copy out
+122   check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b"      #                         110 1-1111  10 11-1111
+123 }
+124 
+125 # smallest 3-byte utf-8
+126 fn test-to-grapheme-three-bytes-min {
+127   var in/eax: code-point <- copy 0x800                        #                            10-0000     00-0000
+128   var out/eax: grapheme <- to-grapheme in
+129   var out-int/eax: int <- copy out
+130   check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a"    #              1110 0000  10 10-0000  10 00-0000
+131 }
+132 
+133 # largest 3-byte utf-8
+134 fn test-to-grapheme-three-bytes-max {
+135   var in/eax: code-point <- copy 0xffff                       #                   1111     11-1111     11-1111
+136   var out/eax: grapheme <- to-grapheme in
+137   var out-int/eax: int <- copy out
+138   check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b"    #              1110 1111  10 11-1111  10 11-1111
+139 }
+140 
+141 # smallest 4-byte utf-8
+142 fn test-to-grapheme-four-bytes-min {
+143   var in/eax: code-point <- copy 0x10000                      #                 1-0000     00-0000     00-0000
+144   var out/eax: grapheme <- to-grapheme in
+145   var out-int/eax: int <- copy out
+146   check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a"  # 1111-0 000  10 01-0000  10 00-0000  10 00-0000
+147 }
+148 
+149 # largest 4-byte utf-8
+150 fn test-to-grapheme-four-bytes-max {
+151   var in/eax: code-point <- copy 0x1fffff                     #        111     11-1111     11-1111     11-1111
+152   var out/eax: grapheme <- to-grapheme in
+153   var out-int/eax: int <- copy out
+154   check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b"  # 1111-0 111  10 11-1111  10 11-1111  10 11-1111
+155 }
+156 
+157 # read the next grapheme from a stream of bytes
+158 fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {
+159   # if at eof, return EOF
+160   {
+161     var eof?/eax: boolean <- stream-empty? in
+162     compare eof?, 0/false
+163     break-if-=
+164     return 0xffffffff
+165   }
+166   var c/eax: byte <- read-byte in
+167   var num-trailers/ecx: int <- copy 0
+168   $read-grapheme:compute-length: {
+169     # single byte: just return it
+170     compare c, 0xc0
+171     {
+172       break-if->=
+173       var g/eax: grapheme <- copy c
+174       return g
+175     }
+176     compare c, 0xfe
+177     {
+178       break-if-<
+179       var g/eax: grapheme <- copy c
+180       return g
+181     }
+182     # 2 bytes
+183     compare c, 0xe0
+184     {
+185       break-if->=
+186       num-trailers <- copy 1
+187       break $read-grapheme:compute-length
+188     }
+189     # 3 bytes
+190     compare c, 0xf0
+191     {
+192       break-if->=
+193       num-trailers <- copy 2
+194       break $read-grapheme:compute-length
+195     }
+196     # 4 bytes
+197     compare c, 0xf8
+198     {
+199       break-if->=
+200       num-trailers <- copy 3
+201       break $read-grapheme:compute-length
+202     }
+203 $read-grapheme:abort: {
+204       # TODO: print to stderr
+205       print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
+206       var n/eax: int <- copy c
+207       print-int32-hex-to-real-screen n
+208       print-string-to-real-screen "\n"
+209       var exit-status/ebx: int <- copy 1
+210       syscall_exit
+211     }
+212   }
+213   # prepend trailer bytes
+214   var result/edi: grapheme <- copy c
+215   var num-byte-shifts/edx: int <- copy 1
+216   {
+217     compare num-trailers, 0
+218     break-if-<=
+219     var tmp/eax: byte <- read-byte in
+220     var tmp2/eax: int <- copy tmp
+221     tmp2 <- shift-left-bytes tmp2, num-byte-shifts
+222     result <- or tmp2
+223     # update loop state
+224     num-byte-shifts <- increment
+225     num-trailers <- decrement
+226     loop
+227   }
+228   return result
+229 }
+230 
+231 fn test-read-grapheme {
+232   var s: (stream byte 0x30)
+233   var s2/ecx: (addr stream byte) <- address s
+234   write s2, "aΒc世d界e"
+235   var c/eax: grapheme <- read-grapheme s2
+236   var n/eax: int <- copy c
+237   check-ints-equal n, 0x61, "F - test grapheme/0"
+238   var c/eax: grapheme <- read-grapheme s2
+239   var n/eax: int <- copy c
+240   check-ints-equal n, 0x92ce/greek-capital-letter-beta, "F - test grapheme/1"
+241   var c/eax: grapheme <- read-grapheme s2
+242   var n/eax: int <- copy c
+243   check-ints-equal n, 0x63, "F - test grapheme/2"
+244   var c/eax: grapheme <- read-grapheme s2
+245   var n/eax: int <- copy c
+246   check-ints-equal n, 0x96b8e4, "F - test grapheme/3"
+247   var c/eax: grapheme <- read-grapheme s2
+248   var n/eax: int <- copy c
+249   check-ints-equal n, 0x64, "F - test grapheme/4"
+250   var c/eax: grapheme <- read-grapheme s2
+251   var n/eax: int <- copy c
+252   check-ints-equal n, 0x8c95e7, "F - test grapheme/5"
+253   var c/eax: grapheme <- read-grapheme s2
+254   var n/eax: int <- copy c
+255   check-ints-equal n, 0x65, "F - test grapheme/6"
+256 }
+257 
+258 fn read-grapheme-buffered in: (addr buffered-file) -> _/eax: grapheme {
+259   var c/eax: byte <- read-byte-buffered in
+260   var num-trailers/ecx: int <- copy 0
+261   $read-grapheme-buffered:compute-length: {
+262     # single byte: just return it
+263     compare c, 0xc0
+264     {
+265       break-if->=
+266       var g/eax: grapheme <- copy c
+267       return g
+268     }
+269     compare c, 0xfe
+270     {
+271       break-if-<
+272       var g/eax: grapheme <- copy c
+273       return g
+274     }
+275     # 2 bytes
+276     compare c, 0xe0
+277     {
+278       break-if->=
+279       num-trailers <- copy 1
+280       break $read-grapheme-buffered:compute-length
+281     }
+282     # 3 bytes
+283     compare c, 0xf0
+284     {
+285       break-if->=
+286       num-trailers <- copy 2
+287       break $read-grapheme-buffered:compute-length
+288     }
+289     # 4 bytes
+290     compare c, 0xf8
+291     {
+292       break-if->=
+293       num-trailers <- copy 3
+294       break $read-grapheme-buffered:compute-length
+295     }
+296 $read-grapheme-buffered:abort: {
+297       # TODO: print to stderr
+298       print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
+299       var n/eax: int <- copy c
+300       print-int32-hex-to-real-screen n
+301       print-string-to-real-screen "\n"
+302       var exit-status/ebx: int <- copy 1
+303       syscall_exit
+304     }
+305   }
+306   # prepend trailer bytes
+307   var result/edi: grapheme <- copy c
+308   var num-byte-shifts/edx: int <- copy 1
+309   {
+310     compare num-trailers, 0
+311     break-if-<=
+312     var tmp/eax: byte <- read-byte-buffered in
+313     var tmp2/eax: int <- copy tmp
+314     tmp2 <- shift-left-bytes tmp2, num-byte-shifts
+315     result <- or tmp2
+316     # update loop state
+317     num-byte-shifts <- increment
+318     num-trailers <- decrement
+319     loop
+320   }
+321   return result
+322 }
+323 
+324 # needed because available primitives only shift by a literal/constant number of bits
+325 fn shift-left-bytes n: int, k: int -> _/eax: int {
+326   var i/ecx: int <- copy 0
+327   var result/eax: int <- copy n
+328   {
+329     compare i, k
+330     break-if->=
+331     compare i, 4  # only 4 bytes in 32 bits
+332     break-if->=
+333     result <- shift-left 8
+334     i <- increment
+335     loop
+336   }
+337   return result
+338 }
+339 
+340 fn test-shift-left-bytes-0 {
+341   var result/eax: int <- shift-left-bytes 1, 0
+342   check-ints-equal result, 1, "F - shift-left-bytes 0"
+343 }
+344 
+345 fn test-shift-left-bytes-1 {
+346   var result/eax: int <- shift-left-bytes 1, 1
+347   check-ints-equal result, 0x100, "F - shift-left-bytes 1"
+348 }
+349 
+350 fn test-shift-left-bytes-2 {
+351   var result/eax: int <- shift-left-bytes 1, 2
+352   check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
+353 }
+354 
+355 fn test-shift-left-bytes-3 {
+356   var result/eax: int <- shift-left-bytes 1, 3
+357   check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
+358 }
+359 
+360 fn test-shift-left-bytes-4 {
+361   var result/eax: int <- shift-left-bytes 1, 4
+362   check-ints-equal result, 0, "F - shift-left-bytes 4"
+363 }
+364 
+365 fn test-shift-left-bytes-5 {
+366   var result/eax: int <- shift-left-bytes 1, 5
+367   check-ints-equal result, 0, "F - shift-left-bytes >4"
+368 }
+369 
+370 # write a grapheme to a stream of bytes
+371 # this is like write-to-stream, except we skip leading 0 bytes
+372 fn write-grapheme out: (addr stream byte), g: grapheme {
+373 $write-grapheme:body: {
+374   var c/eax: int <- copy g
+375   append-byte out, c  # first byte is always written
+376   c <- shift-right 8
+377   compare c, 0
+378   break-if-= $write-grapheme:body
+379   append-byte out, c
+380   c <- shift-right 8
+381   compare c, 0
+382   break-if-= $write-grapheme:body
+383   append-byte out, c
+384   c <- shift-right 8
+385   compare c, 0
+386   break-if-= $write-grapheme:body
+387   append-byte out, c
+388 }
+389 }
+
+ + + -- cgit 1.4.1-2-gfad0