https://github.com/akkartik/mu/blob/master/403code-point.mu
  1 # Helpers for Unicode "code points".
  2 # https://en.wikipedia.org/wiki/Code_point
  3 #
  4 # Mu has no characters, only code points and graphemes.
  5 # Code points are the indivisible atoms of text streams.
  6 # Graphemes are the smallest self-contained unit of text.
  7 # Graphemes may consist of multiple code points.
  8 #
  9 # Mu graphemes are always represented in utf-8, and they are required to fit
 10 # in 4 bytes.
 11 #
 12 # Mu doesn't currently support combining code points, or graphemes made of
 13 # multiple code points. One day we will.
 14 
 15 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
 16 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
 17 #
 18 # The day we want to support combining characters, this function will need to
 19 # take multiple code points. Or something.
 20 fn to-grapheme in: code-point -> out/eax: grapheme {
 21 $to-grapheme:body: {
 22   var c/eax: int <- copy in
 23   var num-trailers/ecx: int <- copy 0
 24   var first/edx: int <- copy 0
 25   $to-grapheme:compute-length: {
 26     # single byte: just return it
 27     compare c, 0x7f
 28     {
 29       break-if->
 30       out <- copy c
 31       break $to-grapheme:body
 32     }
 33     # 2 bytes
 34     compare c, 0x7ff
 35     {
 36       break-if->
 37       num-trailers <- copy 1
 38       first <- copy 0xc0
 39       break $to-grapheme:compute-length
 40     }
 41     # 3 bytes
 42     compare c, 0xffff
 43     {
 44       break-if->
 45       num-trailers <- copy 2
 46       first <- copy 0xe0
 47       break $to-grapheme:compute-length
 48     }
 49     # 4 bytes
 50     compare c, 0x1fffff
 51     {
 52       break-if->
 53       num-trailers <- copy 3
 54       first <- copy 0xf0
 55       break $to-grapheme:compute-length
 56     }
 57     # more than 4 bytes: unsupported
 58     compare c, 0x1fffff
 59     {
 60       break-if->
 61       print-string-to-real-screen "unsupported code point "
 62       print-int32-hex-to-real-screen c
 63       print-string-to-real-screen "\n"
 64       var exit-status/ebx: int <- copy 1
 65       syscall_exit
 66     }
 67   }
 68   # emit trailer bytes, 6 bits from 'in', first two bits '10'
 69   var result/edi: int <- copy 0
 70   {
 71     compare num-trailers, 0
 72     break-if-<=
 73     var tmp/esi: int <- copy c
 74     tmp <- and 0x3f
 75     tmp <- or 0x80
 76     result <- shift-left 8
 77     result <- or tmp
 78     # update loop state
 79     c <- shift-right 6
 80     num-trailers <- decrement
 81     loop
 82   }
 83   # emit engine
 84   result <- shift-left 8
 85   result <- or c
 86   result <- or first
 87   #
 88   out <- copy result
 89 }
 90 }
 91 
 92 # single-byte code point have identical graphemes
 93 fn test-to-grapheme-single-byte {
 94   var in-int/ecx: int <- copy 0
 95   {
 96     compare in-int, 0x7f
 97     break-if->
 98     var in/eax: code-point <- copy in-int
 99     var out/eax: grapheme <- to-grapheme in
100     var out-int/eax: int <- copy out
101     check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte"
102     in-int <- increment
103     loop
104   }
105 }
106 
107                                                               # byte       | byte      | byte      | byte
108 # smallest 2-byte utf-8
109 fn test-to-grapheme-two-bytes-min {
110   var in/eax: code-point <- copy 0x80                         #                                 10     00-0000
111   var out/eax: grapheme <- to-grapheme in
112   var out-int/eax: int <- copy out
113   check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a"      #                         110 0-0010  10 00-0000
114 }
115 
116 # largest 2-byte utf-8
117 fn test-to-grapheme-two-bytes-max {
118   var in/eax: code-point <- copy 0x7ff                        #                             1-1111     11-1111
119   var out/eax: grapheme <- to-grapheme in
120   var out-int/eax: int <- copy out
121   check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b"      #                         110 1-1111  10 11-1111
122 }
123 
124 # smallest 3-byte utf-8
125 fn test-to-grapheme-three-bytes-min {
126   var in/eax: code-point <- copy 0x800                        #                            10-0000     00-0000
127   var out/eax: grapheme <- to-grapheme in
128   var out-int/eax: int <- copy out
129   check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a"    #              1110 0000  10 10-0000  10 00-0000
130 }
131 
132 # largest 3-byte utf-8
133 fn test-to-grapheme-three-bytes-max {
134   var in/eax: code-point <- copy 0xffff                       #                   1111     11-1111     11-1111
135   var out/eax: grapheme <- to-grapheme in
136   var out-int/eax: int <- copy out
137   check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b"    #              1110 1111  10 11-1111  10 11-1111
138 }
139 
140 # smallest 4-byte utf-8
141 fn test-to-grapheme-four-bytes-min {
142   var in/eax: code-point <- copy 0x10000                      #                 1-0000     00-0000     00-0000
143   var out/eax: grapheme <- to-grapheme in
144   var out-int/eax: int <- copy out
145   check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a"  # 1111-0 000  10 01-0000  10 00-0000  10 00-0000
146 }
147 
148 # largest 4-byte utf-8
149 fn test-to-grapheme-four-bytes-max {
150   var in/eax: code-point <- copy 0x1fffff                     #        111     11-1111     11-1111     11-1111
151   var out/eax: grapheme <- to-grapheme in
152   var out-int/eax: int <- copy out
153   check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b"  # 1111-0 111  10 11-1111  10 11-1111  10 11-1111
154 }
155 
156 # To run all tests, uncomment this and run:
157 #   $ ./translate_mu ''  &&  ./a.elf
158 #? fn main -> r/ebx: int {
159 #?   run-tests
160 #?   r <- copy 0
161 #? }