https://github.com/akkartik/mu/blob/main/403unicode.mu
  1 # Helpers for Unicode.
  2 #
  3 # Mu has no characters, only code points and graphemes.
  4 # Code points are the indivisible atoms of text streams.
  5 #   https://en.wikipedia.org/wiki/Code_point
  6 # Graphemes are the smallest self-contained unit of text.
  7 # Graphemes may consist of multiple code points.
  8 #
  9 # Mu graphemes are always represented in utf-8, and they are required to fit
 10 # in 4 bytes. (This can be confusing if you focus just on ASCII, where Mu's
 11 # graphemes and code-points are identical.)
 12 #
 13 # Mu doesn't currently support combining code points, or graphemes made of
 14 # multiple code points. One day we will.
 15 #   https://en.wikipedia.org/wiki/Combining_character
 16 
 17 fn test-unicode-serialization-and-deserialization {
 18   var i/ebx: int <- copy 0
 19   var init?/esi: boolean <- copy 1/true
 20   {
 21     compare i, 0x10000  # 32 bits of utf-8 are sufficient for https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane
 22                         # but not emoji
 23     break-if->=
 24     var c/eax: code-point <- copy i
 25     var _g/eax: grapheme <- to-grapheme c
 26     var g/ecx: grapheme <- copy _g
 27     var c2/eax: code-point <- to-code-point g
 28     compare i, c2
 29     {
 30       break-if-=
 31       {
 32         compare init?, 0/false
 33         break-if-=
 34         draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "F - test-unicode-serialization-and-deserialization: ", 3/fg 0/bg
 35       }
 36       init? <- copy 0/false
 37       draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, i, 3/fg 0/bg
 38       draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg
 39       {
 40         var x/eax: int <- copy g
 41         draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x, 3/fg 0/bg
 42       }
 43       draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg
 44       {
 45         var x2/eax: int <- copy c2
 46         draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x2, 3/fg 0/bg
 47       }
 48       draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, " ", 3/fg 0/bg
 49     }
 50     i <- add 0xf  # to speed things up; ensure increment is not a power of 2
 51     loop
 52   }
 53 }
 54 
 55 # transliterated from tb_utf8_char_to_unicode in https://github.com/nsf/termbox
 56 fn to-code-point in: grapheme -> _/eax: code-point {
 57   var g/ebx: int <- copy in
 58   # if single byte, just return it
 59   {
 60     compare g, 0xff
 61     break-if->
 62     var result/eax: code-point <- copy g
 63     return result
 64   }
 65   #
 66   var len/edx: int <- grapheme-length in
 67   # extract bits from first byte
 68   var b/eax: byte <- copy-byte g
 69   var result/edi: code-point <- copy b
 70   {
 71     compare len, 2
 72     break-if-!=
 73     result <- and 0x1f
 74   }
 75   {
 76     compare len, 3
 77     break-if-!=
 78     result <- and 0x0f
 79   }
 80   {
 81     compare len, 4
 82     break-if-!=
 83     result <- and 0x07
 84   }
 85   # extract bits from remaining bytes
 86   g <- shift-right 8
 87   var i/ecx: int <- copy 1
 88   {
 89     compare i, len
 90     break-if->=
 91     var b/eax: byte <- copy-byte g
 92     b <- and 0x3f
 93     result <- shift-left 6
 94     result <- or b
 95     g <- shift-right 8
 96     i <- increment
 97     loop
 98   }
 99   return result
100 }
101 
102 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
103 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
104 fn to-grapheme in: code-point -> _/eax: grapheme {
105   var c/eax: int <- copy in
106   var num-trailers/ecx: int <- copy 0
107   var first/edx: int <- copy 0
108   $to-grapheme:compute-length: {
109     # single byte: just return it
110     compare c, 0x7f
111     {
112       break-if->
113       var g/eax: grapheme <- copy c
114       return g
115     }
116     # 2 bytes
117     compare c, 0x7ff
118     {
119       break-if->
120       num-trailers <- copy 1
121       first <- copy 0xc0
122       break $to-grapheme:compute-length
123     }
124     # 3 bytes
125     compare c, 0xffff
126     {
127       break-if->
128       num-trailers <- copy 2
129       first <- copy 0xe0
130       break $to-grapheme:compute-length
131     }
132     # 4 bytes
133     compare c, 0x1fffff
134     {
135       break-if->
136       num-trailers <- copy 3
137       first <- copy 0xf0
138       break $to-grapheme:compute-length
139     }
140     # more than 4 bytes: unsupported
141     # TODO: print error message to stderr
142     compare c, 0x1fffff
143     {
144       break-if->
145       return 0
146     }
147   }
148   # emit trailer bytes, 6 bits from 'in', first two bits '10'
149   var result/edi: grapheme <- copy 0
150   {
151     compare num-trailers, 0
152     break-if-<=
153     var tmp/esi: int <- copy c
154     tmp <- and 0x3f
155     tmp <- or 0x80
156     result <- shift-left 8
157     result <- or tmp
158     # update loop state
159     c <- shift-right 6
160     num-trailers <- decrement
161     loop
162   }
163   # emit engine
164   result <- shift-left 8
165   result <- or c
166   result <- or first
167   #
168   return result
169 }
170 
171 # single-byte code point have identical graphemes
172 fn test-to-grapheme-single-byte {
173   var in-int/ecx: int <- copy 0
174   {
175     compare in-int, 0x7f
176     break-if->
177     var in/eax: code-point <- copy in-int
178     var out/eax: grapheme <- to-grapheme in
179     var out-int/eax: int <- copy out
180     check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte"
181     in-int <- increment
182     loop
183   }
184 }
185 
186                                                               # byte       | byte      | byte      | byte
187 # smallest 2-byte utf-8
188 fn test-to-grapheme-two-bytes-min {
189   var in/eax: code-point <- copy 0x80                         #                                 10     00-0000
190   var out/eax: grapheme <- to-grapheme in
191   var out-int/eax: int <- copy out
192   check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a"      #                         110 0-0010  10 00-0000
193 }
194 
195 # largest 2-byte utf-8
196 fn test-to-grapheme-two-bytes-max {
197   var in/eax: code-point <- copy 0x7ff                        #                             1-1111     11-1111
198   var out/eax: grapheme <- to-grapheme in
199   var out-int/eax: int <- copy out
200   check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b"      #                         110 1-1111  10 11-1111
201 }
202 
203 # smallest 3-byte utf-8
204 fn test-to-grapheme-three-bytes-min {
205   var in/eax: code-point <- copy 0x800                        #                            10-0000     00-0000
206   var out/eax: grapheme <- to-grapheme in
207   var out-int/eax: int <- copy out
208   check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a"    #              1110 0000  10 10-0000  10 00-0000
209 }
210 
211 # largest 3-byte utf-8
212 fn test-to-grapheme-three-bytes-max {
213   var in/eax: code-point <- copy 0xffff                       #                   1111     11-1111     11-1111
214   var out/eax: grapheme <- to-grapheme in
215   var out-int/eax: int <- copy out
216   check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b"    #              1110 1111  10 11-1111  10 11-1111
217 }
218 
219 # smallest 4-byte utf-8
220 fn test-to-grapheme-four-bytes-min {
221   var in/eax: code-point <- copy 0x10000                      #                 1-0000     00-0000     00-0000
222   var out/eax: grapheme <- to-grapheme in
223   var out-int/eax: int <- copy out
224   check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a"  # 1111-0 000  10 01-0000  10 00-0000  10 00-0000
225 }
226 
227 # largest 4-byte utf-8
228 fn test-to-grapheme-four-bytes-max {
229   var in/eax: code-point <- copy 0x1fffff                     #        111     11-1111     11-1111     11-1111
230   var out/eax: grapheme <- to-grapheme in
231   var out-int/eax: int <- copy out
232   check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b"  # 1111-0 111  10 11-1111  10 11-1111  10 11-1111
233 }
234 
235 # read the next grapheme from a stream of bytes
236 fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {
237   # if at eof, return EOF
238   {
239     var eof?/eax: boolean <- stream-empty? in
240     compare eof?, 0/false
241     break-if-=
242     return 0xffffffff
243   }
244   var c/eax: byte <- read-byte in
245   var num-trailers/ecx: int <- copy 0
246   $read-grapheme:compute-length: {
247     # single byte: just return it
248     compare c, 0xc0
249     {
250       break-if->=
251       var g/eax: grapheme <- copy c
252       return g
253     }
254     compare c, 0xfe
255     {
256       break-if-<
257       var g/eax: grapheme <- copy c
258       return g
259     }
260     # 2 bytes
261     compare c, 0xe0
262     {
263       break-if->=
264       num-trailers <- copy 1
265       break $read-grapheme:compute-length
266     }
267     # 3 bytes
268     compare c, 0xf0
269     {
270       break-if->=
271       num-trailers <- copy 2
272       break $read-grapheme:compute-length
273     }
274     # 4 bytes
275     compare c, 0xf8
276     {
277       break-if->=
278       num-trailers <- copy 3
279       break $read-grapheme:compute-length
280     }
281     # TODO: print error message
282     return 0
283   }
284   # prepend trailer bytes
285   var result/edi: grapheme <- copy c
286   var num-byte-shifts/edx: int <- copy 1
287   {
288     compare num-trailers, 0
289     break-if-<=
290     var tmp/eax: byte <- read-byte in
291     var tmp2/eax: int <- copy tmp
292     tmp2 <- shift-left-bytes tmp2, num-byte-shifts
293     result <- or tmp2
294     # update loop state
295     num-byte-shifts <- increment
296     num-trailers <- decrement
297     loop
298   }
299   return result
300 }
301 
302 fn grapheme-length g: grapheme -> _/edx: int {
303   {
304     compare g, 0xff
305     break-if->
306     return 1
307   }
308   {
309     compare g, 0xffff
310     break-if->
311     return 2
312   }
313   {
314     compare g, 0xffffff
315     break-if->
316     return 3
317   }
318   return 4
319 }
320 
321 # needed because available primitives only shift by a literal/constant number of bits
322 fn shift-left-bytes n: int, k: int -> _/eax: int {
323   var i/ecx: int <- copy 0
324   var result/eax: int <- copy n
325   {
326     compare i, k
327     break-if->=
328     compare i, 4  # only 4 bytes in 32 bits
329     break-if->=
330     result <- shift-left 8
331     i <- increment
332     loop
333   }
334   return result
335 }
336 
337 # write a grapheme to a stream of bytes
338 # this is like write-to-stream, except we skip leading 0 bytes
339 fn write-grapheme out: (addr stream byte), g: grapheme {
340 $write-grapheme:body: {
341   var c/eax: int <- copy g
342   append-byte out, c  # first byte is always written
343   c <- shift-right 8
344   compare c, 0
345   break-if-= $write-grapheme:body
346   append-byte out, c
347   c <- shift-right 8
348   compare c, 0
349   break-if-= $write-grapheme:body
350   append-byte out, c
351   c <- shift-right 8
352   compare c, 0
353   break-if-= $write-grapheme:body
354   append-byte out, c
355 }
356 }