https://github.com/akkartik/mu/blob/main/403unicode.mu
  1 # Helpers for Unicode.
  2 #
  3 # Mu has no characters, only code points and graphemes.
  4 # Code points are the indivisible atoms of text streams.
  5 #   https://en.wikipedia.org/wiki/Code_point
  6 # Graphemes are the smallest self-contained unit of text.
  7 # Graphemes may consist of multiple code points.
  8 #
  9 # Mu graphemes are always represented in utf-8, and they are required to fit
 10 # in 4 bytes.
 11 #
 12 # Mu doesn't currently support combining code points, or graphemes made of
 13 # multiple code points. One day we will.
 14 # We also don't currently support code points that translate into multiple
 15 # or wide graphemes. (In particular, Tab will never be supported.)
 16 
 17 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
 18 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
 19 #
 20 # The day we want to support combining characters, this function will need to
 21 # take multiple code points. Or something.
 22 fn to-grapheme in: code-point -> _/eax: grapheme {
 23   var c/eax: int <- copy in
 24   var num-trailers/ecx: int <- copy 0
 25   var first/edx: int <- copy 0
 26   $to-grapheme:compute-length: {
 27     # single byte: just return it
 28     compare c, 0x7f
 29     {
 30       break-if->
 31       var g/eax: grapheme <- copy c
 32       return g
 33     }
 34     # 2 bytes
 35     compare c, 0x7ff
 36     {
 37       break-if->
 38       num-trailers <- copy 1
 39       first <- copy 0xc0
 40       break $to-grapheme:compute-length
 41     }
 42     # 3 bytes
 43     compare c, 0xffff
 44     {
 45       break-if->
 46       num-trailers <- copy 2
 47       first <- copy 0xe0
 48       break $to-grapheme:compute-length
 49     }
 50     # 4 bytes
 51     compare c, 0x1fffff
 52     {
 53       break-if->
 54       num-trailers <- copy 3
 55       first <- copy 0xf0
 56       break $to-grapheme:compute-length
 57     }
 58     # more than 4 bytes: unsupported
 59     # TODO: print to stderr
 60     compare c, 0x1fffff
 61     {
 62       break-if->
 63       print-string-to-real-screen "unsupported code point "
 64       print-int32-hex-to-real-screen c
 65       print-string-to-real-screen "\n"
 66       var exit-status/ebx: int <- copy 1
 67       syscall_exit
 68     }
 69   }
 70   # emit trailer bytes, 6 bits from 'in', first two bits '10'
 71   var result/edi: grapheme <- copy 0
 72   {
 73     compare num-trailers, 0
 74     break-if-<=
 75     var tmp/esi: int <- copy c
 76     tmp <- and 0x3f
 77     tmp <- or 0x80
 78     result <- shift-left 8
 79     result <- or tmp
 80     # update loop state
 81     c <- shift-right 6
 82     num-trailers <- decrement
 83     loop
 84   }
 85   # emit engine
 86   result <- shift-left 8
 87   result <- or c
 88   result <- or first
 89   #
 90   return result
 91 }
 92 
 93 # single-byte code point have identical graphemes
 94 fn test-to-grapheme-single-byte {
 95   var in-int/ecx: int <- copy 0
 96   {
 97     compare in-int, 0x7f
 98     break-if->
 99     var in/eax: code-point <- copy in-int
100     var out/eax: grapheme <- to-grapheme in
101     var out-int/eax: int <- copy out
102     check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte"
103     in-int <- increment
104     loop
105   }
106 }
107 
108                                                               # byte       | byte      | byte      | byte
109 # smallest 2-byte utf-8
110 fn test-to-grapheme-two-bytes-min {
111   var in/eax: code-point <- copy 0x80                         #                                 10     00-0000
112   var out/eax: grapheme <- to-grapheme in
113   var out-int/eax: int <- copy out
114   check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a"      #                         110 0-0010  10 00-0000
115 }
116 
117 # largest 2-byte utf-8
118 fn test-to-grapheme-two-bytes-max {
119   var in/eax: code-point <- copy 0x7ff                        #                             1-1111     11-1111
120   var out/eax: grapheme <- to-grapheme in
121   var out-int/eax: int <- copy out
122   check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b"      #                         110 1-1111  10 11-1111
123 }
124 
125 # smallest 3-byte utf-8
126 fn test-to-grapheme-three-bytes-min {
127   var in/eax: code-point <- copy 0x800                        #                            10-0000     00-0000
128   var out/eax: grapheme <- to-grapheme in
129   var out-int/eax: int <- copy out
130   check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a"    #              1110 0000  10 10-0000  10 00-0000
131 }
132 
133 # largest 3-byte utf-8
134 fn test-to-grapheme-three-bytes-max {
135   var in/eax: code-point <- copy 0xffff                       #                   1111     11-1111     11-1111
136   var out/eax: grapheme <- to-grapheme in
137   var out-int/eax: int <- copy out
138   check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b"    #              1110 1111  10 11-1111  10 11-1111
139 }
140 
141 # smallest 4-byte utf-8
142 fn test-to-grapheme-four-bytes-min {
143   var in/eax: code-point <- copy 0x10000                      #                 1-0000     00-0000     00-0000
144   var out/eax: grapheme <- to-grapheme in
145   var out-int/eax: int <- copy out
146   check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a"  # 1111-0 000  10 01-0000  10 00-0000  10 00-0000
147 }
148 
149 # largest 4-byte utf-8
150 fn test-to-grapheme-four-bytes-max {
151   var in/eax: code-point <- copy 0x1fffff                     #        111     11-1111     11-1111     11-1111
152   var out/eax: grapheme <- to-grapheme in
153   var out-int/eax: int <- copy out
154   check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b"  # 1111-0 111  10 11-1111  10 11-1111  10 11-1111
155 }
156 
157 # read the next grapheme from a stream of bytes
158 fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {
159   # if at eof, return EOF
160   {
161     var eof?/eax: boolean <- stream-empty? in
162     compare eof?, 0  # false
163     break-if-=
164     return 0xffffffff
165   }
166   var c/eax: byte <- read-byte in
167   var num-trailers/ecx: int <- copy 0
168   $read-grapheme:compute-length: {
169     # single byte: just return it
170     compare c, 0xc0
171     {
172       break-if->=
173       var g/eax: grapheme <- copy c
174       return g
175     }
176     compare c, 0xfe
177     {
178       break-if-<
179       var g/eax: grapheme <- copy c
180       return g
181     }
182     # 2 bytes
183     compare c, 0xe0
184     {
185       break-if->=
186       num-trailers <- copy 1
187       break $read-grapheme:compute-length
188     }
189     # 3 bytes
190     compare c, 0xf0
191     {
192       break-if->=
193       num-trailers <- copy 2
194       break $read-grapheme:compute-length
195     }
196     # 4 bytes
197     compare c, 0xf8
198     {
199       break-if->=
200       num-trailers <- copy 3
201       break $read-grapheme:compute-length
202     }
203 $read-grapheme:abort: {
204       # TODO: print to stderr
205       print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
206       var n/eax: int <- copy c
207       print-int32-hex-to-real-screen n
208       print-string-to-real-screen "\n"
209       var exit-status/ebx: int <- copy 1
210       syscall_exit
211     }
212   }
213   # prepend trailer bytes
214   var result/edi: grapheme <- copy c
215   var num-byte-shifts/edx: int <- copy 1
216   {
217     compare num-trailers, 0
218     break-if-<=
219     var tmp/eax: byte <- read-byte in
220     var tmp2/eax: int <- copy tmp
221     tmp2 <- shift-left-bytes tmp2, num-byte-shifts
222     result <- or tmp2
223     # update loop state
224     num-byte-shifts <- increment
225     num-trailers <- decrement
226     loop
227   }
228   return result
229 }
230 
231 fn test-read-grapheme {
232   var s: (stream byte 0x30)
233   var s2/ecx: (addr stream byte) <- address s
234   write s2, "aΒc世d界e"
235   var c/eax: grapheme <- read-grapheme s2
236   var n/eax: int <- copy c
237   check-ints-equal n, 0x61, "F - test grapheme/0"
238   var c/eax: grapheme <- read-grapheme s2
239   var n/eax: int <- copy c
240   check-ints-equal n, 0x92ce, "F - test grapheme/1"  # greek capital letter beta
241   var c/eax: grapheme <- read-grapheme s2
242   var n/eax: int <- copy c
243   check-ints-equal n, 0x63, "F - test grapheme/2"
244   var c/eax: grapheme <- read-grapheme s2
245   var n/eax: int <- copy c
246   check-ints-equal n, 0x96b8e4, "F - test grapheme/3"
247   var c/eax: grapheme <- read-grapheme s2
248   var n/eax: int <- copy c
249   check-ints-equal n, 0x64, "F - test grapheme/4"
250   var c/eax: grapheme <- read-grapheme s2
251   var n/eax: int <- copy c
252   check-ints-equal n, 0x8c95e7, "F - test grapheme/5"
253   var c/eax: grapheme <- read-grapheme s2
254   var n/eax: int <- copy c
255   check-ints-equal n, 0x65, "F - test grapheme/6"
256 }
257 
258 fn read-grapheme-buffered in: (addr buffered-file) -> _/eax: grapheme {
259   var c/eax: byte <- read-byte-buffered in
260   var num-trailers/ecx: int <- copy 0
261   $read-grapheme-buffered:compute-length: {
262     # single byte: just return it
263     compare c, 0xc0
264     {
265       break-if->=
266       var g/eax: grapheme <- copy c
267       return g
268     }
269     compare c, 0xfe
270     {
271       break-if-<
272       var g/eax: grapheme <- copy c
273       return g
274     }
275     # 2 bytes
276     compare c, 0xe0
277     {
278       break-if->=
279       num-trailers <- copy 1
280       break $read-grapheme-buffered:compute-length
281     }
282     # 3 bytes
283     compare c, 0xf0
284     {
285       break-if->=
286       num-trailers <- copy 2
287       break $read-grapheme-buffered:compute-length
288     }
289     # 4 bytes
290     compare c, 0xf8
291     {
292       break-if->=
293       num-trailers <- copy 3
294       break $read-grapheme-buffered:compute-length
295     }
296 $read-grapheme-buffered:abort: {
297       # TODO: print to stderr
298       print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
299       var n/eax: int <- copy c
300       print-int32-hex-to-real-screen n
301       print-string-to-real-screen "\n"
302       var exit-status/ebx: int <- copy 1
303       syscall_exit
304     }
305   }
306   # prepend trailer bytes
307   var result/edi: grapheme <- copy c
308   var num-byte-shifts/edx: int <- copy 1
309   {
310     compare num-trailers, 0
311     break-if-<=
312     var tmp/eax: byte <- read-byte-buffered in
313     var tmp2/eax: int <- copy tmp
314     tmp2 <- shift-left-bytes tmp2, num-byte-shifts
315     result <- or tmp2
316     # update loop state
317     num-byte-shifts <- increment
318     num-trailers <- decrement
319     loop
320   }
321   return result
322 }
323 
324 # needed because available primitives only shift by a literal/constant number of bits
325 fn shift-left-bytes n: int, k: int -> _/eax: int {
326   var i/ecx: int <- copy 0
327   var result/eax: int <- copy n
328   {
329     compare i, k
330     break-if->=
331     compare i, 4  # only 4 bytes in 32 bits
332     break-if->=
333     result <- shift-left 8
334     i <- increment
335     loop
336   }
337   return result
338 }
339 
340 fn test-shift-left-bytes-0 {
341   var result/eax: int <- shift-left-bytes 1, 0
342   check-ints-equal result, 1, "F - shift-left-bytes 0"
343 }
344 
345 fn test-shift-left-bytes-1 {
346   var result/eax: int <- shift-left-bytes 1, 1
347   check-ints-equal result, 0x100, "F - shift-left-bytes 1"
348 }
349 
350 fn test-shift-left-bytes-2 {
351   var result/eax: int <- shift-left-bytes 1, 2
352   check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
353 }
354 
355 fn test-shift-left-bytes-3 {
356   var result/eax: int <- shift-left-bytes 1, 3
357   check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
358 }
359 
360 fn test-shift-left-bytes-4 {
361   var result/eax: int <- shift-left-bytes 1, 4
362   check-ints-equal result, 0, "F - shift-left-bytes 4"
363 }
364 
365 fn test-shift-left-bytes-5 {
366   var result/eax: int <- shift-left-bytes 1, 5
367   check-ints-equal result, 0, "F - shift-left-bytes >4"
368 }
369 
370 # To run all tests, uncomment this and run:
371 #   $ ./translate_mu  &&  ./a.elf
372 #? fn main -> _/ebx: int {
373 #?   run-tests
374 #?   r <- copy 0
375 #? }
376 
377 # write a grapheme to a stream of bytes
378 # this is like write-to-stream, except we skip leading 0 bytes
379 fn write-grapheme out: (addr stream byte), g: grapheme {
380 $write-grapheme:body: {
381   var c/eax: int <- copy g
382   append-byte out, c  # first byte is always written
383   c <- shift-right 8
384   compare c, 0
385   break-if-= $write-grapheme:body
386   append-byte out, c
387   c <- shift-right 8
388   compare c, 0
389   break-if-= $write-grapheme:body
390   append-byte out, c
391   c <- shift-right 8
392   compare c, 0
393   break-if-= $write-grapheme:body
394   append-byte out, c
395 }
396 }