https://github.com/akkartik/mu/blob/master/403unicode.mu
  1 # Helpers for Unicode.
  2 #
  3 # Mu has no characters, only code points and graphemes.
  4 # Code points are the indivisible atoms of text streams.
  5 #   https://en.wikipedia.org/wiki/Code_point
  6 # Graphemes are the smallest self-contained unit of text.
  7 # Graphemes may consist of multiple code points.
  8 #
  9 # Mu graphemes are always represented in utf-8, and they are required to fit
 10 # in 4 bytes.
 11 #
 12 # Mu doesn't currently support combining code points, or graphemes made of
 13 # multiple code points. One day we will.
 14 # We also don't currently support code points that translate into multiple
 15 # or wide graphemes. (In particular, Tab will never be supported.)
 16 
 17 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
 18 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
 19 #
 20 # The day we want to support combining characters, this function will need to
 21 # take multiple code points. Or something.
 22 fn to-grapheme in: code-point -> out/eax: grapheme {
 23 $to-grapheme:body: {
 24   var c/eax: int <- copy in
 25   var num-trailers/ecx: int <- copy 0
 26   var first/edx: int <- copy 0
 27   $to-grapheme:compute-length: {
 28     # single byte: just return it
 29     compare c, 0x7f
 30     {
 31       break-if->
 32       out <- copy c
 33       break $to-grapheme:body
 34     }
 35     # 2 bytes
 36     compare c, 0x7ff
 37     {
 38       break-if->
 39       num-trailers <- copy 1
 40       first <- copy 0xc0
 41       break $to-grapheme:compute-length
 42     }
 43     # 3 bytes
 44     compare c, 0xffff
 45     {
 46       break-if->
 47       num-trailers <- copy 2
 48       first <- copy 0xe0
 49       break $to-grapheme:compute-length
 50     }
 51     # 4 bytes
 52     compare c, 0x1fffff
 53     {
 54       break-if->
 55       num-trailers <- copy 3
 56       first <- copy 0xf0
 57       break $to-grapheme:compute-length
 58     }
 59     # more than 4 bytes: unsupported
 60     # TODO: print to stderr
 61     compare c, 0x1fffff
 62     {
 63       break-if->
 64       print-string-to-real-screen "unsupported code point "
 65       print-int32-hex-to-real-screen c
 66       print-string-to-real-screen "\n"
 67       var exit-status/ebx: int <- copy 1
 68       syscall_exit
 69     }
 70   }
 71   # emit trailer bytes, 6 bits from 'in', first two bits '10'
 72   var result/edi: int <- copy 0
 73   {
 74     compare num-trailers, 0
 75     break-if-<=
 76     var tmp/esi: int <- copy c
 77     tmp <- and 0x3f
 78     tmp <- or 0x80
 79     result <- shift-left 8
 80     result <- or tmp
 81     # update loop state
 82     c <- shift-right 6
 83     num-trailers <- decrement
 84     loop
 85   }
 86   # emit engine
 87   result <- shift-left 8
 88   result <- or c
 89   result <- or first
 90   #
 91   out <- copy result
 92 }
 93 }
 94 
 95 # single-byte code point have identical graphemes
 96 fn test-to-grapheme-single-byte {
 97   var in-int/ecx: int <- copy 0
 98   {
 99     compare in-int, 0x7f
100     break-if->
101     var in/eax: code-point <- copy in-int
102     var out/eax: grapheme <- to-grapheme in
103     var out-int/eax: int <- copy out
104     check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte"
105     in-int <- increment
106     loop
107   }
108 }
109 
110                                                               # byte       | byte      | byte      | byte
111 # smallest 2-byte utf-8
112 fn test-to-grapheme-two-bytes-min {
113   var in/eax: code-point <- copy 0x80                         #                                 10     00-0000
114   var out/eax: grapheme <- to-grapheme in
115   var out-int/eax: int <- copy out
116   check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a"      #                         110 0-0010  10 00-0000
117 }
118 
119 # largest 2-byte utf-8
120 fn test-to-grapheme-two-bytes-max {
121   var in/eax: code-point <- copy 0x7ff                        #                             1-1111     11-1111
122   var out/eax: grapheme <- to-grapheme in
123   var out-int/eax: int <- copy out
124   check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b"      #                         110 1-1111  10 11-1111
125 }
126 
127 # smallest 3-byte utf-8
128 fn test-to-grapheme-three-bytes-min {
129   var in/eax: code-point <- copy 0x800                        #                            10-0000     00-0000
130   var out/eax: grapheme <- to-grapheme in
131   var out-int/eax: int <- copy out
132   check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a"    #              1110 0000  10 10-0000  10 00-0000
133 }
134 
135 # largest 3-byte utf-8
136 fn test-to-grapheme-three-bytes-max {
137   var in/eax: code-point <- copy 0xffff                       #                   1111     11-1111     11-1111
138   var out/eax: grapheme <- to-grapheme in
139   var out-int/eax: int <- copy out
140   check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b"    #              1110 1111  10 11-1111  10 11-1111
141 }
142 
143 # smallest 4-byte utf-8
144 fn test-to-grapheme-four-bytes-min {
145   var in/eax: code-point <- copy 0x10000                      #                 1-0000     00-0000     00-0000
146   var out/eax: grapheme <- to-grapheme in
147   var out-int/eax: int <- copy out
148   check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a"  # 1111-0 000  10 01-0000  10 00-0000  10 00-0000
149 }
150 
151 # largest 4-byte utf-8
152 fn test-to-grapheme-four-bytes-max {
153   var in/eax: code-point <- copy 0x1fffff                     #        111     11-1111     11-1111     11-1111
154   var out/eax: grapheme <- to-grapheme in
155   var out-int/eax: int <- copy out
156   check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b"  # 1111-0 111  10 11-1111  10 11-1111  10 11-1111
157 }
158 
159 # read the next grapheme from a stream of bytes
160 fn read-grapheme in: (addr stream byte) -> out/eax: grapheme {
161 $read-grapheme:body: {
162   var c/eax: byte <- read-byte in
163   var num-trailers/ecx: int <- copy 0
164   $read-grapheme:compute-length: {
165     # single byte: just return it
166     compare c, 0xc0
167     {
168       break-if->=
169       out <- copy c
170       num-trailers <- copy 0
171       break $read-grapheme:body
172     }
173     compare c, 0xfe
174     {
175       break-if-<
176       out <- copy c
177       break $read-grapheme:body
178     }
179     # 2 bytes
180     compare c, 0xe0
181     {
182       break-if->=
183       num-trailers <- copy 1
184       break $read-grapheme:compute-length
185     }
186     # 3 bytes
187     compare c, 0xf0
188     {
189       break-if->=
190       num-trailers <- copy 2
191       break $read-grapheme:compute-length
192     }
193     # 4 bytes
194     compare c, 0xf8
195     {
196       break-if->=
197       num-trailers <- copy 3
198       break $read-grapheme:compute-length
199     }
200 $read-grapheme:abort: {
201       # TODO: print to stderr
202       print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
203       var n/eax: int <- copy c
204       print-int32-hex-to-real-screen n
205       print-string-to-real-screen "\n"
206       var exit-status/ebx: int <- copy 1
207       syscall_exit
208     }
209   }
210   # prepend trailer bytes
211   var result/edi: int <- copy c
212   var num-byte-shifts/edx: int <- copy 1
213   {
214     compare num-trailers, 0
215     break-if-<=
216     var tmp/eax: byte <- read-byte in
217     var tmp2/eax: int <- copy tmp
218     tmp2 <- shift-left-bytes tmp2, num-byte-shifts
219     result <- or tmp2
220     # update loop state
221     num-byte-shifts <- increment
222     num-trailers <- decrement
223     loop
224   }
225   out <- copy result
226 }
227 }
228 
229 fn test-read-grapheme {
230   var s: (stream byte 0x30)
231   var s2/ecx: (addr stream byte) <- address s
232   write s2, "aΒc世d界e"
233   var c/eax: grapheme <- read-grapheme s2
234   var n/eax: int <- copy c
235   check-ints-equal n, 0x61, "F - test grapheme/0"
236   var c/eax: grapheme <- read-grapheme s2
237   var n/eax: int <- copy c
238   check-ints-equal n, 0x92ce, "F - test grapheme/1"  # greek capital letter beta
239   var c/eax: grapheme <- read-grapheme s2
240   var n/eax: int <- copy c
241   check-ints-equal n, 0x63, "F - test grapheme/2"
242   var c/eax: grapheme <- read-grapheme s2
243   var n/eax: int <- copy c
244   check-ints-equal n, 0x96b8e4, "F - test grapheme/3"
245   var c/eax: grapheme <- read-grapheme s2
246   var n/eax: int <- copy c
247   check-ints-equal n, 0x64, "F - test grapheme/4"
248   var c/eax: grapheme <- read-grapheme s2
249   var n/eax: int <- copy c
250   check-ints-equal n, 0x8c95e7, "F - test grapheme/5"
251   var c/eax: grapheme <- read-grapheme s2
252   var n/eax: int <- copy c
253   check-ints-equal n, 0x65, "F - test grapheme/6"
254 }
255 
256 fn read-grapheme-buffered in: (addr buffered-file) -> out/eax: grapheme {
257 $read-grapheme-buffered:body: {
258   var c/eax: byte <- read-byte-buffered in
259   var num-trailers/ecx: int <- copy 0
260   $read-grapheme-buffered:compute-length: {
261     # single byte: just return it
262     compare c, 0xc0
263     {
264       break-if->=
265       out <- copy c
266       num-trailers <- copy 0
267       break $read-grapheme-buffered:body
268     }
269     compare c, 0xfe
270     {
271       break-if-<
272       out <- copy c
273       break $read-grapheme-buffered:body
274     }
275     # 2 bytes
276     compare c, 0xe0
277     {
278       break-if->=
279       num-trailers <- copy 1
280       break $read-grapheme-buffered:compute-length
281     }
282     # 3 bytes
283     compare c, 0xf0
284     {
285       break-if->=
286       num-trailers <- copy 2
287       break $read-grapheme-buffered:compute-length
288     }
289     # 4 bytes
290     compare c, 0xf8
291     {
292       break-if->=
293       num-trailers <- copy 3
294       break $read-grapheme-buffered:compute-length
295     }
296 $read-grapheme-buffered:abort: {
297       # TODO: print to stderr
298       print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
299       var n/eax: int <- copy c
300       print-int32-hex-to-real-screen n
301       print-string-to-real-screen "\n"
302       var exit-status/ebx: int <- copy 1
303       syscall_exit
304     }
305   }
306   # prepend trailer bytes
307   var result/edi: int <- copy c
308   var num-byte-shifts/edx: int <- copy 1
309   {
310     compare num-trailers, 0
311     break-if-<=
312     var tmp/eax: byte <- read-byte-buffered in
313     var tmp2/eax: int <- copy tmp
314     tmp2 <- shift-left-bytes tmp2, num-byte-shifts
315     result <- or tmp2
316     # update loop state
317     num-byte-shifts <- increment
318     num-trailers <- decrement
319     loop
320   }
321   out <- copy result
322 }
323 }
324 
325 # needed because available primitives only shift by a literal/constant number of bits
326 fn shift-left-bytes n: int, k: int -> result/eax: int {
327   var i/ecx: int <- copy 0
328   result <- copy n
329   {
330     compare i, k
331     break-if->=
332     compare i, 4  # only 4 bytes in 32 bits
333     break-if->=
334     result <- shift-left 8
335     i <- increment
336     loop
337   }
338 }
339 
340 fn test-shift-left-bytes-0 {
341   var result/eax: int <- shift-left-bytes 1, 0
342   check-ints-equal result, 1, "F - shift-left-bytes 0"
343 }
344 
345 fn test-shift-left-bytes-1 {
346   var result/eax: int <- shift-left-bytes 1, 1
347   check-ints-equal result, 0x100, "F - shift-left-bytes 1"
348 }
349 
350 fn test-shift-left-bytes-2 {
351   var result/eax: int <- shift-left-bytes 1, 2
352   check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
353 }
354 
355 fn test-shift-left-bytes-3 {
356   var result/eax: int <- shift-left-bytes 1, 3
357   check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
358 }
359 
360 fn test-shift-left-bytes-4 {
361   var result/eax: int <- shift-left-bytes 1, 4
362   check-ints-equal result, 0, "F - shift-left-bytes 4"
363 }
364 
365 fn test-shift-left-bytes-5 {
366   var result/eax: int <- shift-left-bytes 1, 5
367   check-ints-equal result, 0, "F - shift-left-bytes >4"
368 }
369 
370 # To run all tests, uncomment this and run:
371 #   $ ./translate_mu  &&  ./a.elf
372 #? fn main -> r/ebx: int {
373 #?   run-tests
374 #?   r <- copy 0
375 #? }
376 
377 # write a grapheme to a stream of bytes
378 # this is like write-to-stream, except we skip leading 0 bytes
379 fn write-grapheme out: (addr stream byte), g: grapheme {
380 $write-grapheme:body: {
381   var c/eax: int <- copy g
382   append-byte out, c  # first byte is always written
383   c <- shift-right 8
384   compare c, 0
385   break-if-= $write-grapheme:body
386   append-byte out, c
387   c <- shift-right 8
388   compare c, 0
389   break-if-= $write-grapheme:body
390   append-byte out, c
391   c <- shift-right 8
392   compare c, 0
393   break-if-= $write-grapheme:body
394   append-byte out, c
395 }
396 }