https://github.com/akkartik/mu/blob/main/403unicode.mu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 fn to-grapheme in: code-point -> _/eax: grapheme {
23 var c/eax: int <- copy in
24 var num-trailers/ecx: int <- copy 0
25 var first/edx: int <- copy 0
26 $to-grapheme:compute-length: {
27
28 compare c, 0x7f
29 {
30 break-if->
31 var g/eax: grapheme <- copy c
32 return g
33 }
34
35 compare c, 0x7ff
36 {
37 break-if->
38 num-trailers <- copy 1
39 first <- copy 0xc0
40 break $to-grapheme:compute-length
41 }
42
43 compare c, 0xffff
44 {
45 break-if->
46 num-trailers <- copy 2
47 first <- copy 0xe0
48 break $to-grapheme:compute-length
49 }
50
51 compare c, 0x1fffff
52 {
53 break-if->
54 num-trailers <- copy 3
55 first <- copy 0xf0
56 break $to-grapheme:compute-length
57 }
58
59
60 compare c, 0x1fffff
61 {
62 break-if->
63 print-string-to-real-screen "unsupported code point "
64 print-int32-hex-to-real-screen c
65 print-string-to-real-screen "\n"
66 var exit-status/ebx: int <- copy 1
67 syscall_exit
68 }
69 }
70
71 var result/edi: grapheme <- copy 0
72 {
73 compare num-trailers, 0
74 break-if-<=
75 var tmp/esi: int <- copy c
76 tmp <- and 0x3f
77 tmp <- or 0x80
78 result <- shift-left 8
79 result <- or tmp
80
81 c <- shift-right 6
82 num-trailers <- decrement
83 loop
84 }
85
86 result <- shift-left 8
87 result <- or c
88 result <- or first
89
90 return result
91 }
92
93
94 fn test-to-grapheme-single-byte {
95 var in-int/ecx: int <- copy 0
96 {
97 compare in-int, 0x7f
98 break-if->
99 var in/eax: code-point <- copy in-int
100 var out/eax: grapheme <- to-grapheme in
101 var out-int/eax: int <- copy out
102 check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte"
103 in-int <- increment
104 loop
105 }
106 }
107
108
109
110 fn test-to-grapheme-two-bytes-min {
111 var in/eax: code-point <- copy 0x80
112 var out/eax: grapheme <- to-grapheme in
113 var out-int/eax: int <- copy out
114 check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a"
115 }
116
117
118 fn test-to-grapheme-two-bytes-max {
119 var in/eax: code-point <- copy 0x7ff
120 var out/eax: grapheme <- to-grapheme in
121 var out-int/eax: int <- copy out
122 check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b"
123 }
124
125
126 fn test-to-grapheme-three-bytes-min {
127 var in/eax: code-point <- copy 0x800
128 var out/eax: grapheme <- to-grapheme in
129 var out-int/eax: int <- copy out
130 check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a"
131 }
132
133
134 fn test-to-grapheme-three-bytes-max {
135 var in/eax: code-point <- copy 0xffff
136 var out/eax: grapheme <- to-grapheme in
137 var out-int/eax: int <- copy out
138 check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b"
139 }
140
141
142 fn test-to-grapheme-four-bytes-min {
143 var in/eax: code-point <- copy 0x10000
144 var out/eax: grapheme <- to-grapheme in
145 var out-int/eax: int <- copy out
146 check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a"
147 }
148
149
150 fn test-to-grapheme-four-bytes-max {
151 var in/eax: code-point <- copy 0x1fffff
152 var out/eax: grapheme <- to-grapheme in
153 var out-int/eax: int <- copy out
154 check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b"
155 }
156
157
158 fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {
159
160 {
161 var eof?/eax: boolean <- stream-empty? in
162 compare eof?, 0
163 break-if-=
164 return 0xffffffff
165 }
166 var c/eax: byte <- read-byte in
167 var num-trailers/ecx: int <- copy 0
168 $read-grapheme:compute-length: {
169
170 compare c, 0xc0
171 {
172 break-if->=
173 var g/eax: grapheme <- copy c
174 return g
175 }
176 compare c, 0xfe
177 {
178 break-if-<
179 var g/eax: grapheme <- copy c
180 return g
181 }
182
183 compare c, 0xe0
184 {
185 break-if->=
186 num-trailers <- copy 1
187 break $read-grapheme:compute-length
188 }
189
190 compare c, 0xf0
191 {
192 break-if->=
193 num-trailers <- copy 2
194 break $read-grapheme:compute-length
195 }
196
197 compare c, 0xf8
198 {
199 break-if->=
200 num-trailers <- copy 3
201 break $read-grapheme:compute-length
202 }
203 $read-grapheme:abort: {
204
205 print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
206 var n/eax: int <- copy c
207 print-int32-hex-to-real-screen n
208 print-string-to-real-screen "\n"
209 var exit-status/ebx: int <- copy 1
210 syscall_exit
211 }
212 }
213
214 var result/edi: grapheme <- copy c
215 var num-byte-shifts/edx: int <- copy 1
216 {
217 compare num-trailers, 0
218 break-if-<=
219 var tmp/eax: byte <- read-byte in
220 var tmp2/eax: int <- copy tmp
221 tmp2 <- shift-left-bytes tmp2, num-byte-shifts
222 result <- or tmp2
223
224 num-byte-shifts <- increment
225 num-trailers <- decrement
226 loop
227 }
228 return result
229 }
230
231 fn test-read-grapheme {
232 var s: (stream byte 0x30)
233 var s2/ecx: (addr stream byte) <- address s
234 write s2, "aΒc世d界e"
235 var c/eax: grapheme <- read-grapheme s2
236 var n/eax: int <- copy c
237 check-ints-equal n, 0x61, "F - test grapheme/0"
238 var c/eax: grapheme <- read-grapheme s2
239 var n/eax: int <- copy c
240 check-ints-equal n, 0x92ce, "F - test grapheme/1"
241 var c/eax: grapheme <- read-grapheme s2
242 var n/eax: int <- copy c
243 check-ints-equal n, 0x63, "F - test grapheme/2"
244 var c/eax: grapheme <- read-grapheme s2
245 var n/eax: int <- copy c
246 check-ints-equal n, 0x96b8e4, "F - test grapheme/3"
247 var c/eax: grapheme <- read-grapheme s2
248 var n/eax: int <- copy c
249 check-ints-equal n, 0x64, "F - test grapheme/4"
250 var c/eax: grapheme <- read-grapheme s2
251 var n/eax: int <- copy c
252 check-ints-equal n, 0x8c95e7, "F - test grapheme/5"
253 var c/eax: grapheme <- read-grapheme s2
254 var n/eax: int <- copy c
255 check-ints-equal n, 0x65, "F - test grapheme/6"
256 }
257
258 fn read-grapheme-buffered in: (addr buffered-file) -> _/eax: grapheme {
259 var c/eax: byte <- read-byte-buffered in
260 var num-trailers/ecx: int <- copy 0
261 $read-grapheme-buffered:compute-length: {
262
263 compare c, 0xc0
264 {
265 break-if->=
266 var g/eax: grapheme <- copy c
267 return g
268 }
269 compare c, 0xfe
270 {
271 break-if-<
272 var g/eax: grapheme <- copy c
273 return g
274 }
275
276 compare c, 0xe0
277 {
278 break-if->=
279 num-trailers <- copy 1
280 break $read-grapheme-buffered:compute-length
281 }
282
283 compare c, 0xf0
284 {
285 break-if->=
286 num-trailers <- copy 2
287 break $read-grapheme-buffered:compute-length
288 }
289
290 compare c, 0xf8
291 {
292 break-if->=
293 num-trailers <- copy 3
294 break $read-grapheme-buffered:compute-length
295 }
296 $read-grapheme-buffered:abort: {
297
298 print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
299 var n/eax: int <- copy c
300 print-int32-hex-to-real-screen n
301 print-string-to-real-screen "\n"
302 var exit-status/ebx: int <- copy 1
303 syscall_exit
304 }
305 }
306
307 var result/edi: grapheme <- copy c
308 var num-byte-shifts/edx: int <- copy 1
309 {
310 compare num-trailers, 0
311 break-if-<=
312 var tmp/eax: byte <- read-byte-buffered in
313 var tmp2/eax: int <- copy tmp
314 tmp2 <- shift-left-bytes tmp2, num-byte-shifts
315 result <- or tmp2
316
317 num-byte-shifts <- increment
318 num-trailers <- decrement
319 loop
320 }
321 return result
322 }
323
324
325 fn shift-left-bytes n: int, k: int -> _/eax: int {
326 var i/ecx: int <- copy 0
327 var result/eax: int <- copy n
328 {
329 compare i, k
330 break-if->=
331 compare i, 4
332 break-if->=
333 result <- shift-left 8
334 i <- increment
335 loop
336 }
337 return result
338 }
339
340 fn test-shift-left-bytes-0 {
341 var result/eax: int <- shift-left-bytes 1, 0
342 check-ints-equal result, 1, "F - shift-left-bytes 0"
343 }
344
345 fn test-shift-left-bytes-1 {
346 var result/eax: int <- shift-left-bytes 1, 1
347 check-ints-equal result, 0x100, "F - shift-left-bytes 1"
348 }
349
350 fn test-shift-left-bytes-2 {
351 var result/eax: int <- shift-left-bytes 1, 2
352 check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
353 }
354
355 fn test-shift-left-bytes-3 {
356 var result/eax: int <- shift-left-bytes 1, 3
357 check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
358 }
359
360 fn test-shift-left-bytes-4 {
361 var result/eax: int <- shift-left-bytes 1, 4
362 check-ints-equal result, 0, "F - shift-left-bytes 4"
363 }
364
365 fn test-shift-left-bytes-5 {
366 var result/eax: int <- shift-left-bytes 1, 5
367 check-ints-equal result, 0, "F - shift-left-bytes >4"
368 }
369
370
371
372
373
374
375
376
377
378
379 fn write-grapheme out: (addr stream byte), g: grapheme {
380 $write-grapheme:body: {
381 var c/eax: int <- copy g
382 append-byte out, c
383 c <- shift-right 8
384 compare c, 0
385 break-if-= $write-grapheme:body
386 append-byte out, c
387 c <- shift-right 8
388 compare c, 0
389 break-if-= $write-grapheme:body
390 append-byte out, c
391 c <- shift-right 8
392 compare c, 0
393 break-if-= $write-grapheme:body
394 append-byte out, c
395 }
396 }