https://github.com/akkartik/mu/blob/main/403unicode.mu
1
2
3
4
5
6
7
8
9
10 fn test-unicode-serialization-and-deserialization {
11 var i/ebx: int <- copy 0
12 var init?/esi: boolean <- copy 1/true
13 {
14 compare i, 0x10000
15
16 break-if->=
17 var c/eax: code-point <- copy i
18 var _g/eax: code-point-utf8 <- to-utf8 c
19 var g/ecx: code-point-utf8 <- copy _g
20 var c2/eax: code-point <- to-code-point g
21 compare i, c2
22 {
23 break-if-=
24 {
25 compare init?, 0/false
26 break-if-=
27 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "F - test-unicode-serialization-and-deserialization: ", 3/fg 0/bg
28 }
29 init? <- copy 0/false
30 draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, i, 3/fg 0/bg
31 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg
32 {
33 var x/eax: int <- copy g
34 draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x, 3/fg 0/bg
35 }
36 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg
37 {
38 var x2/eax: int <- copy c2
39 draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x2, 3/fg 0/bg
40 }
41 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, " ", 3/fg 0/bg
42 }
43 i <- add 0xf
44 loop
45 }
46 }
47
48
49 fn to-code-point in: code-point-utf8 -> _/eax: code-point {
50 var g/ebx: int <- copy in
51
52 {
53 compare g, 0xff
54 break-if->
55 var result/eax: code-point <- copy g
56 return result
57 }
58
59 var len/edx: int <- utf8-length in
60
61 var b/eax: byte <- copy-byte g
62 var result/edi: code-point <- copy b
63 {
64 compare len, 2
65 break-if-!=
66 result <- and 0x1f
67 }
68 {
69 compare len, 3
70 break-if-!=
71 result <- and 0x0f
72 }
73 {
74 compare len, 4
75 break-if-!=
76 result <- and 0x07
77 }
78
79 g <- shift-right 8
80 var i/ecx: int <- copy 1
81 {
82 compare i, len
83 break-if->=
84 var b/eax: byte <- copy-byte g
85 b <- and 0x3f
86 result <- shift-left 6
87 result <- or b
88 g <- shift-right 8
89 i <- increment
90 loop
91 }
92 return result
93 }
94
95
96
97 fn to-utf8 in: code-point -> _/eax: code-point-utf8 {
98 var c/eax: int <- copy in
99 var num-trailers/ecx: int <- copy 0
100 var first/edx: int <- copy 0
101 $to-utf8:compute-length: {
102
103 compare c, 0x7f
104 {
105 break-if->
106 var g/eax: code-point-utf8 <- copy c
107 return g
108 }
109
110 compare c, 0x7ff
111 {
112 break-if->
113 num-trailers <- copy 1
114 first <- copy 0xc0
115 break $to-utf8:compute-length
116 }
117
118 compare c, 0xffff
119 {
120 break-if->
121 num-trailers <- copy 2
122 first <- copy 0xe0
123 break $to-utf8:compute-length
124 }
125
126 compare c, 0x1fffff
127 {
128 break-if->
129 num-trailers <- copy 3
130 first <- copy 0xf0
131 break $to-utf8:compute-length
132 }
133
134 compare c, 0x1fffff
135 {
136 break-if->
137 abort "unsupported code point"
138 return 0
139 }
140 }
141
142 var result/edi: code-point-utf8 <- copy 0
143 {
144 compare num-trailers, 0
145 break-if-<=
146 var tmp/esi: int <- copy c
147 tmp <- and 0x3f
148 tmp <- or 0x80
149 result <- shift-left 8
150 result <- or tmp
151
152 c <- shift-right 6
153 num-trailers <- decrement
154 loop
155 }
156
157 result <- shift-left 8
158 result <- or c
159 result <- or first
160
161 return result
162 }
163
164
165 fn test-to-utf8-single-byte {
166 var in-int/ecx: int <- copy 0
167 {
168 compare in-int, 0x7f
169 break-if->
170 var in/eax: code-point <- copy in-int
171 var out/eax: code-point-utf8 <- to-utf8 in
172 var out-int/eax: int <- copy out
173 check-ints-equal out-int, in-int, "F - test-to-utf8-single-byte"
174 in-int <- increment
175 loop
176 }
177 }
178
179
180
181 fn test-to-utf8-two-bytes-min {
182 var in/eax: code-point <- copy 0x80
183 var out/eax: code-point-utf8 <- to-utf8 in
184 var out-int/eax: int <- copy out
185 check-ints-equal out-int, 0x80c2, "F - to-utf8/2a"
186 }
187
188
189 fn test-to-utf8-two-bytes-max {
190 var in/eax: code-point <- copy 0x7ff
191 var out/eax: code-point-utf8 <- to-utf8 in
192 var out-int/eax: int <- copy out
193 check-ints-equal out-int, 0xbfdf, "F - to-utf8/2b"
194 }
195
196
197 fn test-to-utf8-three-bytes-min {
198 var in/eax: code-point <- copy 0x800
199 var out/eax: code-point-utf8 <- to-utf8 in
200 var out-int/eax: int <- copy out
201 check-ints-equal out-int, 0x80a0e0, "F - to-utf8/3a"
202 }
203
204
205 fn test-to-utf8-three-bytes-max {
206 var in/eax: code-point <- copy 0xffff
207 var out/eax: code-point-utf8 <- to-utf8 in
208 var out-int/eax: int <- copy out
209 check-ints-equal out-int, 0xbfbfef, "F - to-utf8/3b"
210 }
211
212
213 fn test-to-utf8-four-bytes-min {
214 var in/eax: code-point <- copy 0x10000
215 var out/eax: code-point-utf8 <- to-utf8 in
216 var out-int/eax: int <- copy out
217 check-ints-equal out-int, 0x808090f0, "F - to-utf8/4a"
218 }
219
220
221 fn test-to-utf8-four-bytes-max {
222 var in/eax: code-point <- copy 0x1fffff
223 var out/eax: code-point-utf8 <- to-utf8 in
224 var out-int/eax: int <- copy out
225 check-ints-equal out-int, 0xbfbfbff7, "F - to-utf8/4b"
226 }
227
228
229 fn read-code-point-utf8 in: (addr stream byte) -> _/eax: code-point-utf8 {
230
231 {
232 var eof?/eax: boolean <- stream-empty? in
233 compare eof?, 0/false
234 break-if-=
235 return 0xffffffff
236 }
237 var c/eax: byte <- read-byte in
238 var num-trailers/ecx: int <- copy 0
239 $read-code-point-utf8:compute-length: {
240
241 compare c, 0xc0
242 {
243 break-if->=
244 var g/eax: code-point-utf8 <- copy c
245 return g
246 }
247 compare c, 0xfe
248 {
249 break-if-<
250 var g/eax: code-point-utf8 <- copy c
251 return g
252 }
253
254 compare c, 0xe0
255 {
256 break-if->=
257 num-trailers <- copy 1
258 break $read-code-point-utf8:compute-length
259 }
260
261 compare c, 0xf0
262 {
263 break-if->=
264 num-trailers <- copy 2
265 break $read-code-point-utf8:compute-length
266 }
267
268 compare c, 0xf8
269 {
270 break-if->=
271 num-trailers <- copy 3
272 break $read-code-point-utf8:compute-length
273 }
274 abort "utf-8 encodings larger than 4 bytes are not yet supported"
275 return 0
276 }
277
278 var result/edi: code-point-utf8 <- copy c
279 var num-byte-shifts/edx: int <- copy 1
280 {
281 compare num-trailers, 0
282 break-if-<=
283 var tmp/eax: byte <- read-byte in
284 var tmp2/eax: int <- copy tmp
285 tmp2 <- shift-left-bytes tmp2, num-byte-shifts
286 result <- or tmp2
287
288 num-byte-shifts <- increment
289 num-trailers <- decrement
290 loop
291 }
292 return result
293 }
294
295 fn test-read-code-point-utf8 {
296 var s: (stream byte 0x30)
297 var s2/ecx: (addr stream byte) <- address s
298 write s2, "aΒc世d界e"
299 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
300 var n/eax: int <- copy c
301 check-ints-equal n, 0x61, "F - test code-point-utf8/0"
302 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
303 var n/eax: int <- copy c
304 check-ints-equal n, 0x92ce/greek-capital-letter-beta, "F - test code-point-utf8/1"
305 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
306 var n/eax: int <- copy c
307 check-ints-equal n, 0x63, "F - test code-point-utf8/2"
308 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
309 var n/eax: int <- copy c
310 check-ints-equal n, 0x96b8e4, "F - test code-point-utf8/3"
311 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
312 var n/eax: int <- copy c
313 check-ints-equal n, 0x64, "F - test code-point-utf8/4"
314 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
315 var n/eax: int <- copy c
316 check-ints-equal n, 0x8c95e7, "F - test code-point-utf8/5"
317 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
318 var n/eax: int <- copy c
319 check-ints-equal n, 0x65, "F - test code-point-utf8/6"
320 }
321
322 fn utf8-length g: code-point-utf8 -> _/edx: int {
323 {
324 compare g, 0xff
325 break-if->
326 return 1
327 }
328 {
329 compare g, 0xffff
330 break-if->
331 return 2
332 }
333 {
334 compare g, 0xffffff
335 break-if->
336 return 3
337 }
338 return 4
339 }
340
341
342 fn shift-left-bytes n: int, k: int -> _/eax: int {
343 var i/ecx: int <- copy 0
344 var result/eax: int <- copy n
345 {
346 compare i, k
347 break-if->=
348 compare i, 4
349 break-if->=
350 result <- shift-left 8
351 i <- increment
352 loop
353 }
354 return result
355 }
356
357 fn test-shift-left-bytes-0 {
358 var result/eax: int <- shift-left-bytes 1, 0
359 check-ints-equal result, 1, "F - shift-left-bytes 0"
360 }
361
362 fn test-shift-left-bytes-1 {
363 var result/eax: int <- shift-left-bytes 1, 1
364 check-ints-equal result, 0x100, "F - shift-left-bytes 1"
365 }
366
367 fn test-shift-left-bytes-2 {
368 var result/eax: int <- shift-left-bytes 1, 2
369 check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
370 }
371
372 fn test-shift-left-bytes-3 {
373 var result/eax: int <- shift-left-bytes 1, 3
374 check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
375 }
376
377 fn test-shift-left-bytes-4 {
378 var result/eax: int <- shift-left-bytes 1, 4
379 check-ints-equal result, 0, "F - shift-left-bytes 4"
380 }
381
382 fn test-shift-left-bytes-5 {
383 var result/eax: int <- shift-left-bytes 1, 5
384 check-ints-equal result, 0, "F - shift-left-bytes >4"
385 }
386
387
388
389 fn write-code-point-utf8 out: (addr stream byte), g: code-point-utf8 {
390 $write-code-point-utf8:body: {
391 var c/eax: int <- copy g
392 append-byte out, c
393 c <- shift-right 8
394 compare c, 0
395 break-if-= $write-code-point-utf8:body
396 append-byte out, c
397 c <- shift-right 8
398 compare c, 0
399 break-if-= $write-code-point-utf8:body
400 append-byte out, c
401 c <- shift-right 8
402 compare c, 0
403 break-if-= $write-code-point-utf8:body
404 append-byte out, c
405 }
406 }