https://github.com/akkartik/mu/blob/main/linux/403unicode.mu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 fn to-utf8 in: code-point -> _/eax: code-point-utf8 {
20 var c/eax: int <- copy in
21 var num-trailers/ecx: int <- copy 0
22 var first/edx: int <- copy 0
23 $to-utf8:compute-length: {
24
25 compare c, 0x7f
26 {
27 break-if->
28 var g/eax: code-point-utf8 <- copy c
29 return g
30 }
31
32 compare c, 0x7ff
33 {
34 break-if->
35 num-trailers <- copy 1
36 first <- copy 0xc0
37 break $to-utf8:compute-length
38 }
39
40 compare c, 0xffff
41 {
42 break-if->
43 num-trailers <- copy 2
44 first <- copy 0xe0
45 break $to-utf8:compute-length
46 }
47
48 compare c, 0x1fffff
49 {
50 break-if->
51 num-trailers <- copy 3
52 first <- copy 0xf0
53 break $to-utf8:compute-length
54 }
55
56
57 compare c, 0x1fffff
58 {
59 break-if->
60 print-string-to-real-screen "unsupported code point "
61 print-int32-hex-to-real-screen c
62 print-string-to-real-screen "\n"
63 var exit-status/ebx: int <- copy 1
64 syscall_exit
65 }
66 }
67
68 var result/edi: code-point-utf8 <- copy 0
69 {
70 compare num-trailers, 0
71 break-if-<=
72 var tmp/esi: int <- copy c
73 tmp <- and 0x3f
74 tmp <- or 0x80
75 result <- shift-left 8
76 result <- or tmp
77
78 c <- shift-right 6
79 num-trailers <- decrement
80 loop
81 }
82
83 result <- shift-left 8
84 result <- or c
85 result <- or first
86
87 return result
88 }
89
90
91 fn test-to-utf8-single-byte {
92 var in-int/ecx: int <- copy 0
93 {
94 compare in-int, 0x7f
95 break-if->
96 var in/eax: code-point <- copy in-int
97 var out/eax: code-point-utf8 <- to-utf8 in
98 var out-int/eax: int <- copy out
99 check-ints-equal out-int, in-int, "F - test-to-utf8-single-byte"
100 in-int <- increment
101 loop
102 }
103 }
104
105
106
107 fn test-to-utf8-two-bytes-min {
108 var in/eax: code-point <- copy 0x80
109 var out/eax: code-point-utf8 <- to-utf8 in
110 var out-int/eax: int <- copy out
111 check-ints-equal out-int, 0x80c2, "F - to-utf8/2a"
112 }
113
114
115 fn test-to-utf8-two-bytes-max {
116 var in/eax: code-point <- copy 0x7ff
117 var out/eax: code-point-utf8 <- to-utf8 in
118 var out-int/eax: int <- copy out
119 check-ints-equal out-int, 0xbfdf, "F - to-utf8/2b"
120 }
121
122
123 fn test-to-utf8-three-bytes-min {
124 var in/eax: code-point <- copy 0x800
125 var out/eax: code-point-utf8 <- to-utf8 in
126 var out-int/eax: int <- copy out
127 check-ints-equal out-int, 0x80a0e0, "F - to-utf8/3a"
128 }
129
130
131 fn test-to-utf8-three-bytes-max {
132 var in/eax: code-point <- copy 0xffff
133 var out/eax: code-point-utf8 <- to-utf8 in
134 var out-int/eax: int <- copy out
135 check-ints-equal out-int, 0xbfbfef, "F - to-utf8/3b"
136 }
137
138
139 fn test-to-utf8-four-bytes-min {
140 var in/eax: code-point <- copy 0x10000
141 var out/eax: code-point-utf8 <- to-utf8 in
142 var out-int/eax: int <- copy out
143 check-ints-equal out-int, 0x808090f0, "F - to-utf8/4a"
144 }
145
146
147 fn test-to-utf8-four-bytes-max {
148 var in/eax: code-point <- copy 0x1fffff
149 var out/eax: code-point-utf8 <- to-utf8 in
150 var out-int/eax: int <- copy out
151 check-ints-equal out-int, 0xbfbfbff7, "F - to-utf8/4b"
152 }
153
154
155 fn read-code-point-utf8 in: (addr stream byte) -> _/eax: code-point-utf8 {
156
157 {
158 var eof?/eax: boolean <- stream-empty? in
159 compare eof?, 0/false
160 break-if-=
161 return 0xffffffff
162 }
163 var c/eax: byte <- read-byte in
164 var num-trailers/ecx: int <- copy 0
165 $read-code-point-utf8:compute-length: {
166
167 compare c, 0xc0
168 {
169 break-if->=
170 var g/eax: code-point-utf8 <- copy c
171 return g
172 }
173 compare c, 0xfe
174 {
175 break-if-<
176 var g/eax: code-point-utf8 <- copy c
177 return g
178 }
179
180 compare c, 0xe0
181 {
182 break-if->=
183 num-trailers <- copy 1
184 break $read-code-point-utf8:compute-length
185 }
186
187 compare c, 0xf0
188 {
189 break-if->=
190 num-trailers <- copy 2
191 break $read-code-point-utf8:compute-length
192 }
193
194 compare c, 0xf8
195 {
196 break-if->=
197 num-trailers <- copy 3
198 break $read-code-point-utf8:compute-length
199 }
200 $read-code-point-utf8:abort: {
201
202 print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not yet supported. First byte seen: "
203 var n/eax: int <- copy c
204 print-int32-hex-to-real-screen n
205 print-string-to-real-screen "\n"
206 var exit-status/ebx: int <- copy 1
207 syscall_exit
208 }
209 }
210
211 var result/edi: code-point-utf8 <- copy c
212 var num-byte-shifts/edx: int <- copy 1
213 {
214 compare num-trailers, 0
215 break-if-<=
216 var tmp/eax: byte <- read-byte in
217 var tmp2/eax: int <- copy tmp
218 tmp2 <- shift-left-bytes tmp2, num-byte-shifts
219 result <- or tmp2
220
221 num-byte-shifts <- increment
222 num-trailers <- decrement
223 loop
224 }
225 return result
226 }
227
228 fn test-read-code-point-utf8 {
229 var s: (stream byte 0x30)
230 var s2/ecx: (addr stream byte) <- address s
231 write s2, "aΒc世d界e"
232 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
233 var n/eax: int <- copy c
234 check-ints-equal n, 0x61, "F - test code-point-utf8/0"
235 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
236 var n/eax: int <- copy c
237 check-ints-equal n, 0x92ce/greek-capital-letter-beta, "F - test code-point-utf8/1"
238 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
239 var n/eax: int <- copy c
240 check-ints-equal n, 0x63, "F - test code-point-utf8/2"
241 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
242 var n/eax: int <- copy c
243 check-ints-equal n, 0x96b8e4, "F - test code-point-utf8/3"
244 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
245 var n/eax: int <- copy c
246 check-ints-equal n, 0x64, "F - test code-point-utf8/4"
247 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
248 var n/eax: int <- copy c
249 check-ints-equal n, 0x8c95e7, "F - test code-point-utf8/5"
250 var c/eax: code-point-utf8 <- read-code-point-utf8 s2
251 var n/eax: int <- copy c
252 check-ints-equal n, 0x65, "F - test code-point-utf8/6"
253 }
254
255 fn read-code-point-utf8-buffered in: (addr buffered-file) -> _/eax: code-point-utf8 {
256 var c/eax: byte <- read-byte-buffered in
257 var num-trailers/ecx: int <- copy 0
258 $read-code-point-utf8-buffered:compute-length: {
259
260 compare c, 0xc0
261 {
262 break-if->=
263 var g/eax: code-point-utf8 <- copy c
264 return g
265 }
266 compare c, 0xfe
267 {
268 break-if-<
269 var g/eax: code-point-utf8 <- copy c
270 return g
271 }
272
273 compare c, 0xe0
274 {
275 break-if->=
276 num-trailers <- copy 1
277 break $read-code-point-utf8-buffered:compute-length
278 }
279
280 compare c, 0xf0
281 {
282 break-if->=
283 num-trailers <- copy 2
284 break $read-code-point-utf8-buffered:compute-length
285 }
286
287 compare c, 0xf8
288 {
289 break-if->=
290 num-trailers <- copy 3
291 break $read-code-point-utf8-buffered:compute-length
292 }
293 $read-code-point-utf8-buffered:abort: {
294
295 print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
296 var n/eax: int <- copy c
297 print-int32-hex-to-real-screen n
298 print-string-to-real-screen "\n"
299 var exit-status/ebx: int <- copy 1
300 syscall_exit
301 }
302 }
303
304 var result/edi: code-point-utf8 <- copy c
305 var num-byte-shifts/edx: int <- copy 1
306 {
307 compare num-trailers, 0
308 break-if-<=
309 var tmp/eax: byte <- read-byte-buffered in
310 var tmp2/eax: int <- copy tmp
311 tmp2 <- shift-left-bytes tmp2, num-byte-shifts
312 result <- or tmp2
313
314 num-byte-shifts <- increment
315 num-trailers <- decrement
316 loop
317 }
318 return result
319 }
320
321
322 fn shift-left-bytes n: int, k: int -> _/eax: int {
323 var i/ecx: int <- copy 0
324 var result/eax: int <- copy n
325 {
326 compare i, k
327 break-if->=
328 compare i, 4
329 break-if->=
330 result <- shift-left 8
331 i <- increment
332 loop
333 }
334 return result
335 }
336
337 fn test-shift-left-bytes-0 {
338 var result/eax: int <- shift-left-bytes 1, 0
339 check-ints-equal result, 1, "F - shift-left-bytes 0"
340 }
341
342 fn test-shift-left-bytes-1 {
343 var result/eax: int <- shift-left-bytes 1, 1
344 check-ints-equal result, 0x100, "F - shift-left-bytes 1"
345 }
346
347 fn test-shift-left-bytes-2 {
348 var result/eax: int <- shift-left-bytes 1, 2
349 check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
350 }
351
352 fn test-shift-left-bytes-3 {
353 var result/eax: int <- shift-left-bytes 1, 3
354 check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
355 }
356
357 fn test-shift-left-bytes-4 {
358 var result/eax: int <- shift-left-bytes 1, 4
359 check-ints-equal result, 0, "F - shift-left-bytes 4"
360 }
361
362 fn test-shift-left-bytes-5 {
363 var result/eax: int <- shift-left-bytes 1, 5
364 check-ints-equal result, 0, "F - shift-left-bytes >4"
365 }
366
367
368
369 fn write-code-point-utf8 out: (addr stream byte), g: code-point-utf8 {
370 $write-code-point-utf8:body: {
371 var c/eax: int <- copy g
372 append-byte out, c
373 c <- shift-right 8
374 compare c, 0
375 break-if-= $write-code-point-utf8:body
376 append-byte out, c
377 c <- shift-right 8
378 compare c, 0
379 break-if-= $write-code-point-utf8:body
380 append-byte out, c
381 c <- shift-right 8
382 compare c, 0
383 break-if-= $write-code-point-utf8:body
384 append-byte out, c
385 }
386 }