https://github.com/akkartik/mu/blob/master/403unicode.mu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 fn to-grapheme in: code-point -> out/eax: grapheme {
23 $to-grapheme:body: {
24 var c/eax: int <- copy in
25 var num-trailers/ecx: int <- copy 0
26 var first/edx: int <- copy 0
27 $to-grapheme:compute-length: {
28
29 compare c, 0x7f
30 {
31 break-if->
32 out <- copy c
33 break $to-grapheme:body
34 }
35
36 compare c, 0x7ff
37 {
38 break-if->
39 num-trailers <- copy 1
40 first <- copy 0xc0
41 break $to-grapheme:compute-length
42 }
43
44 compare c, 0xffff
45 {
46 break-if->
47 num-trailers <- copy 2
48 first <- copy 0xe0
49 break $to-grapheme:compute-length
50 }
51
52 compare c, 0x1fffff
53 {
54 break-if->
55 num-trailers <- copy 3
56 first <- copy 0xf0
57 break $to-grapheme:compute-length
58 }
59
60
61 compare c, 0x1fffff
62 {
63 break-if->
64 print-string-to-real-screen "unsupported code point "
65 print-int32-hex-to-real-screen c
66 print-string-to-real-screen "\n"
67 var exit-status/ebx: int <- copy 1
68 syscall_exit
69 }
70 }
71
72 var result/edi: int <- copy 0
73 {
74 compare num-trailers, 0
75 break-if-<=
76 var tmp/esi: int <- copy c
77 tmp <- and 0x3f
78 tmp <- or 0x80
79 result <- shift-left 8
80 result <- or tmp
81
82 c <- shift-right 6
83 num-trailers <- decrement
84 loop
85 }
86
87 result <- shift-left 8
88 result <- or c
89 result <- or first
90
91 out <- copy result
92 }
93 }
94
95
96 fn test-to-grapheme-single-byte {
97 var in-int/ecx: int <- copy 0
98 {
99 compare in-int, 0x7f
100 break-if->
101 var in/eax: code-point <- copy in-int
102 var out/eax: grapheme <- to-grapheme in
103 var out-int/eax: int <- copy out
104 check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte"
105 in-int <- increment
106 loop
107 }
108 }
109
110
111
112 fn test-to-grapheme-two-bytes-min {
113 var in/eax: code-point <- copy 0x80
114 var out/eax: grapheme <- to-grapheme in
115 var out-int/eax: int <- copy out
116 check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a"
117 }
118
119
120 fn test-to-grapheme-two-bytes-max {
121 var in/eax: code-point <- copy 0x7ff
122 var out/eax: grapheme <- to-grapheme in
123 var out-int/eax: int <- copy out
124 check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b"
125 }
126
127
128 fn test-to-grapheme-three-bytes-min {
129 var in/eax: code-point <- copy 0x800
130 var out/eax: grapheme <- to-grapheme in
131 var out-int/eax: int <- copy out
132 check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a"
133 }
134
135
136 fn test-to-grapheme-three-bytes-max {
137 var in/eax: code-point <- copy 0xffff
138 var out/eax: grapheme <- to-grapheme in
139 var out-int/eax: int <- copy out
140 check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b"
141 }
142
143
144 fn test-to-grapheme-four-bytes-min {
145 var in/eax: code-point <- copy 0x10000
146 var out/eax: grapheme <- to-grapheme in
147 var out-int/eax: int <- copy out
148 check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a"
149 }
150
151
152 fn test-to-grapheme-four-bytes-max {
153 var in/eax: code-point <- copy 0x1fffff
154 var out/eax: grapheme <- to-grapheme in
155 var out-int/eax: int <- copy out
156 check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b"
157 }
158
159
160 fn read-grapheme in: (addr stream byte) -> out/eax: grapheme {
161 $read-grapheme:body: {
162 var c/eax: byte <- read-byte in
163 var num-trailers/ecx: int <- copy 0
164 $read-grapheme:compute-length: {
165
166 compare c, 0xc0
167 {
168 break-if->=
169 out <- copy c
170 num-trailers <- copy 0
171 break $read-grapheme:body
172 }
173 compare c, 0xfe
174 {
175 break-if-<
176 out <- copy c
177 break $read-grapheme:body
178 }
179
180 compare c, 0xe0
181 {
182 break-if->=
183 num-trailers <- copy 1
184 break $read-grapheme:compute-length
185 }
186
187 compare c, 0xf0
188 {
189 break-if->=
190 num-trailers <- copy 2
191 break $read-grapheme:compute-length
192 }
193
194 compare c, 0xf8
195 {
196 break-if->=
197 num-trailers <- copy 3
198 break $read-grapheme:compute-length
199 }
200 $read-grapheme:abort: {
201
202 print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
203 var n/eax: int <- copy c
204 print-int32-hex-to-real-screen n
205 print-string-to-real-screen "\n"
206 var exit-status/ebx: int <- copy 1
207 syscall_exit
208 }
209 }
210
211 var result/edi: int <- copy c
212 var num-byte-shifts/edx: int <- copy 1
213 {
214 compare num-trailers, 0
215 break-if-<=
216 var tmp/eax: byte <- read-byte in
217 var tmp2/eax: int <- copy tmp
218 tmp2 <- shift-left-bytes tmp2, num-byte-shifts
219 result <- or tmp2
220
221 num-byte-shifts <- increment
222 num-trailers <- decrement
223 loop
224 }
225 out <- copy result
226 }
227 }
228
229 fn test-read-grapheme {
230 var s: (stream byte 0x30)
231 var s2/ecx: (addr stream byte) <- address s
232 write s2, "aΒc世d界e"
233 var c/eax: grapheme <- read-grapheme s2
234 var n/eax: int <- copy c
235 check-ints-equal n, 0x61, "F - test grapheme/0"
236 var c/eax: grapheme <- read-grapheme s2
237 var n/eax: int <- copy c
238 check-ints-equal n, 0x92ce, "F - test grapheme/1"
239 var c/eax: grapheme <- read-grapheme s2
240 var n/eax: int <- copy c
241 check-ints-equal n, 0x63, "F - test grapheme/2"
242 var c/eax: grapheme <- read-grapheme s2
243 var n/eax: int <- copy c
244 check-ints-equal n, 0x96b8e4, "F - test grapheme/3"
245 var c/eax: grapheme <- read-grapheme s2
246 var n/eax: int <- copy c
247 check-ints-equal n, 0x64, "F - test grapheme/4"
248 var c/eax: grapheme <- read-grapheme s2
249 var n/eax: int <- copy c
250 check-ints-equal n, 0x8c95e7, "F - test grapheme/5"
251 var c/eax: grapheme <- read-grapheme s2
252 var n/eax: int <- copy c
253 check-ints-equal n, 0x65, "F - test grapheme/6"
254 }
255
256 fn read-grapheme-buffered in: (addr buffered-file) -> out/eax: grapheme {
257 $read-grapheme-buffered:body: {
258 var c/eax: byte <- read-byte-buffered in
259 var num-trailers/ecx: int <- copy 0
260 $read-grapheme-buffered:compute-length: {
261
262 compare c, 0xc0
263 {
264 break-if->=
265 out <- copy c
266 num-trailers <- copy 0
267 break $read-grapheme-buffered:body
268 }
269 compare c, 0xfe
270 {
271 break-if-<
272 out <- copy c
273 break $read-grapheme-buffered:body
274 }
275
276 compare c, 0xe0
277 {
278 break-if->=
279 num-trailers <- copy 1
280 break $read-grapheme-buffered:compute-length
281 }
282
283 compare c, 0xf0
284 {
285 break-if->=
286 num-trailers <- copy 2
287 break $read-grapheme-buffered:compute-length
288 }
289
290 compare c, 0xf8
291 {
292 break-if->=
293 num-trailers <- copy 3
294 break $read-grapheme-buffered:compute-length
295 }
296 $read-grapheme-buffered:abort: {
297
298 print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
299 var n/eax: int <- copy c
300 print-int32-hex-to-real-screen n
301 print-string-to-real-screen "\n"
302 var exit-status/ebx: int <- copy 1
303 syscall_exit
304 }
305 }
306
307 var result/edi: int <- copy c
308 var num-byte-shifts/edx: int <- copy 1
309 {
310 compare num-trailers, 0
311 break-if-<=
312 var tmp/eax: byte <- read-byte-buffered in
313 var tmp2/eax: int <- copy tmp
314 tmp2 <- shift-left-bytes tmp2, num-byte-shifts
315 result <- or tmp2
316
317 num-byte-shifts <- increment
318 num-trailers <- decrement
319 loop
320 }
321 out <- copy result
322 }
323 }
324
325
326 fn shift-left-bytes n: int, k: int -> result/eax: int {
327 var i/ecx: int <- copy 0
328 result <- copy n
329 {
330 compare i, k
331 break-if->=
332 compare i, 4
333 break-if->=
334 result <- shift-left 8
335 i <- increment
336 loop
337 }
338 }
339
340 fn test-shift-left-bytes-0 {
341 var result/eax: int <- shift-left-bytes 1, 0
342 check-ints-equal result, 1, "F - shift-left-bytes 0"
343 }
344
345 fn test-shift-left-bytes-1 {
346 var result/eax: int <- shift-left-bytes 1, 1
347 check-ints-equal result, 0x100, "F - shift-left-bytes 1"
348 }
349
350 fn test-shift-left-bytes-2 {
351 var result/eax: int <- shift-left-bytes 1, 2
352 check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
353 }
354
355 fn test-shift-left-bytes-3 {
356 var result/eax: int <- shift-left-bytes 1, 3
357 check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
358 }
359
360 fn test-shift-left-bytes-4 {
361 var result/eax: int <- shift-left-bytes 1, 4
362 check-ints-equal result, 0, "F - shift-left-bytes 4"
363 }
364
365 fn test-shift-left-bytes-5 {
366 var result/eax: int <- shift-left-bytes 1, 5
367 check-ints-equal result, 0, "F - shift-left-bytes >4"
368 }
369
370
371
372
373
374
375
376
377
378
379 fn write-grapheme out: (addr stream byte), g: grapheme {
380 $write-grapheme:body: {
381 var c/eax: int <- copy g
382 append-byte out, c
383 c <- shift-right 8
384 compare c, 0
385 break-if-= $write-grapheme:body
386 append-byte out, c
387 c <- shift-right 8
388 compare c, 0
389 break-if-= $write-grapheme:body
390 append-byte out, c
391 c <- shift-right 8
392 compare c, 0
393 break-if-= $write-grapheme:body
394 append-byte out, c
395 }
396 }