https://github.com/akkartik/mu/blob/main/403unicode.mu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 fn test-unicode-serialization-and-deserialization {
18 var i/ebx: int <- copy 0
19 var init?/esi: boolean <- copy 1/true
20 {
21 compare i, 0x10000
22
23 break-if->=
24 var c/eax: code-point <- copy i
25 var _g/eax: grapheme <- to-grapheme c
26 var g/ecx: grapheme <- copy _g
27 var c2/eax: code-point <- to-code-point g
28 compare i, c2
29 {
30 break-if-=
31 {
32 compare init?, 0/false
33 break-if-=
34 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "F - test-unicode-serialization-and-deserialization: ", 3/fg 0/bg
35 }
36 init? <- copy 0/false
37 draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, i, 3/fg 0/bg
38 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg
39 {
40 var x/eax: int <- copy g
41 draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x, 3/fg 0/bg
42 }
43 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg
44 {
45 var x2/eax: int <- copy c2
46 draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x2, 3/fg 0/bg
47 }
48 draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, " ", 3/fg 0/bg
49 }
50 i <- add 0xf
51 loop
52 }
53 }
54
55
56 fn to-code-point in: grapheme -> _/eax: code-point {
57 var g/ebx: int <- copy in
58
59 {
60 compare g, 0xff
61 break-if->
62 var result/eax: code-point <- copy g
63 return result
64 }
65
66 var len/edx: int <- grapheme-length in
67
68 var b/eax: byte <- copy-byte g
69 var result/edi: code-point <- copy b
70 {
71 compare len, 2
72 break-if-!=
73 result <- and 0x1f
74 }
75 {
76 compare len, 3
77 break-if-!=
78 result <- and 0x0f
79 }
80 {
81 compare len, 4
82 break-if-!=
83 result <- and 0x07
84 }
85
86 g <- shift-right 8
87 var i/ecx: int <- copy 1
88 {
89 compare i, len
90 break-if->=
91 var b/eax: byte <- copy-byte g
92 b <- and 0x3f
93 result <- shift-left 6
94 result <- or b
95 g <- shift-right 8
96 i <- increment
97 loop
98 }
99 return result
100 }
101
102
103
104 fn to-grapheme in: code-point -> _/eax: grapheme {
105 var c/eax: int <- copy in
106 var num-trailers/ecx: int <- copy 0
107 var first/edx: int <- copy 0
108 $to-grapheme:compute-length: {
109
110 compare c, 0x7f
111 {
112 break-if->
113 var g/eax: grapheme <- copy c
114 return g
115 }
116
117 compare c, 0x7ff
118 {
119 break-if->
120 num-trailers <- copy 1
121 first <- copy 0xc0
122 break $to-grapheme:compute-length
123 }
124
125 compare c, 0xffff
126 {
127 break-if->
128 num-trailers <- copy 2
129 first <- copy 0xe0
130 break $to-grapheme:compute-length
131 }
132
133 compare c, 0x1fffff
134 {
135 break-if->
136 num-trailers <- copy 3
137 first <- copy 0xf0
138 break $to-grapheme:compute-length
139 }
140
141
142 compare c, 0x1fffff
143 {
144 break-if->
145 return 0
146 }
147 }
148
149 var result/edi: grapheme <- copy 0
150 {
151 compare num-trailers, 0
152 break-if-<=
153 var tmp/esi: int <- copy c
154 tmp <- and 0x3f
155 tmp <- or 0x80
156 result <- shift-left 8
157 result <- or tmp
158
159 c <- shift-right 6
160 num-trailers <- decrement
161 loop
162 }
163
164 result <- shift-left 8
165 result <- or c
166 result <- or first
167
168 return result
169 }
170
171
172 fn test-to-grapheme-single-byte {
173 var in-int/ecx: int <- copy 0
174 {
175 compare in-int, 0x7f
176 break-if->
177 var in/eax: code-point <- copy in-int
178 var out/eax: grapheme <- to-grapheme in
179 var out-int/eax: int <- copy out
180 check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte"
181 in-int <- increment
182 loop
183 }
184 }
185
186
187
188 fn test-to-grapheme-two-bytes-min {
189 var in/eax: code-point <- copy 0x80
190 var out/eax: grapheme <- to-grapheme in
191 var out-int/eax: int <- copy out
192 check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a"
193 }
194
195
196 fn test-to-grapheme-two-bytes-max {
197 var in/eax: code-point <- copy 0x7ff
198 var out/eax: grapheme <- to-grapheme in
199 var out-int/eax: int <- copy out
200 check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b"
201 }
202
203
204 fn test-to-grapheme-three-bytes-min {
205 var in/eax: code-point <- copy 0x800
206 var out/eax: grapheme <- to-grapheme in
207 var out-int/eax: int <- copy out
208 check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a"
209 }
210
211
212 fn test-to-grapheme-three-bytes-max {
213 var in/eax: code-point <- copy 0xffff
214 var out/eax: grapheme <- to-grapheme in
215 var out-int/eax: int <- copy out
216 check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b"
217 }
218
219
220 fn test-to-grapheme-four-bytes-min {
221 var in/eax: code-point <- copy 0x10000
222 var out/eax: grapheme <- to-grapheme in
223 var out-int/eax: int <- copy out
224 check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a"
225 }
226
227
228 fn test-to-grapheme-four-bytes-max {
229 var in/eax: code-point <- copy 0x1fffff
230 var out/eax: grapheme <- to-grapheme in
231 var out-int/eax: int <- copy out
232 check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b"
233 }
234
235
236 fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {
237
238 {
239 var eof?/eax: boolean <- stream-empty? in
240 compare eof?, 0/false
241 break-if-=
242 return 0xffffffff
243 }
244 var c/eax: byte <- read-byte in
245 var num-trailers/ecx: int <- copy 0
246 $read-grapheme:compute-length: {
247
248 compare c, 0xc0
249 {
250 break-if->=
251 var g/eax: grapheme <- copy c
252 return g
253 }
254 compare c, 0xfe
255 {
256 break-if-<
257 var g/eax: grapheme <- copy c
258 return g
259 }
260
261 compare c, 0xe0
262 {
263 break-if->=
264 num-trailers <- copy 1
265 break $read-grapheme:compute-length
266 }
267
268 compare c, 0xf0
269 {
270 break-if->=
271 num-trailers <- copy 2
272 break $read-grapheme:compute-length
273 }
274
275 compare c, 0xf8
276 {
277 break-if->=
278 num-trailers <- copy 3
279 break $read-grapheme:compute-length
280 }
281
282 return 0
283 }
284
285 var result/edi: grapheme <- copy c
286 var num-byte-shifts/edx: int <- copy 1
287 {
288 compare num-trailers, 0
289 break-if-<=
290 var tmp/eax: byte <- read-byte in
291 var tmp2/eax: int <- copy tmp
292 tmp2 <- shift-left-bytes tmp2, num-byte-shifts
293 result <- or tmp2
294
295 num-byte-shifts <- increment
296 num-trailers <- decrement
297 loop
298 }
299 return result
300 }
301
302 fn grapheme-length g: grapheme -> _/edx: int {
303 {
304 compare g, 0xff
305 break-if->
306 return 1
307 }
308 {
309 compare g, 0xffff
310 break-if->
311 return 2
312 }
313 {
314 compare g, 0xffffff
315 break-if->
316 return 3
317 }
318 return 4
319 }
320
321
322 fn shift-left-bytes n: int, k: int -> _/eax: int {
323 var i/ecx: int <- copy 0
324 var result/eax: int <- copy n
325 {
326 compare i, k
327 break-if->=
328 compare i, 4
329 break-if->=
330 result <- shift-left 8
331 i <- increment
332 loop
333 }
334 return result
335 }
336
337
338
339 fn write-grapheme out: (addr stream byte), g: grapheme {
340 $write-grapheme:body: {
341 var c/eax: int <- copy g
342 append-byte out, c
343 c <- shift-right 8
344 compare c, 0
345 break-if-= $write-grapheme:body
346 append-byte out, c
347 c <- shift-right 8
348 compare c, 0
349 break-if-= $write-grapheme:body
350 append-byte out, c
351 c <- shift-right 8
352 compare c, 0
353 break-if-= $write-grapheme:body
354 append-byte out, c
355 }
356 }