https://github.com/akkartik/mu/blob/master/403unicode.mu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 fn to-grapheme in: code-point -> out/eax: grapheme {
23 $to-grapheme:body: {
24 var c/eax: int <- copy in
25 var num-trailers/ecx: int <- copy 0
26 var first/edx: int <- copy 0
27 $to-grapheme:compute-length: {
28
29 compare c, 0x7f
30 {
31 break-if->
32 out <- copy c
33 break $to-grapheme:body
34 }
35
36 compare c, 0x7ff
37 {
38 break-if->
39 num-trailers <- copy 1
40 first <- copy 0xc0
41 break $to-grapheme:compute-length
42 }
43
44 compare c, 0xffff
45 {
46 break-if->
47 num-trailers <- copy 2
48 first <- copy 0xe0
49 break $to-grapheme:compute-length
50 }
51
52 compare c, 0x1fffff
53 {
54 break-if->
55 num-trailers <- copy 3
56 first <- copy 0xf0
57 break $to-grapheme:compute-length
58 }
59
60
61 compare c, 0x1fffff
62 {
63 break-if->
64 print-string-to-real-screen "unsupported code point "
65 print-int32-hex-to-real-screen c
66 print-string-to-real-screen "\n"
67 var exit-status/ebx: int <- copy 1
68 syscall_exit
69 }
70 }
71
72 var result/edi: int <- copy 0
73 {
74 compare num-trailers, 0
75 break-if-<=
76 var tmp/esi: int <- copy c
77 tmp <- and 0x3f
78 tmp <- or 0x80
79 result <- shift-left 8
80 result <- or tmp
81
82 c <- shift-right 6
83 num-trailers <- decrement
84 loop
85 }
86
87 result <- shift-left 8
88 result <- or c
89 result <- or first
90
91 out <- copy result
92 }
93 }
94
95
96 fn test-to-grapheme-single-byte {
97 var in-int/ecx: int <- copy 0
98 {
99 compare in-int, 0x7f
100 break-if->
101 var in/eax: code-point <- copy in-int
102 var out/eax: grapheme <- to-grapheme in
103 var out-int/eax: int <- copy out
104 check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte"
105 in-int <- increment
106 loop
107 }
108 }
109
110
111
112 fn test-to-grapheme-two-bytes-min {
113 var in/eax: code-point <- copy 0x80
114 var out/eax: grapheme <- to-grapheme in
115 var out-int/eax: int <- copy out
116 check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a"
117 }
118
119
120 fn test-to-grapheme-two-bytes-max {
121 var in/eax: code-point <- copy 0x7ff
122 var out/eax: grapheme <- to-grapheme in
123 var out-int/eax: int <- copy out
124 check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b"
125 }
126
127
128 fn test-to-grapheme-three-bytes-min {
129 var in/eax: code-point <- copy 0x800
130 var out/eax: grapheme <- to-grapheme in
131 var out-int/eax: int <- copy out
132 check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a"
133 }
134
135
136 fn test-to-grapheme-three-bytes-max {
137 var in/eax: code-point <- copy 0xffff
138 var out/eax: grapheme <- to-grapheme in
139 var out-int/eax: int <- copy out
140 check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b"
141 }
142
143
144 fn test-to-grapheme-four-bytes-min {
145 var in/eax: code-point <- copy 0x10000
146 var out/eax: grapheme <- to-grapheme in
147 var out-int/eax: int <- copy out
148 check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a"
149 }
150
151
152 fn test-to-grapheme-four-bytes-max {
153 var in/eax: code-point <- copy 0x1fffff
154 var out/eax: grapheme <- to-grapheme in
155 var out-int/eax: int <- copy out
156 check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b"
157 }
158
159
160 fn read-grapheme in: (addr stream byte) -> out/eax: grapheme {
161 $read-grapheme:body: {
162 var c/eax: byte <- read-byte in
163 var num-trailers/ecx: int <- copy 0
164 $read-grapheme:compute-length: {
165
166 compare c, 0xc0
167 {
168 break-if->=
169 out <- copy c
170 num-trailers <- copy 0
171 break $read-grapheme:body
172 }
173 compare c, 0xfe
174 {
175 break-if-<
176 out <- copy c
177 break $read-grapheme:body
178 }
179
180 compare c, 0xe0
181 {
182 break-if->=
183 num-trailers <- copy 1
184 break $read-grapheme:compute-length
185 }
186
187 compare c, 0xf0
188 {
189 break-if->=
190 num-trailers <- copy 2
191 break $read-grapheme:compute-length
192 }
193
194 compare c, 0xf8
195 {
196 break-if->=
197 num-trailers <- copy 3
198 break $read-grapheme:compute-length
199 }
200 $read-grapheme:abort: {
201
202 print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
203 var n/eax: int <- copy c
204 print-int32-hex-to-real-screen n
205 print-string-to-real-screen "\n"
206 var exit-status/ebx: int <- copy 1
207 syscall_exit
208 }
209 }
210
211 var result/edi: int <- copy c
212 var num-byte-shifts/edx: int <- copy 1
213 {
214 compare num-trailers, 0
215 break-if-<=
216 var tmp/eax: byte <- read-byte in
217 var tmp2/eax: int <- copy tmp
218 tmp2 <- shift-left-bytes tmp2, num-byte-shifts
219 result <- or tmp2
220
221 num-byte-shifts <- increment
222 num-trailers <- decrement
223 loop
224 }
225 out <- copy result
226 }
227 }
228
229 fn test-read-grapheme {
230 var s: (stream byte 0x30)
231 var s2/ecx: (addr stream byte) <- address s
232 write s2, "aΒc世d界e"
233 var c/eax: grapheme <- read-grapheme s2
234 var n/eax: int <- copy c
235 check-ints-equal n, 0x61, "F - test grapheme/0"
236 var c/eax: grapheme <- read-grapheme s2
237 var n/eax: int <- copy c
238 check-ints-equal n, 0x92ce, "F - test grapheme/1"
239 var c/eax: grapheme <- read-grapheme s2
240 var n/eax: int <- copy c
241 check-ints-equal n, 0x63, "F - test grapheme/2"
242 var c/eax: grapheme <- read-grapheme s2
243 var n/eax: int <- copy c
244 check-ints-equal n, 0x96b8e4, "F - test grapheme/3"
245 var c/eax: grapheme <- read-grapheme s2
246 var n/eax: int <- copy c
247 check-ints-equal n, 0x64, "F - test grapheme/4"
248 var c/eax: grapheme <- read-grapheme s2
249 var n/eax: int <- copy c
250 check-ints-equal n, 0x8c95e7, "F - test grapheme/5"
251 var c/eax: grapheme <- read-grapheme s2
252 var n/eax: int <- copy c
253 check-ints-equal n, 0x65, "F - test grapheme/6"
254 }
255
256
257 fn shift-left-bytes n: int, k: int -> result/eax: int {
258 var i/ecx: int <- copy 0
259 result <- copy n
260 {
261 compare i, k
262 break-if->=
263 compare i, 4
264 break-if->=
265 result <- shift-left 8
266 i <- increment
267 loop
268 }
269 }
270
271 fn test-shift-left-bytes-0 {
272 var result/eax: int <- shift-left-bytes 1, 0
273 check-ints-equal result, 1, "F - shift-left-bytes 0"
274 }
275
276 fn test-shift-left-bytes-1 {
277 var result/eax: int <- shift-left-bytes 1, 1
278 check-ints-equal result, 0x100, "F - shift-left-bytes 1"
279 }
280
281 fn test-shift-left-bytes-2 {
282 var result/eax: int <- shift-left-bytes 1, 2
283 check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
284 }
285
286 fn test-shift-left-bytes-3 {
287 var result/eax: int <- shift-left-bytes 1, 3
288 check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
289 }
290
291 fn test-shift-left-bytes-4 {
292 var result/eax: int <- shift-left-bytes 1, 4
293 check-ints-equal result, 0, "F - shift-left-bytes 4"
294 }
295
296 fn test-shift-left-bytes-5 {
297 var result/eax: int <- shift-left-bytes 1, 5
298 check-ints-equal result, 0, "F - shift-left-bytes >4"
299 }
300
301
302
303
304
305
306