https://github.com/akkartik/mu/blob/main/shell/tokenize.mu
1
2
3
4
5 fn tokenize in: (addr gap-buffer), out: (addr stream cell), trace: (addr trace) {
6 trace-text trace, "read", "tokenize"
7 trace-lower trace
8 rewind-gap-buffer in
9 var token-storage: cell
10 var token/edx: (addr cell) <- address token-storage
11 {
12 skip-whitespace-from-gap-buffer in
13 var done?/eax: boolean <- gap-buffer-scan-done? in
14 compare done?, 0/false
15 break-if-!=
16
17 var dest-ah/eax: (addr handle stream byte) <- get token, text-data
18 populate-stream dest-ah, 0x40/max-token-size
19
20 next-token in, token, trace
21 var error?/eax: boolean <- has-errors? trace
22 compare error?, 0/false
23 {
24 break-if-=
25 return
26 }
27 write-to-stream out, token
28 loop
29 }
30 trace-higher trace
31 }
32
33 fn test-tokenize-dotted-list {
34
35 var in-storage: gap-buffer
36 var in/esi: (addr gap-buffer) <- address in-storage
37 initialize-gap-buffer in, 0x10
38 add-code-point-at-gap in, 0x28/open-paren
39 add-code-point-at-gap in, 0x61/a
40 add-code-point-at-gap in, 0x20/space
41 add-code-point-at-gap in, 0x2e/dot
42 add-code-point-at-gap in, 0x20/space
43 add-code-point-at-gap in, 0x62/b
44 add-code-point-at-gap in, 0x29/close-paren
45
46 var stream-storage: (stream cell 0x10)
47 var stream/edi: (addr stream cell) <- address stream-storage
48
49 tokenize in, stream, 0/no-trace
50
51 var curr-token-storage: cell
52 var curr-token/ebx: (addr cell) <- address curr-token-storage
53 read-from-stream stream, curr-token
54 var open-paren?/eax: boolean <- open-paren-token? curr-token
55 check open-paren?, "F - test-tokenize-dotted-list: open paren"
56 read-from-stream stream, curr-token
57 read-from-stream stream, curr-token
58 var dot?/eax: boolean <- dot-token? curr-token
59 check dot?, "F - test-tokenize-dotted-list: dot"
60 read-from-stream stream, curr-token
61 read-from-stream stream, curr-token
62 var close-paren?/eax: boolean <- close-paren-token? curr-token
63 check close-paren?, "F - test-tokenize-dotted-list: close paren"
64 }
65
66 fn next-token in: (addr gap-buffer), _out-cell: (addr cell), trace: (addr trace) {
67 trace-text trace, "read", "next-token"
68 trace-lower trace
69 var out-cell/eax: (addr cell) <- copy _out-cell
70 var out-ah/eax: (addr handle stream byte) <- get out-cell, text-data
71 var _out/eax: (addr stream byte) <- lookup *out-ah
72 var out/edi: (addr stream byte) <- copy _out
73 $next-token:body: {
74 clear-stream out
75 skip-whitespace-from-gap-buffer in
76 var g/eax: grapheme <- peek-from-gap-buffer in
77 {
78 var stream-storage: (stream byte 0x40)
79 var stream/esi: (addr stream byte) <- address stream-storage
80 write stream, "next: "
81 var gval/eax: int <- copy g
82 write-int32-hex stream, gval
83 trace trace, "read", stream
84 }
85
86 {
87 var digit?/eax: boolean <- decimal-digit? g
88 compare digit?, 0/false
89 break-if-=
90 next-number-token in, out, trace
91 break $next-token:body
92 }
93
94 {
95 var symbol?/eax: boolean <- symbol-grapheme? g
96 compare symbol?, 0/false
97 break-if-=
98 next-symbol-token in, out, trace
99 break $next-token:body
100 }
101
102 {
103 var bracket?/eax: boolean <- bracket-grapheme? g
104 compare bracket?, 0/false
105 break-if-=
106 var g/eax: grapheme <- read-from-gap-buffer in
107 next-bracket-token g, out, trace
108 break $next-token:body
109 }
110
111 {
112 var operator?/eax: boolean <- operator-grapheme? g
113 compare operator?, 0/false
114 break-if-=
115 next-operator-token in, out, trace
116 break $next-token:body
117 }
118 }
119 trace-higher trace
120 var stream-storage: (stream byte 0x40)
121 var stream/eax: (addr stream byte) <- address stream-storage
122 write stream, "=> "
123 rewind-stream out
124 write-stream stream, out
125 trace trace, "read", stream
126 }
127
128 fn next-symbol-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
129 trace-text trace, "read", "looking for a symbol"
130 trace-lower trace
131 $next-symbol-token:loop: {
132 var done?/eax: boolean <- gap-buffer-scan-done? in
133 compare done?, 0/false
134 break-if-!=
135 var g/eax: grapheme <- peek-from-gap-buffer in
136 {
137 var stream-storage: (stream byte 0x40)
138 var stream/esi: (addr stream byte) <- address stream-storage
139 write stream, "next: "
140 var gval/eax: int <- copy g
141 write-int32-hex stream, gval
142 trace trace, "read", stream
143 }
144
145 {
146 var symbol-grapheme?/eax: boolean <- symbol-grapheme? g
147 compare symbol-grapheme?, 0/false
148 break-if-!=
149 trace-text trace, "read", "stop"
150 break $next-symbol-token:loop
151 }
152 var g/eax: grapheme <- read-from-gap-buffer in
153 write-grapheme out, g
154 loop
155 }
156 trace-higher trace
157 var stream-storage: (stream byte 0x40)
158 var stream/esi: (addr stream byte) <- address stream-storage
159 write stream, "=> "
160 rewind-stream out
161 write-stream stream, out
162 trace trace, "read", stream
163 }
164
165 fn next-operator-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
166 trace-text trace, "read", "looking for a operator"
167 trace-lower trace
168 $next-operator-token:loop: {
169 var done?/eax: boolean <- gap-buffer-scan-done? in
170 compare done?, 0/false
171 break-if-!=
172 var g/eax: grapheme <- peek-from-gap-buffer in
173 {
174 var stream-storage: (stream byte 0x40)
175 var stream/esi: (addr stream byte) <- address stream-storage
176 write stream, "next: "
177 var gval/eax: int <- copy g
178 write-int32-hex stream, gval
179 trace trace, "read", stream
180 }
181
182 {
183 var operator-grapheme?/eax: boolean <- operator-grapheme? g
184 compare operator-grapheme?, 0/false
185 break-if-!=
186 trace-text trace, "read", "stop"
187 break $next-operator-token:loop
188 }
189 var g/eax: grapheme <- read-from-gap-buffer in
190 write-grapheme out, g
191 loop
192 }
193 trace-higher trace
194 var stream-storage: (stream byte 0x40)
195 var stream/esi: (addr stream byte) <- address stream-storage
196 write stream, "=> "
197 rewind-stream out
198 write-stream stream, out
199 trace trace, "read", stream
200 }
201
202 fn next-number-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
203 trace-text trace, "read", "looking for a number"
204 trace-lower trace
205 $next-number-token:loop: {
206 var done?/eax: boolean <- gap-buffer-scan-done? in
207 compare done?, 0/false
208 break-if-!=
209 var g/eax: grapheme <- peek-from-gap-buffer in
210 {
211 var stream-storage: (stream byte 0x40)
212 var stream/esi: (addr stream byte) <- address stream-storage
213 write stream, "next: "
214 var gval/eax: int <- copy g
215 write-int32-hex stream, gval
216 trace trace, "read", stream
217 }
218
219 {
220 var symbol-grapheme?/eax: boolean <- symbol-grapheme? g
221 compare symbol-grapheme?, 0/false
222 break-if-!=
223 trace-text trace, "read", "stop"
224 break $next-number-token:loop
225 }
226
227 {
228 var digit?/eax: boolean <- decimal-digit? g
229 compare digit?, 0/false
230 break-if-!=
231 error trace, "invalid number"
232 return
233 }
234 trace-text trace, "read", "append"
235 var g/eax: grapheme <- read-from-gap-buffer in
236 write-grapheme out, g
237 loop
238 }
239 trace-higher trace
240 }
241
242 fn next-bracket-token g: grapheme, out: (addr stream byte), trace: (addr trace) {
243 trace-text trace, "read", "bracket"
244 write-grapheme out, g
245 var stream-storage: (stream byte 0x40)
246 var stream/esi: (addr stream byte) <- address stream-storage
247 write stream, "=> "
248 rewind-stream out
249 write-stream stream, out
250 trace trace, "read", stream
251 }
252
253 fn symbol-grapheme? g: grapheme -> _/eax: boolean {
254
255 compare g, 9/tab
256 {
257 break-if-!=
258 return 0/false
259 }
260 compare g, 0xa/newline
261 {
262 break-if-!=
263 return 0/false
264 }
265 compare g, 0x20/space
266 {
267 break-if-!=
268 return 0/false
269 }
270
271 compare g, 0x22/double-quote
272 {
273 break-if-!=
274 return 0/false
275 }
276 compare g, 0x60/backquote
277 {
278 break-if-!=
279 return 0/false
280 }
281
282 compare g, 0x28/open-paren
283 {
284 break-if-!=
285 return 0/false
286 }
287 compare g, 0x29/close-paren
288 {
289 break-if-!=
290 return 0/false
291 }
292 compare g, 0x5b/open-square-bracket
293 {
294 break-if-!=
295 return 0/false
296 }
297 compare g, 0x5d/close-square-bracket
298 {
299 break-if-!=
300 return 0/false
301 }
302 compare g, 0x7b/open-curly-bracket
303 {
304 break-if-!=
305 return 0/false
306 }
307 compare g, 0x7d/close-curly-bracket
308 {
309 break-if-!=
310 return 0/false
311 }
312
313
314 compare g, 0x23/hash
315 {
316 break-if-!=
317 return 0/false
318 }
319
320 compare g, 0x25/percent
321 {
322 break-if-!=
323 return 0/false
324 }
325 compare g, 0x26/ampersand
326 {
327 break-if-!=
328 return 0/false
329 }
330 compare g, 0x27/single-quote
331 {
332 break-if-!=
333 return 0/false
334 }
335 compare g, 0x2a/asterisk
336 {
337 break-if-!=
338 return 0/false
339 }
340 compare g, 0x2b/plus
341 {
342 break-if-!=
343 return 0/false
344 }
345 compare g, 0x2c/comma
346 {
347 break-if-!=
348 return 0/false
349 }
350 compare g, 0x2d/dash
351 {
352 break-if-!=
353 return 0/false
354 }
355 compare g, 0x2e/period
356 {
357 break-if-!=
358 return 0/false
359 }
360 compare g, 0x2f/slash
361 {
362 break-if-!=
363 return 0/false
364 }
365 compare g, 0x3a/colon
366 {
367 break-if-!=
368 return 0/false
369 }
370 compare g, 0x3b/semi-colon
371 {
372 break-if-!=
373 return 0/false
374 }
375 compare g, 0x3c/less-than
376 {
377 break-if-!=
378 return 0/false
379 }
380 compare g, 0x3d/equal
381 {
382 break-if-!=
383 return 0/false
384 }
385 compare g, 0x3e/greater-than
386 {
387 break-if-!=
388 return 0/false
389 }
390
391 compare g, 0x40/at-sign
392 {
393 break-if-!=
394 return 0/false
395 }
396 compare g, 0x5c/backslash
397 {
398 break-if-!=
399 return 0/false
400 }
401 compare g, 0x5e/caret
402 {
403 break-if-!=
404 return 0/false
405 }
406
407 compare g, 0x7c/vertical-line
408 {
409 break-if-!=
410 return 0/false
411 }
412 compare g, 0x7e/tilde
413 {
414 break-if-!=
415 return 0/false
416 }
417 return 1/true
418 }
419
420 fn bracket-grapheme? g: grapheme -> _/eax: boolean {
421 compare g, 0x28/open-paren
422 {
423 break-if-!=
424 return 1/true
425 }
426 compare g, 0x29/close-paren
427 {
428 break-if-!=
429 return 1/true
430 }
431 compare g, 0x5b/open-square-bracket
432 {
433 break-if-!=
434 return 1/true
435 }
436 compare g, 0x5d/close-square-bracket
437 {
438 break-if-!=
439 return 1/true
440 }
441 compare g, 0x7b/open-curly-bracket
442 {
443 break-if-!=
444 return 1/true
445 }
446 compare g, 0x7d/close-curly-bracket
447 {
448 break-if-!=
449 return 1/true
450 }
451 return 0/false
452 }
453
454 fn operator-grapheme? g: grapheme -> _/eax: boolean {
455
456 compare g, 0x25/percent
457 {
458 break-if-!=
459 return 1/false
460 }
461 compare g, 0x26/ampersand
462 {
463 break-if-!=
464 return 1/true
465 }
466 compare g, 0x27/single-quote
467 {
468 break-if-!=
469 return 1/true
470 }
471 compare g, 0x2a/asterisk
472 {
473 break-if-!=
474 return 1/true
475 }
476 compare g, 0x2b/plus
477 {
478 break-if-!=
479 return 1/true
480 }
481 compare g, 0x2c/comma
482 {
483 break-if-!=
484 return 1/true
485 }
486 compare g, 0x2d/dash
487 {
488 break-if-!=
489 return 1/true
490 }
491 compare g, 0x2e/period
492 {
493 break-if-!=
494 return 1/true
495 }
496 compare g, 0x2f/slash
497 {
498 break-if-!=
499 return 1/true
500 }
501 compare g, 0x3a/colon
502 {
503 break-if-!=
504 return 1/true
505 }
506 compare g, 0x3b/semi-colon
507 {
508 break-if-!=
509 return 1/true
510 }
511 compare g, 0x3c/less-than
512 {
513 break-if-!=
514 return 1/true
515 }
516 compare g, 0x3d/equal
517 {
518 break-if-!=
519 return 1/true
520 }
521 compare g, 0x3e/greater-than
522 {
523 break-if-!=
524 return 1/true
525 }
526
527 compare g, 0x40/at-sign
528 {
529 break-if-!=
530 return 1/true
531 }
532 compare g, 0x5c/backslash
533 {
534 break-if-!=
535 return 1/true
536 }
537 compare g, 0x5e/caret
538 {
539 break-if-!=
540 return 1/true
541 }
542
543 compare g, 0x7c/vertical-line
544 {
545 break-if-!=
546 return 1/true
547 }
548 compare g, 0x7e/tilde
549 {
550 break-if-!=
551 return 1/true
552 }
553 return 0/false
554 }
555
556 fn number-token? _in: (addr cell) -> _/eax: boolean {
557 var in/eax: (addr cell) <- copy _in
558 var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
559 var in-data/eax: (addr stream byte) <- lookup *in-data-ah
560 rewind-stream in-data
561 var g/eax: grapheme <- read-grapheme in-data
562 var result/eax: boolean <- decimal-digit? g
563 return result
564 }
565
566 fn bracket-token? _in: (addr cell) -> _/eax: boolean {
567 var in/eax: (addr cell) <- copy _in
568 var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
569 var in-data/eax: (addr stream byte) <- lookup *in-data-ah
570 rewind-stream in-data
571 var g/eax: grapheme <- read-grapheme in-data
572 var result/eax: boolean <- bracket-grapheme? g
573 return result
574 }
575
576 fn quote-token? _in: (addr cell) -> _/eax: boolean {
577 var in/eax: (addr cell) <- copy _in
578 var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
579 var in-data/eax: (addr stream byte) <- lookup *in-data-ah
580 rewind-stream in-data
581 var g/eax: grapheme <- read-grapheme in-data
582 compare g, 0x27/single-quote
583 {
584 break-if-!=
585 return 1/true
586 }
587 return 0/false
588 }
589
590 fn open-paren-token? _in: (addr cell) -> _/eax: boolean {
591 var in/eax: (addr cell) <- copy _in
592 var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
593 var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
594 var in-data/ecx: (addr stream byte) <- copy _in-data
595 rewind-stream in-data
596 var g/eax: grapheme <- read-grapheme in-data
597 compare g, 0x28/open-paren
598 {
599 break-if-!=
600 var result/eax: boolean <- stream-empty? in-data
601 return result
602 }
603 return 0/false
604 }
605
606 fn close-paren-token? _in: (addr cell) -> _/eax: boolean {
607 var in/eax: (addr cell) <- copy _in
608 var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
609 var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
610 var in-data/ecx: (addr stream byte) <- copy _in-data
611 rewind-stream in-data
612 var g/eax: grapheme <- read-grapheme in-data
613 compare g, 0x29/close-paren
614 {
615 break-if-!=
616 var result/eax: boolean <- stream-empty? in-data
617 return result
618 }
619 return 0/false
620 }
621
622 fn dot-token? _in: (addr cell) -> _/eax: boolean {
623 var in/eax: (addr cell) <- copy _in
624 var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
625 var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
626 var in-data/ecx: (addr stream byte) <- copy _in-data
627 rewind-stream in-data
628 var g/eax: grapheme <- read-grapheme in-data
629 compare g, 0x2e/dot
630 {
631 break-if-!=
632 var result/eax: boolean <- stream-empty? in-data
633 return result
634 }
635 return 0/false
636 }
637
638 fn test-dot-token {
639 var tmp-storage: (handle cell)
640 var tmp-ah/eax: (addr handle cell) <- address tmp-storage
641 new-symbol tmp-ah, "."
642 var tmp/eax: (addr cell) <- lookup *tmp-ah
643 var result/eax: boolean <- dot-token? tmp
644 check result, "F - test-dot-token"
645 }