https://github.com/akkartik/mu/blob/main/shell/tokenize.mu
1
2
3
4
5 fn tokenize in: (addr gap-buffer), out: (addr stream cell), trace: (addr trace) {
6 trace-text trace, "read", "tokenize"
7 trace-lower trace
8 rewind-gap-buffer in
9 var token-storage: cell
10 var token/edx: (addr cell) <- address token-storage
11 {
12 skip-whitespace-from-gap-buffer in
13 var done?/eax: boolean <- gap-buffer-scan-done? in
14 compare done?, 0/false
15 break-if-!=
16
17 var dest-ah/eax: (addr handle stream byte) <- get token, text-data
18 populate-stream dest-ah, 0x100/max-definition-size
19
20 next-token in, token, trace
21 var skip?/eax: boolean <- comment-token? token
22 compare skip?, 0/false
23 loop-if-!=
24 var error?/eax: boolean <- has-errors? trace
25 compare error?, 0/false
26 {
27 break-if-=
28 return
29 }
30 write-to-stream out, token
31 loop
32 }
33 trace-higher trace
34 }
35
36 fn test-tokenize-dotted-list {
37 var in-storage: gap-buffer
38 var in/esi: (addr gap-buffer) <- address in-storage
39 initialize-gap-buffer-with in, "(a . b)"
40
41 var stream-storage: (stream cell 0x10)
42 var stream/edi: (addr stream cell) <- address stream-storage
43
44 tokenize in, stream, 0/no-trace
45
46 var curr-token-storage: cell
47 var curr-token/ebx: (addr cell) <- address curr-token-storage
48 read-from-stream stream, curr-token
49 var open-paren?/eax: boolean <- open-paren-token? curr-token
50 check open-paren?, "F - test-tokenize-dotted-list: open paren"
51 read-from-stream stream, curr-token
52 read-from-stream stream, curr-token
53 var dot?/eax: boolean <- dot-token? curr-token
54 check dot?, "F - test-tokenize-dotted-list: dot"
55 read-from-stream stream, curr-token
56 read-from-stream stream, curr-token
57 var close-paren?/eax: boolean <- close-paren-token? curr-token
58 check close-paren?, "F - test-tokenize-dotted-list: close paren"
59 }
60
61 fn test-tokenize-stream-literal {
62 var in-storage: gap-buffer
63 var in/esi: (addr gap-buffer) <- address in-storage
64 initialize-gap-buffer-with in, "[abc def]"
65
66 var stream-storage: (stream cell 0x10)
67 var stream/edi: (addr stream cell) <- address stream-storage
68
69 tokenize in, stream, 0/no-trace
70
71 var curr-token-storage: cell
72 var curr-token/ebx: (addr cell) <- address curr-token-storage
73 read-from-stream stream, curr-token
74 var stream?/eax: boolean <- stream-token? curr-token
75 check stream?, "F - test-tokenize-stream-literal: type"
76 var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
77 var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
78 var data-equal?/eax: boolean <- stream-data-equal? curr-token-data, "abc def"
79 check data-equal?, "F - test-tokenize-stream-literal"
80 var empty?/eax: boolean <- stream-empty? stream
81 check empty?, "F - test-tokenize-stream-literal: empty?"
82 }
83
84 fn test-tokenize-stream-literal-in-tree {
85 var in-storage: gap-buffer
86 var in/esi: (addr gap-buffer) <- address in-storage
87 initialize-gap-buffer-with in, "([abc def])"
88
89 var stream-storage: (stream cell 0x10)
90 var stream/edi: (addr stream cell) <- address stream-storage
91
92 tokenize in, stream, 0/no-trace
93
94 var curr-token-storage: cell
95 var curr-token/ebx: (addr cell) <- address curr-token-storage
96 read-from-stream stream, curr-token
97 var bracket?/eax: boolean <- bracket-token? curr-token
98 check bracket?, "F - test-tokenize-stream-literal-in-tree: open paren"
99 read-from-stream stream, curr-token
100 var stream?/eax: boolean <- stream-token? curr-token
101 check stream?, "F - test-tokenize-stream-literal-in-tree: type"
102 var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
103 var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
104 var data-equal?/eax: boolean <- stream-data-equal? curr-token-data, "abc def"
105 check data-equal?, "F - test-tokenize-stream-literal-in-tree"
106 read-from-stream stream, curr-token
107 var bracket?/eax: boolean <- bracket-token? curr-token
108 check bracket?, "F - test-tokenize-stream-literal-in-tree: close paren"
109 var empty?/eax: boolean <- stream-empty? stream
110 check empty?, "F - test-tokenize-stream-literal-in-tree: empty?"
111 }
112
113 fn next-token in: (addr gap-buffer), _out-cell: (addr cell), trace: (addr trace) {
114 trace-text trace, "read", "next-token"
115 trace-lower trace
116 var out-cell/eax: (addr cell) <- copy _out-cell
117 {
118 var out-cell-type/eax: (addr int) <- get out-cell, type
119 copy-to *out-cell-type, 0/uninitialized
120 }
121 var out-ah/eax: (addr handle stream byte) <- get out-cell, text-data
122 var _out/eax: (addr stream byte) <- lookup *out-ah
123 var out/edi: (addr stream byte) <- copy _out
124 $next-token:body: {
125 clear-stream out
126 var g/eax: grapheme <- peek-from-gap-buffer in
127 {
128 var stream-storage: (stream byte 0x40)
129 var stream/esi: (addr stream byte) <- address stream-storage
130 write stream, "next: "
131 var gval/eax: int <- copy g
132 write-int32-hex stream, gval
133 trace trace, "read", stream
134 }
135
136 {
137 compare g, 0x23/comment
138 break-if-!=
139 rest-of-line in, out, trace
140 break $next-token:body
141 }
142
143 {
144 var digit?/eax: boolean <- decimal-digit? g
145 compare digit?, 0/false
146 break-if-=
147 next-number-token in, out, trace
148 break $next-token:body
149 }
150
151 {
152 var symbol?/eax: boolean <- symbol-grapheme? g
153 compare symbol?, 0/false
154 break-if-=
155 next-symbol-token in, out, trace
156 break $next-token:body
157 }
158
159 {
160 compare g, 0x5b/open-square-bracket
161 break-if-!=
162 g <- read-from-gap-buffer in
163 next-stream-token in, out, trace
164 var out-cell/eax: (addr cell) <- copy _out-cell
165 var out-cell-type/eax: (addr int) <- get out-cell, type
166 copy-to *out-cell-type, 3/stream
167 break $next-token:body
168 }
169
170 {
171 compare g, 0x5d/close-square-bracket
172 break-if-!=
173 error trace, "unbalanced ']'"
174 return
175 }
176
177 {
178 var bracket?/eax: boolean <- bracket-grapheme? g
179 compare bracket?, 0/false
180 break-if-=
181 var g/eax: grapheme <- read-from-gap-buffer in
182 next-bracket-token g, out, trace
183 break $next-token:body
184 }
185
186 {
187 var operator?/eax: boolean <- operator-grapheme? g
188 compare operator?, 0/false
189 break-if-=
190 next-operator-token in, out, trace
191 break $next-token:body
192 }
193 }
194 trace-higher trace
195 var stream-storage: (stream byte 0x40)
196 var stream/eax: (addr stream byte) <- address stream-storage
197 write stream, "=> "
198 rewind-stream out
199 write-stream stream, out
200 trace trace, "read", stream
201 }
202
203 fn next-symbol-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
204 trace-text trace, "read", "looking for a symbol"
205 trace-lower trace
206 $next-symbol-token:loop: {
207 var done?/eax: boolean <- gap-buffer-scan-done? in
208 compare done?, 0/false
209 break-if-!=
210 var g/eax: grapheme <- peek-from-gap-buffer in
211 {
212 var stream-storage: (stream byte 0x40)
213 var stream/esi: (addr stream byte) <- address stream-storage
214 write stream, "next: "
215 var gval/eax: int <- copy g
216 write-int32-hex stream, gval
217 trace trace, "read", stream
218 }
219
220 {
221 var symbol-grapheme?/eax: boolean <- symbol-grapheme? g
222 compare symbol-grapheme?, 0/false
223 break-if-!=
224 trace-text trace, "read", "stop"
225 break $next-symbol-token:loop
226 }
227 var g/eax: grapheme <- read-from-gap-buffer in
228 write-grapheme out, g
229 loop
230 }
231 trace-higher trace
232 var stream-storage: (stream byte 0x40)
233 var stream/esi: (addr stream byte) <- address stream-storage
234 write stream, "=> "
235 rewind-stream out
236 write-stream stream, out
237 trace trace, "read", stream
238 }
239
240 fn next-operator-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
241 trace-text trace, "read", "looking for a operator"
242 trace-lower trace
243 $next-operator-token:loop: {
244 var done?/eax: boolean <- gap-buffer-scan-done? in
245 compare done?, 0/false
246 break-if-!=
247 var g/eax: grapheme <- peek-from-gap-buffer in
248 {
249 var stream-storage: (stream byte 0x40)
250 var stream/esi: (addr stream byte) <- address stream-storage
251 write stream, "next: "
252 var gval/eax: int <- copy g
253 write-int32-hex stream, gval
254 trace trace, "read", stream
255 }
256
257 {
258 var operator-grapheme?/eax: boolean <- operator-grapheme? g
259 compare operator-grapheme?, 0/false
260 break-if-!=
261 trace-text trace, "read", "stop"
262 break $next-operator-token:loop
263 }
264 var g/eax: grapheme <- read-from-gap-buffer in
265 write-grapheme out, g
266 loop
267 }
268 trace-higher trace
269 var stream-storage: (stream byte 0x40)
270 var stream/esi: (addr stream byte) <- address stream-storage
271 write stream, "=> "
272 rewind-stream out
273 write-stream stream, out
274 trace trace, "read", stream
275 }
276
277 fn next-number-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
278 trace-text trace, "read", "looking for a number"
279 trace-lower trace
280 $next-number-token:loop: {
281 var done?/eax: boolean <- gap-buffer-scan-done? in
282 compare done?, 0/false
283 break-if-!=
284 var g/eax: grapheme <- peek-from-gap-buffer in
285 {
286 var stream-storage: (stream byte 0x40)
287 var stream/esi: (addr stream byte) <- address stream-storage
288 write stream, "next: "
289 var gval/eax: int <- copy g
290 write-int32-hex stream, gval
291 trace trace, "read", stream
292 }
293
294 {
295 var symbol-grapheme?/eax: boolean <- symbol-grapheme? g
296 compare symbol-grapheme?, 0/false
297 break-if-!=
298 trace-text trace, "read", "stop"
299 break $next-number-token:loop
300 }
301
302 {
303 var digit?/eax: boolean <- decimal-digit? g
304 compare digit?, 0/false
305 break-if-!=
306 error trace, "invalid number"
307 return
308 }
309 trace-text trace, "read", "append"
310 var g/eax: grapheme <- read-from-gap-buffer in
311 write-grapheme out, g
312 loop
313 }
314 trace-higher trace
315 }
316
317 fn next-stream-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
318 trace-text trace, "read", "stream"
319 {
320 var empty?/eax: boolean <- gap-buffer-scan-done? in
321 compare empty?, 0/false
322 {
323 break-if-=
324 error trace, "unbalanced '['"
325 return
326 }
327 var g/eax: grapheme <- read-from-gap-buffer in
328 compare g, 0x5d/close-square-bracket
329 break-if-=
330 write-grapheme out, g
331 loop
332 }
333 var stream-storage: (stream byte 0x40)
334 var stream/esi: (addr stream byte) <- address stream-storage
335 write stream, "=> "
336 rewind-stream out
337 write-stream stream, out
338 trace trace, "read", stream
339 }
340
341 fn next-bracket-token g: grapheme, out: (addr stream byte), trace: (addr trace) {
342 trace-text trace, "read", "bracket"
343 write-grapheme out, g
344 var stream-storage: (stream byte 0x40)
345 var stream/esi: (addr stream byte) <- address stream-storage
346 write stream, "=> "
347 rewind-stream out
348 write-stream stream, out
349 trace trace, "read", stream
350 }
351
352 fn rest-of-line in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
353 trace-text trace, "read", "comment"
354 {
355 var empty?/eax: boolean <- gap-buffer-scan-done? in
356 compare empty?, 0/false
357 {
358 break-if-=
359 return
360 }
361 var g/eax: grapheme <- read-from-gap-buffer in
362 compare g, 0xa/newline
363 break-if-=
364 write-grapheme out, g
365 loop
366 }
367 var stream-storage: (stream byte 0x80)
368 var stream/esi: (addr stream byte) <- address stream-storage
369 write stream, "=> "
370 rewind-stream out
371 write-stream stream, out
372 trace trace, "read", stream
373 }
374
375 fn symbol-grapheme? g: grapheme -> _/eax: boolean {
376
377 compare g, 9/tab
378 {
379 break-if-!=
380 return 0/false
381 }
382 compare g, 0xa/newline
383 {
384 break-if-!=
385 return 0/false
386 }
387 compare g, 0x20/space
388 {
389 break-if-!=
390 return 0/false
391 }
392
393 compare g, 0x22/double-quote
394 {
395 break-if-!=
396 return 0/false
397 }
398 compare g, 0x60/backquote
399 {
400 break-if-!=
401 return 0/false
402 }
403
404 compare g, 0x28/open-paren
405 {
406 break-if-!=
407 return 0/false
408 }
409 compare g, 0x29/close-paren
410 {
411 break-if-!=
412 return 0/false
413 }
414 compare g, 0x5b/open-square-bracket
415 {
416 break-if-!=
417 return 0/false
418 }
419 compare g, 0x5d/close-square-bracket
420 {
421 break-if-!=
422 return 0/false
423 }
424 compare g, 0x7b/open-curly-bracket
425 {
426 break-if-!=
427 return 0/false
428 }
429 compare g, 0x7d/close-curly-bracket
430 {
431 break-if-!=
432 return 0/false
433 }
434
435
436 compare g, 0x23/hash
437 {
438 break-if-!=
439 return 0/false
440 }
441
442 compare g, 0x25/percent
443 {
444 break-if-!=
445 return 0/false
446 }
447 compare g, 0x26/ampersand
448 {
449 break-if-!=
450 return 0/false
451 }
452 compare g, 0x27/single-quote
453 {
454 break-if-!=
455 return 0/false
456 }
457 compare g, 0x2a/asterisk
458 {
459 break-if-!=
460 return 0/false
461 }
462 compare g, 0x2b/plus
463 {
464 break-if-!=
465 return 0/false
466 }
467 compare g, 0x2c/comma
468 {
469 break-if-!=
470 return 0/false
471 }
472 compare g, 0x2d/dash
473 {
474 break-if-!=
475 return 0/false
476 }
477 compare g, 0x2e/period
478 {
479 break-if-!=
480 return 0/false
481 }
482 compare g, 0x2f/slash
483 {
484 break-if-!=
485 return 0/false
486 }
487 compare g, 0x3a/colon
488 {
489 break-if-!=
490 return 0/false
491 }
492 compare g, 0x3b/semi-colon
493 {
494 break-if-!=
495 return 0/false
496 }
497 compare g, 0x3c/less-than
498 {
499 break-if-!=
500 return 0/false
501 }
502 compare g, 0x3d/equal
503 {
504 break-if-!=
505 return 0/false
506 }
507 compare g, 0x3e/greater-than
508 {
509 break-if-!=
510 return 0/false
511 }
512
513 compare g, 0x40/at-sign
514 {
515 break-if-!=
516 return 0/false
517 }
518 compare g, 0x5c/backslash
519 {
520 break-if-!=
521 return 0/false
522 }
523 compare g, 0x5e/caret
524 {
525 break-if-!=
526 return 0/false
527 }
528
529 compare g, 0x7c/vertical-line
530 {
531 break-if-!=
532 return 0/false
533 }
534 compare g, 0x7e/tilde
535 {
536 break-if-!=
537 return 0/false
538 }
539 return 1/true
540 }
541
542 fn bracket-grapheme? g: grapheme -> _/eax: boolean {
543 compare g, 0x28/open-paren
544 {
545 break-if-!=
546 return 1/true
547 }
548 compare g, 0x29/close-paren
549 {
550 break-if-!=
551 return 1/true
552 }
553 compare g, 0x5b/open-square-bracket
554 {
555 break-if-!=
556 return 1/true
557 }
558 compare g, 0x5d/close-square-bracket
559 {
560 break-if-!=
561 return 1/true
562 }
563 compare g, 0x7b/open-curly-bracket
564 {
565 break-if-!=
566 return 1/true
567 }
568 compare g, 0x7d/close-curly-bracket
569 {
570 break-if-!=
571 return 1/true
572 }
573 return 0/false
574 }
575
576 fn operator-grapheme? g: grapheme -> _/eax: boolean {
577
578 compare g, 0x25/percent
579 {
580 break-if-!=
581 return 1/false
582 }
583 compare g, 0x26/ampersand
584 {
585 break-if-!=
586 return 1/true
587 }
588 compare g, 0x27/single-quote
589 {
590 break-if-!=
591 return 1/true
592 }
593 compare g, 0x2a/asterisk
594 {
595 break-if-!=
596 return 1/true
597 }
598 compare g, 0x2b/plus
599 {
600 break-if-!=
601 return 1/true
602 }
603 compare g, 0x2c/comma
604 {
605 break-if-!=
606 return 1/true
607 }
608 compare g, 0x2d/dash
609 {
610 break-if-!=
611 return 1/true
612 }
613 compare g, 0x2e/period
614 {
615 break-if-!=
616 return 1/true
617 }
618 compare g, 0x2f/slash
619 {
620 break-if-!=
621 return 1/true
622 }
623 compare g, 0x3a/colon
624 {
625 break-if-!=
626 return 1/true
627 }
628 compare g, 0x3b/semi-colon
629 {
630 break-if-!=
631 return 1/true
632 }
633 compare g, 0x3c/less-than
634 {
635 break-if-!=
636 return 1/true
637 }
638 compare g, 0x3d/equal
639 {
640 break-if-!=
641 return 1/true
642 }
643 compare g, 0x3e/greater-than
644 {
645 break-if-!=
646 return 1/true
647 }
648
649 compare g, 0x40/at-sign
650 {
651 break-if-!=
652 return 1/true
653 }
654 compare g, 0x5c/backslash
655 {
656 break-if-!=
657 return 1/true
658 }
659 compare g, 0x5e/caret
660 {
661 break-if-!=
662 return 1/true
663 }
664
665 compare g, 0x7c/vertical-line
666 {
667 break-if-!=
668 return 1/true
669 }
670 compare g, 0x7e/tilde
671 {
672 break-if-!=
673 return 1/true
674 }
675 return 0/false
676 }
677
678 fn number-token? _in: (addr cell) -> _/eax: boolean {
679 var in/eax: (addr cell) <- copy _in
680 var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
681 var in-data/eax: (addr stream byte) <- lookup *in-data-ah
682 rewind-stream in-data
683 var g/eax: grapheme <- read-grapheme in-data
684 var result/eax: boolean <- decimal-digit? g
685 return result
686 }
687
688 fn bracket-token? _in: (addr cell) -> _/eax: boolean {
689 var in/eax: (addr cell) <- copy _in
690 {
691 var in-type/eax: (addr int) <- get in, type
692 compare *in-type, 3/stream
693 break-if-!=
694
695 return 0/false
696 }
697 var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
698 var in-data/eax: (addr stream byte) <- lookup *in-data-ah
699 rewind-stream in-data
700 var g/eax: grapheme <- read-grapheme in-data
701 var result/eax: boolean <- bracket-grapheme? g
702 return result
703 }
704
705 fn quote-token? _in: (addr cell) -> _/eax: boolean {
706 var in/eax: (addr cell) <- copy _in
707 var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
708 var in-data/eax: (addr stream byte) <- lookup *in-data-ah
709 rewind-stream in-data
710 var g/eax: grapheme <- read-grapheme in-data
711 compare g, 0x27/single-quote
712 {
713 break-if-!=
714 return 1/true
715 }
716 return 0/false
717 }
718
719 fn open-paren-token? _in: (addr cell) -> _/eax: boolean {
720 var in/eax: (addr cell) <- copy _in
721 var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
722 var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
723 var in-data/ecx: (addr stream byte) <- copy _in-data
724 rewind-stream in-data
725 var g/eax: grapheme <- read-grapheme in-data
726 compare g, 0x28/open-paren
727 {
728 break-if-!=
729 var result/eax: boolean <- stream-empty? in-data
730 return result
731 }
732 return 0/false
733 }
734
735 fn close-paren-token? _in: (addr cell) -> _/eax: boolean {
736 var in/eax: (addr cell) <- copy _in
737 var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
738 var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
739 var in-data/ecx: (addr stream byte) <- copy _in-data
740 rewind-stream in-data
741 var g/eax: grapheme <- read-grapheme in-data
742 compare g, 0x29/close-paren
743 {
744 break-if-!=
745 var result/eax: boolean <- stream-empty? in-data
746 return result
747 }
748 return 0/false
749 }
750
751 fn dot-token? _in: (addr cell) -> _/eax: boolean {
752 var in/eax: (addr cell) <- copy _in
753 var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
754 var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
755 var in-data/ecx: (addr stream byte) <- copy _in-data
756 rewind-stream in-data
757 var g/eax: grapheme <- read-grapheme in-data
758 compare g, 0x2e/dot
759 {
760 break-if-!=
761 var result/eax: boolean <- stream-empty? in-data
762 return result
763 }
764 return 0/false
765 }
766
767 fn test-dot-token {
768 var tmp-storage: (handle cell)
769 var tmp-ah/eax: (addr handle cell) <- address tmp-storage
770 new-symbol tmp-ah, "."
771 var tmp/eax: (addr cell) <- lookup *tmp-ah
772 var result/eax: boolean <- dot-token? tmp
773 check result, "F - test-dot-token"
774 }
775
776 fn stream-token? _in: (addr cell) -> _/eax: boolean {
777 var in/eax: (addr cell) <- copy _in
778 var in-type/eax: (addr int) <- get in, type
779 compare *in-type, 3/stream
780 {
781 break-if-=
782 return 0/false
783 }
784 return 1/true
785 }
786
787 fn comment-token? _in: (addr cell) -> _/eax: boolean {
788 var in/eax: (addr cell) <- copy _in
789 var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
790 var in-data/eax: (addr stream byte) <- lookup *in-data-ah
791 rewind-stream in-data
792 var g/eax: grapheme <- read-grapheme in-data
793 compare g, 0x23/hash
794 {
795 break-if-=
796 return 0/false
797 }
798 return 1/true
799 }