https://github.com/akkartik/mu/blob/main/shell/tokenize.mu
  1 # We reuse the cell data structure for tokenization
  2 # Token cells are special, though. They have no type, they're always atoms,
  3 # they always have text-data.
  4 
  5 fn tokenize in: (addr gap-buffer), out: (addr stream cell), trace: (addr trace) {
  6   trace-text trace, "read", "tokenize"
  7   trace-lower trace
  8   rewind-gap-buffer in
  9   var token-storage: cell
 10   var token/edx: (addr cell) <- address token-storage
 11   {
 12     skip-whitespace-from-gap-buffer in
 13     var done?/eax: boolean <- gap-buffer-scan-done? in
 14     compare done?, 0/false
 15     break-if-!=
 16     # initialize token data each iteration to avoid aliasing
 17     var dest-ah/eax: (addr handle stream byte) <- get token, text-data
 18     populate-stream dest-ah, 0x40/max-token-size
 19     #
 20     next-token in, token, trace
 21     var error?/eax: boolean <- has-errors? trace
 22     compare error?, 0/false
 23     {
 24       break-if-=
 25       return
 26     }
 27     write-to-stream out, token  # shallow-copy text-data
 28     loop
 29   }
 30   trace-higher trace
 31 }
 32 
 33 fn test-tokenize-dotted-list {
 34   # in: "(a . b)"
 35   var in-storage: gap-buffer
 36   var in/esi: (addr gap-buffer) <- address in-storage
 37   initialize-gap-buffer in, 0x10
 38   add-code-point-at-gap in, 0x28/open-paren
 39   add-code-point-at-gap in, 0x61/a
 40   add-code-point-at-gap in, 0x20/space
 41   add-code-point-at-gap in, 0x2e/dot
 42   add-code-point-at-gap in, 0x20/space
 43   add-code-point-at-gap in, 0x62/b
 44   add-code-point-at-gap in, 0x29/close-paren
 45   #
 46   var stream-storage: (stream cell 0x10)
 47   var stream/edi: (addr stream cell) <- address stream-storage
 48   #
 49   tokenize in, stream, 0/no-trace
 50   #
 51   var curr-token-storage: cell
 52   var curr-token/ebx: (addr cell) <- address curr-token-storage
 53   read-from-stream stream, curr-token
 54   var open-paren?/eax: boolean <- open-paren-token? curr-token
 55   check open-paren?, "F - test-tokenize-dotted-list: open paren"
 56   read-from-stream stream, curr-token  # skip a
 57   read-from-stream stream, curr-token
 58   var dot?/eax: boolean <- dot-token? curr-token
 59   check dot?, "F - test-tokenize-dotted-list: dot"
 60   read-from-stream stream, curr-token  # skip b
 61   read-from-stream stream, curr-token
 62   var close-paren?/eax: boolean <- close-paren-token? curr-token
 63   check close-paren?, "F - test-tokenize-dotted-list: close paren"
 64 }
 65 
 66 fn next-token in: (addr gap-buffer), _out-cell: (addr cell), trace: (addr trace) {
 67   trace-text trace, "read", "next-token"
 68   trace-lower trace
 69   var out-cell/eax: (addr cell) <- copy _out-cell
 70   var out-ah/eax: (addr handle stream byte) <- get out-cell, text-data
 71   var _out/eax: (addr stream byte) <- lookup *out-ah
 72   var out/edi: (addr stream byte) <- copy _out
 73   $next-token:body: {
 74     clear-stream out
 75     skip-whitespace-from-gap-buffer in
 76     var g/eax: grapheme <- peek-from-gap-buffer in
 77     {
 78       var stream-storage: (stream byte 0x40)
 79       var stream/esi: (addr stream byte) <- address stream-storage
 80       write stream, "next: "
 81       var gval/eax: int <- copy g
 82       write-int32-hex stream, gval
 83       trace trace, "read", stream
 84     }
 85     # digit
 86     {
 87       var digit?/eax: boolean <- decimal-digit? g
 88       compare digit?, 0/false
 89       break-if-=
 90       next-number-token in, out, trace
 91       break $next-token:body
 92     }
 93     # other symbol char
 94     {
 95       var symbol?/eax: boolean <- symbol-grapheme? g
 96       compare symbol?, 0/false
 97       break-if-=
 98       next-symbol-token in, out, trace
 99       break $next-token:body
100     }
101     # brackets are always single-char tokens
102     {
103       var bracket?/eax: boolean <- bracket-grapheme? g
104       compare bracket?, 0/false
105       break-if-=
106       var g/eax: grapheme <- read-from-gap-buffer in
107       next-bracket-token g, out, trace
108       break $next-token:body
109     }
110     # non-symbol operators
111     {
112       var operator?/eax: boolean <- operator-grapheme? g
113       compare operator?, 0/false
114       break-if-=
115       next-operator-token in, out, trace
116       break $next-token:body
117     }
118   }
119   trace-higher trace
120   var stream-storage: (stream byte 0x40)
121   var stream/eax: (addr stream byte) <- address stream-storage
122   write stream, "=> "
123   rewind-stream out
124   write-stream stream, out
125   trace trace, "read", stream
126 }
127 
128 fn next-symbol-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
129   trace-text trace, "read", "looking for a symbol"
130   trace-lower trace
131   $next-symbol-token:loop: {
132     var done?/eax: boolean <- gap-buffer-scan-done? in
133     compare done?, 0/false
134     break-if-!=
135     var g/eax: grapheme <- peek-from-gap-buffer in
136     {
137       var stream-storage: (stream byte 0x40)
138       var stream/esi: (addr stream byte) <- address stream-storage
139       write stream, "next: "
140       var gval/eax: int <- copy g
141       write-int32-hex stream, gval
142       trace trace, "read", stream
143     }
144     # if non-symbol, return
145     {
146       var symbol-grapheme?/eax: boolean <- symbol-grapheme? g
147       compare symbol-grapheme?, 0/false
148       break-if-!=
149       trace-text trace, "read", "stop"
150       break $next-symbol-token:loop
151     }
152     var g/eax: grapheme <- read-from-gap-buffer in
153     write-grapheme out, g
154     loop
155   }
156   trace-higher trace
157   var stream-storage: (stream byte 0x40)
158   var stream/esi: (addr stream byte) <- address stream-storage
159   write stream, "=> "
160   rewind-stream out
161   write-stream stream, out
162   trace trace, "read", stream
163 }
164 
165 fn next-operator-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
166   trace-text trace, "read", "looking for a operator"
167   trace-lower trace
168   $next-operator-token:loop: {
169     var done?/eax: boolean <- gap-buffer-scan-done? in
170     compare done?, 0/false
171     break-if-!=
172     var g/eax: grapheme <- peek-from-gap-buffer in
173     {
174       var stream-storage: (stream byte 0x40)
175       var stream/esi: (addr stream byte) <- address stream-storage
176       write stream, "next: "
177       var gval/eax: int <- copy g
178       write-int32-hex stream, gval
179       trace trace, "read", stream
180     }
181     # if non-operator, return
182     {
183       var operator-grapheme?/eax: boolean <- operator-grapheme? g
184       compare operator-grapheme?, 0/false
185       break-if-!=
186       trace-text trace, "read", "stop"
187       break $next-operator-token:loop
188     }
189     var g/eax: grapheme <- read-from-gap-buffer in
190     write-grapheme out, g
191     loop
192   }
193   trace-higher trace
194   var stream-storage: (stream byte 0x40)
195   var stream/esi: (addr stream byte) <- address stream-storage
196   write stream, "=> "
197   rewind-stream out
198   write-stream stream, out
199   trace trace, "read", stream
200 }
201 
202 fn next-number-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
203   trace-text trace, "read", "looking for a number"
204   trace-lower trace
205   $next-number-token:loop: {
206     var done?/eax: boolean <- gap-buffer-scan-done? in
207     compare done?, 0/false
208     break-if-!=
209     var g/eax: grapheme <- peek-from-gap-buffer in
210     {
211       var stream-storage: (stream byte 0x40)
212       var stream/esi: (addr stream byte) <- address stream-storage
213       write stream, "next: "
214       var gval/eax: int <- copy g
215       write-int32-hex stream, gval
216       trace trace, "read", stream
217     }
218     # if not symbol grapheme, return
219     {
220       var symbol-grapheme?/eax: boolean <- symbol-grapheme? g
221       compare symbol-grapheme?, 0/false
222       break-if-!=
223       trace-text trace, "read", "stop"
224       break $next-number-token:loop
225     }
226     # if not digit grapheme, abort
227     {
228       var digit?/eax: boolean <- decimal-digit? g
229       compare digit?, 0/false
230       break-if-!=
231       error trace, "invalid number"
232       return
233     }
234     trace-text trace, "read", "append"
235     var g/eax: grapheme <- read-from-gap-buffer in
236     write-grapheme out, g
237     loop
238   }
239   trace-higher trace
240 }
241 
242 fn next-bracket-token g: grapheme, out: (addr stream byte), trace: (addr trace) {
243   trace-text trace, "read", "bracket"
244   write-grapheme out, g
245   var stream-storage: (stream byte 0x40)
246   var stream/esi: (addr stream byte) <- address stream-storage
247   write stream, "=> "
248   rewind-stream out
249   write-stream stream, out
250   trace trace, "read", stream
251 }
252 
253 fn symbol-grapheme? g: grapheme -> _/eax: boolean {
254   ## whitespace
255   compare g, 9/tab
256   {
257     break-if-!=
258     return 0/false
259   }
260   compare g, 0xa/newline
261   {
262     break-if-!=
263     return 0/false
264   }
265   compare g, 0x20/space
266   {
267     break-if-!=
268     return 0/false
269   }
270   ## quotes
271   compare g, 0x22/double-quote
272   {
273     break-if-!=
274     return 0/false
275   }
276   compare g, 0x60/backquote
277   {
278     break-if-!=
279     return 0/false
280   }
281   ## brackets
282   compare g, 0x28/open-paren
283   {
284     break-if-!=
285     return 0/false
286   }
287   compare g, 0x29/close-paren
288   {
289     break-if-!=
290     return 0/false
291   }
292   compare g, 0x5b/open-square-bracket
293   {
294     break-if-!=
295     return 0/false
296   }
297   compare g, 0x5d/close-square-bracket
298   {
299     break-if-!=
300     return 0/false
301   }
302   compare g, 0x7b/open-curly-bracket
303   {
304     break-if-!=
305     return 0/false
306   }
307   compare g, 0x7d/close-curly-bracket
308   {
309     break-if-!=
310     return 0/false
311   }
312   # - other punctuation
313   # '!' is a symbol char
314   compare g, 0x23/hash
315   {
316     break-if-!=
317     return 0/false
318   }
319   # '$' is a symbol char
320   compare g, 0x25/percent
321   {
322     break-if-!=
323     return 0/false
324   }
325   compare g, 0x26/ampersand
326   {
327     break-if-!=
328     return 0/false
329   }
330   compare g, 0x27/single-quote
331   {
332     break-if-!=
333     return 0/false
334   }
335   compare g, 0x2a/asterisk
336   {
337     break-if-!=
338     return 0/false
339   }
340   compare g, 0x2b/plus
341   {
342     break-if-!=
343     return 0/false
344   }
345   compare g, 0x2c/comma
346   {
347     break-if-!=
348     return 0/false
349   }
350   compare g, 0x2d/dash  # '-' not allowed in symbols
351   {
352     break-if-!=
353     return 0/false
354   }
355   compare g, 0x2e/period
356   {
357     break-if-!=
358     return 0/false
359   }
360   compare g, 0x2f/slash
361   {
362     break-if-!=
363     return 0/false
364   }
365   compare g, 0x3a/colon
366   {
367     break-if-!=
368     return 0/false
369   }
370   compare g, 0x3b/semi-colon
371   {
372     break-if-!=
373     return 0/false
374   }
375   compare g, 0x3c/less-than
376   {
377     break-if-!=
378     return 0/false
379   }
380   compare g, 0x3d/equal
381   {
382     break-if-!=
383     return 0/false
384   }
385   compare g, 0x3e/greater-than
386   {
387     break-if-!=
388     return 0/false
389   }
390   # '?' is a symbol char
391   compare g, 0x40/at-sign
392   {
393     break-if-!=
394     return 0/false
395   }
396   compare g, 0x5c/backslash
397   {
398     break-if-!=
399     return 0/false
400   }
401   compare g, 0x5e/caret
402   {
403     break-if-!=
404     return 0/false
405   }
406   # '_' is a symbol char
407   compare g, 0x7c/vertical-line
408   {
409     break-if-!=
410     return 0/false
411   }
412   compare g, 0x7e/tilde
413   {
414     break-if-!=
415     return 0/false
416   }
417   return 1/true
418 }
419 
420 fn bracket-grapheme? g: grapheme -> _/eax: boolean {
421   compare g, 0x28/open-paren
422   {
423     break-if-!=
424     return 1/true
425   }
426   compare g, 0x29/close-paren
427   {
428     break-if-!=
429     return 1/true
430   }
431   compare g, 0x5b/open-square-bracket
432   {
433     break-if-!=
434     return 1/true
435   }
436   compare g, 0x5d/close-square-bracket
437   {
438     break-if-!=
439     return 1/true
440   }
441   compare g, 0x7b/open-curly-bracket
442   {
443     break-if-!=
444     return 1/true
445   }
446   compare g, 0x7d/close-curly-bracket
447   {
448     break-if-!=
449     return 1/true
450   }
451   return 0/false
452 }
453 
454 fn operator-grapheme? g: grapheme -> _/eax: boolean {
455   # '$' is a symbol char
456   compare g, 0x25/percent
457   {
458     break-if-!=
459     return 1/false
460   }
461   compare g, 0x26/ampersand
462   {
463     break-if-!=
464     return 1/true
465   }
466   compare g, 0x27/single-quote
467   {
468     break-if-!=
469     return 1/true
470   }
471   compare g, 0x2a/asterisk
472   {
473     break-if-!=
474     return 1/true
475   }
476   compare g, 0x2b/plus
477   {
478     break-if-!=
479     return 1/true
480   }
481   compare g, 0x2c/comma
482   {
483     break-if-!=
484     return 1/true
485   }
486   compare g, 0x2d/dash  # '-' not allowed in symbols
487   {
488     break-if-!=
489     return 1/true
490   }
491   compare g, 0x2e/period
492   {
493     break-if-!=
494     return 1/true
495   }
496   compare g, 0x2f/slash
497   {
498     break-if-!=
499     return 1/true
500   }
501   compare g, 0x3a/colon
502   {
503     break-if-!=
504     return 1/true
505   }
506   compare g, 0x3b/semi-colon
507   {
508     break-if-!=
509     return 1/true
510   }
511   compare g, 0x3c/less-than
512   {
513     break-if-!=
514     return 1/true
515   }
516   compare g, 0x3d/equal
517   {
518     break-if-!=
519     return 1/true
520   }
521   compare g, 0x3e/greater-than
522   {
523     break-if-!=
524     return 1/true
525   }
526   # '?' is a symbol char
527   compare g, 0x40/at-sign
528   {
529     break-if-!=
530     return 1/true
531   }
532   compare g, 0x5c/backslash
533   {
534     break-if-!=
535     return 1/true
536   }
537   compare g, 0x5e/caret
538   {
539     break-if-!=
540     return 1/true
541   }
542   # '_' is a symbol char
543   compare g, 0x7c/vertical-line
544   {
545     break-if-!=
546     return 1/true
547   }
548   compare g, 0x7e/tilde
549   {
550     break-if-!=
551     return 1/true
552   }
553   return 0/false
554 }
555 
556 fn number-token? _in: (addr cell) -> _/eax: boolean {
557   var in/eax: (addr cell) <- copy _in
558   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
559   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
560   rewind-stream in-data
561   var g/eax: grapheme <- read-grapheme in-data
562   var result/eax: boolean <- decimal-digit? g
563   return result
564 }
565 
566 fn bracket-token? _in: (addr cell) -> _/eax: boolean {
567   var in/eax: (addr cell) <- copy _in
568   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
569   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
570   rewind-stream in-data
571   var g/eax: grapheme <- read-grapheme in-data
572   var result/eax: boolean <- bracket-grapheme? g
573   return result
574 }
575 
576 fn quote-token? _in: (addr cell) -> _/eax: boolean {
577   var in/eax: (addr cell) <- copy _in
578   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
579   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
580   rewind-stream in-data
581   var g/eax: grapheme <- read-grapheme in-data
582   compare g, 0x27/single-quote
583   {
584     break-if-!=
585     return 1/true
586   }
587   return 0/false
588 }
589 
590 fn open-paren-token? _in: (addr cell) -> _/eax: boolean {
591   var in/eax: (addr cell) <- copy _in
592   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
593   var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
594   var in-data/ecx: (addr stream byte) <- copy _in-data
595   rewind-stream in-data
596   var g/eax: grapheme <- read-grapheme in-data
597   compare g, 0x28/open-paren
598   {
599     break-if-!=
600     var result/eax: boolean <- stream-empty? in-data
601     return result
602   }
603   return 0/false
604 }
605 
606 fn close-paren-token? _in: (addr cell) -> _/eax: boolean {
607   var in/eax: (addr cell) <- copy _in
608   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
609   var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
610   var in-data/ecx: (addr stream byte) <- copy _in-data
611   rewind-stream in-data
612   var g/eax: grapheme <- read-grapheme in-data
613   compare g, 0x29/close-paren
614   {
615     break-if-!=
616     var result/eax: boolean <- stream-empty? in-data
617     return result
618   }
619   return 0/false
620 }
621 
622 fn dot-token? _in: (addr cell) -> _/eax: boolean {
623   var in/eax: (addr cell) <- copy _in
624   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
625   var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
626   var in-data/ecx: (addr stream byte) <- copy _in-data
627   rewind-stream in-data
628   var g/eax: grapheme <- read-grapheme in-data
629   compare g, 0x2e/dot
630   {
631     break-if-!=
632     var result/eax: boolean <- stream-empty? in-data
633     return result
634   }
635   return 0/false
636 }
637 
638 fn test-dot-token {
639   var tmp-storage: (handle cell)
640   var tmp-ah/eax: (addr handle cell) <- address tmp-storage
641   new-symbol tmp-ah, "."
642   var tmp/eax: (addr cell) <- lookup *tmp-ah
643   var result/eax: boolean <- dot-token? tmp
644   check result, "F - test-dot-token"
645 }