https://github.com/akkartik/mu/blob/main/shell/tokenize.mu
  1 # We reuse the cell data structure for tokenization
  2 # Token cells are special, though. They have no type, they're always atoms,
  3 # they always have text-data.
  4 
  5 fn tokenize in: (addr gap-buffer), out: (addr stream cell), trace: (addr trace) {
  6   trace-text trace, "read", "tokenize"
  7   trace-lower trace
  8   rewind-gap-buffer in
  9   var token-storage: cell
 10   var token/edx: (addr cell) <- address token-storage
 11   {
 12     skip-whitespace-from-gap-buffer in
 13     var done?/eax: boolean <- gap-buffer-scan-done? in
 14     compare done?, 0/false
 15     break-if-!=
 16     # initialize token data each iteration to avoid aliasing
 17     var dest-ah/eax: (addr handle stream byte) <- get token, text-data
 18     populate-stream dest-ah, 0x40/max-token-size
 19     #
 20     next-token in, token, trace
 21     var error?/eax: boolean <- has-errors? trace
 22     compare error?, 0/false
 23     {
 24       break-if-=
 25       return
 26     }
 27     write-to-stream out, token  # shallow-copy text-data
 28     loop
 29   }
 30   trace-higher trace
 31 }
 32 
 33 fn next-token in: (addr gap-buffer), _out-cell: (addr cell), trace: (addr trace) {
 34   trace-text trace, "read", "next-token"
 35   trace-lower trace
 36   var out-cell/eax: (addr cell) <- copy _out-cell
 37   var out-ah/eax: (addr handle stream byte) <- get out-cell, text-data
 38   var _out/eax: (addr stream byte) <- lookup *out-ah
 39   var out/edi: (addr stream byte) <- copy _out
 40   $next-token:body: {
 41     clear-stream out
 42     skip-whitespace-from-gap-buffer in
 43     var g/eax: grapheme <- peek-from-gap-buffer in
 44     {
 45       var stream-storage: (stream byte 0x40)
 46       var stream/esi: (addr stream byte) <- address stream-storage
 47       write stream, "next: "
 48       var gval/eax: int <- copy g
 49       write-int32-hex stream, gval
 50       trace trace, "read", stream
 51     }
 52     # digit
 53     {
 54       var digit?/eax: boolean <- decimal-digit? g
 55       compare digit?, 0/false
 56       break-if-=
 57       next-number-token in, out, trace
 58       break $next-token:body
 59     }
 60     # other symbol char
 61     {
 62       var symbol?/eax: boolean <- symbol-grapheme? g
 63       compare symbol?, 0/false
 64       break-if-=
 65       next-symbol-token in, out, trace
 66       break $next-token:body
 67     }
 68     # brackets are always single-char tokens
 69     {
 70       var bracket?/eax: boolean <- bracket-grapheme? g
 71       compare bracket?, 0/false
 72       break-if-=
 73       var g/eax: grapheme <- read-from-gap-buffer in
 74       next-bracket-token g, out, trace
 75       break $next-token:body
 76     }
 77     # non-symbol operators
 78     {
 79       var operator?/eax: boolean <- operator-grapheme? g
 80       compare operator?, 0/false
 81       break-if-=
 82       next-operator-token in, out, trace
 83       break $next-token:body
 84     }
 85   }
 86   trace-higher trace
 87   var stream-storage: (stream byte 0x40)
 88   var stream/eax: (addr stream byte) <- address stream-storage
 89   write stream, "=> "
 90   rewind-stream out
 91   write-stream stream, out
 92   trace trace, "read", stream
 93 }
 94 
 95 fn next-symbol-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
 96   trace-text trace, "read", "looking for a symbol"
 97   trace-lower trace
 98   $next-symbol-token:loop: {
 99     var done?/eax: boolean <- gap-buffer-scan-done? in
100     compare done?, 0/false
101     break-if-!=
102     var g/eax: grapheme <- peek-from-gap-buffer in
103     {
104       var stream-storage: (stream byte 0x40)
105       var stream/esi: (addr stream byte) <- address stream-storage
106       write stream, "next: "
107       var gval/eax: int <- copy g
108       write-int32-hex stream, gval
109       trace trace, "read", stream
110     }
111     # if non-symbol, return
112     {
113       var symbol-grapheme?/eax: boolean <- symbol-grapheme? g
114       compare symbol-grapheme?, 0/false
115       break-if-!=
116       trace-text trace, "read", "stop"
117       break $next-symbol-token:loop
118     }
119     var g/eax: grapheme <- read-from-gap-buffer in
120     write-grapheme out, g
121     loop
122   }
123   trace-higher trace
124   var stream-storage: (stream byte 0x40)
125   var stream/esi: (addr stream byte) <- address stream-storage
126   write stream, "=> "
127   rewind-stream out
128   write-stream stream, out
129   trace trace, "read", stream
130 }
131 
132 fn next-operator-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
133   trace-text trace, "read", "looking for a operator"
134   trace-lower trace
135   $next-operator-token:loop: {
136     var done?/eax: boolean <- gap-buffer-scan-done? in
137     compare done?, 0/false
138     break-if-!=
139     var g/eax: grapheme <- peek-from-gap-buffer in
140     {
141       var stream-storage: (stream byte 0x40)
142       var stream/esi: (addr stream byte) <- address stream-storage
143       write stream, "next: "
144       var gval/eax: int <- copy g
145       write-int32-hex stream, gval
146       trace trace, "read", stream
147     }
148     # if non-operator, return
149     {
150       var operator-grapheme?/eax: boolean <- operator-grapheme? g
151       compare operator-grapheme?, 0/false
152       break-if-!=
153       trace-text trace, "read", "stop"
154       break $next-operator-token:loop
155     }
156     var g/eax: grapheme <- read-from-gap-buffer in
157     write-grapheme out, g
158     loop
159   }
160   trace-higher trace
161   var stream-storage: (stream byte 0x40)
162   var stream/esi: (addr stream byte) <- address stream-storage
163   write stream, "=> "
164   rewind-stream out
165   write-stream stream, out
166   trace trace, "read", stream
167 }
168 
169 fn next-number-token in: (addr gap-buffer), out: (addr stream byte), trace: (addr trace) {
170   trace-text trace, "read", "looking for a number"
171   trace-lower trace
172   $next-number-token:loop: {
173     var done?/eax: boolean <- gap-buffer-scan-done? in
174     compare done?, 0/false
175     break-if-!=
176     var g/eax: grapheme <- peek-from-gap-buffer in
177     {
178       var stream-storage: (stream byte 0x40)
179       var stream/esi: (addr stream byte) <- address stream-storage
180       write stream, "next: "
181       var gval/eax: int <- copy g
182       write-int32-hex stream, gval
183       trace trace, "read", stream
184     }
185     # if not symbol grapheme, return
186     {
187       var symbol-grapheme?/eax: boolean <- symbol-grapheme? g
188       compare symbol-grapheme?, 0/false
189       break-if-!=
190       trace-text trace, "read", "stop"
191       break $next-number-token:loop
192     }
193     # if not digit grapheme, abort
194     {
195       var digit?/eax: boolean <- decimal-digit? g
196       compare digit?, 0/false
197       break-if-!=
198       error trace, "invalid number"
199       return
200     }
201     trace-text trace, "read", "append"
202     var g/eax: grapheme <- read-from-gap-buffer in
203     write-grapheme out, g
204     loop
205   }
206   trace-higher trace
207 }
208 
209 fn next-bracket-token g: grapheme, out: (addr stream byte), trace: (addr trace) {
210   trace-text trace, "read", "bracket"
211   write-grapheme out, g
212   var stream-storage: (stream byte 0x40)
213   var stream/esi: (addr stream byte) <- address stream-storage
214   write stream, "=> "
215   rewind-stream out
216   write-stream stream, out
217   trace trace, "read", stream
218 }
219 
220 fn symbol-grapheme? g: grapheme -> _/eax: boolean {
221   ## whitespace
222   compare g, 9/tab
223   {
224     break-if-!=
225     return 0/false
226   }
227   compare g, 0xa/newline
228   {
229     break-if-!=
230     return 0/false
231   }
232   compare g, 0x20/space
233   {
234     break-if-!=
235     return 0/false
236   }
237   ## quotes
238   compare g, 0x22/double-quote
239   {
240     break-if-!=
241     return 0/false
242   }
243   compare g, 0x60/backquote
244   {
245     break-if-!=
246     return 0/false
247   }
248   ## brackets
249   compare g, 0x28/open-paren
250   {
251     break-if-!=
252     return 0/false
253   }
254   compare g, 0x29/close-paren
255   {
256     break-if-!=
257     return 0/false
258   }
259   compare g, 0x5b/open-square-bracket
260   {
261     break-if-!=
262     return 0/false
263   }
264   compare g, 0x5d/close-square-bracket
265   {
266     break-if-!=
267     return 0/false
268   }
269   compare g, 0x7b/open-curly-bracket
270   {
271     break-if-!=
272     return 0/false
273   }
274   compare g, 0x7d/close-curly-bracket
275   {
276     break-if-!=
277     return 0/false
278   }
279   # - other punctuation
280   # '!' is a symbol char
281   compare g, 0x23/hash
282   {
283     break-if-!=
284     return 0/false
285   }
286   # '$' is a symbol char
287   compare g, 0x25/percent
288   {
289     break-if-!=
290     return 0/false
291   }
292   compare g, 0x26/ampersand
293   {
294     break-if-!=
295     return 0/false
296   }
297   compare g, 0x27/single-quote
298   {
299     break-if-!=
300     return 0/false
301   }
302   compare g, 0x2a/asterisk
303   {
304     break-if-!=
305     return 0/false
306   }
307   compare g, 0x2b/plus
308   {
309     break-if-!=
310     return 0/false
311   }
312   compare g, 0x2c/comma
313   {
314     break-if-!=
315     return 0/false
316   }
317   compare g, 0x2d/dash  # '-' not allowed in symbols
318   {
319     break-if-!=
320     return 0/false
321   }
322   compare g, 0x2e/period
323   {
324     break-if-!=
325     return 0/false
326   }
327   compare g, 0x2f/slash
328   {
329     break-if-!=
330     return 0/false
331   }
332   compare g, 0x3a/colon
333   {
334     break-if-!=
335     return 0/false
336   }
337   compare g, 0x3b/semi-colon
338   {
339     break-if-!=
340     return 0/false
341   }
342   compare g, 0x3c/less-than
343   {
344     break-if-!=
345     return 0/false
346   }
347   compare g, 0x3d/equal
348   {
349     break-if-!=
350     return 0/false
351   }
352   compare g, 0x3e/greater-than
353   {
354     break-if-!=
355     return 0/false
356   }
357   # '?' is a symbol char
358   compare g, 0x40/at-sign
359   {
360     break-if-!=
361     return 0/false
362   }
363   compare g, 0x5c/backslash
364   {
365     break-if-!=
366     return 0/false
367   }
368   compare g, 0x5e/caret
369   {
370     break-if-!=
371     return 0/false
372   }
373   # '_' is a symbol char
374   compare g, 0x7c/vertical-line
375   {
376     break-if-!=
377     return 0/false
378   }
379   compare g, 0x7e/tilde
380   {
381     break-if-!=
382     return 0/false
383   }
384   return 1/true
385 }
386 
387 fn bracket-grapheme? g: grapheme -> _/eax: boolean {
388   compare g, 0x28/open-paren
389   {
390     break-if-!=
391     return 1/true
392   }
393   compare g, 0x29/close-paren
394   {
395     break-if-!=
396     return 1/true
397   }
398   compare g, 0x5b/open-square-bracket
399   {
400     break-if-!=
401     return 1/true
402   }
403   compare g, 0x5d/close-square-bracket
404   {
405     break-if-!=
406     return 1/true
407   }
408   compare g, 0x7b/open-curly-bracket
409   {
410     break-if-!=
411     return 1/true
412   }
413   compare g, 0x7d/close-curly-bracket
414   {
415     break-if-!=
416     return 1/true
417   }
418   return 0/false
419 }
420 
421 fn operator-grapheme? g: grapheme -> _/eax: boolean {
422   # '$' is a symbol char
423   compare g, 0x25/percent
424   {
425     break-if-!=
426     return 1/false
427   }
428   compare g, 0x26/ampersand
429   {
430     break-if-!=
431     return 1/true
432   }
433   compare g, 0x27/single-quote
434   {
435     break-if-!=
436     return 1/true
437   }
438   compare g, 0x2a/asterisk
439   {
440     break-if-!=
441     return 1/true
442   }
443   compare g, 0x2b/plus
444   {
445     break-if-!=
446     return 1/true
447   }
448   compare g, 0x2c/comma
449   {
450     break-if-!=
451     return 1/true
452   }
453   compare g, 0x2d/dash  # '-' not allowed in symbols
454   {
455     break-if-!=
456     return 1/true
457   }
458   compare g, 0x2e/period
459   {
460     break-if-!=
461     return 1/true
462   }
463   compare g, 0x2f/slash
464   {
465     break-if-!=
466     return 1/true
467   }
468   compare g, 0x3a/colon
469   {
470     break-if-!=
471     return 1/true
472   }
473   compare g, 0x3b/semi-colon
474   {
475     break-if-!=
476     return 1/true
477   }
478   compare g, 0x3c/less-than
479   {
480     break-if-!=
481     return 1/true
482   }
483   compare g, 0x3d/equal
484   {
485     break-if-!=
486     return 1/true
487   }
488   compare g, 0x3e/greater-than
489   {
490     break-if-!=
491     return 1/true
492   }
493   # '?' is a symbol char
494   compare g, 0x40/at-sign
495   {
496     break-if-!=
497     return 1/true
498   }
499   compare g, 0x5c/backslash
500   {
501     break-if-!=
502     return 1/true
503   }
504   compare g, 0x5e/caret
505   {
506     break-if-!=
507     return 1/true
508   }
509   # '_' is a symbol char
510   compare g, 0x7c/vertical-line
511   {
512     break-if-!=
513     return 1/true
514   }
515   compare g, 0x7e/tilde
516   {
517     break-if-!=
518     return 1/true
519   }
520   return 0/false
521 }
522 
523 fn number-token? _in: (addr cell) -> _/eax: boolean {
524   var in/eax: (addr cell) <- copy _in
525   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
526   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
527   rewind-stream in-data
528   var g/eax: grapheme <- read-grapheme in-data
529   var result/eax: boolean <- decimal-digit? g
530   return result
531 }
532 
533 fn bracket-token? _in: (addr cell) -> _/eax: boolean {
534   var in/eax: (addr cell) <- copy _in
535   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
536   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
537   rewind-stream in-data
538   var g/eax: grapheme <- read-grapheme in-data
539   var result/eax: boolean <- bracket-grapheme? g
540   return result
541 }
542 
543 fn quote-token? _in: (addr cell) -> _/eax: boolean {
544   var in/eax: (addr cell) <- copy _in
545   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
546   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
547   rewind-stream in-data
548   var g/eax: grapheme <- read-grapheme in-data
549   compare g, 0x27/single-quote
550   {
551     break-if-!=
552     return 1/true
553   }
554   return 0/false
555 }
556 
557 fn open-paren-token? _in: (addr cell) -> _/eax: boolean {
558   var in/eax: (addr cell) <- copy _in
559   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
560   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
561   rewind-stream in-data
562   var g/eax: grapheme <- read-grapheme in-data
563   compare g, 0x28/open-paren
564   {
565     break-if-!=
566     return 1/true
567   }
568   return 0/false
569 }
570 
571 fn close-paren-token? _in: (addr cell) -> _/eax: boolean {
572   var in/eax: (addr cell) <- copy _in
573   var in-data-ah/eax: (addr handle stream byte) <- get in, text-data
574   var in-data/eax: (addr stream byte) <- lookup *in-data-ah
575   rewind-stream in-data
576   var g/eax: grapheme <- read-grapheme in-data
577   compare g, 0x29/open-paren
578   {
579     break-if-!=
580     return 1/true
581   }
582   return 0/false
583 }