https://github.com/akkartik/mu/blob/main/shell/parenthesize.mu
  1 ## insert explicit parens based on indentation
  2 
  3 # Design goals:
  4 #  keywords in other languages should look different from functions: def, if, while, etc.
  5 #  fully-parenthesized expressions should not be messed with
  6 #    ignore indent when lines start with parens
  7 #    ignore indent inside parens
  8 #    no modes to disable this pass
  9 #  introduce no new operators
 10 #    the language doesn't use nested lists like Scheme's `cond`
 11 #    lines with one word are never wrapped in parens
 12 #  encourage macros to explicitly insert all parens
 13 #    ignore indent inside backquote
 14 
 15 fn parenthesize in: (addr stream token), out: (addr stream token), trace: (addr trace) {
 16   trace-text trace, "parenthesize", "insert parens"
 17   trace-lower trace
 18   var buffer-storage: (stream token 0x40)
 19   var buffer/edi: (addr stream token) <- address buffer-storage
 20   var curr-line-indent: int
 21   var num-words-in-line: int
 22   var paren-at-start-of-line?: boolean
 23   var explicit-open-parens-storage: int
 24   var explicit-open-parens/ebx: (addr int) <- address explicit-open-parens-storage
 25   var implicit-open-parens-storage: int-stack
 26   var implicit-open-parens/esi: (addr int-stack) <- address implicit-open-parens-storage
 27   initialize-int-stack implicit-open-parens, 0x10  # potentially a major memory leak
 28   rewind-stream in
 29   {
 30     var done?/eax: boolean <- stream-empty? in
 31     compare done?, 0/false
 32     break-if-!=
 33     #
 34     var curr-token-storage: token
 35     var curr-token/ecx: (addr token) <- address curr-token-storage
 36     read-from-stream in, curr-token
 37 #?     dump-token-from-cursor curr-token
 38     # update state
 39     {
 40       var is-indent?/eax: boolean <- indent-token? curr-token
 41       compare is-indent?, 0/false
 42       break-if-=
 43       copy-to num-words-in-line, 0
 44       copy-to paren-at-start-of-line?, 0/false
 45       var tmp/eax: int <- indent-level curr-token
 46       copy-to curr-line-indent, tmp
 47     }
 48     {
 49       var is-word?/eax: boolean <- word-token? curr-token
 50       compare is-word?, 0/false
 51       break-if-=
 52       increment num-words-in-line
 53     }
 54     {
 55       compare num-words-in-line, 0
 56       break-if-!=
 57       var is-open?/eax: boolean <- open-paren-token? curr-token
 58       compare is-open?, 0/false
 59       break-if-=
 60       copy-to paren-at-start-of-line?, 1/true
 61     }
 62     #
 63     $parenthesize:emit: {
 64       {
 65         compare paren-at-start-of-line?, 0/false
 66         break-if-=
 67 #?         draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen "A", 7/fg 0/bg
 68         emit-all buffer, curr-token, out, explicit-open-parens
 69         break $parenthesize:emit
 70       }
 71       {
 72         var is-indent?/eax: boolean <- indent-token? curr-token
 73         compare is-indent?, 0/false
 74         break-if-=
 75 #?         draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen "B", 7/fg 0/bg
 76         emit-all buffer, curr-token, out, explicit-open-parens
 77         break $parenthesize:emit
 78       }
 79       {
 80         compare num-words-in-line, 2
 81         break-if->=
 82 #?         draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen "C", 7/fg 0/bg
 83         write-to-stream buffer, curr-token
 84         break $parenthesize:emit
 85       }
 86       {
 87         compare num-words-in-line, 2
 88         break-if-!=
 89         var is-word?/eax: boolean <- word-token? curr-token
 90         compare is-word?, 0/false
 91         break-if-=
 92         compare *explicit-open-parens, 0
 93         break-if-!=
 94 #?         draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen "(\n", 7/fg 0/bg
 95         var paren-storage: token
 96         var paren-token/eax: (addr token) <- address paren-storage
 97         initialize-token paren-token, "("
 98         write-to-stream out, paren-token
 99         push-int-stack implicit-open-parens, curr-line-indent
100       }
101       emit-all buffer, curr-token, out, explicit-open-parens
102     }
103     {
104       var is-indent?/eax: boolean <- indent-token? curr-token
105       compare is-indent?, 0/false
106       break-if-=
107       {
108         # . loop check
109         var done?/eax: boolean <- int-stack-empty? implicit-open-parens
110         compare done?, 0/false
111         break-if-!=
112         var top-indent/eax: int <- int-stack-top implicit-open-parens
113         compare top-indent, curr-line-indent
114         break-if-<
115         # . loop body
116         var paren-storage: token
117         var paren-token/eax: (addr token) <- address paren-storage
118         initialize-token paren-token, ")"
119         write-to-stream out, paren-token
120         # . update
121         var dummy/eax: int <- pop-int-stack implicit-open-parens
122         loop
123       }
124     }
125     loop
126   }
127   emit-all buffer, 0/no-curr-token, out, explicit-open-parens
128   {
129     # . loop check
130     var done?/eax: boolean <- int-stack-empty? implicit-open-parens
131     compare done?, 0/false
132     break-if-!=
133     # . loop body
134     var paren-storage: token
135     var paren-token/eax: (addr token) <- address paren-storage
136     initialize-token paren-token, ")"
137     write-to-stream out, paren-token
138     # . update
139     var dummy/eax: int <- pop-int-stack implicit-open-parens
140     loop
141   }
142   trace-higher trace
143 }
144 
145 fn indent-level _in: (addr token) -> _/eax: int {
146   var in/eax: (addr token) <- copy _in
147   var result/eax: (addr int) <- get in, number-data
148   return *result
149 }
150 
151 fn word-token? in: (addr token) -> _/eax: boolean {
152   {
153     var is-indent?/eax: boolean <- indent-token? in
154     compare is-indent?, 0/false
155     break-if-!=
156     var is-bracket?/eax: boolean <- bracket-token? in  # overzealously checks for [], but shouldn't ever encounter it
157     compare is-bracket?, 0/false
158     break-if-!=
159     var is-quote?/eax: boolean <- quote-token? in
160     compare is-quote?, 0/false
161     break-if-!=
162     var is-backquote?/eax: boolean <- backquote-token? in
163     compare is-backquote?, 0/false
164     break-if-!=
165     var is-unquote?/eax: boolean <- unquote-token? in
166     compare is-unquote?, 0/false
167     break-if-!=
168     var is-unquote-splice?/eax: boolean <- unquote-splice-token? in
169     compare is-unquote-splice?, 0/false
170     break-if-!=
171     return 1/true
172   }
173   return 0/false
174 }
175 
176 fn emit-all first: (addr stream token), second: (addr token), out: (addr stream token), explicit-open-parens: (addr int) {
177   rewind-stream first
178   {
179     var done?/eax: boolean <- stream-empty? first
180     compare done?, 0/false
181     break-if-!=
182     var curr-token-storage: token
183     var curr-token/eax: (addr token) <- address curr-token-storage
184     read-from-stream first, curr-token
185     emit curr-token, out, explicit-open-parens
186     loop
187   }
188   clear-stream first
189   {
190     compare second, 0
191     break-if-=
192     emit second, out, explicit-open-parens
193   }
194 }
195 
196 fn emit t: (addr token), out: (addr stream token), explicit-open-parens: (addr int) {
197   {
198     var is-indent?/eax: boolean <- indent-token? t
199     compare is-indent?, 0/false
200     break-if-=
201     return
202   }
203   write-to-stream out, t
204   var explicit-open-parens/edi: (addr int) <- copy explicit-open-parens
205   {
206     var is-open?/eax: boolean <- open-paren-token? t
207     compare is-open?, 0/false
208     break-if-=
209     increment *explicit-open-parens
210   }
211   {
212     var is-close?/eax: boolean <- close-paren-token? t
213     compare is-close?, 0/false
214     break-if-=
215     decrement *explicit-open-parens
216     compare *explicit-open-parens, 0
217     break-if->=
218     abort "emit: extra ')'"
219   }
220 }
221 
222 # helper for checking parenthesize
223 fn emit-salient-tokens in: (addr stream token), out: (addr stream token) {
224   rewind-stream in
225   {
226     var done?/eax: boolean <- stream-empty? in
227     compare done?, 0/false
228     break-if-!=
229     var token-storage: token
230     var token/edx: (addr token) <- address token-storage
231     read-from-stream in, token
232     # skip tokens should be skipped
233     var is-skip?/eax: boolean <- skip-token? token
234     compare is-skip?, 0/false
235     loop-if-!=
236     # indent tokens should be skipped
237     var is-indent?/eax: boolean <- indent-token? token
238     compare is-indent?, 0/false
239     loop-if-!=
240     #
241     write-to-stream out, token  # shallow copy
242     loop
243   }
244 }
245 
246 fn test-parenthesize {
247   check-parenthesize "a b c  ", "(a b c)", "F - test-parenthesize/1"
248   check-parenthesize "a (b)", "(a (b))", "F - test-parenthesize/2"
249   check-parenthesize "a (b c)", "(a (b c))", "F - test-parenthesize/3"
250   check-parenthesize "a (b c) d", "(a (b c) d)", "F - test-parenthesize/4"
251   check-parenthesize "a b c\nd ef", "(a b c) (d ef)", "F - test-parenthesize/5-multiple-lines"
252   check-parenthesize "a b c\n  d ef", "(a b c (d ef))", "F - test-parenthesize/6-indented"
253   check-parenthesize "a b c\n  (d ef)", "(a b c (d ef))", "F - test-parenthesize/7-indented"
254   check-parenthesize "a b c\n  (d ef)\n  g", "(a b c (d ef) g)", "F - test-parenthesize/8-indented"
255   check-parenthesize "a b c\n  d e\n    f\ny", "(a b c (d e f)) y", "F - test-parenthesize/9-indented"
256   check-parenthesize "#a\na b", "(a b)", "F - test-parenthesize/10-initial-comment"
257 #? a b c
258 #?     d ef
259 #? 
260 #?   g
261 #?   check-parenthesize "a b c\n    d ef\n\n  g", "(a b c (d ef) g)", "F - test-parenthesize/11-comments"
262 #?   check-parenthesize "a b c\n    d ef\n\n  g #abc", "(a b c (d ef)) g", "F - test-parenthesize/11-comments"
263   check-parenthesize "a b c\n    d ef\n\n  g #abc", "(a b c (d ef) g)", "F - test-parenthesize/11-comments"
264 #? a b c
265 #?   '(d ef)
266 #? 
267 #?   g #abc
268 #?   check-parenthesize "a b c\n  '(d ef)\n  g #abc", "(a b c '(d ef) g)", "F - test-parenthesize/12-quotes-and-comments"
269   check-parenthesize "a b c\n  '(d ef)\n\n  g #abc", "(a b c '(d ef) g)", "F - test-parenthesize/12-quotes-and-comments"
270   check-parenthesize "  a b c", "(a b c)", "F - test-parenthesize/13-initial-indent"
271   check-parenthesize "    a b c\n  34", "(a b c) 34", "F - test-parenthesize/14-initial-indent"
272   check-parenthesize "def foo\n    a b c\n  d e\nnewdef", "(def foo (a b c) (d e)) newdef", "F - test-parenthesize/14"
273   check-parenthesize "  a a\n    a\ny", "(a a a) y", "F - test-parenthesize/15-group-before-too-much-outdent"
274   check-parenthesize "a `(b c)", "(a `(b c))", "F - test-parenthesize/16-backquote"
275   check-parenthesize "'a b c", "('a b c)", "F - test-parenthesize/17-quote"
276   check-parenthesize ",a b c", "(,a b c)", "F - test-parenthesize/18-unquote"
277   check-parenthesize ",@a b c", "(,@a b c)", "F - test-parenthesize/19-unquote-splice"
278   check-parenthesize "a b\n  'c\n  ,d\n  e", "(a b 'c ,d e)", "F - test-parenthesize/20-quotes-are-not-words"
279   check-parenthesize "def foo\n#a b c\n  d e\nnew", "(def foo (d e)) new", "F - test-parenthesize/21-group-across-comments"
280 }
281 
282 fn test-parenthesize-skips-lines-with-initial-parens {
283   check-parenthesize "(a b c)", "(a b c)", "F - test-parenthesize-skips-lines-with-initial-parens/1"
284   check-parenthesize "(a (b c))", "(a (b c))", "F - test-parenthesize-skips-lines-with-initial-parens/2"
285   check-parenthesize "(a () b)", "(a () b)", "F - test-parenthesize-skips-lines-with-initial-parens/3"
286   check-parenthesize "  (a b c)", "(a b c)", "F - test-parenthesize-skips-lines-with-initial-parens/initial-indent"
287   check-parenthesize "(a b c\n  bc\n    def\n  gh)", "(a b c bc def gh)", "F - test-parenthesize-skips-lines-with-initial-parens/outdent"
288   check-parenthesize "(a b c\n  (def gh)\n    (i j k)\n  lm\n\n\n    (no p))", "(a b c (def gh) (i j k) lm (no p))", "F - test-parenthesize-skips-lines-with-initial-parens/fully-parenthesized"
289   check-parenthesize ",(a b c)", ",(a b c)", "F - test-parenthesize-skips-lines-with-initial-parens/after-unquote"
290   check-parenthesize ",@(a b c)", ",@(a b c)", "F - test-parenthesize-skips-lines-with-initial-parens/after-unquote-splice"
291   check-parenthesize ",,(a b c)", ",,(a b c)", "F - test-parenthesize-skips-lines-with-initial-parens/after-nested-unquote"
292   check-parenthesize "(def foo\n    #a b c\n  d e)\nnew", "(def foo d e) new", "F - test-parenthesize-skips-lines-with-initial-parens/across-comment"
293   check-parenthesize "`(def foo\n    #a b c\n  d e)\nnew", "`(def foo d e) new", "F - test-parenthesize-skips-lines-with-initial-parens/across-comment-after-backquote"
294   check-parenthesize "  (a b c\n    d e)", "(a b c d e)", "F - test-parenthesize-skips-lines-with-initial-parens/with-indent"
295   check-parenthesize "def foo(a (b)\n    c d)\n  d e\nnew", "(def foo (a (b) c d) (d e)) new", "F - test-parenthesize-skips-lines-with-initial-parens/inside-arg-lists"
296 }
297 
298 fn test-parenthesize-skips-single-word-lines {
299   # lines usually get grouped with later indented lines
300   check-parenthesize "a b\n  c", "(a b c)", "F - test-parenthesize-skips-single-word-lines/0"
301   # but single-word lines don't
302   check-parenthesize "a\n  c", "a c", "F - test-parenthesize-skips-single-word-lines/1"
303   check-parenthesize "a", "a", "F - test-parenthesize-skips-single-word-lines/2"
304   check-parenthesize "a  \nb\nc", "a b c", "F - test-parenthesize-skips-single-word-lines/3"
305 }
306 
307 fn check-parenthesize actual: (addr array byte), expected: (addr array byte), message: (addr array byte) {
308   var trace-storage: trace
309   var trace/edx: (addr trace) <- address trace-storage
310   initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
311   #
312   var actual-buffer-storage: gap-buffer
313   var actual-buffer/eax: (addr gap-buffer) <- address actual-buffer-storage
314   initialize-gap-buffer-with actual-buffer, actual
315   var actual-tokens-storage: (stream token 0x40)
316   var actual-tokens/esi: (addr stream token) <- address actual-tokens-storage
317   tokenize-and-parenthesize actual-buffer, actual-tokens, trace
318   #
319   var expected-buffer-storage: gap-buffer
320   var expected-buffer/eax: (addr gap-buffer) <- address expected-buffer-storage
321   initialize-gap-buffer-with expected-buffer, expected
322   var expected-tokens-storage: (stream token 0x40)
323   var expected-tokens/edi: (addr stream token) <- address expected-tokens-storage
324   tokenize-salient expected-buffer, expected-tokens, trace
325   #
326   rewind-stream actual-tokens
327   check-token-streams-data-equal actual-tokens, expected-tokens, message
328 }
329 
330 fn check-token-streams-data-equal actual: (addr stream token), expected: (addr stream token), message: (addr array byte) {
331   rewind-stream actual
332   rewind-stream expected
333   {
334     # loop termination checks
335     var actual-done?/eax: boolean <- stream-empty? actual
336     {
337       compare actual-done?, 0/false
338       break-if-=
339       var expected-done?/eax: boolean <- stream-empty? expected
340       compare expected-done?, 0/false
341       {
342         break-if-!=
343         # actual empty, but expected not empty
344         draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, message, 3/fg=cyan 0/bg
345         draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, ": too short\n", 3/fg=cyan 0/bg
346         count-test-failure
347         return
348       }
349       draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, ".", 3/fg/cyan, 0/bg
350       return
351     }
352     var expected-done?/eax: boolean <- stream-empty? expected
353     compare expected-done?, 0/false
354     {
355       break-if-=
356       # actual not empty, but expected empty
357       draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, message, 3/fg=cyan 0/bg
358       draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, ": too long\n", 3/fg=cyan 0/bg
359       count-test-failure
360       return
361     }
362     # loop body
363     var curr-token-storage: token
364     var curr-token/ecx: (addr token) <- address curr-token-storage
365     read-from-stream actual, curr-token
366 #?     dump-token-from-cursor curr-token
367     var expected-token-storage: token
368     var expected-token/edx: (addr token) <- address expected-token-storage
369     read-from-stream expected, expected-token
370 #?     dump-token-from-cursor expected-token
371     var match?/eax: boolean <- tokens-equal? curr-token, expected-token
372     compare match?, 0/false
373     {
374       break-if-!=
375       draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, message, 3/fg=cyan 0/bg
376       count-test-failure
377       return
378     }
379     loop
380   }
381 }
382 
383 fn tokenize-and-parenthesize in: (addr gap-buffer), out: (addr stream token), trace: (addr trace) {
384   var tokens-storage: (stream token 0x400)
385   var tokens/edx: (addr stream token) <- address tokens-storage
386   tokenize in, tokens, trace
387   var error?/eax: boolean <- has-errors? trace
388   compare error?, 0/false
389   {
390     break-if-=
391     return
392   }
393   parenthesize tokens, out, trace
394 }
395 
396 fn tokenize-salient in: (addr gap-buffer), out: (addr stream token), trace: (addr trace) {
397   var tokens-storage: (stream token 0x400)
398   var tokens/edx: (addr stream token) <- address tokens-storage
399   tokenize in, tokens, trace
400   var error?/eax: boolean <- has-errors? trace
401   compare error?, 0/false
402   {
403     break-if-=
404     return
405   }
406   emit-salient-tokens tokens, out
407 }