shell: second notation for string literals

I've always been dissatisfied with the notion of escaping. It introduces a special-case meta-notation within the tokenizer, and the conventional approach leads to exponential "leaning toothpick syndrome" with each level of escaping. One potential "correct" solution is to keep string terminals parameterizable: [abc] => abc [=] => = [=[abc]=] => abc [=[a]bc]=] => a]bc [==[a]=]bc]==] => a]=]bc ..and so on. Basically the terminals grow linearly as the number of escapings grow. While this is workable, I'd like to wait until I actually need it, and then gauge whether the need is a sign of the stack growing too complex, with too many layers of notation/parsing. Mu's goal is just 3 notations, and it's going to require constant vigilance to keep that from growing. Therefore, for now, there are two notations for string literals, one symmetric and one balanced: "abc" => abc [abc] => abc The balancing notation permits nested brackets as long as they balance. [abc [def]] => abc [def] If you need unbalanced square brackets, use the symmetric terminals: "abc [def" => abc [def If you need double quotes inside strings, use the balanced notation: [abc "def] => abc "def If you need _both_ square brackets (whether balanced or unbalanced) and double quotes, you're currently shit outta luck.
author: Kartik K. Agaram <vc@akkartik.com> 2021-07-28 20:44:02 -0700
committer: Kartik K. Agaram <vc@akkartik.com> 2021-07-28 20:44:02 -0700
commit: bec33a7067288570619c3c12fb0543e61bb562d2 (patch)
tree: 67e58909aeeaad542900ed8fcb334ff1ddd1a577 /shell
parent: 267c74b59a5f148bd28233f25bc794a3a4893e8e (diff)
download: mu-bec33a7067288570619c3c12fb0543e61bb562d2.tar.gz
1 files changed, 126 insertions, 5 deletions
diff --git a/shell/tokenize.mu b/shell/tokenize.mu
index e097e460..2ba391a9 100644
--- a/shell/tokenize.mu
+++ b/shell/tokenize.mu
@@ -248,10 +248,11 @@ fn test-tokenize-dotted-list {
   check close-paren?, "F - test-tokenize-dotted-list: close paren"
 }
 
+# double quotes with zero escaping support
 fn test-tokenize-stream-literal {
   var in-storage: gap-buffer
   var in/esi: (addr gap-buffer) <- address in-storage
-  initialize-gap-buffer-with in, "[abc def]"
+  initialize-gap-buffer-with in, "\"abc def\""
   #
   var stream-storage: (stream token 0x10)
   var stream/edi: (addr stream token) <- address stream-storage
@@ -279,6 +280,69 @@ fn test-tokenize-stream-literal {
   check empty?, "F - test-tokenize-stream-literal: empty?"
 }
 
+# alternative syntax for strings with balancing brackets
+fn test-tokenize-balanced-stream-literal {
+  var in-storage: gap-buffer
+  var in/esi: (addr gap-buffer) <- address in-storage
+  initialize-gap-buffer-with in, "[abc def]"
+  #
+  var stream-storage: (stream token 0x10)
+  var stream/edi: (addr stream token) <- address stream-storage
+  #
+  var trace-storage: trace
+  var trace/edx: (addr trace) <- address trace-storage
+  initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
+  tokenize in, stream, trace
+  #
+  var curr-token-storage: token
+  var curr-token/ebx: (addr token) <- address curr-token-storage
+  read-from-stream stream, curr-token
+  var curr-token-type/eax: (addr int) <- get curr-token, type
+  check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-balanced-stream-literal/before-indent-type"
+  var curr-token-data/eax: (addr int) <- get curr-token, number-data
+  check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-balanced-stream-literal/before-indent"
+  read-from-stream stream, curr-token
+  var stream?/eax: boolean <- stream-token? curr-token
+  check stream?, "F - test-tokenize-stream-literal: type"
+  var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
+  var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
+  var data-equal?/eax: boolean <- stream-data-equal? curr-token-data, "abc def"
+  check data-equal?, "F - test-tokenize-balanced-stream-literal"
+  var empty?/eax: boolean <- stream-empty? stream
+  check empty?, "F - test-tokenize-balanced-stream-literal: empty?"
+}
+
+fn test-tokenize-nested-stream-literal {
+  var in-storage: gap-buffer
+  var in/esi: (addr gap-buffer) <- address in-storage
+  initialize-gap-buffer-with in, "[abc [def]]"
+  #
+  var stream-storage: (stream token 0x10)
+  var stream/edi: (addr stream token) <- address stream-storage
+  #
+  var trace-storage: trace
+  var trace/edx: (addr trace) <- address trace-storage
+  initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
+  tokenize in, stream, trace
+  #
+  var curr-token-storage: token
+  var curr-token/ebx: (addr token) <- address curr-token-storage
+  read-from-stream stream, curr-token
+  var curr-token-type/eax: (addr int) <- get curr-token, type
+  check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-nested-stream-literal/before-indent-type"
+  var curr-token-data/eax: (addr int) <- get curr-token, number-data
+  check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-nested-stream-literal/before-indent"
+  read-from-stream stream, curr-token
+  var stream?/eax: boolean <- stream-token? curr-token
+  check stream?, "F - test-tokenize-stream-literal: type"
+  var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
+  var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
+  var data-equal?/eax: boolean <- stream-data-equal? curr-token-data, "abc [def]"
+  check data-equal?, "F - test-tokenize-nested-stream-literal"
+  var empty?/eax: boolean <- stream-empty? stream
+  check empty?, "F - test-tokenize-nested-stream-literal: empty?"
+}
+
 fn test-tokenize-stream-literal-in-tree {
   var in-storage: gap-buffer
   var in/esi: (addr gap-buffer) <- address in-storage
@@ -411,12 +475,20 @@ fn next-token in: (addr gap-buffer), out: (addr token), start-of-line?: boolean,
     trace trace, "tokenize", stream
   }
   $next-token:case: {
-    # open square brackets begin streams
+    # double quotes begin streams
+    {
+      compare g, 0x22/double-quote
+      break-if-!=
+      var dummy/eax: grapheme <- read-from-gap-buffer in  # skip
+      next-stream-token in, out, trace
+      break $next-token:case
+    }
+    # open square brackets begin balanced streams
     {
       compare g, 0x5b/open-square-bracket
       break-if-!=
       var dummy/eax: grapheme <- read-from-gap-buffer in  # skip open bracket
-      next-stream-token in, out, trace
+      next-balanced-stream-token in, out, trace
       break $next-token:case
     }
     # other symbol char
@@ -621,11 +693,11 @@ fn next-stream-token in: (addr gap-buffer), _out: (addr token), trace: (addr tra
     compare empty?, 0/false
     {
       break-if-=
-      error trace, "unbalanced '['"
+      error trace, "unbalanced '\"'"
       return
     }
     var g/eax: grapheme <- read-from-gap-buffer in
-    compare g, 0x5d/close-square-bracket
+    compare g, 0x22/double-quote
     break-if-=
     write-grapheme out-data, g
     loop
@@ -643,6 +715,55 @@ fn next-stream-token in: (addr gap-buffer), _out: (addr token), trace: (addr tra
   }
 }
 
+fn next-balanced-stream-token in: (addr gap-buffer), _out: (addr token), trace: (addr trace) {
+  trace-text trace, "tokenize", "balanced stream"
+  var out/edi: (addr token) <- copy _out
+  var out-type/eax: (addr int) <- get out, type
+  copy-to *out-type, 1/stream
+  var out-data-ah/eax: (addr handle stream byte) <- get out, text-data
+  var bracket-count: int
+  # stream tokens contain whole function definitions on boot, so we always
+  # give them plenty of space
+  populate-stream out-data-ah, 0x400/max-definition-size=1KB
+  var _out-data/eax: (addr stream byte) <- lookup *out-data-ah
+  var out-data/edi: (addr stream byte) <- copy _out-data
+  $next-balanced-stream-token:loop: {
+    var empty?/eax: boolean <- gap-buffer-scan-done? in
+    compare empty?, 0/false
+    {
+      break-if-=
+      error trace, "unbalanced '['"
+      return
+    }
+    var g/eax: grapheme <- read-from-gap-buffer in
+    {
+      compare g, 0x5b/open-square-bracket
+      break-if-!=
+      increment bracket-count
+    }
+    {
+      compare g, 0x5d/close-square-bracket
+      break-if-!=
+      compare bracket-count, 0
+      break-if-= $next-balanced-stream-token:loop
+      decrement bracket-count
+    }
+    write-grapheme out-data, g
+    loop
+  }
+  {
+    var should-trace?/eax: boolean <- should-trace? trace
+    compare should-trace?, 0/false
+    break-if-=
+    var stream-storage: (stream byte 0x400)  # max-definition-size
+    var stream/esi: (addr stream byte) <- address stream-storage
+    write stream, "=> "
+    rewind-stream out-data
+    write-stream-immutable stream, out-data
+    trace trace, "tokenize", stream
+  }
+}
+
 fn next-bracket-token g: grapheme, _out: (addr token), trace: (addr trace) {
   trace-text trace, "tokenize", "bracket"
   var out/eax: (addr token) <- copy _out
author	Kartik K. Agaram <vc@akkartik.com>	2021-07-28 20:44:02 -0700
committer	Kartik K. Agaram <vc@akkartik.com>	2021-07-28 20:44:02 -0700
commit	bec33a7067288570619c3c12fb0543e61bb562d2 (patch)
tree	67e58909aeeaad542900ed8fcb334ff1ddd1a577 /shell
parent	267c74b59a5f148bd28233f25bc794a3a4893e8e (diff)
download	mu-bec33a7067288570619c3c12fb0543e61bb562d2.tar.gz