diff options
author | Kartik K. Agaram <vc@akkartik.com> | 2021-07-28 20:44:02 -0700 |
---|---|---|
committer | Kartik K. Agaram <vc@akkartik.com> | 2021-07-28 20:44:02 -0700 |
commit | bec33a7067288570619c3c12fb0543e61bb562d2 (patch) | |
tree | 67e58909aeeaad542900ed8fcb334ff1ddd1a577 /shell | |
parent | 267c74b59a5f148bd28233f25bc794a3a4893e8e (diff) | |
download | mu-bec33a7067288570619c3c12fb0543e61bb562d2.tar.gz |
shell: second notation for string literals
I've always been dissatisfied with the notion of escaping. It introduces a special-case meta-notation within the tokenizer, and the conventional approach leads to exponential "leaning toothpick syndrome" with each level of escaping. One potential "correct" solution is to keep string terminals parameterizable: [abc] => abc [=] => = [=[abc]=] => abc [=[a]bc]=] => a]bc [==[a]=]bc]==] => a]=]bc ..and so on. Basically the terminals grow linearly as the number of escapings grow. While this is workable, I'd like to wait until I actually need it, and then gauge whether the need is a sign of the stack growing too complex, with too many layers of notation/parsing. Mu's goal is just 3 notations, and it's going to require constant vigilance to keep that from growing. Therefore, for now, there are two notations for string literals, one symmetric and one balanced: "abc" => abc [abc] => abc The balancing notation permits nested brackets as long as they balance. [abc [def]] => abc [def] If you need unbalanced square brackets, use the symmetric terminals: "abc [def" => abc [def If you need double quotes inside strings, use the balanced notation: [abc "def] => abc "def If you need _both_ square brackets (whether balanced or unbalanced) and double quotes, you're currently shit outta luck.
Diffstat (limited to 'shell')
-rw-r--r-- | shell/tokenize.mu | 131 |
1 files changed, 126 insertions, 5 deletions
diff --git a/shell/tokenize.mu b/shell/tokenize.mu index e097e460..2ba391a9 100644 --- a/shell/tokenize.mu +++ b/shell/tokenize.mu @@ -248,10 +248,11 @@ fn test-tokenize-dotted-list { check close-paren?, "F - test-tokenize-dotted-list: close paren" } +# double quotes with zero escaping support fn test-tokenize-stream-literal { var in-storage: gap-buffer var in/esi: (addr gap-buffer) <- address in-storage - initialize-gap-buffer-with in, "[abc def]" + initialize-gap-buffer-with in, "\"abc def\"" # var stream-storage: (stream token 0x10) var stream/edi: (addr stream token) <- address stream-storage @@ -279,6 +280,69 @@ fn test-tokenize-stream-literal { check empty?, "F - test-tokenize-stream-literal: empty?" } +# alternative syntax for strings with balancing brackets +fn test-tokenize-balanced-stream-literal { + var in-storage: gap-buffer + var in/esi: (addr gap-buffer) <- address in-storage + initialize-gap-buffer-with in, "[abc def]" + # + var stream-storage: (stream token 0x10) + var stream/edi: (addr stream token) <- address stream-storage + # + var trace-storage: trace + var trace/edx: (addr trace) <- address trace-storage + initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible + tokenize in, stream, trace + # + var curr-token-storage: token + var curr-token/ebx: (addr token) <- address curr-token-storage + read-from-stream stream, curr-token + var curr-token-type/eax: (addr int) <- get curr-token, type + check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-balanced-stream-literal/before-indent-type" + var curr-token-data/eax: (addr int) <- get curr-token, number-data + check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-balanced-stream-literal/before-indent" + read-from-stream stream, curr-token + var stream?/eax: boolean <- stream-token? curr-token + check stream?, "F - test-tokenize-stream-literal: type" + var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data + var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah + var data-equal?/eax: boolean <- stream-data-equal? curr-token-data, "abc def" + check data-equal?, "F - test-tokenize-balanced-stream-literal" + var empty?/eax: boolean <- stream-empty? stream + check empty?, "F - test-tokenize-balanced-stream-literal: empty?" +} + +fn test-tokenize-nested-stream-literal { + var in-storage: gap-buffer + var in/esi: (addr gap-buffer) <- address in-storage + initialize-gap-buffer-with in, "[abc [def]]" + # + var stream-storage: (stream token 0x10) + var stream/edi: (addr stream token) <- address stream-storage + # + var trace-storage: trace + var trace/edx: (addr trace) <- address trace-storage + initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible + tokenize in, stream, trace + # + var curr-token-storage: token + var curr-token/ebx: (addr token) <- address curr-token-storage + read-from-stream stream, curr-token + var curr-token-type/eax: (addr int) <- get curr-token, type + check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-nested-stream-literal/before-indent-type" + var curr-token-data/eax: (addr int) <- get curr-token, number-data + check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-nested-stream-literal/before-indent" + read-from-stream stream, curr-token + var stream?/eax: boolean <- stream-token? curr-token + check stream?, "F - test-tokenize-stream-literal: type" + var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data + var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah + var data-equal?/eax: boolean <- stream-data-equal? curr-token-data, "abc [def]" + check data-equal?, "F - test-tokenize-nested-stream-literal" + var empty?/eax: boolean <- stream-empty? stream + check empty?, "F - test-tokenize-nested-stream-literal: empty?" +} + fn test-tokenize-stream-literal-in-tree { var in-storage: gap-buffer var in/esi: (addr gap-buffer) <- address in-storage @@ -411,12 +475,20 @@ fn next-token in: (addr gap-buffer), out: (addr token), start-of-line?: boolean, trace trace, "tokenize", stream } $next-token:case: { - # open square brackets begin streams + # double quotes begin streams + { + compare g, 0x22/double-quote + break-if-!= + var dummy/eax: grapheme <- read-from-gap-buffer in # skip + next-stream-token in, out, trace + break $next-token:case + } + # open square brackets begin balanced streams { compare g, 0x5b/open-square-bracket break-if-!= var dummy/eax: grapheme <- read-from-gap-buffer in # skip open bracket - next-stream-token in, out, trace + next-balanced-stream-token in, out, trace break $next-token:case } # other symbol char @@ -621,11 +693,11 @@ fn next-stream-token in: (addr gap-buffer), _out: (addr token), trace: (addr tra compare empty?, 0/false { break-if-= - error trace, "unbalanced '['" + error trace, "unbalanced '\"'" return } var g/eax: grapheme <- read-from-gap-buffer in - compare g, 0x5d/close-square-bracket + compare g, 0x22/double-quote break-if-= write-grapheme out-data, g loop @@ -643,6 +715,55 @@ fn next-stream-token in: (addr gap-buffer), _out: (addr token), trace: (addr tra } } +fn next-balanced-stream-token in: (addr gap-buffer), _out: (addr token), trace: (addr trace) { + trace-text trace, "tokenize", "balanced stream" + var out/edi: (addr token) <- copy _out + var out-type/eax: (addr int) <- get out, type + copy-to *out-type, 1/stream + var out-data-ah/eax: (addr handle stream byte) <- get out, text-data + var bracket-count: int + # stream tokens contain whole function definitions on boot, so we always + # give them plenty of space + populate-stream out-data-ah, 0x400/max-definition-size=1KB + var _out-data/eax: (addr stream byte) <- lookup *out-data-ah + var out-data/edi: (addr stream byte) <- copy _out-data + $next-balanced-stream-token:loop: { + var empty?/eax: boolean <- gap-buffer-scan-done? in + compare empty?, 0/false + { + break-if-= + error trace, "unbalanced '['" + return + } + var g/eax: grapheme <- read-from-gap-buffer in + { + compare g, 0x5b/open-square-bracket + break-if-!= + increment bracket-count + } + { + compare g, 0x5d/close-square-bracket + break-if-!= + compare bracket-count, 0 + break-if-= $next-balanced-stream-token:loop + decrement bracket-count + } + write-grapheme out-data, g + loop + } + { + var should-trace?/eax: boolean <- should-trace? trace + compare should-trace?, 0/false + break-if-= + var stream-storage: (stream byte 0x400) # max-definition-size + var stream/esi: (addr stream byte) <- address stream-storage + write stream, "=> " + rewind-stream out-data + write-stream-immutable stream, out-data + trace trace, "tokenize", stream + } +} + fn next-bracket-token g: grapheme, _out: (addr token), trace: (addr trace) { trace-text trace, "tokenize", "bracket" var out/eax: (addr token) <- copy _out |