diff options
author | Kartik Agaram <vc@akkartik.com> | 2019-07-27 16:01:55 -0700 |
---|---|---|
committer | Kartik Agaram <vc@akkartik.com> | 2019-07-27 17:47:59 -0700 |
commit | 6e1eeeebfb453fa7c871869c19375ce60fbd7413 (patch) | |
tree | 539c4a3fdf1756ae79770d5c4aaf6366f1d1525e /archive/2.vm/014literal_string.cc | |
parent | 8846a7f85cc04b77b2fe8a67b6d317723437b00c (diff) | |
download | mu-6e1eeeebfb453fa7c871869c19375ce60fbd7413.tar.gz |
5485 - promote SubX to top-level
Diffstat (limited to 'archive/2.vm/014literal_string.cc')
-rw-r--r-- | archive/2.vm/014literal_string.cc | 274 |
1 files changed, 274 insertions, 0 deletions
diff --git a/archive/2.vm/014literal_string.cc b/archive/2.vm/014literal_string.cc new file mode 100644 index 00000000..84dbe8d0 --- /dev/null +++ b/archive/2.vm/014literal_string.cc @@ -0,0 +1,274 @@ +//: For convenience, some instructions will take literal arrays of characters +//: (text or strings). +//: +//: Instead of quotes, we'll use [] to delimit strings. That'll reduce the +//: need for escaping since we can support nested brackets. And we can also +//: imagine that 'recipe' might one day itself be defined in Mu, doing its own +//: parsing. + +void test_string_literal() { + load( + "def main [\n" + " 1:address:array:character <- copy [abc def]\n" + "]\n" + ); + CHECK_TRACE_CONTENTS( + "parse: ingredient: {\"abc def\": \"literal-string\"}\n" + ); +} + +void test_string_literal_with_colons() { + load( + "def main [\n" + " 1:address:array:character <- copy [abc:def/ghi]\n" + "]\n" + ); + CHECK_TRACE_CONTENTS( + "parse: ingredient: {\"abc:def/ghi\": \"literal-string\"}\n" + ); +} + +:(before "End Mu Types Initialization") +put(Type_ordinal, "literal-string", 0); + +:(before "End next_word Special-cases") +if (in.peek() == '[') { + string result = slurp_quoted(in); + skip_whitespace_and_comments_but_not_newline(in); + return result; +} + +:(code) +string slurp_quoted(istream& in) { + ostringstream out; + assert(has_data(in)); assert(in.peek() == '['); out << static_cast<char>(in.get()); // slurp the '[' + if (is_code_string(in, out)) + slurp_quoted_comment_aware(in, out); + else + slurp_quoted_comment_oblivious(in, out); + return out.str(); +} + +// A string is a code string (ignores comments when scanning for matching +// brackets) if it contains a newline at the start before any non-whitespace. +bool is_code_string(istream& in, ostream& out) { + while (has_data(in)) { + char c = in.get(); + if (!isspace(c)) { + in.putback(c); + return false; + } + out << c; + if (c == '\n') { + return true; + } + } + return false; +} + +// Read a regular string. Regular strings can only contain other regular +// strings. +void slurp_quoted_comment_oblivious(istream& in, ostream& out) { + int brace_depth = 1; + while (has_data(in)) { + char c = in.get(); + if (c == '\\') { + slurp_one_past_backslashes(in, out); + continue; + } + out << c; + if (c == '[') ++brace_depth; + if (c == ']') --brace_depth; + if (brace_depth == 0) break; + } + if (!has_data(in) && brace_depth > 0) { + raise << "unbalanced '['\n" << end(); + out.clear(); + } +} + +// Read a code string. Code strings can contain either code or regular strings. +void slurp_quoted_comment_aware(istream& in, ostream& out) { + char c; + while (in >> c) { + if (c == '\\') { + slurp_one_past_backslashes(in, out); + continue; + } + if (c == '#') { + out << c; + while (has_data(in) && in.peek() != '\n') out << static_cast<char>(in.get()); + continue; + } + if (c == '[') { + in.putback(c); + // recurse + out << slurp_quoted(in); + continue; + } + out << c; + if (c == ']') return; + } + raise << "unbalanced '['\n" << end(); + out.clear(); +} + +:(after "Parsing reagent(string s)") +if (starts_with(s, "[")) { + if (*s.rbegin() != ']') return; // unbalanced bracket; handled elsewhere + name = s; + // delete [] delimiters + name.erase(0, 1); + strip_last(name); + type = new type_tree("literal-string", 0); + return; +} + +//: Unlike other reagents, escape newlines in literal strings to make them +//: more friendly to trace(). + +:(after "string to_string(const reagent& r)") + if (is_literal_text(r)) + return emit_literal_string(r.name); + +:(code) +bool is_literal_text(const reagent& x) { + return x.type && x.type->name == "literal-string"; +} + +string emit_literal_string(string name) { + size_t pos = 0; + while (pos != string::npos) + pos = replace(name, "\n", "\\n", pos); + return "{\""+name+"\": \"literal-string\"}"; +} + +size_t replace(string& str, const string& from, const string& to, size_t n) { + size_t result = str.find(from, n); + if (result != string::npos) + str.replace(result, from.length(), to); + return result; +} + +void strip_last(string& s) { + if (!s.empty()) s.erase(SIZE(s)-1); +} + +void slurp_one_past_backslashes(istream& in, ostream& out) { + // When you encounter a backslash, strip it out and pass through any + // following run of backslashes. If we 'escaped' a single following + // character, then the character '\' would be: + // '\\' escaped once + // '\\\\' escaped twice + // '\\\\\\\\' escaped thrice (8 backslashes) + // ..and so on. With our approach it'll be: + // '\\' escaped once + // '\\\' escaped twice + // '\\\\' escaped thrice + // This only works as long as backslashes aren't also overloaded to create + // special characters. So Mu doesn't follow C's approach of overloading + // backslashes both to escape quote characters and also as a notation for + // unprintable characters like '\n'. + while (has_data(in)) { + char c = in.get(); + out << c; + if (c != '\\') break; + } +} + +void test_string_literal_nested() { + load( + "def main [\n" + " 1:address:array:character <- copy [abc [def]]\n" + "]\n" + ); + CHECK_TRACE_CONTENTS( + "parse: ingredient: {\"abc [def]\": \"literal-string\"}\n" + ); +} + +void test_string_literal_escaped() { + load( + "def main [\n" + " 1:address:array:character <- copy [abc \\[def]\n" + "]\n" + ); + CHECK_TRACE_CONTENTS( + "parse: ingredient: {\"abc [def\": \"literal-string\"}\n" + ); +} + +void test_string_literal_escaped_twice() { + load( + "def main [\n" + " 1:address:array:character <- copy [\n" + "abc \\\\[def]\n" + "]\n" + ); + CHECK_TRACE_CONTENTS( + "parse: ingredient: {\"\\nabc \\[def\": \"literal-string\"}\n" + ); +} + +void test_string_literal_and_comment() { + load( + "def main [\n" + " 1:address:array:character <- copy [abc] # comment\n" + "]\n" + ); + CHECK_TRACE_CONTENTS( + "parse: --- defining main\n" + "parse: instruction: copy\n" + "parse: number of ingredients: 1\n" + "parse: ingredient: {\"abc\": \"literal-string\"}\n" + "parse: product: {1: (\"address\" \"array\" \"character\")}\n" + ); +} + +void test_string_literal_escapes_newlines_in_trace() { + load( + "def main [\n" + " copy [abc\n" + "def]\n" + "]\n" + ); + CHECK_TRACE_CONTENTS( + "parse: ingredient: {\"abc\\ndef\": \"literal-string\"}\n" + ); +} + +void test_string_literal_can_skip_past_comments() { + load( + "def main [\n" + " copy [\n" + " # ']' inside comment\n" + " bar\n" + " ]\n" + "]\n" + ); + CHECK_TRACE_CONTENTS( + "parse: ingredient: {\"\\n # ']' inside comment\\n bar\\n \": \"literal-string\"}\n" + ); +} + +void test_string_literal_empty() { + load( + "def main [\n" + " copy []\n" + "]\n" + ); + CHECK_TRACE_CONTENTS( + "parse: ingredient: {\"\": \"literal-string\"}\n" + ); +} + +void test_multiple_unfinished_recipes() { + Hide_errors = true; + load( + "def f1 [\n" + "def f2 [\n" + ); + CHECK_TRACE_CONTENTS( + "error: unbalanced '['\n" + ); +} |