about summary refs log blame commit diff stats
path: root/014literal_string.cc
blob: 84dbe8d0673994d74259029ed7f5dae9ccefee48 (plain) (tree)
1
2
3
4
5
6
7
8

                                                                             

                                                                          
                                                                           
                                                                              
            
 




















                                                                    
 
                                       
                                       
 
                                       

                                   
                                                   

                


                                  
                    
                                                                                                         
                              





                                            

                                                                             
                                                
                        


                      



                    







                                                                        
                                                                
                      
                        
                      
                    
                                          

               
             


                                
   
                                         
                                         




                                                                               
                                                            

                   
                    
                                          

               

                   
                                                                                   








                              
                         
   
                                       
              
 
 
                                    
                          
                                                                           
           


                         
                                            

         
 

                                                                          
 
                                             
                         
                                       

       
                                        
                                                    

 



                                          
                                              








                                                                             



                                     





















                                                                            































































































                                                                                                   
//: For convenience, some instructions will take literal arrays of characters
//: (text or strings).
//:
//: Instead of quotes, we'll use [] to delimit strings. That'll reduce the
//: need for escaping since we can support nested brackets. And we can also
//: imagine that 'recipe' might one day itself be defined in Mu, doing its own
//: parsing.

void test_string_literal() {
  load(
      "def main [\n"
      "  1:address:array:character <- copy [abc def]\n"
      "]\n"
  );
  CHECK_TRACE_CONTENTS(
      "parse:   ingredient: {\"abc def\": \"literal-string\"}\n"
  );
}

void test_string_literal_with_colons() {
  load(
      "def main [\n"
      "  1:address:array:character <- copy [abc:def/ghi]\n"
      "]\n"
  );
  CHECK_TRACE_CONTENTS(
      "parse:   ingredient: {\"abc:def/ghi\": \"literal-string\"}\n"
  );
}

:(before "End Mu Types Initialization")
put(Type_ordinal, "literal-string", 0);

:(before "End next_word Special-cases")
if (in.peek() == '[') {
  string result = slurp_quoted(in);
  skip_whitespace_and_comments_but_not_newline(in);
  return result;
}

:(code)
string slurp_quoted(istream& in) {
  ostringstream out;
  assert(has_data(in));  assert(in.peek() == '[');  out << static_cast<char>(in.get());  // slurp the '['
  if (is_code_string(in, out))
    slurp_quoted_comment_aware(in, out);
  else
    slurp_quoted_comment_oblivious(in, out);
  return out.str();
}

// A string is a code string (ignores comments when scanning for matching
// brackets) if it contains a newline at the start before any non-whitespace.
bool is_code_string(istream& in, ostream& out) {
  while (has_data(in)) {
    char c = in.get();
    if (!isspace(c)) {
      in.putback(c);
      return false;
    }
    out << c;
    if (c == '\n') {
      return true;
    }
  }
  return false;
}

// Read a regular string. Regular strings can only contain other regular
// strings.
void slurp_quoted_comment_oblivious(istream& in, ostream& out) {
  int brace_depth = 1;
  while (has_data(in)) {
    char c = in.get();
    if (c == '\\') {
      slurp_one_past_backslashes(in, out);
      continue;
    }
    out << c;
    if (c == '[') ++brace_depth;
    if (c == ']') --brace_depth;
    if (brace_depth == 0) break;
  }
  if (!has_data(in) && brace_depth > 0) {
    raise << "unbalanced '['\n" << end();
    out.clear();
  }
}

// Read a code string. Code strings can contain either code or regular strings.
void slurp_quoted_comment_aware(istream& in, ostream& out) {
  char c;
  while (in >> c) {
    if (c == '\\') {
      slurp_one_past_backslashes(in, out);
      continue;
    }
    if (c == '#') {
      out << c;
      while (has_data(in) && in.peek() != '\n') out << static_cast<char>(in.get());
      continue;
    }
    if (c == '[') {
      in.putback(c);
      // recurse
      out << slurp_quoted(in);
      continue;
    }
    out << c;
    if (c == ']') return;
  }
  raise << "unbalanced '['\n" << end();
  out.clear();
}

:(after "Parsing reagent(string s)")
if (starts_with(s, "[")) {
  if (*s.rbegin() != ']') return;  // unbalanced bracket; handled elsewhere
  name = s;
  // delete [] delimiters
  name.erase(0, 1);
  strip_last(name);
  type = new type_tree("literal-string", 0);
  return;
}

//: Unlike other reagents, escape newlines in literal strings to make them
//: more friendly to trace().

:(after "string to_string(const reagent& r)")
  if (is_literal_text(r))
    return emit_literal_string(r.name);

:(code)
bool is_literal_text(const reagent& x) {
  return x.type && x.type->name == "literal-string";
}

string emit_literal_string(string name) {
  size_t pos = 0;
  while (pos != string::npos)
    pos = replace(name, "\n", "\\n", pos);
  return "{\""+name+"\": \"literal-string\"}";
}

size_t replace(string& str, const string& from, const string& to, size_t n) {
  size_t result = str.find(from, n);
  if (result != string::npos)
    str.replace(result, from.length(), to);
  return result;
}

void strip_last(string& s) {
  if (!s.empty()) s.erase(SIZE(s)-1);
}

void slurp_one_past_backslashes(istream& in, ostream& out) {
  // When you encounter a backslash, strip it out and pass through any
  // following run of backslashes. If we 'escaped' a single following
  // character, then the character '\' would be:
  //   '\\' escaped once
  //   '\\\\' escaped twice
  //   '\\\\\\\\' escaped thrice (8 backslashes)
  // ..and so on. With our approach it'll be:
  //   '\\' escaped once
  //   '\\\' escaped twice
  //   '\\\\' escaped thrice
  // This only works as long as backslashes aren't also overloaded to create
  // special characters. So Mu doesn't follow C's approach of overloading
  // backslashes both to escape quote characters and also as a notation for
  // unprintable characters like '\n'.
  while (has_data(in)) {
    char c = in.get();
    out << c;
    if (c != '\\') break;
  }
}

void test_string_literal_nested() {
  load(
      "def main [\n"
      "  1:address:array:character <- copy [abc [def]]\n"
      "]\n"
  );
  CHECK_TRACE_CONTENTS(
      "parse:   ingredient: {\"abc [def]\": \"literal-string\"}\n"
  );
}

void test_string_literal_escaped() {
  load(
      "def main [\n"
      "  1:address:array:character <- copy [abc \\[def]\n"
      "]\n"
  );
  CHECK_TRACE_CONTENTS(
      "parse:   ingredient: {\"abc [def\": \"literal-string\"}\n"
  );
}

void test_string_literal_escaped_twice() {
  load(
      "def main [\n"
      "  1:address:array:character <- copy [\n"
      "abc \\\\[def]\n"
      "]\n"
  );
  CHECK_TRACE_CONTENTS(
      "parse:   ingredient: {\"\\nabc \\[def\": \"literal-string\"}\n"
  );
}

void test_string_literal_and_comment() {
  load(
      "def main [\n"
      "  1:address:array:character <- copy [abc]  # comment\n"
      "]\n"
  );
  CHECK_TRACE_CONTENTS(
      "parse: --- defining main\n"
      "parse: instruction: copy\n"
      "parse:   number of ingredients: 1\n"
      "parse:   ingredient: {\"abc\": \"literal-string\"}\n"
      "parse:   product: {1: (\"address\" \"array\" \"character\")}\n"
  );
}

void test_string_literal_escapes_newlines_in_trace() {
  load(
      "def main [\n"
      "  copy [abc\n"
      "def]\n"
      "]\n"
  );
  CHECK_TRACE_CONTENTS(
      "parse:   ingredient: {\"abc\\ndef\": \"literal-string\"}\n"
  );
}

void test_string_literal_can_skip_past_comments() {
  load(
      "def main [\n"
      "  copy [\n"
      "    # ']' inside comment\n"
      "    bar\n"
      "  ]\n"
      "]\n"
  );
  CHECK_TRACE_CONTENTS(
      "parse:   ingredient: {\"\\n    # ']' inside comment\\n    bar\\n  \": \"literal-string\"}\n"
  );
}

void test_string_literal_empty() {
  load(
      "def main [\n"
      "  copy []\n"
      "]\n"
  );
  CHECK_TRACE_CONTENTS(
      "parse:   ingredient: {\"\": \"literal-string\"}\n"
  );
}

void test_multiple_unfinished_recipes() {
  Hide_errors = true;
  load(
      "def f1 [\n"
      "def f2 [\n"
  );
  CHECK_TRACE_CONTENTS(
      "error: unbalanced '['\n"
  );
}