Mu - 038literal_strings.cc

From fcc161e70502caf34bc0206d2c428e8341e97fa6 Mon Sep 17 00:00:00 2001 From: Kartik Agaram Date: Sun, 24 May 2020 22:43:18 -0700 Subject: 6397 Drop '---' section boundaries from filenames. I noticed them confusing tab-completion for certain advanced shell setups. --- html/038literal_strings.cc.html | 426 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 426 insertions(+) create mode 100644 html/038literal_strings.cc.html (limited to 'html/038literal_strings.cc.html') diff --git a/html/038literal_strings.cc.html b/html/038literal_strings.cc.html new file mode 100644 index 00000000..31960151 --- /dev/null +++ b/html/038literal_strings.cc.html @@ -0,0 +1,426 @@ + + + + +Mu - 038literal_strings.cc + + + + + + + + + + +https://github.com/akkartik/mu/blob/master/038literal_strings.cc +
+  1 //: Allow instructions to mention literals directly.
+  2 //:
+  3 //: This layer will transparently move them to the global segment (assumed to
+  4 //: always be the second segment).
+  5 
+  6 void test_transform_literal_string() {
+  7   run(
+  8       "== code 0x1\n"
+  9       "b8/copy  \"test\"/imm32\n"
+ 10       "== data 0x2000\n"  // need an empty segment
+ 11   );
+ 12   CHECK_TRACE_CONTENTS(
+ 13       "transform: -- move literal strings to data segment\n"
+ 14       "transform: adding global variable '__subx_global_1' containing \"test\"\n"
+ 15       "transform: line after transform: 'b8 __subx_global_1'\n"
+ 16   );
+ 17 }
+ 18 
+ 19 //: We don't rely on any transforms running in previous layers, but this layer
+ 20 //: knows about labels and global variables and will emit them for previous
+ 21 //: layers to transform.
+ 22 :(after "Begin Transforms")
+ 23 Transform.push_back(transform_literal_strings);
+ 24 
+ 25 :(before "End Globals")
+ 26 int Next_auto_global = 1;
+ 27 :(before "End Reset")
+ 28 Next_auto_global = 1;
+ 29 :(code)
+ 30 void transform_literal_strings(program& p) {
+ 31   trace(3, "transform") << "-- move literal strings to data segment" << end();
+ 32   if (p.segments.empty()) return;
+ 33   vector<line> new_lines;
+ 34   for (int s = 0;  s < SIZE(p.segments);  ++s) {
+ 35     segment& seg = p.segments.at(s);
+ 36     trace(99, "transform") << "segment '" << seg.name << "'" << end();
+ 37     for (int i = 0;  i < SIZE(seg.lines);  ++i) {
+ 38 //?       cerr << seg.name << '/' << i << '\n';
+ 39       line& line = seg.lines.at(i);
+ 40       for (int j = 0;  j < SIZE(line.words);  ++j) {
+ 41         word& curr = line.words.at(j);
+ 42         if (curr.data.at(0) != '"') continue;
+ 43         ostringstream global_name;
+ 44         global_name << "__subx_global_" << Next_auto_global;
+ 45         ++Next_auto_global;
+ 46         add_global_to_data_segment(global_name.str(), curr, new_lines);
+ 47         curr.data = global_name.str();
+ 48       }
+ 49       trace(99, "transform") << "line after transform: '" << data_to_string(line) << "'" << end();
+ 50     }
+ 51   }
+ 52   segment* data = find(p, "data");
+ 53   if (data)
+ 54     data->lines.insert(data->lines.end(), new_lines.begin(), new_lines.end());
+ 55 }
+ 56 
+ 57 void add_global_to_data_segment(const string& name, const word& value, vector<line>& out) {
+ 58   trace(99, "transform") << "adding global variable '" << name << "' containing " << value.data << end();
+ 59   // emit label
+ 60   out.push_back(label(name));
+ 61   // emit size for size-prefixed array
+ 62   out.push_back(line());
+ 63   emit_hex_bytes(out.back(), SIZE(value.data)-/*skip quotes*/2, 4/*bytes*/);
+ 64   // emit data byte by byte
+ 65   out.push_back(line());
+ 66   line& curr = out.back();
+ 67   for (int i = /*skip start quote*/1;  i < SIZE(value.data)-/*skip end quote*/1;  ++i) {
+ 68     char c = value.data.at(i);
+ 69     curr.words.push_back(word());
+ 70     curr.words.back().data = hex_byte_to_string(c);
+ 71     curr.words.back().metadata.push_back(string(1, c));
+ 72   }
+ 73 }
+ 74 
+ 75 //: Within strings, whitespace is significant. So we need to redo our instruction
+ 76 //: parsing.
+ 77 
+ 78 void test_instruction_with_string_literal() {
+ 79   parse_instruction_character_by_character(
+ 80       "a \"abc  def\" z\n"  // two spaces inside string
+ 81   );
+ 82   CHECK_TRACE_CONTENTS(
+ 83       "parse2: word: a\n"
+ 84       "parse2: word: \"abc  def\"\n"
+ 85       "parse2: word: z\n"
+ 86   );
+ 87   // no other words
+ 88   CHECK_TRACE_COUNT("parse2", 3);
+ 89 }
+ 90 
+ 91 void test_string_literal_in_data_segment() {
+ 92   run(
+ 93       "== code 0x1\n"
+ 94       "b8/copy  X/imm32\n"
+ 95       "== data 0x2000\n"
+ 96       "X:\n"
+ 97       "\"test\"/imm32\n"
+ 98   );
+ 99   CHECK_TRACE_CONTENTS(
+100       "transform: -- move literal strings to data segment\n"
+101       "transform: adding global variable '__subx_global_1' containing \"test\"\n"
+102       "transform: line after transform: '__subx_global_1'\n"
+103   );
+104 }
+105 
+106 void test_string_literal_with_missing_quote() {
+107   Hide_errors = true;
+108   run(
+109       "== code 0x1\n"
+110       "b8/copy  \"test/imm32\n"
+111       "== data 0x2000\n"
+112   );
+113   CHECK_TRACE_CONTENTS(
+114       "error: unclosed string in: b8/copy  \"test/imm32"
+115   );
+116 }
+117 
+118 :(before "End Line Parsing Special-cases(line_data -> l)")
+119 if (line_data.find('"') != string::npos) {  // can cause false-positives, but we can handle them
+120   parse_instruction_character_by_character(line_data, l);
+121   continue;
+122 }
+123 
+124 :(code)
+125 void parse_instruction_character_by_character(const string& line_data, vector<line>& out) {
+126   if (line_data.find('\n') != string::npos  && line_data.find('\n') != line_data.size()-1) {
+127     raise << "parse_instruction_character_by_character: should receive only a single line\n" << end();
+128     return;
+129   }
+130   // parse literals
+131   istringstream in(line_data);
+132   in >> std::noskipws;
+133   line result;
+134   result.original = line_data;
+135   // add tokens (words or strings) one by one
+136   while (has_data(in)) {
+137     skip_whitespace(in);
+138     if (!has_data(in)) break;
+139     char c = in.get();
+140     if (c == '#') break;  // comment; drop rest of line
+141     if (c == ':') break;  // line metadata; skip for now
+142     if (c == '.') {
+143       if (!has_data(in)) break;  // comment token at end of line
+144       if (isspace(in.peek()))
+145         continue;  // '.' followed by space is comment token; skip
+146     }
+147     result.words.push_back(word());
+148     if (c == '"') {
+149       // string literal; slurp everything between quotes into data
+150       ostringstream d;
+151       d << c;
+152       while (true) {
+153         if (!has_data(in)) {
+154           raise << "unclosed string in: " << line_data << end();
+155           return;
+156         }
+157         in >> c;
+158         if (c == '\\') {
+159           in >> c;
+160           if (c == 'n') d << '\n';
+161           else if (c == '"') d << '"';
+162           else if (c == '\\') d << '\\';
+163           else {
+164             raise << "parse_instruction_character_by_character: unknown escape sequence '\\" << c << "'\n" << end();
+165             return;
+166           }
+167           continue;
+168         } else {
+169           d << c;
+170         }
+171         if (c == '"') break;
+172       }
+173       result.words.back().data = d.str();
+174       result.words.back().original = d.str();
+175       // slurp metadata
+176       ostringstream m;
+177       while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
+178         in >> c;
+179         if (c == '/') {
+180           if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
+181           m.str("");
+182         }
+183         else {
+184           m << c;
+185         }
+186       }
+187       if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
+188     }
+189     else {
+190       // not a string literal; slurp all characters until whitespace
+191       ostringstream w;
+192       w << c;
+193       while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
+194         in >> c;
+195         w << c;
+196       }
+197       parse_word(w.str(), result.words.back());
+198     }
+199     trace(99, "parse2") << "word: " << to_string(result.words.back()) << end();
+200   }
+201   if (!result.words.empty())
+202     out.push_back(result);
+203 }
+204 
+205 void skip_whitespace(istream& in) {
+206   while (has_data(in) && isspace(in.peek())) {
+207     in.get();
+208   }
+209 }
+210 
+211 void skip_comment(istream& in) {
+212   if (has_data(in) && in.peek() == '#') {
+213     in.get();
+214     while (has_data(in) && in.peek() != '\n') in.get();
+215   }
+216 }
+217 
+218 line label(string s) {
+219   line result;
+220   result.words.push_back(word());
+221   result.words.back().data = (s+":");
+222   return result;
+223 }
+224 
+225 // helper for tests
+226 void parse_instruction_character_by_character(const string& line_data) {
+227   vector<line> out;
+228   parse_instruction_character_by_character(line_data, out);
+229 }
+230 
+231 void test_parse2_comment_token_in_middle() {
+232   parse_instruction_character_by_character(
+233       "a . z\n"
+234   );
+235   CHECK_TRACE_CONTENTS(
+236       "parse2: word: a\n"
+237       "parse2: word: z\n"
+238   );
+239   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
+240   // no other words
+241   CHECK_TRACE_COUNT("parse2", 2);
+242 }
+243 
+244 void test_parse2_word_starting_with_dot() {
+245   parse_instruction_character_by_character(
+246       "a .b c\n"
+247   );
+248   CHECK_TRACE_CONTENTS(
+249       "parse2: word: a\n"
+250       "parse2: word: .b\n"
+251       "parse2: word: c\n"
+252   );
+253 }
+254 
+255 void test_parse2_comment_token_at_start() {
+256   parse_instruction_character_by_character(
+257       ". a b\n"
+258   );
+259   CHECK_TRACE_CONTENTS(
+260       "parse2: word: a\n"
+261       "parse2: word: b\n"
+262   );
+263   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
+264 }
+265 
+266 void test_parse2_comment_token_at_end() {
+267   parse_instruction_character_by_character(
+268       "a b .\n"
+269   );
+270   CHECK_TRACE_CONTENTS(
+271       "parse2: word: a\n"
+272       "parse2: word: b\n"
+273   );
+274   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
+275 }
+276 
+277 void test_parse2_word_starting_with_dot_at_start() {
+278   parse_instruction_character_by_character(
+279       ".a b c\n"
+280   );
+281   CHECK_TRACE_CONTENTS(
+282       "parse2: word: .a\n"
+283       "parse2: word: b\n"
+284       "parse2: word: c\n"
+285   );
+286 }
+287 
+288 void test_parse2_metadata() {
+289   parse_instruction_character_by_character(
+290       ".a b/c d\n"
+291   );
+292   CHECK_TRACE_CONTENTS(
+293       "parse2: word: .a\n"
+294       "parse2: word: b /c\n"
+295       "parse2: word: d\n"
+296   );
+297 }
+298 
+299 void test_parse2_string_with_metadata() {
+300   parse_instruction_character_by_character(
+301       "a \"bc  def\"/disp32 g\n"
+302   );
+303   CHECK_TRACE_CONTENTS(
+304       "parse2: word: a\n"
+305       "parse2: word: \"bc  def\" /disp32\n"
+306       "parse2: word: g\n"
+307   );
+308 }
+309 
+310 void test_parse2_string_with_metadata_at_end() {
+311   parse_instruction_character_by_character(
+312       "a \"bc  def\"/disp32\n"
+313   );
+314   CHECK_TRACE_CONTENTS(
+315       "parse2: word: a\n"
+316       "parse2: word: \"bc  def\" /disp32\n"
+317   );
+318 }
+319 
+320 void test_parse2_string_with_metadata_at_end_of_line_without_newline() {
+321   parse_instruction_character_by_character(
+322       "68/push \"test\"/f"  // no newline, which is how calls from parse() will look
+323   );
+324   CHECK_TRACE_CONTENTS(
+325       "parse2: word: 68 /push\n"
+326       "parse2: word: \"test\" /f\n"
+327   );
+328 }
+329 
+330 //: Make sure slashes inside strings don't trigger adding stuff from inside the
+331 //: string to metadata.
+332 
+333 void test_parse2_string_containing_slashes() {
+334   parse_instruction_character_by_character(
+335       "a \"bc/def\"/disp32\n"
+336   );
+337   CHECK_TRACE_CONTENTS(
+338       "parse2: word: \"bc/def\" /disp32\n"
+339   );
+340 }
+341 
+342 void test_instruction_with_string_literal_with_escaped_quote() {
+343   parse_instruction_character_by_character(
+344       "\"a\\\"b\"\n"  // escaped quote inside string
+345   );
+346   CHECK_TRACE_CONTENTS(
+347       "parse2: word: \"a\"b\"\n"
+348   );
+349   // no other words
+350   CHECK_TRACE_COUNT("parse2", 1);
+351 }
+352 
+353 void test_instruction_with_string_literal_with_escaped_backslash() {
+354   parse_instruction_character_by_character(
+355       "\"a\\\\b\"\n"  // escaped backslash inside string
+356   );
+357   CHECK_TRACE_CONTENTS(
+358       "parse2: word: \"a\\b\"\n"
+359   );
+360   // no other words
+361   CHECK_TRACE_COUNT("parse2", 1);
+362 }
+
+ + + -- cgit 1.4.1-2-gfad0