https://github.com/akkartik/mu/blob/main/038literal_strings.cc
  1 //: Allow instructions to mention literals directly.
  2 //:
  3 //: This layer will transparently move them to the global segment (assumed to
  4 //: always be the second segment).
  5 
  6 void test_transform_literal_string() {
  7   run(
  8       "== code 0x1\n"
  9       "b8/copy  \"test\"/imm32\n"
 10       "== data 0x2000\n"  // need an empty segment
 11   );
 12   CHECK_TRACE_CONTENTS(
 13       "transform: -- move literal strings to data segment\n"
 14       "transform: adding global variable '__subx_global_1' containing \"test\"\n"
 15       "transform: line after transform: 'b8 __subx_global_1'\n"
 16   );
 17 }
 18 
 19 //: We don't rely on any transforms running in previous layers, but this layer
 20 //: knows about labels and global variables and will emit them for previous
 21 //: layers to transform.
 22 :(after "Begin Transforms")
 23 Transform.push_back(transform_literal_strings);
 24 
 25 :(before "End Globals")
 26 int Next_auto_global = 1;
 27 :(before "End Reset")
 28 Next_auto_global = 1;
 29 :(code)
 30 void transform_literal_strings(program& p) {
 31   trace(3, "transform") << "-- move literal strings to data segment" << end();
 32   if (p.segments.empty()) return;
 33   vector<line> new_lines;
 34   for (int s = 0;  s < SIZE(p.segments);  ++s) {
 35     segment& seg = p.segments.at(s);
 36     trace(99, "transform") << "segment '" << seg.name << "'" << end();
 37     for (int i = 0;  i < SIZE(seg.lines);  ++i) {
 38 //?       cerr << seg.name << '/' << i << '\n';
 39       line& line = seg.lines.at(i);
 40       for (int j = 0;  j < SIZE(line.words);  ++j) {
 41         word& curr = line.words.at(j);
 42         if (curr.data.at(0) != '"') continue;
 43         ostringstream global_name;
 44         global_name << "__subx_global_" << Next_auto_global;
 45         ++Next_auto_global;
 46         add_global_to_data_segment(global_name.str(), curr, new_lines);
 47         curr.data = global_name.str();
 48       }
 49       trace(99, "transform") << "line after transform: '" << data_to_string(line) << "'" << end();
 50     }
 51   }
 52   segment* data = find(p, "data");
 53   if (data)
 54     data->lines.insert(data->lines.end(), new_lines.begin(), new_lines.end());
 55 }
 56 
 57 void add_global_to_data_segment(const string& name, const word& value, vector<line>& out) {
 58   trace(99, "transform") << "adding global variable '" << name << "' containing " << value.data << end();
 59   // emit label
 60   out.push_back(label(name));
 61   // emit size for size-prefixed array
 62   out.push_back(line());
 63   emit_hex_bytes(out.back(), SIZE(value.data)-/*skip quotes*/2, 4/*bytes*/);
 64   // emit data byte by byte
 65   out.push_back(line());
 66   line& curr = out.back();
 67   for (int i = /*skip start quote*/1;  i < SIZE(value.data)-/*skip end quote*/1;  ++i) {
 68     char c = value.data.at(i);
 69     curr.words.push_back(word());
 70     curr.words.back().data = hex_byte_to_string(c);
 71     curr.words.back().metadata.push_back(string(1, c));
 72   }
 73 }
 74 
 75 //: Within strings, whitespace is significant. So we need to redo our instruction
 76 //: parsing.
 77 
 78 void test_instruction_with_string_literal() {
 79   parse_instruction_character_by_character(
 80       "a \"abc  def\" z\n"  // two spaces inside string
 81   );
 82   CHECK_TRACE_CONTENTS(
 83       "parse2: word: a\n"
 84       "parse2: word: \"abc  def\"\n"
 85       "parse2: word: z\n"
 86   );
 87   // no other words
 88   CHECK_TRACE_COUNT("parse2", 3);
 89 }
 90 
 91 void test_string_literal_in_data_segment() {
 92   run(
 93       "== code 0x1\n"
 94       "b8/copy  X/imm32\n"
 95       "== data 0x2000\n"
 96       "X:\n"
 97       "\"test\"/imm32\n"
 98   );
 99   CHECK_TRACE_CONTENTS(
100       "transform: -- move literal strings to data segment\n"
101       "transform: adding global variable '__subx_global_1' containing \"test\"\n"
102       "transform: line after transform: '__subx_global_1'\n"
103   );
104 }
105 
106 void test_string_literal_with_missing_quote() {
107   Hide_errors = true;
108   run(
109       "== code 0x1\n"
110       "b8/copy  \"test/imm32\n"
111       "== data 0x2000\n"
112   );
113   CHECK_TRACE_CONTENTS(
114       "error: unclosed string in: b8/copy  \"test/imm32"
115   );
116 }
117 
118 :(before "End Line Parsing Special-cases(line_data -> l)")
119 if (line_data.find('"') != string::npos) {  // can cause false-positives, but we can handle them
120   parse_instruction_character_by_character(line_data, l);
121   continue;
122 }
123 
124 :(code)
125 void parse_instruction_character_by_character(const string& line_data, vector<line>& out) {
126   if (line_data.find('\n') != string::npos  && line_data.find('\n') != line_data.size()-1) {
127     raise << "parse_instruction_character_by_character: should receive only a single line\n" << end();
128     return;
129   }
130   // parse literals
131   istringstream in(line_data);
132   in >> std::noskipws;
133   line result;
134   result.original = line_data;
135   // add tokens (words or strings) one by one
136   while (has_data(in)) {
137     skip_whitespace(in);
138     if (!has_data(in)) break;
139     char c = in.get();
140     if (c == '#') break;  // comment; drop rest of line
141     if (c == ':') break;  // line metadata; skip for now
142     if (c == '.') {
143       if (!has_data(in)) break;  // comment token at end of line
144       if (isspace(in.peek()))
145         continue;  // '.' followed by space is comment token; skip
146     }
147     result.words.push_back(word());
148     if (c == '"') {
149       // string literal; slurp everything between quotes into data
150       ostringstream d;
151       d << c;
152       while (true) {
153         if (!has_data(in)) {
154           raise << "unclosed string in: " << line_data << end();
155           return;
156         }
157         in >> c;
158         if (c == '\\') {
159           in >> c;
160           if (c == 'n') d << '\n';
161           else if (c == '"') d << '"';
162           else if (c == '\\') d << '\\';
163           else {
164             raise << "parse_instruction_character_by_character: unknown escape sequence '\\" << c << "'\n" << end();
165             return;
166           }
167           continue;
168         } else {
169           d << c;
170         }
171         if (c == '"') break;
172       }
173       result.words.back().data = d.str();
174       result.words.back().original = d.str();
175       // slurp metadata
176       ostringstream m;
177       while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
178         in >> c;
179         if (c == '/') {
180           if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
181           m.str("");
182         }
183         else {
184           m << c;
185         }
186       }
187       if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
188     }
189     else {
190       // not a string literal; slurp all characters until whitespace
191       ostringstream w;
192       w << c;
193       while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
194         in >> c;
195         w << c;
196       }
197       parse_word(w.str(), result.words.back());
198     }
199     trace(99, "parse2") << "word: " << to_string(result.words.back()) << end();
200   }
201   if (!result.words.empty())
202     out.push_back(result);
203 }
204 
205 void skip_whitespace(istream& in) {
206   while (has_data(in) && isspace(in.peek())) {
207     in.get();
208   }
209 }
210 
211 void skip_comment(istream& in) {
212   if (has_data(in) && in.peek() == '#') {
213     in.get();
214     while (has_data(in) && in.peek() != '\n') in.get();
215   }
216 }
217 
218 line label(string s) {
219   line result;
220   result.words.push_back(word());
221   result.words.back().data = (s+":");
222   return result;
223 }
224 
225 // helper for tests
226 void parse_instruction_character_by_character(const string& line_data) {
227   vector<line> out;
228   parse_instruction_character_by_character(line_data, out);
229 }
230 
231 void test_parse2_comment_token_in_middle() {
232   parse_instruction_character_by_character(
233       "a . z\n"
234   );
235   CHECK_TRACE_CONTENTS(
236       "parse2: word: a\n"
237       "parse2: word: z\n"
238   );
239   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
240   // no other words
241   CHECK_TRACE_COUNT("parse2", 2);
242 }
243 
244 void test_parse2_word_starting_with_dot() {
245   parse_instruction_character_by_character(
246       "a .b c\n"
247   );
248   CHECK_TRACE_CONTENTS(
249       "parse2: word: a\n"
250       "parse2: word: .b\n"
251       "parse2: word: c\n"
252   );
253 }
254 
255 void test_parse2_comment_token_at_start() {
256   parse_instruction_character_by_character(
257       ". a b\n"
258   );
259   CHECK_TRACE_CONTENTS(
260       "parse2: word: a\n"
261       "parse2: word: b\n"
262   );
263   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
264 }
265 
266 void test_parse2_comment_token_at_end() {
267   parse_instruction_character_by_character(
268       "a b .\n"
269   );
270   CHECK_TRACE_CONTENTS(
271       "parse2: word: a\n"
272       "parse2: word: b\n"
273   );
274   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
275 }
276 
277 void test_parse2_word_starting_with_dot_at_start() {
278   parse_instruction_character_by_character(
279       ".a b c\n"
280   );
281   CHECK_TRACE_CONTENTS(
282       "parse2: word: .a\n"
283       "parse2: word: b\n"
284       "parse2: word: c\n"
285   );
286 }
287 
288 void test_parse2_metadata() {
289   parse_instruction_character_by_character(
290       ".a b/c d\n"
291   );
292   CHECK_TRACE_CONTENTS(
293       "parse2: word: .a\n"
294       "parse2: word: b /c\n"
295       "parse2: word: d\n"
296   );
297 }
298 
299 void test_parse2_string_with_metadata() {
300   parse_instruction_character_by_character(
301       "a \"bc  def\"/disp32 g\n"
302   );
303   CHECK_TRACE_CONTENTS(
304       "parse2: word: a\n"
305       "parse2: word: \"bc  def\" /disp32\n"
306       "parse2: word: g\n"
307   );
308 }
309 
310 void test_parse2_string_with_metadata_at_end() {
311   parse_instruction_character_by_character(
312       "a \"bc  def\"/disp32\n"
313   );
314   CHECK_TRACE_CONTENTS(
315       "parse2: word: a\n"
316       "parse2: word: \"bc  def\" /disp32\n"
317   );
318 }
319 
320 void test_parse2_string_with_metadata_at_end_of_line_without_newline() {
321   parse_instruction_character_by_character(
322       "68/push \"test\"/f"  // no newline, which is how calls from parse() will look
323   );
324   CHECK_TRACE_CONTENTS(
325       "parse2: word: 68 /push\n"
326       "parse2: word: \"test\" /f\n"
327   );
328 }
329 
330 //: Make sure slashes inside strings don't trigger adding stuff from inside the
331 //: string to metadata.
332 
333 void test_parse2_string_containing_slashes() {
334   parse_instruction_character_by_character(
335       "a \"bc/def\"/disp32\n"
336   );
337   CHECK_TRACE_CONTENTS(
338       "parse2: word: \"bc/def\" /disp32\n"
339   );
340 }
341 
342 void test_instruction_with_string_literal_with_escaped_quote() {
343   parse_instruction_character_by_character(
344       "\"a\\\"b\"\n"  // escaped quote inside string
345   );
346   CHECK_TRACE_CONTENTS(
347       "parse2: word: \"a\"b\"\n"
348   );
349   // no other words
350   CHECK_TRACE_COUNT("parse2", 1);
351 }
352 
353 void test_instruction_with_string_literal_with_escaped_backslash() {
354   parse_instruction_character_by_character(
355       "\"a\\\\b\"\n"  // escaped backslash inside string
356   );
357   CHECK_TRACE_CONTENTS(
358       "parse2: word: \"a\\b\"\n"
359   );
360   // no other words
361   CHECK_TRACE_COUNT("parse2", 1);
362 }