https://github.com/akkartik/mu/blob/master/038---literal_strings.cc
  1 //: Allow instructions to mention literals directly.
  2 //:
  3 //: This layer will transparently move them to the global segment (assumed to
  4 //: always be the second segment).
  5 
  6 void test_transform_literal_string() {
  7   run(
  8       "== code 0x1\n"
  9       "b8/copy  \"test\"/imm32\n"
 10       "== data 0x2000\n"  // need an empty segment
 11   );
 12   CHECK_TRACE_CONTENTS(
 13       "transform: -- move literal strings to data segment\n"
 14       "transform: adding global variable '__subx_global_1' containing \"test\"\n"
 15       "transform: line after transform: 'b8 __subx_global_1'\n"
 16   );
 17 }
 18 
 19 //: We don't rely on any transforms running in previous layers, but this layer
 20 //: knows about labels and global variables and will emit them for previous
 21 //: layers to transform.
 22 :(after "Begin Transforms")
 23 Transform.push_back(transform_literal_strings);
 24 
 25 :(before "End Globals")
 26 int Next_auto_global = 1;
 27 :(before "End Reset")
 28 Next_auto_global = 1;
 29 :(code)
 30 void transform_literal_strings(program& p) {
 31   trace(3, "transform") << "-- move literal strings to data segment" << end();
 32   if (p.segments.empty()) return;
 33   vector<line> new_lines;
 34   for (int s = 0;  s < SIZE(p.segments);  ++s) {
 35     segment& seg = p.segments.at(s);
 36     trace(99, "transform") << "segment '" << seg.name << "'" << end();
 37     for (int i = 0;  i < SIZE(seg.lines);  ++i) {
 38 //?       cerr << seg.name << '/' << i << '\n';
 39       line& line = seg.lines.at(i);
 40       for (int j = 0;  j < SIZE(line.words);  ++j) {
 41         word& curr = line.words.at(j);
 42         if (curr.data.at(0) != '"') continue;
 43         ostringstream global_name;
 44         global_name << "__subx_global_" << Next_auto_global;
 45         ++Next_auto_global;
 46         add_global_to_data_segment(global_name.str(), curr, new_lines);
 47         curr.data = global_name.str();
 48       }
 49       trace(99, "transform") << "line after transform: '" << data_to_string(line) << "'" << end();
 50     }
 51   }
 52   segment* data = find(p, "data");
 53   if (data)
 54     data->lines.insert(data->lines.end(), new_lines.begin(), new_lines.end());
 55 }
 56 
 57 void add_global_to_data_segment(const string& name, const word& value, vector<line>& out) {
 58   trace(99, "transform") << "adding global variable '" << name << "' containing " << value.data << end();
 59   // emit label
 60   out.push_back(label(name));
 61   // emit size for size-prefixed array
 62   out.push_back(line());
 63   emit_hex_bytes(out.back(), SIZE(value.data)-/*skip quotes*/2, 4/*bytes*/);
 64   // emit data byte by byte
 65   out.push_back(line());
 66   line& curr = out.back();
 67   for (int i = /*skip start quote*/1;  i < SIZE(value.data)-/*skip end quote*/1;  ++i) {
 68     char c = value.data.at(i);
 69     curr.words.push_back(word());
 70     curr.words.back().data = hex_byte_to_string(c);
 71     curr.words.back().metadata.push_back(string(1, c));
 72   }
 73 }
 74 
 75 //: Within strings, whitespace is significant. So we need to redo our instruction
 76 //: parsing.
 77 
 78 void test_instruction_with_string_literal() {
 79   parse_instruction_character_by_character(
 80       "a \"abc  def\" z\n"  // two spaces inside string
 81   );
 82   CHECK_TRACE_CONTENTS(
 83       "parse2: word: a\n"
 84       "parse2: word: \"abc  def\"\n"
 85       "parse2: word: z\n"
 86   );
 87   // no other words
 88   CHECK_TRACE_COUNT("parse2", 3);
 89 }
 90 
 91 void test_string_literal_in_data_segment() {
 92   run(
 93       "== code 0x1\n"
 94       "b8/copy  X/imm32\n"
 95       "== data 0x2000\n"
 96       "X:\n"
 97       "\"test\"/imm32\n"
 98   );
 99   CHECK_TRACE_CONTENTS(
100       "transform: -- move literal strings to data segment\n"
101       "transform: adding global variable '__subx_global_1' containing \"test\"\n"
102       "transform: line after transform: '__subx_global_1'\n"
103   );
104 }
105 
106 :(before "End Line Parsing Special-cases(line_data -> l)")
107 if (line_data.find('"') != string::npos) {  // can cause false-positives, but we can handle them
108   parse_instruction_character_by_character(line_data, l);
109   continue;
110 }
111 
112 :(code)
113 void parse_instruction_character_by_character(const string& line_data, vector<line>& out) {
114   if (line_data.find('\n') != string::npos  && line_data.find('\n') != line_data.size()-1) {
115     raise << "parse_instruction_character_by_character: should receive only a single line\n" << end();
116     return;
117   }
118   // parse literals
119   istringstream in(line_data);
120   in >> std::noskipws;
121   line result;
122   result.original = line_data;
123   // add tokens (words or strings) one by one
124   while (has_data(in)) {
125     skip_whitespace(in);
126     if (!has_data(in)) break;
127     char c = in.get();
128     if (c == '#') break;  // comment; drop rest of line
129     if (c == ':') break;  // line metadata; skip for now
130     if (c == '.') {
131       if (!has_data(in)) break;  // comment token at end of line
132       if (isspace(in.peek()))
133         continue;  // '.' followed by space is comment token; skip
134     }
135     result.words.push_back(word());
136     if (c == '"') {
137       // string literal; slurp everything between quotes into data
138       ostringstream d;
139       d << c;
140       while (has_data(in)) {
141         in >> c;
142         if (c == '\\') {
143           in >> c;
144           if (c == 'n') d << '\n';
145           else if (c == '"') d << '"';
146           else if (c == '\\') d << '\\';
147           else {
148             raise << "parse_instruction_character_by_character: unknown escape sequence '\\" << c << "'\n" << end();
149             return;
150           }
151           continue;
152         } else {
153           d << c;
154         }
155         if (c == '"') break;
156       }
157       result.words.back().data = d.str();
158       result.words.back().original = d.str();
159       // slurp metadata
160       ostringstream m;
161       while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
162         in >> c;
163         if (c == '/') {
164           if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
165           m.str("");
166         }
167         else {
168           m << c;
169         }
170       }
171       if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
172     }
173     else {
174       // not a string literal; slurp all characters until whitespace
175       ostringstream w;
176       w << c;
177       while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
178         in >> c;
179         w << c;
180       }
181       parse_word(w.str(), result.words.back());
182     }
183     trace(99, "parse2") << "word: " << to_string(result.words.back()) << end();
184   }
185   if (!result.words.empty())
186     out.push_back(result);
187 }
188 
189 void skip_whitespace(istream& in) {
190   while (has_data(in) && isspace(in.peek())) {
191     in.get();
192   }
193 }
194 
195 void skip_comment(istream& in) {
196   if (has_data(in) && in.peek() == '#') {
197     in.get();
198     while (has_data(in) && in.peek() != '\n') in.get();
199   }
200 }
201 
202 line label(string s) {
203   line result;
204   result.words.push_back(word());
205   result.words.back().data = (s+":");
206   return result;
207 }
208 
209 // helper for tests
210 void parse_instruction_character_by_character(const string& line_data) {
211   vector<line> out;
212   parse_instruction_character_by_character(line_data, out);
213 }
214 
215 void test_parse2_comment_token_in_middle() {
216   parse_instruction_character_by_character(
217       "a . z\n"
218   );
219   CHECK_TRACE_CONTENTS(
220       "parse2: word: a\n"
221       "parse2: word: z\n"
222   );
223   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
224   // no other words
225   CHECK_TRACE_COUNT("parse2", 2);
226 }
227 
228 void test_parse2_word_starting_with_dot() {
229   parse_instruction_character_by_character(
230       "a .b c\n"
231   );
232   CHECK_TRACE_CONTENTS(
233       "parse2: word: a\n"
234       "parse2: word: .b\n"
235       "parse2: word: c\n"
236   );
237 }
238 
239 void test_parse2_comment_token_at_start() {
240   parse_instruction_character_by_character(
241       ". a b\n"
242   );
243   CHECK_TRACE_CONTENTS(
244       "parse2: word: a\n"
245       "parse2: word: b\n"
246   );
247   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
248 }
249 
250 void test_parse2_comment_token_at_end() {
251   parse_instruction_character_by_character(
252       "a b .\n"
253   );
254   CHECK_TRACE_CONTENTS(
255       "parse2: word: a\n"
256       "parse2: word: b\n"
257   );
258   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
259 }
260 
261 void test_parse2_word_starting_with_dot_at_start() {
262   parse_instruction_character_by_character(
263       ".a b c\n"
264   );
265   CHECK_TRACE_CONTENTS(
266       "parse2: word: .a\n"
267       "parse2: word: b\n"
268       "parse2: word: c\n"
269   );
270 }
271 
272 void test_parse2_metadata() {
273   parse_instruction_character_by_character(
274       ".a b/c d\n"
275   );
276   CHECK_TRACE_CONTENTS(
277       "parse2: word: .a\n"
278       "parse2: word: b /c\n"
279       "parse2: word: d\n"
280   );
281 }
282 
283 void test_parse2_string_with_metadata() {
284   parse_instruction_character_by_character(
285       "a \"bc  def\"/disp32 g\n"
286   );
287   CHECK_TRACE_CONTENTS(
288       "parse2: word: a\n"
289       "parse2: word: \"bc  def\" /disp32\n"
290       "parse2: word: g\n"
291   );
292 }
293 
294 void test_parse2_string_with_metadata_at_end() {
295   parse_instruction_character_by_character(
296       "a \"bc  def\"/disp32\n"
297   );
298   CHECK_TRACE_CONTENTS(
299       "parse2: word: a\n"
300       "parse2: word: \"bc  def\" /disp32\n"
301   );
302 }
303 
304 void test_parse2_string_with_metadata_at_end_of_line_without_newline() {
305   parse_instruction_character_by_character(
306       "68/push \"test\"/f"  // no newline, which is how calls from parse() will look
307   );
308   CHECK_TRACE_CONTENTS(
309       "parse2: word: 68 /push\n"
310       "parse2: word: \"test\" /f\n"
311   );
312 }
313 
314 //: Make sure slashes inside strings don't trigger adding stuff from inside the
315 //: string to metadata.
316 
317 void test_parse2_string_containing_slashes() {
318   parse_instruction_character_by_character(
319       "a \"bc/def\"/disp32\n"
320   );
321   CHECK_TRACE_CONTENTS(
322       "parse2: word: \"bc/def\" /disp32\n"
323   );
324 }
325 
326 void test_instruction_with_string_literal_with_escaped_quote() {
327   parse_instruction_character_by_character(
328       "\"a\\\"b\"\n"  // escaped quote inside string
329   );
330   CHECK_TRACE_CONTENTS(
331       "parse2: word: \"a\"b\"\n"
332   );
333   // no other words
334   CHECK_TRACE_COUNT("parse2", 1);
335 }
336 
337 void test_instruction_with_string_literal_with_escaped_backslash() {
338   parse_instruction_character_by_character(
339       "\"a\\\\b\"\n"  // escaped backslash inside string
340   );
341   CHECK_TRACE_CONTENTS(
342       "parse2: word: \"a\\b\"\n"
343   );
344   // no other words
345   CHECK_TRACE_COUNT("parse2", 1);
346 }