https://github.com/akkartik/mu/blob/master/subx/038---literal_strings.cc
  1 //: Allow instructions to mention literals directly.
  2 //:
  3 //: This layer will transparently move them to the global segment (assumed to
  4 //: always be the second segment).
  5 
  6 void test_transform_literal_string() {
  7   run(
  8       "== code 0x1\n"
  9       "b8/copy  \"test\"/imm32\n"
 10       "== data 0x2000\n"  // need an empty segment
 11   );
 12   CHECK_TRACE_CONTENTS(
 13       "transform: -- move literal strings to data segment\n"
 14       "transform: adding global variable '__subx_global_1' containing \"test\"\n"
 15       "transform: instruction after transform: 'b8 __subx_global_1'\n"
 16   );
 17 }
 18 
 19 //: We don't rely on any transforms running in previous layers, but this layer
 20 //: knows about labels and global variables and will emit them for previous
 21 //: layers to transform.
 22 :(after "Begin Transforms")
 23 // Begin Level-3 Transforms
 24 Transform.push_back(transform_literal_strings);
 25 // End Level-3 Transforms
 26 
 27 :(before "End Globals")
 28 int Next_auto_global = 1;
 29 :(code)
 30 void transform_literal_strings(program& p) {
 31   trace(3, "transform") << "-- move literal strings to data segment" << end();
 32   if (p.segments.empty()) return;
 33   segment& code = *find(p, "code");
 34   segment& data = *find(p, "data");
 35   for (int i = 0;  i < SIZE(code.lines);  ++i) {
 36     line& inst = code.lines.at(i);
 37     for (int j = 0;  j < SIZE(inst.words);  ++j) {
 38       word& curr = inst.words.at(j);
 39       if (curr.data.at(0) != '"') continue;
 40       ostringstream global_name;
 41       global_name << "__subx_global_" << Next_auto_global;
 42       ++Next_auto_global;
 43       add_global_to_data_segment(global_name.str(), curr, data);
 44       curr.data = global_name.str();
 45     }
 46     trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end();
 47   }
 48 }
 49 
 50 void add_global_to_data_segment(const string& name, const word& value, segment& data) {
 51   trace(99, "transform") << "adding global variable '" << name << "' containing " << value.data << end();
 52   // emit label
 53   data.lines.push_back(label(name));
 54   // emit size for size-prefixed array
 55   data.lines.push_back(line());
 56   emit_hex_bytes(data.lines.back(), SIZE(value.data)-/*skip quotes*/2, 4/*bytes*/);
 57   // emit data byte by byte
 58   data.lines.push_back(line());
 59   line& curr = data.lines.back();
 60   for (int i = /*skip start quote*/1;  i < SIZE(value.data)-/*skip end quote*/1;  ++i) {
 61     char c = value.data.at(i);
 62     curr.words.push_back(word());
 63     curr.words.back().data = hex_byte_to_string(c);
 64     curr.words.back().metadata.push_back(string(1, c));
 65   }
 66 }
 67 
 68 //: Within strings, whitespace is significant. So we need to redo our instruction
 69 //: parsing.
 70 
 71 void test_instruction_with_string_literal() {
 72   parse_instruction_character_by_character(
 73       "a \"abc  def\" z\n"  // two spaces inside string
 74   );
 75   CHECK_TRACE_CONTENTS(
 76       "parse2: word: a\n"
 77       "parse2: word: \"abc  def\"\n"
 78       "parse2: word: z\n"
 79   );
 80   // no other words
 81   CHECK_TRACE_COUNT("parse2", 3);
 82 }
 83 
 84 :(before "End Line Parsing Special-cases(line_data -> l)")
 85 if (line_data.find('"') != string::npos) {  // can cause false-positives, but we can handle them
 86   parse_instruction_character_by_character(line_data, l);
 87   continue;
 88 }
 89 
 90 :(code)
 91 void parse_instruction_character_by_character(const string& line_data, vector<line>& out) {
 92   if (line_data.find('\n') != string::npos  && line_data.find('\n') != line_data.size()-1) {
 93     raise << "parse_instruction_character_by_character: should receive only a single line\n" << end();
 94     return;
 95   }
 96   // parse literals
 97   istringstream in(line_data);
 98   in >> std::noskipws;
 99   line result;
100   result.original = line_data;
101   // add tokens (words or strings) one by one
102   while (has_data(in)) {
103     skip_whitespace(in);
104     if (!has_data(in)) break;
105     char c = in.get();
106     if (c == '#') break;  // comment; drop rest of line
107     if (c == ':') break;  // line metadata; skip for now
108     if (c == '.') {
109       if (!has_data(in)) break;  // comment token at end of line
110       if (isspace(in.peek()))
111         continue;  // '.' followed by space is comment token; skip
112     }
113     result.words.push_back(word());
114     if (c == '"') {
115       // string literal; slurp everything between quotes into data
116       ostringstream d;
117       d << c;
118       while (has_data(in)) {
119         in >> c;
120         if (c == '\\') {
121           in >> c;
122           if (c == 'n') d << '\n';
123           else if (c == '"') d << '"';
124           else if (c == '\\') d << '\\';
125           else {
126             raise << "parse_instruction_character_by_character: unknown escape sequence '\\" << c << "'\n" << end();
127             return;
128           }
129           continue;
130         } else {
131           d << c;
132         }
133         if (c == '"') break;
134       }
135       result.words.back().data = d.str();
136       // slurp metadata
137       ostringstream m;
138       while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
139         in >> c;
140         if (c == '/') {
141           if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
142           m.str("");
143         }
144         else {
145           m << c;
146         }
147       }
148       if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
149     }
150     else {
151       // not a string literal; slurp all characters until whitespace
152       ostringstream w;
153       w << c;
154       while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
155         in >> c;
156         w << c;
157       }
158       parse_word(w.str(), result.words.back());
159     }
160     trace(99, "parse2") << "word: " << to_string(result.words.back()) << end();
161   }
162   if (!result.words.empty())
163     out.push_back(result);
164 }
165 
166 void skip_whitespace(istream& in) {
167   while (true) {
168     if (has_data(in) && isspace(in.peek())) in.get();
169     else break;
170   }
171 }
172 
173 void skip_comment(istream& in) {
174   if (has_data(in) && in.peek() == '#') {
175     in.get();
176     while (has_data(in) && in.peek() != '\n') in.get();
177   }
178 }
179 
180 line label(string s) {
181   line result;
182   result.words.push_back(word());
183   result.words.back().data = (s+":");
184   return result;
185 }
186 
187 // helper for tests
188 void parse_instruction_character_by_character(const string& line_data) {
189   vector<line> out;
190   parse_instruction_character_by_character(line_data, out);
191 }
192 
193 void test_parse2_comment_token_in_middle() {
194   parse_instruction_character_by_character(
195       "a . z\n"
196   );
197   CHECK_TRACE_CONTENTS(
198       "parse2: word: a\n"
199       "parse2: word: z\n"
200   );
201   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
202   // no other words
203   CHECK_TRACE_COUNT("parse2", 2);
204 }
205 
206 void test_parse2_word_starting_with_dot() {
207   parse_instruction_character_by_character(
208       "a .b c\n"
209   );
210   CHECK_TRACE_CONTENTS(
211       "parse2: word: a\n"
212       "parse2: word: .b\n"
213       "parse2: word: c\n"
214   );
215 }
216 
217 void test_parse2_comment_token_at_start() {
218   parse_instruction_character_by_character(
219       ". a b\n"
220   );
221   CHECK_TRACE_CONTENTS(
222       "parse2: word: a\n"
223       "parse2: word: b\n"
224   );
225   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
226 }
227 
228 void test_parse2_comment_token_at_end() {
229   parse_instruction_character_by_character(
230       "a b .\n"
231   );
232   CHECK_TRACE_CONTENTS(
233       "parse2: word: a\n"
234       "parse2: word: b\n"
235   );
236   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
237 }
238 
239 void test_parse2_word_starting_with_dot_at_start() {
240   parse_instruction_character_by_character(
241       ".a b c\n"
242   );
243   CHECK_TRACE_CONTENTS(
244       "parse2: word: .a\n"
245       "parse2: word: b\n"
246       "parse2: word: c\n"
247   );
248 }
249 
250 void test_parse2_metadata() {
251   parse_instruction_character_by_character(
252       ".a b/c d\n"
253   );
254   CHECK_TRACE_CONTENTS(
255       "parse2: word: .a\n"
256       "parse2: word: b /c\n"
257       "parse2: word: d\n"
258   );
259 }
260 
261 void test_parse2_string_with_metadata() {
262   parse_instruction_character_by_character(
263       "a \"bc  def\"/disp32 g\n"
264   );
265   CHECK_TRACE_CONTENTS(
266       "parse2: word: a\n"
267       "parse2: word: \"bc  def\" /disp32\n"
268       "parse2: word: g\n"
269   );
270 }
271 
272 void test_parse2_string_with_metadata_at_end() {
273   parse_instruction_character_by_character(
274       "a \"bc  def\"/disp32\n"
275   );
276   CHECK_TRACE_CONTENTS(
277       "parse2: word: a\n"
278       "parse2: word: \"bc  def\" /disp32\n"
279   );
280 }
281 
282 void test_parse2_string_with_metadata_at_end_of_line_without_newline() {
283   parse_instruction_character_by_character(
284       "68/push \"test\"/f"  // no newline, which is how calls from parse() will look
285   );
286   CHECK_TRACE_CONTENTS(
287       "parse2: word: 68 /push\n"
288       "parse2: word: \"test\" /f\n"
289   );
290 }
291 
292 //: Make sure slashes inside strings don't trigger adding stuff from inside the
293 //: string to metadata.
294 
295 void test_parse2_string_containing_slashes() {
296   parse_instruction_character_by_character(
297       "a \"bc/def\"/disp32\n"
298   );
299   CHECK_TRACE_CONTENTS(
300       "parse2: word: \"bc/def\" /disp32\n"
301   );
302 }
303 
304 void test_instruction_with_string_literal_with_escaped_quote() {
305   parse_instruction_character_by_character(
306       "\"a\\\"b\"\n"  // escaped quote inside string
307   );
308   CHECK_TRACE_CONTENTS(
309       "parse2: word: \"a\"b\"\n"
310   );
311   // no other words
312   CHECK_TRACE_COUNT("parse2", 1);
313 }
314 
315 void test_instruction_with_string_literal_with_escaped_backslash() {
316   parse_instruction_character_by_character(
317       "\"a\\\\b\"\n"  // escaped backslash inside string
318   );
319   CHECK_TRACE_CONTENTS(
320       "parse2: word: \"a\\b\"\n"
321   );
322   // no other words
323   CHECK_TRACE_COUNT("parse2", 1);
324 }