https://github.com/akkartik/mu/blob/master/subx/038---literal_strings.cc
  1 //: Allow instructions to mention literals directly.
  2 //:
  3 //: This layer will transparently move them to the global segment (assumed to
  4 //: always be the second segment).
  5 
  6 void test_transform_literal_string() {
  7   run(
  8       "== code\n"
  9       "b8/copy  \"test\"/imm32\n"
 10       "== data\n"  // need to manually create the segment for now
 11   );
 12   CHECK_TRACE_CONTENTS(
 13       "transform: -- move literal strings to data segment\n"
 14       "transform: adding global variable '__subx_global_1' containing \"test\"\n"
 15       "transform: instruction after transform: 'b8 __subx_global_1'\n"
 16   );
 17 }
 18 
 19 //: We don't rely on any transforms running in previous layers, but this layer
 20 //: knows about labels and global variables and will emit them for previous
 21 //: layers to transform.
 22 :(after "Begin Transforms")
 23 // Begin Level-3 Transforms
 24 Transform.push_back(transform_literal_strings);
 25 // End Level-3 Transforms
 26 
 27 :(before "End Globals")
 28 int Next_auto_global = 1;
 29 :(code)
 30 void transform_literal_strings(program& p) {
 31   trace(3, "transform") << "-- move literal strings to data segment" << end();
 32   if (p.segments.empty()) return;
 33   segment& code = p.segments.at(0);
 34   segment data;
 35   for (int i = 0;  i < SIZE(code.lines);  ++i) {
 36     line& inst = code.lines.at(i);
 37     for (int j = 0;  j < SIZE(inst.words);  ++j) {
 38       word& curr = inst.words.at(j);
 39       if (curr.data.at(0) != '"') continue;
 40       ostringstream global_name;
 41       global_name << "__subx_global_" << Next_auto_global;
 42       ++Next_auto_global;
 43       add_global_to_data_segment(global_name.str(), curr, data);
 44       curr.data = global_name.str();
 45     }
 46     trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end();
 47   }
 48   if (data.lines.empty()) return;
 49   if (SIZE(p.segments) < 2) {
 50     p.segments.resize(2);
 51     p.segments.at(1).lines.swap(data.lines);
 52   }
 53   vector<line>& existing_data = p.segments.at(1).lines;
 54   existing_data.insert(existing_data.end(), data.lines.begin(), data.lines.end());
 55 }
 56 
 57 void add_global_to_data_segment(const string& name, const word& value, segment& data) {
 58   trace(99, "transform") << "adding global variable '" << name << "' containing " << value.data << end();
 59   // emit label
 60   data.lines.push_back(label(name));
 61   // emit size for size-prefixed array
 62   data.lines.push_back(line());
 63   emit_hex_bytes(data.lines.back(), SIZE(value.data)-/*skip quotes*/2, 4/*bytes*/);
 64   // emit data byte by byte
 65   data.lines.push_back(line());
 66   line& curr = data.lines.back();
 67   for (int i = /*skip start quote*/1;  i < SIZE(value.data)-/*skip end quote*/1;  ++i) {
 68     char c = value.data.at(i);
 69     curr.words.push_back(word());
 70     curr.words.back().data = hex_byte_to_string(c);
 71     curr.words.back().metadata.push_back(string(1, c));
 72   }
 73 }
 74 
 75 //: Within strings, whitespace is significant. So we need to redo our instruction
 76 //: parsing.
 77 
 78 void test_instruction_with_string_literal() {
 79   parse_instruction_character_by_character(
 80       "a \"abc  def\" z\n"  // two spaces inside string
 81   );
 82   CHECK_TRACE_CONTENTS(
 83       "parse2: word: a\n"
 84       "parse2: word: \"abc  def\"\n"
 85       "parse2: word: z\n"
 86   );
 87   // no other words
 88   CHECK_TRACE_COUNT("parse2", 3);
 89 }
 90 
 91 :(before "End Line Parsing Special-cases(line_data -> l)")
 92 if (line_data.find('"') != string::npos) {  // can cause false-positives, but we can handle them
 93   parse_instruction_character_by_character(line_data, l);
 94   continue;
 95 }
 96 
 97 :(code)
 98 void parse_instruction_character_by_character(const string& line_data, vector<line>& out) {
 99   if (line_data.find('\n') != string::npos  && line_data.find('\n') != line_data.size()-1) {
100     raise << "parse_instruction_character_by_character: should receive only a single line\n" << end();
101     return;
102   }
103   // parse literals
104   istringstream in(line_data);
105   in >> std::noskipws;
106   line result;
107   // add tokens (words or strings) one by one
108   while (has_data(in)) {
109     skip_whitespace(in);
110     if (!has_data(in)) break;
111     char c = in.get();
112     if (c == '#') break;  // comment; drop rest of line
113     if (c == ':') break;  // line metadata; skip for now
114     if (c == '.') {
115       if (!has_data(in)) break;  // comment token at end of line
116       if (isspace(in.peek()))
117         continue;  // '.' followed by space is comment token; skip
118     }
119     result.words.push_back(word());
120     if (c == '"') {
121       // slurp word data
122       ostringstream d;
123       d << c;
124       while (has_data(in)) {
125         in >> c;
126         d << c;
127         if (c == '"') break;
128       }
129       result.words.back().data = d.str();
130       // slurp metadata
131       ostringstream m;
132       while (!isspace(in.peek()) && has_data(in)) {
133         in >> c;
134         if (c == '/') {
135           if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
136           m.str("");
137         }
138         else {
139           m << c;
140         }
141       }
142       if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
143     }
144     else {
145       // slurp all characters until whitespace
146       ostringstream w;
147       w << c;
148       while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
149         in >> c;
150         w << c;
151       }
152       parse_word(w.str(), result.words.back());
153     }
154     trace(99, "parse2") << "word: " << to_string(result.words.back()) << end();
155   }
156   if (!result.words.empty())
157     out.push_back(result);
158 }
159 
160 void skip_whitespace(istream& in) {
161   while (true) {
162     if (has_data(in) && isspace(in.peek())) in.get();
163     else break;
164   }
165 }
166 
167 void skip_comment(istream& in) {
168   if (has_data(in) && in.peek() == '#') {
169     in.get();
170     while (has_data(in) && in.peek() != '\n') in.get();
171   }
172 }
173 
174 // helper for tests
175 void parse_instruction_character_by_character(const string& line_data) {
176   vector<line> out;
177   parse_instruction_character_by_character(line_data, out);
178 }
179 
180 void test_parse2_comment_token_in_middle() {
181   parse_instruction_character_by_character(
182       "a . z\n"
183   );
184   CHECK_TRACE_CONTENTS(
185       "parse2: word: a\n"
186       "parse2: word: z\n"
187   );
188   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
189   // no other words
190   CHECK_TRACE_COUNT("parse2", 2);
191 }
192 
193 void test_parse2_word_starting_with_dot() {
194   parse_instruction_character_by_character(
195       "a .b c\n"
196   );
197   CHECK_TRACE_CONTENTS(
198       "parse2: word: a\n"
199       "parse2: word: .b\n"
200       "parse2: word: c\n"
201   );
202 }
203 
204 void test_parse2_comment_token_at_start() {
205   parse_instruction_character_by_character(
206       ". a b\n"
207   );
208   CHECK_TRACE_CONTENTS(
209       "parse2: word: a\n"
210       "parse2: word: b\n"
211   );
212   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
213 }
214 
215 void test_parse2_comment_token_at_end() {
216   parse_instruction_character_by_character(
217       "a b .\n"
218   );
219   CHECK_TRACE_CONTENTS(
220       "parse2: word: a\n"
221       "parse2: word: b\n"
222   );
223   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
224 }
225 
226 void test_parse2_word_starting_with_dot_at_start() {
227   parse_instruction_character_by_character(
228       ".a b c\n"
229   );
230   CHECK_TRACE_CONTENTS(
231       "parse2: word: .a\n"
232       "parse2: word: b\n"
233       "parse2: word: c\n"
234   );
235 }
236 
237 void test_parse2_metadata() {
238   parse_instruction_character_by_character(
239       ".a b/c d\n"
240   );
241   CHECK_TRACE_CONTENTS(
242       "parse2: word: .a\n"
243       "parse2: word: b /c\n"
244       "parse2: word: d\n"
245   );
246 }
247 
248 void test_parse2_string_with_metadata() {
249   parse_instruction_character_by_character(
250       "a \"bc  def\"/disp32 g\n"
251   );
252   CHECK_TRACE_CONTENTS(
253       "parse2: word: a\n"
254       "parse2: word: \"bc  def\" /disp32\n"
255       "parse2: word: g\n"
256   );
257 }
258 
259 void test_parse2_string_with_metadata_at_end() {
260   parse_instruction_character_by_character(
261       "a \"bc  def\"/disp32\n"
262   );
263   CHECK_TRACE_CONTENTS(
264       "parse2: word: a\n"
265       "parse2: word: \"bc  def\" /disp32\n"
266   );
267 }
268 
269 void test_parse2_string_with_metadata_at_end_of_line_without_newline() {
270   parse_instruction_character_by_character(
271       "68/push \"test\"/f"  // no newline, which is how calls from parse() will look
272   );
273   CHECK_TRACE_CONTENTS(
274       "parse2: word: 68 /push\n"
275       "parse2: word: \"test\" /f\n"
276   );
277 }
278 
279 //: Make sure slashes inside strings don't trigger adding stuff from inside the
280 //: string to metadata.
281 
282 void test_parse2_string_containing_slashes() {
283   parse_instruction_character_by_character(
284       "a \"bc/def\"/disp32\n"
285   );
286   CHECK_TRACE_CONTENTS(
287       "parse2: word: \"bc/def\" /disp32\n"
288   );
289 }