https://github.com/akkartik/mu/blob/master/subx/038---literal_strings.cc
  1 //: Allow instructions to mention literals directly.
  2 //:
  3 //: This layer will transparently move them to the global segment (assumed to
  4 //: always be the second segment).
  5 
  6 :(scenario transform_literal_string)
  7 == code
  8 b8/copy  "test"/imm32
  9 == data  # need to manually create this for now
 10 +transform: -- move literal strings to data segment
 11 +transform: adding global variable '__subx_global_1' containing "test"
 12 +transform: instruction after transform: 'b8 __subx_global_1'
 13 
 14 //: We don't rely on any transforms running in previous layers, but this layer
 15 //: knows about labels and global variables and will emit them for previous
 16 //: layers to transform.
 17 :(after "Begin Transforms")
 18 // Begin Level-3 Transforms
 19 Transform.push_back(transform_literal_strings);
 20 // End Level-3 Transforms
 21 
 22 :(before "End Globals")
 23 int Next_auto_global = 1;
 24 :(code)
 25 void transform_literal_strings(program& p) {
 26   trace(99, "transform") << "-- move literal strings to data segment" << end();
 27   if (p.segments.empty()) return;
 28   segment& code = p.segments.at(0);
 29   segment data;
 30   for (int i = 0;  i < SIZE(code.lines);  ++i) {
 31     line& inst = code.lines.at(i);
 32     for (int j = 0;  j < SIZE(inst.words);  ++j) {
 33       word& curr = inst.words.at(j);
 34       if (curr.data.at(0) != '"') continue;
 35       ostringstream global_name;
 36       global_name << "__subx_global_" << Next_auto_global;
 37       ++Next_auto_global;
 38       add_global_to_data_segment(global_name.str(), curr, data);
 39       curr.data = global_name.str();
 40     }
 41     trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end();
 42   }
 43   if (data.lines.empty()) return;
 44   if (SIZE(p.segments) < 2) {
 45     p.segments.resize(2);
 46     p.segments.at(1).lines.swap(data.lines);
 47   }
 48   vector<line>& existing_data = p.segments.at(1).lines;
 49   existing_data.insert(existing_data.end(), data.lines.begin(), data.lines.end());
 50 }
 51 
 52 void add_global_to_data_segment(const string& name, const word& value, segment& data) {
 53   trace(99, "transform") << "adding global variable '" << name << "' containing " << value.data << end();
 54   // emit label
 55   data.lines.push_back(label(name));
 56   // emit size for size-prefixed array
 57   data.lines.push_back(line());
 58   emit_hex_bytes(data.lines.back(), SIZE(value.data)-/*skip quotes*/2, 4/*bytes*/);
 59   // emit data byte by byte
 60   data.lines.push_back(line());
 61   line& curr = data.lines.back();
 62   for (int i = /*skip start quote*/1;  i < SIZE(value.data)-/*skip end quote*/1;  ++i) {
 63     char c = value.data.at(i);
 64     curr.words.push_back(word());
 65     curr.words.back().data = hex_byte_to_string(c);
 66     curr.words.back().metadata.push_back(string(1, c));
 67   }
 68 }
 69 
 70 line label(string s) {
 71   line result;
 72   result.words.push_back(word());
 73   result.words.back().data = (s+":");
 74   return result;
 75 }
 76 
 77 //: Within strings, whitespace is significant. So we need to redo our instruction
 78 //: parsing.
 79 
 80 :(scenarios parse_instruction_character_by_character)
 81 :(scenario instruction_with_string_literal)
 82 a "abc  def" z  # two spaces inside string
 83 +parse2: word: a
 84 +parse2: word: "abc  def"
 85 +parse2: word: z
 86 # no other words
 87 $parse2: 3
 88 
 89 :(before "End Line Parsing Special-cases(line_data -> l)")
 90 if (line_data.find('"') != string::npos) {  // can cause false-positives, but we can handle them
 91   parse_instruction_character_by_character(line_data, l);
 92   continue;
 93 }
 94 
 95 :(code)
 96 void parse_instruction_character_by_character(const string& line_data, vector<line>& out) {
 97   if (line_data.find('\n') != string::npos  && line_data.find('\n') != line_data.size()-1) {
 98     raise << "parse_instruction_character_by_character: should receive only a single line\n" << end();
 99     return;
100   }
101   // parse literals
102   istringstream in(line_data);
103   in >> std::noskipws;
104   line result;
105   // add tokens (words or strings) one by one
106   while (has_data(in)) {
107     skip_whitespace(in);
108     if (!has_data(in)) break;
109     char c = in.get();
110     if (c == '#') break;  // comment; drop rest of line
111     if (c == ':') break;  // line metadata; skip for now
112     if (c == '.') {
113       if (!has_data(in)) break;  // comment token at end of line
114       if (isspace(in.peek()))
115         continue;  // '.' followed by space is comment token; skip
116     }
117     result.words.push_back(word());
118     if (c == '"') {
119       // slurp word data
120       ostringstream d;
121       d << c;
122       while (has_data(in)) {
123         in >> c;
124         d << c;
125         if (c == '"') break;
126       }
127       result.words.back().data = d.str();
128       // slurp metadata
129       ostringstream m;
130       while (!isspace(in.peek()) && has_data(in)) {
131         in >> c;
132         if (c == '/') {
133           if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
134           m.str("");
135         }
136         else {
137           m << c;
138         }
139       }
140       if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
141     }
142     else {
143       // slurp all characters until whitespace
144       ostringstream w;
145       w << c;
146       while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
147         in >> c;
148         w << c;
149       }
150       parse_word(w.str(), result.words.back());
151     }
152     trace(99, "parse2") << "word: " << to_string(result.words.back()) << end();
153   }
154   if (!result.words.empty())
155     out.push_back(result);
156 }
157 
158 void skip_whitespace(istream& in) {
159   while (true) {
160     if (has_data(in) && isspace(in.peek())) in.get();
161     else break;
162   }
163 }
164 
165 void skip_comment(istream& in) {
166   if (has_data(in) && in.peek() == '#') {
167     in.get();
168     while (has_data(in) && in.peek() != '\n') in.get();
169   }
170 }
171 
172 // helper for tests
173 void parse_instruction_character_by_character(const string& line_data) {
174   vector<line> out;
175   parse_instruction_character_by_character(line_data, out);
176 }
177 
178 :(scenario parse2_comment_token_in_middle)
179 a . z
180 +parse2: word: a
181 +parse2: word: z
182 -parse2: word: .
183 # no other words
184 $parse2: 2
185 
186 :(scenario parse2_word_starting_with_dot)
187 a .b c
188 +parse2: word: a
189 +parse2: word: .b
190 +parse2: word: c
191 
192 :(scenario parse2_comment_token_at_start)
193 . a b
194 +parse2: word: a
195 +parse2: word: b
196 -parse2: word: .
197 
198 :(scenario parse2_comment_token_at_end)
199 a b .
200 +parse2: word: a
201 +parse2: word: b
202 -parse2: word: .
203 
204 :(scenario parse2_word_starting_with_dot_at_start)
205 .a b c
206 +parse2: word: .a
207 +parse2: word: b
208 +parse2: word: c
209 
210 :(scenario parse2_metadata)
211 .a b/c d
212 +parse2: word: .a
213 +parse2: word: b /c
214 +parse2: word: d
215 
216 :(scenario parse2_string_with_metadata)
217 a "bc  def"/disp32 g
218 +parse2: word: a
219 +parse2: word: "bc  def" /disp32
220 +parse2: word: g
221 
222 :(scenario parse2_string_with_metadata_at_end)
223 a "bc  def"/disp32
224 +parse2: word: a
225 +parse2: word: "bc  def" /disp32
226 
227 :(code)
228 void test_parse2_string_with_metadata_at_end_of_line_without_newline() {
229   parse_instruction_character_by_character(
230       "68/push \"test\"/f"  // no newline, which is how calls from parse() will look
231   );
232   CHECK_TRACE_CONTENTS(
233       "parse2: word: 68 /push^D"
234       "parse2: word: \"test\" /f^D"
235   );
236 }
237 
238 //: Make sure slashes inside strings don't trigger adding stuff from inside the
239 //: string to metadata.
240 :(scenario parse2_string_containing_slashes)
241 a "bc/def"/disp32
242 +parse2: word: "bc/def" /disp32