https://github.com/akkartik/mu/blob/main/linux/bootstrap/036labels.cc
  1 //: Labels are defined by ending names with a ':'. This layer will compute
  2 //: displacements for labels, and compute the offset for instructions using them.
  3 //:
  4 //: We won't check this, but our convention will be that jump targets will
  5 //: start with a '$', while functions will not. Function names will never be
  6 //: jumped to, and jump targets will never be called.
  7 
  8 //: We're introducing non-number names for the first time, so it's worth
  9 //: laying down some ground rules all transforms will follow, so things don't
 10 //: get too confusing:
 11 //:   - if it starts with a digit, it's treated as a number. If it can't be
 12 //:     parsed as hex it will raise an error.
 13 //:   - if it starts with '-' it's treated as a number.
 14 //:   - if it starts with '0x' it's treated as a number.
 15 //:   - if it's two characters long, it can't be a name. Either it's a hex
 16 //:     byte, or it raises an error.
 17 //: That's it. Names can start with any non-digit that isn't a dash. They can
 18 //: be a single character long. 'a' is not a hex number, it's a variable.
 19 //: Later layers may add more conventions partitioning the space of names. But
 20 //: the above rules will remain inviolate.
 21 
 22 //: One special label is 'Entry', the address to start running the program at.
 23 //: It can be non-unique; the last declaration overrides earlier ones.
 24 //: It must exist in a program. Otherwise we don't know where to start running
 25 //: programs.
 26 
 27 void test_Entry_label() {
 28   run(
 29       "== code 0x1\n"
 30       "05 0x0d0c0b0a/imm32\n"
 31       "Entry:\n"
 32       "05 0x0d0c0b0a/imm32\n"
 33   );
 34   CHECK_TRACE_CONTENTS(
 35       "run: 0x00000006 opcode: 05\n"
 36   );
 37   CHECK_TRACE_DOESNT_CONTAIN("run: 0x00000001 opcode: 05");
 38 }
 39 
 40 :(before "End looks_like_hex_int(s) Detectors")
 41 if (SIZE(s) == 2) return true;
 42 
 43 :(code)
 44 void test_pack_immediate_ignores_single_byte_nondigit_argument() {
 45   Hide_errors = true;
 46   transform(
 47       "== code 0x1\n"
 48       "b9/copy  a/imm32\n"
 49   );
 50   CHECK_TRACE_CONTENTS(
 51       "transform: packing instruction 'b9/copy a/imm32'\n"
 52       // no change (we're just not printing metadata to the trace)
 53       "transform: instruction after packing: 'b9 a'\n"
 54   );
 55 }
 56 
 57 void test_pack_immediate_ignores_3_hex_digit_argument() {
 58   Hide_errors = true;
 59   transform(
 60       "== code 0x1\n"
 61       "b9/copy  aaa/imm32\n"
 62   );
 63   CHECK_TRACE_CONTENTS(
 64       "transform: packing instruction 'b9/copy aaa/imm32'\n"
 65       // no change (we're just not printing metadata to the trace)
 66       "transform: instruction after packing: 'b9 aaa'\n"
 67   );
 68 }
 69 
 70 void test_pack_immediate_ignores_non_hex_argument() {
 71   Hide_errors = true;
 72   transform(
 73       "== code 0x1\n"
 74       "b9/copy xxx/imm32\n"
 75   );
 76   CHECK_TRACE_CONTENTS(
 77       "transform: packing instruction 'b9/copy xxx/imm32'\n"
 78       // no change (we're just not printing metadata to the trace)
 79       "transform: instruction after packing: 'b9 xxx'\n"
 80   );
 81 }
 82 
 83 //: a helper we'll find handy later
 84 void check_valid_name(const string& s) {
 85   if (s.empty()) {
 86     raise << "empty name!\n" << end();
 87     return;
 88   }
 89   if (s.at(0) == '-')
 90     raise << "'" << s << "' starts with '-', which can be confused with a negative number; use a different name\n" << end();
 91   if (s.substr(0, 2) == "0x") {
 92     raise << "'" << s << "' looks like a hex number; use a different name\n" << end();
 93     return;
 94   }
 95   if (isdigit(s.at(0)))
 96     raise << "'" << s << "' starts with a digit, and so can be confused with a number; use a different name.\n" << end();
 97   if (SIZE(s) == 2)
 98     raise << "'" << s << "' is two characters long, which can look like raw hex bytes at a glance; use a different name\n" << end();
 99 }
100 
101 //: Now that that's done, let's start using names as labels.
102 
103 void test_map_label() {
104   transform(
105       "== code 0x1\n"
106       "loop:\n"
107       "  05  0x0d0c0b0a/imm32\n"
108   );
109   CHECK_TRACE_CONTENTS(
110       "transform: label 'loop' is at address 1\n"
111   );
112 }
113 
114 :(before "End Transforms")
115 Transform.push_back(rewrite_labels);
116 :(code)
117 void rewrite_labels(program& p) {
118   trace(3, "transform") << "-- rewrite labels" << end();
119   if (p.segments.empty()) return;
120   segment& code = *find(p, "code");
121   map<string, int32_t> byte_index;  // values are unsigned, but we're going to do subtractions on them so they need to fit in 31 bits
122   compute_byte_indices_for_labels(code, byte_index);
123   if (trace_contains_errors()) return;
124   drop_labels(code);
125   if (trace_contains_errors()) return;
126   replace_labels_with_displacements(code, byte_index);
127   if (contains_key(byte_index, "Entry"))
128     p.entry = code.start + get(byte_index, "Entry");
129 }
130 
131 void compute_byte_indices_for_labels(const segment& code, map<string, int32_t>& byte_index) {
132   int current_byte = 0;
133   for (int i = 0;  i < SIZE(code.lines);  ++i) {
134     const line& inst = code.lines.at(i);
135     if (Source_lines_file.is_open() && !inst.original.empty() && /*not a label*/ *inst.words.at(0).data.rbegin() != ':')
136       Source_lines_file << "0x" << HEXWORD << (code.start + current_byte) << ' ' << inst.original << '\n';
137     for (int j = 0;  j < SIZE(inst.words);  ++j) {
138       const word& curr = inst.words.at(j);
139       // hack: if we have any argument metadata left after previous transforms,
140       // deduce its size
141       // Maybe we should just move this transform to before instruction
142       // packing, and deduce the size of *all* arguments. But then we'll also
143       // have to deal with bitfields.
144       if (has_argument_metadata(curr, "disp32") || has_argument_metadata(curr, "imm32")) {
145         if (*curr.data.rbegin() == ':')
146           raise << "'" << to_string(inst) << "': don't use ':' when jumping to labels\n" << end();
147         current_byte += 4;
148       }
149       else if (has_argument_metadata(curr, "disp16")) {
150         if (*curr.data.rbegin() == ':')
151           raise << "'" << to_string(inst) << "': don't use ':' when jumping to labels\n" << end();
152         current_byte += 2;
153       }
154       // automatically handle /disp8 and /imm8 here
155       else if (*curr.data.rbegin() != ':') {
156         ++current_byte;
157       }
158       else {
159         string label = drop_last(curr.data);
160         // ensure labels look sufficiently different from raw hex
161         check_valid_name(label);
162         if (trace_contains_errors()) return;
163         if (contains_any_argument_metadata(curr))
164           raise << "'" << to_string(inst) << "': label definition (':') not allowed in argument\n" << end();
165         if (j > 0)
166           raise << "'" << to_string(inst) << "': labels can only be the first word in a line.\n" << end();
167         if (Labels_file.is_open())
168           Labels_file << "0x" << HEXWORD << (code.start + current_byte) << ' ' << label << '\n';
169         if (contains_key(byte_index, label) && label != "Entry") {
170           raise << "duplicate label '" << label << "'\n" << end();
171           return;
172         }
173         put(byte_index, label, current_byte);
174         trace(99, "transform") << "label '" << label << "' is at address " << (current_byte+code.start) << end();
175         // no modifying current_byte; label definitions won't be in the final binary
176       }
177     }
178   }
179 }
180 
181 :(before "End Globals")
182 bool Dump_debug_info = false;  // currently used only by 'bootstrap translate'
183 ofstream Labels_file;
184 ofstream Source_lines_file;
185 :(before "End Commandline Options")
186 else if (is_equal(*arg, "--debug")) {
187   Dump_debug_info = true;
188   // End --debug Settings
189 }
190 //: wait to open "labels" for writing until we're sure we aren't trying to read it
191 :(after "Begin bootstrap translate")
192 if (Dump_debug_info) {
193   cerr << "saving address->label information to 'labels'\n";
194   Labels_file.open("labels");
195   cerr << "saving address->source information to 'source_lines'\n";
196   Source_lines_file.open("source_lines");
197 }
198 :(before "End bootstrap translate")
199 if (Dump_debug_info) {
200   Labels_file.close();
201   Source_lines_file.close();
202 }
203 
204 :(code)
205 void drop_labels(segment& code) {
206   for (int i = 0;  i < SIZE(code.lines);  ++i) {
207     line& inst = code.lines.at(i);
208     vector<word>::iterator new_end = remove_if(inst.words.begin(), inst.words.end(), is_label);
209     inst.words.erase(new_end, inst.words.end());
210   }
211 }
212 
213 bool is_label(const word& w) {
214   return *w.data.rbegin() == ':';
215 }
216 
217 void replace_labels_with_displacements(segment& code, const map<string, int32_t>& byte_index) {
218   int32_t byte_index_next_instruction_starts_at = 0;
219   for (int i = 0;  i < SIZE(code.lines);  ++i) {
220     line& inst = code.lines.at(i);
221     byte_index_next_instruction_starts_at += num_bytes(inst);
222     line new_inst;
223     for (int j = 0;  j < SIZE(inst.words);  ++j) {
224       const word& curr = inst.words.at(j);
225       if (contains_key(byte_index, curr.data)) {
226         int32_t displacement = static_cast<int32_t>(get(byte_index, curr.data)) - byte_index_next_instruction_starts_at;
227         int32_t absolute_address = code.start + get(byte_index, curr.data);
228         if (has_argument_metadata(curr, "disp8")) {
229           if (displacement > 0x7f || displacement < -0x7f)
230             raise << "'" << to_string(inst) << "': label too far away for displacement " << std::hex << displacement << " to fit in 8 signed bits\n" << end();
231           else
232             emit_hex_bytes(new_inst, displacement, 1);
233         }
234         else if (has_argument_metadata(curr, "disp16")) {
235           if (displacement > 0x7fff || displacement < -0x7fff)
236             raise << "'" << to_string(inst) << "': label too far away for displacement " << std::hex << displacement << " to fit in 16 signed bits\n" << end();
237           else
238             emit_hex_bytes(new_inst, displacement, 2);
239         }
240         else if (has_argument_metadata(curr, "disp32")) {
241           if (is_far_jump_or_call(new_inst))
242             emit_hex_bytes(new_inst, displacement, 4);
243           else
244             emit_hex_bytes(new_inst, absolute_address, 4);
245         } else if (has_argument_metadata(curr, "imm32")) {
246           emit_hex_bytes(new_inst, absolute_address, 4);
247         }
248       }
249       else {
250         new_inst.words.push_back(curr);
251       }
252     }
253     inst.words.swap(new_inst.words);
254     trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end();
255   }
256 }
257 
258 bool is_far_jump_or_call(const line& inst) {
259   string first_opcode = inst.words.at(0).data;
260   if (first_opcode == "e8" || first_opcode == "e9") return true;
261   if (SIZE(inst.words) < 2) return false;
262   if (first_opcode != "0f") return false;
263   string second_opcode = inst.words.at(1).data;
264   return starts_with(second_opcode, "8");
265 }
266 
267 string data_to_string(const line& inst) {
268   ostringstream out;
269   for (int i = 0;  i < SIZE(inst.words);  ++i) {
270     if (i > 0) out << ' ';
271     out << inst.words.at(i).data;
272   }
273   return out.str();
274 }
275 
276 string drop_last(const string& s) {
277   return string(s.begin(), --s.end());
278 }
279 
280 //: Label definitions must be the first word on a line. No jumping inside
281 //: instructions.
282 //: They should also be the only word on a line.
283 //: However, you can absolutely have multiple labels map to the same address,
284 //: as long as they're on separate lines.
285 
286 void test_multiple_labels_at() {
287   transform(
288       "== code 0x1\n"
289       // address 1
290       "loop:\n"
291       " $loop2:\n"
292       // address 1 (labels take up no space)
293       "    05  0x0d0c0b0a/imm32\n"
294       // address 6
295       "    eb  $loop2/disp8\n"
296       // address 8
297       "    eb  $loop3/disp8\n"
298       // address 0xa
299       " $loop3:\n"
300   );
301   CHECK_TRACE_CONTENTS(
302       "transform: label 'loop' is at address 1\n"
303       "transform: label '$loop2' is at address 1\n"
304       "transform: label '$loop3' is at address a\n"
305       // first jump is to -7
306       "transform: instruction after transform: 'eb f9'\n"
307       // second jump is to 0 (fall through)
308       "transform: instruction after transform: 'eb 00'\n"
309   );
310 }
311 
312 void test_loading_label_as_imm32() {
313   transform(
314       "== code 0x1\n"
315       "label:\n"
316       "  be/copy-to-ESI  label/imm32\n"
317   );
318   CHECK_TRACE_CONTENTS(
319       "transform: label 'label' is at address 1\n"
320       "transform: instruction after transform: 'be 01 00 00 00'\n"
321   );
322 }
323 
324 void test_duplicate_label() {
325   Hide_errors = true;
326   transform(
327       "== code 0x1\n"
328       "loop:\n"
329       "loop:\n"
330       "    05  0x0d0c0b0a/imm32\n"
331   );
332   CHECK_TRACE_CONTENTS(
333       "error: duplicate label 'loop'\n"
334   );
335 }
336 
337 void test_label_too_short() {
338   Hide_errors = true;
339   transform(
340       "== code 0x1\n"
341       "xz:\n"
342       "  05  0x0d0c0b0a/imm32\n"
343   );
344   CHECK_TRACE_CONTENTS(
345       "error: 'xz' is two characters long, which can look like raw hex bytes at a glance; use a different name\n"
346   );
347 }
348 
349 void test_label_hex() {
350   Hide_errors = true;
351   transform(
352       "== code 0x1\n"
353       "0xab:\n"
354       "  05  0x0d0c0b0a/imm32\n"
355   );
356   CHECK_TRACE_CONTENTS(
357       "error: '0xab' looks like a hex number; use a different name\n"
358   );
359 }
360 
361 void test_label_negative_hex() {
362   Hide_errors = true;
363   transform(
364       "== code 0x1\n"
365       "-a:\n"
366       "    05  0x0d0c0b0a/imm32\n"
367   );
368   CHECK_TRACE_CONTENTS(
369       "error: '-a' starts with '-', which can be confused with a negative number; use a different name\n"
370   );
371 }
372 
373 //: As said up top, the 'Entry' label is special.
374 //: It can be non-unique; the last declaration overrides earlier ones.
375 //: It must exist in a program. Otherwise we don't know where to start running
376 //: programs.
377 
378 void test_duplicate_Entry_label() {
379   transform(
380       "== code 0x1\n"
381       "Entry:\n"
382       "Entry:\n"
383       "    05  0x0d0c0b0a/imm32\n"
384   );
385   CHECK_TRACE_DOESNT_CONTAIN_ERRORS();
386 }
387 
388 // This test could do with some refactoring.
389 // We're duplicating the flow inside `bootstrap translate`, but without
390 // reading/writing files.
391 // We can't just use run(string) because most of our tests allow programs
392 // without 'Entry' labels, as a convenience.
393 void test_programs_without_Entry_label() {
394   Hide_errors = true;
395   program p;
396   istringstream in(
397       "== code 0x1\n"
398       "05 0x0d0c0b0a/imm32\n"
399       "05 0x0d0c0b0a/imm32\n"
400   );
401   parse(in, p);
402   transform(p);
403   ostringstream dummy;
404   save_elf(p, dummy);
405   CHECK_TRACE_CONTENTS(
406       "error: no 'Entry' label found\n"
407   );
408 }
409 
410 //: now that we have labels, we need to adjust segment size computation to
411 //: ignore them.
412 
413 void test_segment_size_ignores_labels() {
414   transform(
415       "== code 0x09000074\n"
416       "  05/add  0x0d0c0b0a/imm32\n"  // 5 bytes
417       "foo:\n"                        // 0 bytes
418       "== data 0x0a000000\n"
419       "bar:\n"
420       "  00\n"
421   );
422   CHECK_TRACE_CONTENTS(
423       "transform: segment 1 begins at address 0x0a000079\n"
424   );
425 }
426 
427 :(before "End size_of(word w) Special-cases")
428 else if (is_label(w))
429   return 0;