https://github.com/akkartik/mu/blob/master/subx/035labels.cc
  1 //: Labels are defined by ending names with a ':'. This layer will compute
  2 //: displacements for labels, and compute the offset for instructions using them.
  3 //:
  4 //: We won't check this, but our convention will be that jump targets will
  5 //: start with a '$', while functions will not. Function names will never be
  6 //: jumped to, and jump targets will never be called.
  7 
  8 //: We're introducing non-number names for the first time, so it's worth
  9 //: laying down some ground rules all transforms will follow, so things don't
 10 //: get too confusing:
 11 //:   - if it starts with a digit, it's treated as a number. If it can't be
 12 //:     parsed as hex it will raise an error.
 13 //:   - if it starts with '-' it's treated as a number.
 14 //:   - if it starts with '0x' it's treated as a number.
 15 //:   - if it's two characters long, it can't be a name. Either it's a hex
 16 //:     byte, or it raises an error.
 17 //: That's it. Names can start with any non-digit that isn't a dash. They can
 18 //: be a single character long. 'a' is not a hex number, it's a variable.
 19 //: Later layers may add more conventions partitioning the space of names. But
 20 //: the above rules will remain inviolate.
 21 
 22 //: One special label: the address to start running the program at.
 23 
 24 void test_entry_label() {
 25   run(
 26       "== 0x1\n"  // code segment
 27       "05 0x0d0c0b0a/imm32\n"
 28       "Entry:\n"
 29       "05 0x0d0c0b0a/imm32\n"
 30   );
 31   CHECK_TRACE_CONTENTS(
 32       "run: 0x00000006 opcode: 05\n"
 33   );
 34   CHECK_TRACE_DOESNT_CONTAIN("run: 0x00000001 opcode: 05");
 35 }
 36 
 37 :(before "End Globals")
 38 uint32_t Entry_address = 0;
 39 :(before "End Reset")
 40 Entry_address = 0;
 41 :(before "End Initialize EIP")
 42 if (Entry_address) EIP = Entry_address;
 43 :(after "Override e_entry")
 44 if (Entry_address) e_entry = Entry_address;
 45 
 46 :(before "End looks_like_hex_int(s) Detectors")
 47 if (SIZE(s) == 2) return true;
 48 
 49 :(code)
 50 void test_pack_immediate_ignores_single_byte_nondigit_operand() {
 51   Hide_errors = true;
 52   transform(
 53       "== 0x1\n"  // code segment
 54       "b9/copy  a/imm32\n"
 55   );
 56   CHECK_TRACE_CONTENTS(
 57       "transform: packing instruction 'b9/copy a/imm32'\n"
 58       // no change (we're just not printing metadata to the trace)
 59       "transform: instruction after packing: 'b9 a'\n"
 60   );
 61 }
 62 
 63 void test_pack_immediate_ignores_3_hex_digit_operand() {
 64   Hide_errors = true;
 65   transform(
 66       "== 0x1\n"  // code segment
 67       "b9/copy  aaa/imm32\n"
 68   );
 69   CHECK_TRACE_CONTENTS(
 70       "transform: packing instruction 'b9/copy aaa/imm32'\n"
 71       // no change (we're just not printing metadata to the trace)
 72       "transform: instruction after packing: 'b9 aaa'\n"
 73   );
 74 }
 75 
 76 void test_pack_immediate_ignores_non_hex_operand() {
 77   Hide_errors = true;
 78   transform(
 79       "== 0x1\n"  // code segment
 80       "b9/copy xxx/imm32\n"
 81   );
 82   CHECK_TRACE_CONTENTS(
 83       "transform: packing instruction 'b9/copy xxx/imm32'\n"
 84       // no change (we're just not printing metadata to the trace)
 85       "transform: instruction after packing: 'b9 xxx'\n"
 86   );
 87 }
 88 
 89 //: a helper we'll find handy later
 90 void check_valid_name(const string& s) {
 91   if (s.empty()) {
 92     raise << "empty name!\n" << end();
 93     return;
 94   }
 95   if (s.at(0) == '-')
 96     raise << "'" << s << "' starts with '-', which can be confused with a negative number; use a different name\n" << end();
 97   if (s.substr(0, 2) == "0x") {
 98     raise << "'" << s << "' looks like a hex number; use a different name\n" << end();
 99     return;
100   }
101   if (isdigit(s.at(0)))
102     raise << "'" << s << "' starts with a digit, and so can be confused with a negative number; use a different name.\n" << end();
103   if (SIZE(s) == 2)
104     raise << "'" << s << "' is two characters long which can look like raw hex bytes at a glance; use a different name\n" << end();
105 }
106 
107 //: Now that that's done, let's start using names as labels.
108 
109 void test_map_label() {
110   transform(
111       "== 0x1\n"  // code segment
112       "loop:\n"
113       "  05  0x0d0c0b0a/imm32\n"
114   );
115   CHECK_TRACE_CONTENTS(
116       "transform: label 'loop' is at address 1\n"
117   );
118 }
119 
120 :(before "End Level-2 Transforms")
121 Transform.push_back(rewrite_labels);
122 :(code)
123 void rewrite_labels(program& p) {
124   trace(3, "transform") << "-- rewrite labels" << end();
125   if (p.segments.empty()) return;
126   segment& code = p.segments.at(0);
127   map<string, int32_t> byte_index;  // values are unsigned, but we're going to do subtractions on them so they need to fit in 31 bits
128   compute_byte_indices_for_labels(code, byte_index);
129   if (trace_contains_errors()) return;
130   drop_labels(code);
131   if (trace_contains_errors()) return;
132   replace_labels_with_displacements(code, byte_index);
133   if (contains_key(byte_index, "Entry"))
134     Entry_address = code.start + get(byte_index, "Entry");
135 }
136 
137 void compute_byte_indices_for_labels(const segment& code, map<string, int32_t>& byte_index) {
138   int current_byte = 0;
139   for (int i = 0;  i < SIZE(code.lines);  ++i) {
140     const line& inst = code.lines.at(i);
141     for (int j = 0;  j < SIZE(inst.words);  ++j) {
142       const word& curr = inst.words.at(j);
143       // hack: if we have any operand metadata left after previous transforms,
144       // deduce its size
145       // Maybe we should just move this transform to before instruction
146       // packing, and deduce the size of *all* operands. But then we'll also
147       // have to deal with bitfields.
148       if (has_operand_metadata(curr, "disp32") || has_operand_metadata(curr, "imm32")) {
149         if (*curr.data.rbegin() == ':')
150           raise << "'" << to_string(inst) << "': don't use ':' when jumping to labels\n" << end();
151         current_byte += 4;
152       }
153       else if (has_operand_metadata(curr, "disp16")) {
154         if (*curr.data.rbegin() == ':')
155           raise << "'" << to_string(inst) << "': don't use ':' when jumping to labels\n" << end();
156         current_byte += 2;
157       }
158       // automatically handle /disp8 and /imm8 here
159       else if (*curr.data.rbegin() != ':') {
160         ++current_byte;
161       }
162       else {
163         string label = drop_last(curr.data);
164         // ensure labels look sufficiently different from raw hex
165         check_valid_name(label);
166         if (trace_contains_errors()) return;
167         if (contains_any_operand_metadata(curr))
168           raise << "'" << to_string(inst) << "': label definition (':') not allowed in operand\n" << end();
169         if (j > 0)
170           raise << "'" << to_string(inst) << "': labels can only be the first word in a line.\n" << end();
171         if (Map_file.is_open())
172           Map_file << "0x" << HEXWORD << (code.start + current_byte) << ' ' << label << '\n';
173         if (contains_key(byte_index, label) && label != "Entry") {
174           raise << "duplicate label '" << label << "'\n" << end();
175           return;
176         }
177         put(byte_index, label, current_byte);
178         trace(99, "transform") << "label '" << label << "' is at address " << (current_byte+code.start) << end();
179         // no modifying current_byte; label definitions won't be in the final binary
180       }
181     }
182   }
183 }
184 
185 :(before "End Globals")
186 bool Dump_map = false;  // currently used only by 'subx translate'
187 ofstream Map_file;
188 :(before "End Commandline Options")
189 else if (is_equal(*arg, "--map")) {
190   Dump_map = true;
191   // End --map Settings
192 }
193 //: wait to open "map" for writing until we're sure we aren't trying to read it
194 :(after "Begin subx translate")
195 if (Dump_map)
196   Map_file.open("map");
197 :(before "End subx translate")
198 if (Dump_map)
199   Map_file.close();
200 
201 :(code)
202 void drop_labels(segment& code) {
203   for (int i = 0;  i < SIZE(code.lines);  ++i) {
204     line& inst = code.lines.at(i);
205     vector<word>::iterator new_end = remove_if(inst.words.begin(), inst.words.end(), is_label);
206     inst.words.erase(new_end, inst.words.end());
207   }
208 }
209 
210 bool is_label(const word& w) {
211   return *w.data.rbegin() == ':';
212 }
213 
214 void replace_labels_with_displacements(segment& code, const map<string, int32_t>& byte_index) {
215   int32_t byte_index_next_instruction_starts_at = 0;
216   for (int i = 0;  i < SIZE(code.lines);  ++i) {
217     line& inst = code.lines.at(i);
218     byte_index_next_instruction_starts_at += num_bytes(inst);
219     line new_inst;
220     for (int j = 0;  j < SIZE(inst.words);  ++j) {
221       const word& curr = inst.words.at(j);
222       if (contains_key(byte_index, curr.data)) {
223         int32_t displacement = static_cast<int32_t>(get(byte_index, curr.data)) - byte_index_next_instruction_starts_at;
224         if (has_operand_metadata(curr, "disp8")) {
225           if (displacement > 0x7f || displacement < -0x7f)
226             raise << "'" << to_string(inst) << "': label too far away for displacement " << std::hex << displacement << " to fit in 8 signed bits\n" << end();
227           else
228             emit_hex_bytes(new_inst, displacement, 1);
229         }
230         else if (has_operand_metadata(curr, "disp16")) {
231           if (displacement > 0x7fff || displacement < -0x7fff)
232             raise << "'" << to_string(inst) << "': label too far away for displacement " << std::hex << displacement << " to fit in 16 signed bits\n" << end();
233           else
234             emit_hex_bytes(new_inst, displacement, 2);
235         }
236         else if (has_operand_metadata(curr, "disp32")) {
237           emit_hex_bytes(new_inst, displacement, 4);
238         }
239       }
240       else {
241         new_inst.words.push_back(curr);
242       }
243     }
244     inst.words.swap(new_inst.words);
245     trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end();
246   }
247 }
248 
249 string data_to_string(const line& inst) {
250   ostringstream out;
251   for (int i = 0;  i < SIZE(inst.words);  ++i) {
252     if (i > 0) out << ' ';
253     out << inst.words.at(i).data;
254   }
255   return out.str();
256 }
257 
258 string drop_last(const string& s) {
259   return string(s.begin(), --s.end());
260 }
261 
262 //: Label definitions must be the first word on a line. No jumping inside
263 //: instructions.
264 //: They should also be the only word on a line.
265 //: However, you can absolutely have multiple labels map to the same address,
266 //: as long as they're on separate lines.
267 
268 void test_multiple_labels_at() {
269   transform(
270       "== 0x1\n"  // code segment
271       // address 1
272       "loop:\n"
273       " $loop2:\n"
274       // address 1 (labels take up no space)
275       "    05  0x0d0c0b0a/imm32\n"
276       // address 6
277       "    eb  $loop2/disp8\n"
278       // address 8
279       "    eb  $loop3/disp8\n"
280       // address 0xa
281       " $loop3:\n"
282   );
283   CHECK_TRACE_CONTENTS(
284       "transform: label 'loop' is at address 1\n"
285       "transform: label '$loop2' is at address 1\n"
286       "transform: label '$loop3' is at address a\n"
287       // first jump is to -7
288       "transform: instruction after transform: 'eb f9'\n"
289       // second jump is to 0 (fall through)
290       "transform: instruction after transform: 'eb 00'\n"
291   );
292 }
293 
294 void test_duplicate_label() {
295   Hide_errors = true;
296   transform(
297       "== 0x1\n"
298       "loop:\n"
299       "loop:\n"
300       "    05  0x0d0c0b0a/imm32\n"
301   );
302   CHECK_TRACE_CONTENTS(
303       "error: duplicate label 'loop'\n"
304   );
305 }
306 
307 void test_label_too_short() {
308   Hide_errors = true;
309   transform(
310       "== 0x1\n"
311       "xz:\n"
312       "  05  0x0d0c0b0a/imm32\n"
313   );
314   CHECK_TRACE_CONTENTS(
315       "error: 'xz' is two characters long which can look like raw hex bytes at a glance; use a different name\n"
316   );
317 }
318 
319 void test_label_hex() {
320   Hide_errors = true;
321   transform(
322       "== 0x1\n"
323       "0xab:\n"
324       "  05  0x0d0c0b0a/imm32\n"
325   );
326   CHECK_TRACE_CONTENTS(
327       "error: '0xab' looks like a hex number; use a different name\n"
328   );
329 }
330 
331 void test_label_negative_hex() {
332   Hide_errors = true;
333   transform(
334       "== 0x1\n"
335       "-a:\n"
336       "    05  0x0d0c0b0a/imm32\n"
337   );
338   CHECK_TRACE_CONTENTS(
339       "error: '-a' starts with '-', which can be confused with a negative number; use a different name\n"
340   );
341 }
342 
343 //: now that we have labels, we need to adjust segment size computation to
344 //: ignore them.
345 
346 void test_segment_size_ignores_labels() {
347   transform(
348       "== code\n"  // 0x09000074
349       "  05/add  0x0d0c0b0a/imm32\n"  // 5 bytes
350       "foo:\n"                        // 0 bytes
351       "== data\n"  // 0x0a000079
352       "bar:\n"
353       "  00\n"
354   );
355   CHECK_TRACE_CONTENTS(
356       "transform: segment 1 begins at address 0x0a000079\n"
357   );
358 }
359 
360 :(before "End size_of(word w) Special-cases")
361 else if (is_label(w))
362   return 0;