https://github.com/akkartik/mu/blob/master/subx/035labels.cc
  1 //: Labels are defined by ending names with a ':'. This layer will compute
  2 //: displacements for labels, and compute the offset for instructions using them.
  3 //:
  4 //: We won't check this, but our convention will be that jump targets will
  5 //: start with a '$', while functions will not. Function names will never be
  6 //: jumped to, and jump targets will never be called.
  7 
  8 //: We're introducing non-number names for the first time, so it's worth
  9 //: laying down some ground rules all transforms will follow, so things don't
 10 //: get too confusing:
 11 //:   - if it starts with a digit, it's treated as a number. If it can't be
 12 //:     parsed as hex it will raise an error.
 13 //:   - if it starts with '-' it's treated as a number.
 14 //:   - if it starts with '0x' it's treated as a number.
 15 //:   - if it's two characters long, it can't be a name. Either it's a hex
 16 //:     byte, or it raises an error.
 17 //: That's it. Names can start with any non-digit that isn't a dash. They can
 18 //: be a single character long. 'a' is not a hex number, it's a variable.
 19 //: Later layers may add more conventions partitioning the space of names. But
 20 //: the above rules will remain inviolate.
 21 
 22 //: One special label: the address to start running the program at.
 23 
 24 :(scenario entry_label)
 25 == 0x1
 26 05 0x0d0c0b0a/imm32
 27 Entry:
 28 05 0x0d0c0b0a/imm32
 29 +run: inst: 0x00000006
 30 -run: inst: 0x00000001
 31 
 32 :(before "End Globals")
 33 uint32_t Entry_address = 0;
 34 :(before "End Reset")
 35 Entry_address = 0;
 36 :(before "End Initialize EIP")
 37 if (Entry_address) EIP = Entry_address;
 38 :(after "Override e_entry")
 39 if (Entry_address) e_entry = Entry_address;
 40 
 41 :(before "End looks_like_hex_int(s) Detectors")
 42 if (SIZE(s) == 2) return true;
 43 
 44 :(scenarios transform)
 45 :(scenario pack_immediate_ignores_single_byte_nondigit_operand)
 46 % Hide_errors = true;
 47 == 0x1
 48 b9/copy  a/imm32
 49 +transform: packing instruction 'b9/copy a/imm32'
 50 # no change (we're just not printing metadata to the trace)
 51 +transform: instruction after packing: 'b9 a'
 52 
 53 :(scenario pack_immediate_ignores_3_hex_digit_operand)
 54 % Hide_errors = true;
 55 == 0x1
 56 b9/copy  aaa/imm32
 57 +transform: packing instruction 'b9/copy aaa/imm32'
 58 # no change (we're just not printing metadata to the trace)
 59 +transform: instruction after packing: 'b9 aaa'
 60 
 61 :(scenario pack_immediate_ignores_non_hex_operand)
 62 % Hide_errors = true;
 63 == 0x1
 64 b9/copy xxx/imm32
 65 +transform: packing instruction 'b9/copy xxx/imm32'
 66 # no change (we're just not printing metadata to the trace)
 67 +transform: instruction after packing: 'b9 xxx'
 68 
 69 //: a helper we'll find handy later
 70 :(code)
 71 void check_valid_name(const string& s) {
 72   if (s.empty()) {
 73     raise << "empty name!\n" << end();
 74     return;
 75   }
 76   if (s.at(0) == '-')
 77     raise << "'" << s << "' starts with '-', which can be confused with a negative number; use a different name\n" << end();
 78   if (s.substr(0, 2) == "0x") {
 79     raise << "'" << s << "' looks like a hex number; use a different name\n" << end();
 80     return;
 81   }
 82   if (isdigit(s.at(0)))
 83     raise << "'" << s << "' starts with a digit, and so can be confused with a negative number; use a different name.\n" << end();
 84   if (SIZE(s) == 2)
 85     raise << "'" << s << "' is two characters long which can look like raw hex bytes at a glance; use a different name\n" << end();
 86 }
 87 
 88 //: Now that that's done, let's start using names as labels.
 89 
 90 :(scenario map_label)
 91 == 0x1
 92 loop:
 93   05  0x0d0c0b0a/imm32
 94 +transform: label 'loop' is at address 1
 95 
 96 :(before "End Level-2 Transforms")
 97 Transform.push_back(rewrite_labels);
 98 :(code)
 99 void rewrite_labels(program& p) {
100   trace(99, "transform") << "-- rewrite labels" << end();
101   if (p.segments.empty()) return;
102   segment& code = p.segments.at(0);
103   map<string, int32_t> byte_index;  // values are unsigned, but we're going to do subtractions on them so they need to fit in 31 bits
104   compute_byte_indices_for_labels(code, byte_index);
105   if (trace_contains_errors()) return;
106   drop_labels(code);
107   if (trace_contains_errors()) return;
108   replace_labels_with_displacements(code, byte_index);
109   if (contains_key(byte_index, "Entry"))
110     Entry_address = code.start + get(byte_index, "Entry");
111 }
112 
113 void compute_byte_indices_for_labels(const segment& code, map<string, int32_t>& byte_index) {
114   int current_byte = 0;
115   for (int i = 0;  i < SIZE(code.lines);  ++i) {
116     const line& inst = code.lines.at(i);
117     for (int j = 0;  j < SIZE(inst.words);  ++j) {
118       const word& curr = inst.words.at(j);
119       // hack: if we have any operand metadata left after previous transforms,
120       // deduce its size
121       // Maybe we should just move this transform to before instruction
122       // packing, and deduce the size of *all* operands. But then we'll also
123       // have to deal with bitfields.
124       if (has_operand_metadata(curr, "disp32") || has_operand_metadata(curr, "imm32")) {
125         if (*curr.data.rbegin() == ':')
126           raise << "'" << to_string(inst) << "': don't use ':' when jumping to labels\n" << end();
127         current_byte += 4;
128       }
129       else if (has_operand_metadata(curr, "disp16")) {
130         if (*curr.data.rbegin() == ':')
131           raise << "'" << to_string(inst) << "': don't use ':' when jumping to labels\n" << end();
132         current_byte += 2;
133       }
134       // automatically handle /disp8 and /imm8 here
135       else if (*curr.data.rbegin() != ':') {
136         ++current_byte;
137       }
138       else {
139         string label = drop_last(curr.data);
140         // ensure labels look sufficiently different from raw hex
141         check_valid_name(label);
142         if (trace_contains_errors()) return;
143         if (contains_any_operand_metadata(curr))
144           raise << "'" << to_string(inst) << "': label definition (':') not allowed in operand\n" << end();
145         if (j > 0)
146           raise << "'" << to_string(inst) << "': labels can only be the first word in a line.\n" << end();
147         if (Map_file.is_open())
148           Map_file << "0x" << HEXWORD << (code.start + current_byte) << ' ' << label << '\n';
149         if (contains_key(byte_index, label) && label != "Entry") {
150           raise << "duplicate label '" << label << "'\n" << end();
151           return;
152         }
153         put(byte_index, label, current_byte);
154         trace(99, "transform") << "label '" << label << "' is at address " << (current_byte+code.start) << end();
155         // no modifying current_byte; label definitions won't be in the final binary
156       }
157     }
158   }
159 }
160 
161 :(before "End Globals")
162 bool Dump_map = false;  // currently used only by 'subx translate'
163 ofstream Map_file;
164 :(before "End Commandline Options")
165 else if (is_equal(*arg, "--map")) {
166   Dump_map = true;
167   // End --map Settings
168 }
169 //: wait to open "map" for writing until we're sure we aren't trying to read it
170 :(after "Begin subx translate")
171 if (Dump_map)
172   Map_file.open("map");
173 :(before "End subx translate")
174 if (Dump_map)
175   Map_file.close();
176 
177 :(code)
178 void drop_labels(segment& code) {
179   for (int i = 0;  i < SIZE(code.lines);  ++i) {
180     line& inst = code.lines.at(i);
181     vector<word>::iterator new_end = remove_if(inst.words.begin(), inst.words.end(), is_label);
182     inst.words.erase(new_end, inst.words.end());
183   }
184 }
185 
186 bool is_label(const word& w) {
187   return *w.data.rbegin() == ':';
188 }
189 
190 void replace_labels_with_displacements(segment& code, const map<string, int32_t>& byte_index) {
191   int32_t byte_index_next_instruction_starts_at = 0;
192   for (int i = 0;  i < SIZE(code.lines);  ++i) {
193     line& inst = code.lines.at(i);
194     byte_index_next_instruction_starts_at += num_bytes(inst);
195     line new_inst;
196     for (int j = 0;  j < SIZE(inst.words);  ++j) {
197       const word& curr = inst.words.at(j);
198       if (contains_key(byte_index, curr.data)) {
199         int32_t displacement = static_cast<int32_t>(get(byte_index, curr.data)) - byte_index_next_instruction_starts_at;
200         if (has_operand_metadata(curr, "disp8")) {
201           if (displacement > 0xff || displacement < -0x7f)
202             raise << "'" << to_string(inst) << "': label too far away for displacement " << std::hex << displacement << " to fit in 8 bits\n" << end();
203           else
204             emit_hex_bytes(new_inst, displacement, 1);
205         }
206         else if (has_operand_metadata(curr, "disp16")) {
207           if (displacement > 0xffff || displacement < -0x7fff)
208             raise << "'" << to_string(inst) << "': label too far away for displacement " << std::hex << displacement << " to fit in 16 bits\n" << end();
209           else
210             emit_hex_bytes(new_inst, displacement, 2);
211         }
212         else if (has_operand_metadata(curr, "disp32")) {
213           emit_hex_bytes(new_inst, displacement, 4);
214         }
215       }
216       else {
217         new_inst.words.push_back(curr);
218       }
219     }
220     inst.words.swap(new_inst.words);
221     trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end();
222   }
223 }
224 
225 string data_to_string(const line& inst) {
226   ostringstream out;
227   for (int i = 0;  i < SIZE(inst.words);  ++i) {
228     if (i > 0) out << ' ';
229     out << inst.words.at(i).data;
230   }
231   return out.str();
232 }
233 
234 string drop_last(const string& s) {
235   return string(s.begin(), --s.end());
236 }
237 
238 //: Label definitions must be the first word on a line. No jumping inside
239 //: instructions.
240 //: They should also be the only word on a line.
241 //: However, you can absolutely have multiple labels map to the same address,
242 //: as long as they're on separate lines.
243 
244 :(scenario multiple_labels_at)
245 == 0x1
246 # address 1
247 loop:
248  $loop2:
249 # address 1 (labels take up no space)
250     05  0x0d0c0b0a/imm32
251 # address 6
252     eb  $loop2/disp8
253 # address 8
254     eb  $loop3/disp8
255 # address 0xa
256  $loop3:
257 +transform: label 'loop' is at address 1
258 +transform: label '$loop2' is at address 1
259 +transform: label '$loop3' is at address a
260 # first jump is to -7
261 +transform: instruction after transform: 'eb f9'
262 # second jump is to 0 (fall through)
263 +transform: instruction after transform: 'eb 00'
264 
265 :(scenario duplicate_label)
266 % Hide_errors = true;
267 == 0x1
268 loop:
269 loop:
270     05  0x0d0c0b0a/imm32
271 +error: duplicate label 'loop'
272 
273 :(scenario label_too_short)
274 % Hide_errors = true;
275 == 0x1
276 xz:
277   05  0x0d0c0b0a/imm32
278 +error: 'xz' is two characters long which can look like raw hex bytes at a glance; use a different name
279 
280 :(scenario label_hex)
281 % Hide_errors = true;
282 == 0x1
283 0xab:
284   05  0x0d0c0b0a/imm32
285 +error: '0xab' looks like a hex number; use a different name
286 
287 :(scenario label_negative_hex)
288 % Hide_errors = true;
289 == 0x1
290  -a:  # indent to avoid looking like a trace_should_not_contain command for this scenario
291     05  0x0d0c0b0a/imm32
292 +error: '-a' starts with '-', which can be confused with a negative number; use a different name
293 
294 //: now that we have labels, we need to adjust segment size computation to
295 //: ignore them.
296 
297 :(scenario segment_size_ignores_labels)
298 == code  # 0x09000074
299   05/add  0x0d0c0b0a/imm32  # 5 bytes
300 foo:                      # 0 bytes
301 == data  # 0x0a000079
302 bar:
303   00
304 +transform: segment 1 begins at address 0x0a000079
305 
306 :(before "End size_of(word w) Special-cases")
307 else if (is_label(w))
308   return 0;