1 //: Labels are defined by ending names with a ':'. This layer will compute
  2 //: addresses for labels, and compute the offset for instructions using them.
  3 
  4 //: We're introducing non-number names for the first time, so it's worth
  5 //: laying down some ground rules all transforms will follow, so things don't
  6 //: get too confusing:
  7 //:   - if it starts with a digit, it's treated as a number. If it can't be
  8 //:     parsed as hex it will raise an error.
  9 //:   - if it starts with '-' it's treated as a number.
 10 //:   - if it starts with '0x' it's treated as a number.
 11 //:   - if it's two characters long, it can't be a name. Either it's a hex
 12 //:     byte, or it raises an error.
 13 //: That's it. Names can start with any non-digit that isn't a dash. They can
 14 //: be a single character long. 'a' is not a hex number, it's a variable.
 15 //: Later layers may add more conventions partitioning the space of names. But
 16 //: the above rules will remain inviolate.
 17 bool is_number(const string& s) {
 18   if (s.at(0) == '-') return true;
 19   if (isdigit(s.at(0))) return true;
 20   return SIZE(s) == 2;
 21 }
 22 :(before "End Unit Tests")
 23 void test_is_number() {
 24   CHECK(!is_number("a"));
 25 }
 26 :(code)
 27 void check_valid_name(const string& s) {
 28   if (s.empty()) {
 29     raise << "empty name!\n" << end();
 30     return;
 31   }
 32   if (s.at(0) == '-')
 33     raise << "'" << s << "' starts with '-', which can be confused with a negative number; use a different name\n" << end();
 34   if (s.substr(0, 2) == "0x") {
 35     raise << "'" << s << "' looks like a hex number; use a different name\n" << end();
 36     return;
 37   }
 38   if (isdigit(s.at(0)))
 39     raise << "'" << s << "' starts with a digit, and so can be confused with a negative number; use a different name.\n" << end();
 40   if (SIZE(s) == 2)
 41     raise << "'" << s << "' is two characters long which can look like raw hex bytes at a glance; use a different name\n" << end();
 42 }
 43 
 44 :(scenarios transform)
 45 :(scenario map_label)
 46 == 0x1
 47           # instruction                     effective address                                                   operand     displacement    immediate
 48           # op          subop               mod             rm32          base        index         scale       r32
 49           # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
 50 loop:
 51             05                                                                                                                              0x0d0c0b0a/imm32  # add to EAX
 52 +transform: label 'loop' is at address 1
 53 
 54 :(before "End Level-2 Transforms")
 55 Transform.push_back(rewrite_labels);
 56 :(code)
 57 void rewrite_labels(program& p) {
 58   trace(99, "transform") << "-- rewrite labels" << end();
 59   if (p.segments.empty()) return;
 60   segment& code = p.segments.at(0);
 61   // Rewrite Labels(segment code)
 62   map<string, int32_t> address;  // values are unsigned, but we're going to do subtractions on them so they need to fit in 31 bits
 63   compute_addresses_for_labels(code, address);
 64   if (trace_contains_errors()) return;
 65   drop_labels(code);
 66   if (trace_contains_errors()) return;
 67   replace_labels_with_addresses(code, address);
 68 }
 69 
 70 void compute_addresses_for_labels(const segment& code, map<string, int32_t>& address) {
 71   int current_byte = 0;
 72   for (int i = 0;  i < SIZE(code.lines);  ++i) {
 73     const line& inst = code.lines.at(i);
 74     for (int j = 0;  j < SIZE(inst.words);  ++j) {
 75       const word& curr = inst.words.at(j);
 76       // hack: if we have any operand metadata left after previous transforms,
 77       // deduce its size
 78       // Maybe we should just move this transform to before instruction
 79       // packing, and deduce the size of *all* operands. But then we'll also
 80       // have to deal with bitfields.
 81       if (has_metadata(curr, "disp32") || has_metadata(curr, "imm32")) {
 82         if (*curr.data.rbegin() == ':')
 83           raise << "'" << to_string(inst) << "': don't use ':' when jumping to labels\n" << end();
 84         current_byte += 4;
 85       }
 86       // automatically handle /disp8 and /imm8 here
 87       else if (*curr.data.rbegin() != ':') {
 88         ++current_byte;
 89       }
 90       else {
 91         string label = drop_last(curr.data);
 92         // ensure labels look sufficiently different from raw hex
 93         check_valid_name(label);
 94         if (trace_contains_errors()) return;
 95         if (contains_any_operand_metadata(curr))
 96           raise << "'" << to_string(inst) << "': label definition (':') not allowed in operand\n" << end();
 97         if (j > 0)
 98           raise << "'" << to_string(inst) << "': labels can only be the first word in a line.\n" << end();
 99         put(address, label, current_byte);
100         trace(99, "transform") << "label '" << label << "' is at address " << (current_byte+code.start) << end();
101         // no modifying current_byte; label definitions won't be in the final binary
102       }
103     }
104   }
105 }
106 
107 void drop_labels(segment& code) {
108   for (int i = 0;  i < SIZE(code.lines);  ++i) {
109     line& inst = code.lines.at(i);
110     vector<word>::iterator new_end = remove_if(inst.words.begin(), inst.words.end(), is_label);
111     inst.words.erase(new_end, inst.words.end());
112   }
113 }
114 
115 bool is_label(const word& w) {
116   return *w.data.rbegin() == ':';
117 }
118 
119 void replace_labels_with_addresses(segment& code, const map<string, int32_t>& address) {
120   int32_t byte_next_instruction_starts_at = 0;
121   for (int i = 0;  i < SIZE(code.lines);  ++i) {
122     line& inst = code.lines.at(i);
123     byte_next_instruction_starts_at += num_bytes(inst);
124     line new_inst;
125     for (int j = 0;  j < SIZE(inst.words);  ++j) {
126       const word& curr = inst.words.at(j);
127       if (contains_key(address, curr.data)) {
128         int32_t offset = static_cast<int32_t>(get(address, curr.data)) - byte_next_instruction_starts_at;
129         if (has_metadata(curr, "disp8") || has_metadata(curr, "imm8")) {
130           if (offset > 0xff || offset < -0x7f)
131             raise << "'" << to_string(inst) << "': label too far away for distance " << std::hex << offset << " to fit in 8 bits\n" << end();
132           else
133             emit_hex_bytes(new_inst, offset, 1);
134         }
135         else if (has_metadata(curr, "disp16")) {
136           if (offset > 0xffff || offset < -0x7fff)
137             raise << "'" << to_string(inst) << "': label too far away for distance " << std::hex << offset << " to fit in 16 bits\n" << end();
138           else
139             emit_hex_bytes(new_inst, offset, 2);
140         }
141         else if (has_metadata(curr, "disp32") || has_metadata(curr, "imm32")) {
142           emit_hex_bytes(new_inst, offset, 4);
143         }
144       }
145       else {
146         new_inst.words.push_back(curr);
147       }
148     }
149     inst.words.swap(new_inst.words);
150     trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end();
151   }
152 }
153 
154 // Assumes all bitfields are packed.
155 uint32_t num_bytes(const line& inst) {
156   uint32_t sum = 0;
157   for (int i = 0;  i < SIZE(inst.words);  ++i) {
158     const word& curr = inst.words.at(i);
159     if (has_metadata(curr, "disp32") || has_metadata(curr, "imm32"))  // only multi-byte operands
160       sum += 4;
161     else
162       sum++;
163   }
164   return sum;
165 }
166 
167 string data_to_string(const line& inst) {
168   ostringstream out;
169   for (int i = 0;  i < SIZE(inst.words);  ++i) {
170     if (i > 0) out << ' ';
171     out << inst.words.at(i).data;
172   }
173   return out.str();
174 }
175 
176 string drop_last(const string& s) {
177   return string(s.begin(), --s.end());
178 }
179 
180 //: Label definitions must be the first word on a line. No jumping inside
181 //: instructions.
182 //: They should also be the only word on a line.
183 //: However, you can absolutely have multiple labels map to the same address,
184 //: as long as they're on separate lines.
185 
186 :(scenario multiple_labels_at)
187 == 0x1
188           # instruction                     effective address                                                   operand     displacement    immediate
189           # op          subop               mod             rm32          base        index         scale       r32
190           # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
191 # address 1
192 loop:
193  $loop2:
194 # address 1 (labels take up no space)
195             05                                                                                                                              0x0d0c0b0a/imm32  # add to EAX
196 # address 6
197             eb                                                                                                              $loop2/disp8
198 # address 8
199             eb                                                                                                              $loop3/disp8
200 # address 0xa
201  $loop3:
202 +transform: label 'loop' is at address 1
203 +transform: label '$loop2' is at address 1
204 +transform: label '$loop3' is at address a
205 # first jump is to -7
206 +transform: instruction after transform: 'eb f9'
207 # second jump is to 0 (fall through)
208 +transform: instruction after transform: 'eb 00'
209 
210 :(scenario label_too_short)
211 % Hide_errors = true;
212 == 0x1
213           # instruction                     effective address                                                   operand     displacement    immediate
214           # op          subop               mod             rm32          base        index         scale       r32
215           # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
216 xz:
217             05                                                                                                                              0x0d0c0b0a/imm32  # add to EAX
218 +error: 'xz' is two characters long which can look like raw hex bytes at a glance; use a different name
219 
220 :(scenario label_hex)
221 % Hide_errors = true;
222 == 0x1
223           # instruction                     effective address                                                   operand     displacement    immediate
224           # op          subop               mod             rm32          base        index         scale       r32
225           # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
226 0xab:
227             05                                                                                                                              0x0d0c0b0a/imm32  # add to EAX
228 +error: '0xab' looks like a hex number; use a different name
229 
230 :(scenario label_negative_hex)
231 % Hide_errors = true;
232 == 0x1
233           # instruction                     effective address                                                   operand     displacement    immediate
234           # op          subop               mod             rm32          base        index         scale       r32
235           # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
236  -a:  # indent to avoid looking like a trace_should_not_contain command for this scenario
237             05                                                                                                                              0x0d0c0b0a/imm32  # add to EAX
238 +error: '-a' starts with '-', which can be confused with a negative number; use a different name