1 //: Beginning of "level 2": tagging bytes with metadata around what field of
  2 //: an x86 instruction they're for.
  3 //:
  4 //: The x86 instruction set is variable-length, and how a byte is interpreted
  5 //: affects later instruction boundaries. A lot of the pain in programming
  6 //: machine code stems from computer and programmer going out of sync on what
  7 //: a byte means. The miscommunication is usually not immediately caught, and
  8 //: metastasizes at runtime into kilobytes of misinterpreted instructions.
  9 //:
 10 //: To mitigate these issues, we'll start programming in terms of logical
 11 //: operands rather than physical bytes. Some operands are smaller than a
 12 //: byte, and others may consist of multiple bytes. This layer will correctly
 13 //: pack and order the bytes corresponding to the operands in an instruction.
 14 
 15 :(before "End Help Texts")
 16 put(Help, "instructions",
 17   "Each x86 instruction consists of an instruction or opcode and some number\n"
 18   "of operands.\n"
 19   "Each operand has a type. An instruction won't have more than one operand of\n"
 20   "any type.\n"
 21   "Each instruction has some set of allowed operand types. It'll reject others.\n"
 22   "The complete list of operand types: mod, subop, r32 (register), rm32\n"
 23   "(register or memory), scale, index, base, disp8, disp16, disp32, imm8,\n"
 24   "imm32.\n"
 25   "Each of these has its own help page. Try reading 'subx help mod' next.\n"
 26 );
 27 :(before "End Help Contents")
 28 cerr << "  instructions\n";
 29 
 30 :(scenario pack_immediate_constants)
 31 == 0x1
 32 # instruction                     effective address                                                   operand     displacement    immediate
 33 # op          subop               mod             rm32          base        index         scale       r32
 34 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
 35   bb                                                                                                                              0x2a/imm32        # copy 42 to EBX
 36 +transform: packing instruction 'bb 0x2a/imm32'
 37 +transform: instruction after packing: 'bb 2a 00 00 00'
 38 +run: copy imm32 0x0000002a to EBX
 39 
 40 //: complete set of valid operand types
 41 
 42 :(before "End Globals")
 43 set<string> Instruction_operands;
 44 :(before "End One-time Setup")
 45 Instruction_operands.insert("subop");
 46 Instruction_operands.insert("mod");
 47 Instruction_operands.insert("rm32");
 48 Instruction_operands.insert("base");
 49 Instruction_operands.insert("index");
 50 Instruction_operands.insert("scale");
 51 Instruction_operands.insert("r32");
 52 Instruction_operands.insert("disp8");
 53 Instruction_operands.insert("disp16");
 54 Instruction_operands.insert("disp32");
 55 Instruction_operands.insert("imm8");
 56 Instruction_operands.insert("imm32");
 57 
 58 :(before "End Help Texts")
 59 init_operand_type_help();
 60 :(code)
 61 void init_operand_type_help() {
 62   put(Help, "mod",
 63     "2-bit operand controlling the _addressing mode_ of many instructions,\n"
 64     "to determine how to compute the _effective address_ to look up memory at\n"
 65     "based on the 'rm32' operand and potentially others.\n"
 66     "\n"
 67     "If mod = 3, just operate on the contents of the register specified by rm32\n"
 68     "            (direct mode).\n"
 69     "If mod = 2, effective address is usually* rm32 + disp32\n"
 70     "            (indirect mode with displacement).\n"
 71     "If mod = 1, effective address is usually* rm32 + disp8\n"
 72     "            (indirect mode with displacement).\n"
 73     "If mod = 0, effective address is usually* rm32 (indirect mode).\n"
 74     "(* - The exception is when rm32 is '4'. Register 4 is the stack pointer (ESP).\n"
 75     "     Using it as an address gets more involved. For more details,\n"
 76     "     try reading the help pages for 'base', 'index' and 'scale'.)\n"
 77     "\n"
 78     "For complete details consult the IA-32 software developer's manual, table 2-2,\n"
 79     "\"32-bit addressing forms with the ModR/M byte\".\n"
 80     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
 81   );
 82   put(Help, "subop",
 83     "Additional 3-bit operand for determining the instruction when the opcode is 81, 8f or ff.\n"
 84     "Can't coexist with operand of type 'r32' in a single instruction, because the two use the same bits.\n"
 85   );
 86   put(Help, "r32",
 87     "3-bit operand specifying a register operand used directly, without any further addressing modes.\n"
 88   );
 89   put(Help, "rm32",
 90     "32-bit value in register or memory. The precise details of its construction depend on the eponymous 3-bit\n"
 91     "'rm32' operand, the 'mod' operand, and also potentially the 'SIB' operands ('scale', 'index' and 'base')\n"
 92     "and a displacement ('disp8' or 'disp32').\n"
 93     "For complete details consult the IA-32 software developer's manual, table 2-2,\n"
 94     "\"32-bit addressing forms with the ModR/M byte\".\n"
 95     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
 96   );
 97   put(Help, "base",
 98     "Additional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) specifying the register containing an address to look up.\n"
 99     "This address may be further modified by 'index' and 'scale' operands.\n"
100     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
101     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
102     "\"32-bit addressing forms with the SIB byte\".\n"
103     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
104   );
105   put(Help, "index",
106     "Optional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be added to the 'base' operand to compute the 'effective address' at which to look up memory.\n"
107     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
108     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
109     "\"32-bit addressing forms with the SIB byte\".\n"
110     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
111   );
112   put(Help, "scale",
113     "Optional 2-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be multiplied to the 'index' operand before adding the result to the 'base' operand to compute the _effective address_ to operate on.\n"
114     "  effective address = base + index * scale + displacement (disp8 or disp32)\n"
115     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
116     "\"32-bit addressing forms with the SIB byte\".\n"
117     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
118   );
119   put(Help, "disp8",
120     "8-bit value to be added in many instructions.\n"
121   );
122   put(Help, "disp16",
123     "16-bit value to be added in many instructions.\n"
124   );
125   put(Help, "disp32",
126     "32-bit value to be added in many instructions.\n"
127   );
128   put(Help, "imm8",
129     "8-bit value for many instructions.\n"
130   );
131   put(Help, "imm32",
132     "32-bit value for many instructions.\n"
133   );
134 }
135 
136 //:: transform packing operands into bytes in the right order
137 
138 :(after "Begin Transforms")
139 // Begin Level-2 Transforms
140 Transform.push_back(pack_operands);
141 // End Level-2 Transforms
142 
143 :(code)
144 void pack_operands(program& p) {
145   if (p.segments.empty()) return;
146   segment& code = p.segments.at(0);
147   // Pack Operands(segment code)
148   trace(99, "transform") << "-- pack operands" << end();
149   for (int i = 0;  i < SIZE(code.lines);  ++i) {
150     line& inst = code.lines.at(i);
151     if (all_hex_bytes(inst)) continue;
152     trace(99, "transform") << "packing instruction '" << to_string(/*with metadata*/inst) << "'" << end();
153     pack_operands(inst);
154     trace(99, "transform") << "instruction after packing: '" << to_string(/*without metadata*/inst.words) << "'" << end();
155   }
156 }
157 
158 void pack_operands(line& inst) {
159   line new_inst;
160   add_opcodes(inst, new_inst);
161   add_modrm_byte(inst, new_inst);
162   add_sib_byte(inst, new_inst);
163   add_disp_bytes(inst, new_inst);
164   add_imm_bytes(inst, new_inst);
165   inst.words.swap(new_inst.words);
166 }
167 
168 void add_opcodes(const line& in, line& out) {
169   out.words.push_back(in.words.at(0));
170   if (in.words.at(0).data == "0f" || in.words.at(0).data == "f2" || in.words.at(0).data == "f3")
171     out.words.push_back(in.words.at(1));
172   if (in.words.at(0).data == "f3" && in.words.at(1).data == "0f")
173     out.words.push_back(in.words.at(2));
174   if (in.words.at(0).data == "f2" && in.words.at(1).data == "0f")
175     out.words.push_back(in.words.at(2));
176 }
177 
178 void add_modrm_byte(const line& in, line& out) {
179   uint8_t mod=0, reg_subop=0, rm32=0;
180   bool emit = false;
181   for (int i = 0;  i < SIZE(in.words);  ++i) {
182     const word& curr = in.words.at(i);
183     if (has_operand_metadata(curr, "mod")) {
184       mod = hex_byte(curr.data);
185       emit = true;
186     }
187     else if (has_operand_metadata(curr, "rm32")) {
188       rm32 = hex_byte(curr.data);
189       emit = true;
190     }
191     else if (has_operand_metadata(curr, "r32")) {
192       reg_subop = hex_byte(curr.data);
193       emit = true;
194     }
195     else if (has_operand_metadata(curr, "subop")) {
196       reg_subop = hex_byte(curr.data);
197       emit = true;
198     }
199   }
200   if (emit)
201     out.words.push_back(hex_byte_text((mod << 6) | (reg_subop << 3) | rm32));
202 }
203 
204 void add_sib_byte(const line& in, line& out) {
205   uint8_t scale=0, index=0, base=0;
206   bool emit = false;
207   for (int i = 0;  i < SIZE(in.words);  ++i) {
208     const word& curr = in.words.at(i);
209     if (has_operand_metadata(curr, "scale")) {
210       scale = hex_byte(curr.data);
211       emit = true;
212     }
213     else if (has_operand_metadata(curr, "index")) {
214       index = hex_byte(curr.data);
215       emit = true;
216     }
217     else if (has_operand_metadata(curr, "base")) {
218       base = hex_byte(curr.data);
219       emit = true;
220     }
221   }
222   if (emit)
223     out.words.push_back(hex_byte_text((scale << 6) | (index << 3) | base));
224 }
225 
226 void add_disp_bytes(const line& in, line& out) {
227   for (int i = 0;  i < SIZE(in.words);  ++i) {
228     const word& curr = in.words.at(i);
229     if (has_operand_metadata(curr, "disp8"))
230       emit_hex_bytes(out, curr, 1);
231     if (has_operand_metadata(curr, "disp16"))
232       emit_hex_bytes(out, curr, 2);
233     else if (has_operand_metadata(curr, "disp32"))
234       emit_hex_bytes(out, curr, 4);
235   }
236 }
237 
238 void add_imm_bytes(const line& in, line& out) {
239   for (int i = 0;  i < SIZE(in.words);  ++i) {
240     const word& curr = in.words.at(i);
241     if (has_operand_metadata(curr, "imm8"))
242       emit_hex_bytes(out, curr, 1);
243     else if (has_operand_metadata(curr, "imm32"))
244       emit_hex_bytes(out, curr, 4);
245   }
246 }
247 
248 void emit_hex_bytes(line& out, const word& w, int num) {
249   assert(num <= 4);
250   if (num == 1 || !is_hex_int(w.data)) {
251     out.words.push_back(w);
252     if (is_hex_int(w.data))
253       out.words.back().data = hex_byte_to_string(parse_int(w.data));
254     return;
255   }
256   emit_hex_bytes(out, static_cast<uint32_t>(parse_int(w.data)), num);
257 }
258 
259 void emit_hex_bytes(line& out, uint32_t val, int num) {
260   assert(num <= 4);
261   for (int i = 0;  i < num;  ++i) {
262     out.words.push_back(hex_byte_text(val & 0xff));
263     val = val >> 8;
264   }
265 }
266 
267 word hex_byte_text(uint8_t val) {
268   word result;
269   result.data = hex_byte_to_string(val);
270   result.original = result.data+"/auto";
271   return result;
272 }
273 
274 string hex_byte_to_string(uint8_t val) {
275   ostringstream out;
276   // uint8_t prints without padding, but int8_t will expand to 32 bits again
277   out << HEXBYTE << NUM(val);
278   return out.str();
279 }
280 
281 string to_string(const vector<word>& in) {
282   ostringstream out;
283   for (int i = 0;  i < SIZE(in);  ++i) {
284     if (i > 0) out << ' ';
285     out << in.at(i).data;
286   }
287   return out.str();
288 }
289 
290 :(before "End Unit Tests")
291 void test_preserve_metadata_when_emitting_single_byte() {
292   word in;
293   in.data = "f0";
294   in.original = "f0/foo";
295   line out;
296   emit_hex_bytes(out, in, 1);
297   CHECK_EQ(out.words.at(0).data, "f0");
298   CHECK_EQ(out.words.at(0).original, "f0/foo");
299 }
300 
301 :(scenario pack_disp8)
302 == 0x1
303 74 2/disp8  # jump 2 bytes away if ZF is set
304 +transform: packing instruction '74 2/disp8'
305 +transform: instruction after packing: '74 02'
306 
307 :(scenarios transform)
308 :(scenario pack_disp8_negative)
309 == 0x1
310 # running this will cause an infinite loop
311 74 -1/disp8  # jump 1 byte before if ZF is set
312 +transform: packing instruction '74 -1/disp8'
313 +transform: instruction after packing: '74 ff'
314 :(scenarios run)
315 
316 //: helper for scenario
317 :(code)
318 void transform(const string& text_bytes) {
319   program p;
320   istringstream in(text_bytes);
321   parse(in, p);
322   if (trace_contains_errors()) return;
323   transform(p);
324 }
325 
326 :(scenario pack_modrm_imm32)
327 == 0x1
328 # instruction                     effective address                                                   operand     displacement    immediate
329 # op          subop               mod             rm32          base        index         scale       r32
330 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
331   81          0/add/subop         3/mod/direct    3/ebx/rm32                                                                      1/imm32           # add 1 to EBX
332 +transform: packing instruction '81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32'
333 +transform: instruction after packing: '81 c3 01 00 00 00'
334 
335 :(scenario pack_imm32_large)
336 == 0x1
337 b9 0x080490a7/imm32  # copy to ECX
338 +transform: packing instruction 'b9 0x080490a7/imm32'
339 +transform: instruction after packing: 'b9 a7 90 04 08'
340 
341 :(scenario pack_immediate_constants_hex)
342 == 0x1
343 # instruction                     effective address                                                   operand     displacement    immediate
344 # op          subop               mod             rm32          base        index         scale       r32
345 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
346   bb                                                                                                                              0x2a/imm32        # copy 42 to EBX
347 +transform: packing instruction 'bb 0x2a/imm32'
348 +transform: instruction after packing: 'bb 2a 00 00 00'
349 +run: copy imm32 0x0000002a to EBX
350 
351 :(scenarios transform)
352 :(scenario pack_silently_ignores_non_hex)
353 == 0x1
354 # instruction                     effective address                                                   operand     displacement    immediate
355 # op          subop               mod             rm32          base        index         scale       r32
356 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
357   bb                                                                                                                              foo/imm32         # copy foo to EBX
358 +transform: packing instruction 'bb foo/imm32'
359 # no change (we're just not printing metadata to the trace)
360 +transform: instruction after packing: 'bb foo'
361 $error: 0
362 :(scenarios run)
363 
364 //:: helpers
365 
366 :(code)
367 bool all_hex_bytes(const line& inst) {
368   for (int i = 0;  i < SIZE(inst.words);  ++i)
369     if (!is_hex_byte(inst.words.at(i)))
370       return false;
371   return true;
372 }
373 
374 bool is_hex_byte(const word& curr) {
375   if (contains_any_operand_metadata(curr))
376     return false;
377   if (SIZE(curr.data) != 2)
378     return false;
379   if (curr.data.find_first_not_of("0123456789abcdefABCDEF") != string::npos)
380     return false;
381   return true;
382 }
383 
384 bool contains_any_operand_metadata(const word& word) {
385   for (int i = 0;  i < SIZE(word.metadata);  ++i)
386     if (Instruction_operands.find(word.metadata.at(i)) != Instruction_operands.end())
387       return true;
388   return false;
389 }
390 
391 bool has_operand_metadata(const line& inst, const string& m) {
392   bool result = false;
393   for (int i = 0;  i < SIZE(inst.words);  ++i) {
394     if (!has_operand_metadata(inst.words.at(i), m)) continue;
395     if (result) {
396       raise << "'" << to_string(inst) << "' has conflicting " << m << " operands\n" << end();
397       return false;
398     }
399     result = true;
400   }
401   return result;
402 }
403 
404 bool has_operand_metadata(const word& w, const string& m) {
405   bool result = false;
406   bool metadata_found = false;
407   for (int i = 0;  i < SIZE(w.metadata);  ++i) {
408     const string& curr = w.metadata.at(i);
409     if (Instruction_operands.find(curr) == Instruction_operands.end()) continue;  // ignore unrecognized metadata
410     if (metadata_found) {
411       raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end();
412       return false;
413     }
414     metadata_found = true;
415     result = (curr == m);
416   }
417   return result;
418 }
419 
420 word metadata(const line& inst, const string& m) {
421   for (int i = 0;  i < SIZE(inst.words);  ++i)
422     if (has_operand_metadata(inst.words.at(i), m))
423       return inst.words.at(i);
424   assert(false);
425 }
426 
427 bool is_hex_int(const string& s) {
428   if (s.empty()) return false;
429   size_t pos = 0;
430   if (s.at(0) == '-' || s.at(0) == '+') pos++;
431   if (s.substr(pos, pos+2) == "0x") pos += 2;
432   return s.find_first_not_of("0123456789abcdefABCDEF", pos) == string::npos;
433 }
434 
435 :(code)
436 string to_string(const line& inst) {
437   ostringstream out;
438   for (int i = 0;  i < SIZE(inst.words);  ++i) {
439     if (i > 0) out << ' ';
440     out << inst.words.at(i).original;
441   }
442   return out.str();
443 }