1 //: Beginning of "level 2": tagging bytes with metadata around what field of
  2 //: an x86 instruction they're for.
  3 //:
  4 //: The x86 instruction set is variable-length, and how a byte is interpreted
  5 //: affects later instruction boundaries. A lot of the pain in programming
  6 //: machine code stems from computer and programmer going out of sync on what
  7 //: a byte means. The miscommunication is usually not immediately caught, and
  8 //: metastasizes at runtime into kilobytes of misinterpreted instructions.
  9 //:
 10 //: To mitigate these issues, we'll start programming in terms of logical
 11 //: operands rather than physical bytes. Some operands are smaller than a
 12 //: byte, and others may consist of multiple bytes. This layer will correctly
 13 //: pack and order the bytes corresponding to the operands in an instruction.
 14 
 15 :(before "End Help Texts")
 16 put(Help, "instructions",
 17   "Each x86 instruction consists of an instruction or opcode and some number\n"
 18   "of operands.\n"
 19   "Each operand has a type. An instruction won't have more than one operand of\n"
 20   "any type.\n"
 21   "Each instruction has some set of allowed operand types. It'll reject others.\n"
 22   "The complete list of operand types: mod, subop, r32 (register), rm32\n"
 23   "(register or memory), scale, index, base, disp8, disp16, disp32, imm8,\n"
 24   "imm32.\n"
 25   "Each of these has its own help page. Try reading 'subx help mod' next.\n"
 26 );
 27 :(before "End Help Contents")
 28 cerr << "  instructions\n";
 29 
 30 :(scenario pack_immediate_constants)
 31 == 0x1
 32 # instruction                     effective address                                                   operand     displacement    immediate
 33 # op          subop               mod             rm32          base        index         scale       r32
 34 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
 35   bb                                                                                                                              0x2a/imm32        # copy 42 to EBX
 36 +transform: packing instruction 'bb 0x2a/imm32'
 37 +transform: instruction after packing: 'bb 2a 00 00 00'
 38 +run: copy imm32 0x0000002a to EBX
 39 
 40 //: complete set of valid operand types
 41 
 42 :(before "End Globals")
 43 set<string> Instruction_operands;
 44 :(before "End One-time Setup")
 45 Instruction_operands.insert("subop");
 46 Instruction_operands.insert("mod");
 47 Instruction_operands.insert("rm32");
 48 Instruction_operands.insert("base");
 49 Instruction_operands.insert("index");
 50 Instruction_operands.insert("scale");
 51 Instruction_operands.insert("r32");
 52 Instruction_operands.insert("disp8");
 53 Instruction_operands.insert("disp16");
 54 Instruction_operands.insert("disp32");
 55 Instruction_operands.insert("imm8");
 56 Instruction_operands.insert("imm32");
 57 
 58 :(before "End Help Texts")
 59 init_operand_type_help();
 60 :(code)
 61 void init_operand_type_help() {
 62   put(Help, "mod",
 63     "2-bit operand controlling the _addressing mode_ of many instructions,\n"
 64     "to determine how to compute the _effective address_ to look up memory at\n"
 65     "based on the 'rm32' operand and potentially others.\n"
 66     "\n"
 67     "If mod = 3, just operate on the contents of the register specified by rm32\n"
 68     "            (direct mode).\n"
 69     "If mod = 2, effective address is usually* rm32 + disp32\n"
 70     "            (indirect mode with displacement).\n"
 71     "If mod = 1, effective address is usually* rm32 + disp8\n"
 72     "            (indirect mode with displacement).\n"
 73     "If mod = 0, effective address is usually* rm32 (indirect mode).\n"
 74     "(* - The exception is when rm32 is '4'. Register 4 is the stack pointer (ESP).\n"
 75     "     Using it as an address gets more involved. For more details,\n"
 76     "     try reading the help pages for 'base', 'index' and 'scale'.)\n"
 77     "\n"
 78     "For complete details consult the IA-32 software developer's manual, table 2-2,\n"
 79     "\"32-bit addressing forms with the ModR/M byte\".\n"
 80     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
 81   );
 82   put(Help, "subop",
 83     "Additional 3-bit operand for determining the instruction when the opcode is 81, 8f or ff.\n"
 84     "Can't coexist with operand of type 'r32' in a single instruction, because the two use the same bits.\n"
 85   );
 86   put(Help, "r32",
 87     "3-bit operand specifying a register operand used directly, without any further addressing modes.\n"
 88   );
 89   put(Help, "rm32",
 90     "3-bit operand specifying a register operand whose precise interpretation interacts with 'mod'.\n"
 91     "For complete details consult the IA-32 software developer's manual, table 2-2,\n"
 92     "\"32-bit addressing forms with the ModR/M byte\".\n"
 93     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
 94   );
 95   put(Help, "base",
 96     "Additional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) specifying the register containing an address to look up.\n"
 97     "This address may be further modified by 'index' and 'scale' operands.\n"
 98     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
 99     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
100     "\"32-bit addressing forms with the SIB byte\".\n"
101     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
102   );
103   put(Help, "index",
104     "Optional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be added to the 'base' operand to compute the 'effective address' at which to look up memory.\n"
105     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
106     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
107     "\"32-bit addressing forms with the SIB byte\".\n"
108     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
109   );
110   put(Help, "scale",
111     "Optional 2-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be multiplied to the 'index' operand before adding the result to the 'base' operand to compute the _effective address_ to operate on.\n"
112     "  effective address = base + index * scale + displacement (disp8 or disp32)\n"
113     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
114     "\"32-bit addressing forms with the SIB byte\".\n"
115     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
116   );
117   put(Help, "disp8",
118     "8-bit value to be added in many instructions.\n"
119   );
120   put(Help, "disp16",
121     "16-bit value to be added in many instructions.\n"
122   );
123   put(Help, "disp32",
124     "32-bit value to be added in many instructions.\n"
125   );
126   put(Help, "imm8",
127     "8-bit value for many instructions.\n"
128   );
129   put(Help, "imm32",
130     "32-bit value for many instructions.\n"
131   );
132 }
133 
134 //:: transform packing operands into bytes in the right order
135 
136 :(after "Begin Transforms")
137 // Begin Level-2 Transforms
138 Transform.push_back(pack_operands);
139 // End Level-2 Transforms
140 
141 :(code)
142 void pack_operands(program& p) {
143   if (p.segments.empty()) return;
144   segment& code = p.segments.at(0);
145   // Pack Operands(segment code)
146   trace(99, "transform") << "-- pack operands" << end();
147   for (int i = 0;  i < SIZE(code.lines);  ++i) {
148     line& inst = code.lines.at(i);
149     if (all_hex_bytes(inst)) continue;
150     trace(99, "transform") << "packing instruction '" << to_string(/*with metadata*/inst) << "'" << end();
151     pack_operands(inst);
152     trace(99, "transform") << "instruction after packing: '" << to_string(/*without metadata*/inst.words) << "'" << end();
153   }
154 }
155 
156 void pack_operands(line& inst) {
157   line new_inst;
158   add_opcodes(inst, new_inst);
159   add_modrm_byte(inst, new_inst);
160   add_sib_byte(inst, new_inst);
161   add_disp_bytes(inst, new_inst);
162   add_imm_bytes(inst, new_inst);
163   inst.words.swap(new_inst.words);
164 }
165 
166 void add_opcodes(const line& in, line& out) {
167   out.words.push_back(in.words.at(0));
168   if (in.words.at(0).data == "0f" || in.words.at(0).data == "f2" || in.words.at(0).data == "f3")
169     out.words.push_back(in.words.at(1));
170   if (in.words.at(0).data == "f3" && in.words.at(1).data == "0f")
171     out.words.push_back(in.words.at(2));
172   if (in.words.at(0).data == "f2" && in.words.at(1).data == "0f")
173     out.words.push_back(in.words.at(2));
174 }
175 
176 void add_modrm_byte(const line& in, line& out) {
177   uint8_t mod=0, reg_subop=0, rm32=0;
178   bool emit = false;
179   for (int i = 0;  i < SIZE(in.words);  ++i) {
180     const word& curr = in.words.at(i);
181     if (has_metadata(curr, "mod")) {
182       mod = hex_byte(curr.data);
183       emit = true;
184     }
185     else if (has_metadata(curr, "rm32")) {
186       rm32 = hex_byte(curr.data);
187       emit = true;
188     }
189     else if (has_metadata(curr, "r32")) {
190       reg_subop = hex_byte(curr.data);
191       emit = true;
192     }
193     else if (has_metadata(curr, "subop")) {
194       reg_subop = hex_byte(curr.data);
195       emit = true;
196     }
197   }
198   if (emit)
199     out.words.push_back(hex_byte_text((mod << 6) | (reg_subop << 3) | rm32));
200 }
201 
202 void add_sib_byte(const line& in, line& out) {
203   uint8_t scale=0, index=0, base=0;
204   bool emit = false;
205   for (int i = 0;  i < SIZE(in.words);  ++i) {
206     const word& curr = in.words.at(i);
207     if (has_metadata(curr, "scale")) {
208       scale = hex_byte(curr.data);
209       emit = true;
210     }
211     else if (has_metadata(curr, "index")) {
212       index = hex_byte(curr.data);
213       emit = true;
214     }
215     else if (has_metadata(curr, "base")) {
216       base = hex_byte(curr.data);
217       emit = true;
218     }
219   }
220   if (emit)
221     out.words.push_back(hex_byte_text((scale << 6) | (index << 3) | base));
222 }
223 
224 void add_disp_bytes(const line& in, line& out) {
225   for (int i = 0;  i < SIZE(in.words);  ++i) {
226     const word& curr = in.words.at(i);
227     if (has_metadata(curr, "disp8"))
228       emit_hex_bytes(out, curr, 1);
229     if (has_metadata(curr, "disp16"))
230       emit_hex_bytes(out, curr, 2);
231     else if (has_metadata(curr, "disp32"))
232       emit_hex_bytes(out, curr, 4);
233   }
234 }
235 
236 void add_imm_bytes(const line& in, line& out) {
237   for (int i = 0;  i < SIZE(in.words);  ++i) {
238     const word& curr = in.words.at(i);
239     if (has_metadata(curr, "imm8"))
240       emit_hex_bytes(out, curr, 1);
241     else if (has_metadata(curr, "imm32"))
242       emit_hex_bytes(out, curr, 4);
243   }
244 }
245 
246 void emit_hex_bytes(line& out, const word& w, int num) {
247   assert(num <= 4);
248   if (num == 1 || !is_hex_int(w.data)) {
249     out.words.push_back(w);
250     if (is_hex_int(w.data))
251       out.words.back().data = hex_byte_to_string(parse_int(w.data));
252     return;
253   }
254   emit_hex_bytes(out, static_cast<uint32_t>(parse_int(w.data)), num);
255 }
256 
257 void emit_hex_bytes(line& out, uint32_t val, int num) {
258   assert(num <= 4);
259   for (int i = 0;  i < num;  ++i) {
260     out.words.push_back(hex_byte_text(val & 0xff));
261     val = val >> 8;
262   }
263 }
264 
265 word hex_byte_text(uint8_t val) {
266   word result;
267   result.data = hex_byte_to_string(val);
268   result.original = result.data+"/auto";
269   return result;
270 }
271 
272 string hex_byte_to_string(uint8_t val) {
273   ostringstream out;
274   // uint8_t prints without padding, but int8_t will expand to 32 bits again
275   out << HEXBYTE << NUM(val);
276   return out.str();
277 }
278 
279 string to_string(const vector<word>& in) {
280   ostringstream out;
281   for (int i = 0;  i < SIZE(in);  ++i) {
282     if (i > 0) out << ' ';
283     out << in.at(i).data;
284   }
285   return out.str();
286 }
287 
288 :(before "End Unit Tests")
289 void test_preserve_metadata_when_emitting_single_byte() {
290   word in;
291   in.data = "f0";
292   in.original = "f0/foo";
293   line out;
294   emit_hex_bytes(out, in, 1);
295   CHECK_EQ(out.words.at(0).data, "f0");
296   CHECK_EQ(out.words.at(0).original, "f0/foo");
297 }
298 
299 :(scenario pack_disp8)
300 == 0x1
301 74 2/disp8  # jump 2 bytes away if ZF is set
302 +transform: packing instruction '74 2/disp8'
303 +transform: instruction after packing: '74 02'
304 
305 :(scenarios transform)
306 :(scenario pack_disp8_negative)
307 == 0x1
308 # running this will cause an infinite loop
309 74 -1/disp8  # jump 1 byte before if ZF is set
310 +transform: packing instruction '74 -1/disp8'
311 +transform: instruction after packing: '74 ff'
312 :(scenarios run)
313 
314 //: helper for scenario
315 :(code)
316 void transform(const string& text_bytes) {
317   program p;
318   istringstream in(text_bytes);
319   parse(in, p);
320   if (trace_contains_errors()) return;
321   transform(p);
322 }
323 
324 :(scenario pack_modrm_imm32)
325 == 0x1
326 # instruction                     effective address                                                   operand     displacement    immediate
327 # op          subop               mod             rm32          base        index         scale       r32
328 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
329   81          0/add/subop         3/mod/direct    3/ebx/rm32                                                                      1/imm32           # add 1 to EBX
330 +transform: packing instruction '81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32'
331 +transform: instruction after packing: '81 c3 01 00 00 00'
332 
333 :(scenario pack_imm32_large)
334 == 0x1
335 b9 0x080490a7/imm32  # copy to ECX
336 +transform: packing instruction 'b9 0x080490a7/imm32'
337 +transform: instruction after packing: 'b9 a7 90 04 08'
338 
339 :(scenario pack_immediate_constants_hex)
340 == 0x1
341 # instruction                     effective address                                                   operand     displacement    immediate
342 # op          subop               mod             rm32          base        index         scale       r32
343 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
344   bb                                                                                                                              0x2a/imm32        # copy 42 to EBX
345 +transform: packing instruction 'bb 0x2a/imm32'
346 +transform: instruction after packing: 'bb 2a 00 00 00'
347 +run: copy imm32 0x0000002a to EBX
348 
349 :(scenarios transform)
350 :(scenario pack_silently_ignores_non_hex)
351 == 0x1
352 # instruction                     effective address                                                   operand     displacement    immediate
353 # op          subop               mod             rm32          base        index         scale       r32
354 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
355   bb                                                                                                                              foo/imm32         # copy foo to EBX
356 +transform: packing instruction 'bb foo/imm32'
357 # no change (we're just not printing metadata to the trace)
358 +transform: instruction after packing: 'bb foo'
359 $error: 0
360 :(scenarios run)
361 
362 //:: helpers
363 
364 :(code)
365 bool all_hex_bytes(const line& inst) {
366   for (int i = 0;  i < SIZE(inst.words);  ++i)
367     if (!is_hex_byte(inst.words.at(i)))
368       return false;
369   return true;
370 }
371 
372 bool is_hex_byte(const word& curr) {
373   if (contains_any_operand_metadata(curr))
374     return false;
375   if (SIZE(curr.data) != 2)
376     return false;
377   if (curr.data.find_first_not_of("0123456789abcdefABCDEF") != string::npos)
378     return false;
379   return true;
380 }
381 
382 bool contains_any_operand_metadata(const word& word) {
383   for (int i = 0;  i < SIZE(word.metadata);  ++i)
384     if (Instruction_operands.find(word.metadata.at(i)) != Instruction_operands.end())
385       return true;
386   return false;
387 }
388 
389 bool has_metadata(const line& inst, const string& m) {
390   bool result = false;
391   for (int i = 0;  i < SIZE(inst.words);  ++i) {
392     if (!has_metadata(inst.words.at(i), m)) continue;
393     if (result) {
394       raise << "'" << to_string(inst) << "' has conflicting " << m << " operands\n" << end();
395       return false;
396     }
397     result = true;
398   }
399   return result;
400 }
401 
402 bool has_metadata(const word& w, const string& m) {
403   bool result = false;
404   bool metadata_found = false;
405   for (int i = 0;  i < SIZE(w.metadata);  ++i) {
406     const string& curr = w.metadata.at(i);
407     if (Instruction_operands.find(curr) == Instruction_operands.end()) continue;  // ignore unrecognized metadata
408     if (metadata_found) {
409       raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end();
410       return false;
411     }
412     metadata_found = true;
413     result = (curr == m);
414   }
415   return result;
416 }
417 
418 word metadata(const line& inst, const string& m) {
419   for (int i = 0;  i < SIZE(inst.words);  ++i)
420     if (has_metadata(inst.words.at(i), m))
421       return inst.words.at(i);
422   assert(false);
423 }
424 
425 bool is_hex_int(const string& s) {
426   if (s.empty()) return false;
427   size_t pos = 0;
428   if (s.at(0) == '-' || s.at(0) == '+') pos++;
429   if (s.substr(pos, pos+2) == "0x") pos += 2;
430   return s.find_first_not_of("0123456789abcdefABCDEF", pos) == string::npos;
431 }
432 
433 :(code)
434 string to_string(const line& inst) {
435   ostringstream out;
436   for (int i = 0;  i < SIZE(inst.words);  ++i) {
437     if (i > 0) out << ' ';
438     out << inst.words.at(i).original;
439   }
440   return out.str();
441 }