1 //: Beginning of "level 2": tagging bytes with metadata around what field of
  2 //: an x86 instruction they're for.
  3 //:
  4 //: The x86 instruction set is variable-length, and how a byte is interpreted
  5 //: affects later instruction boundaries. A lot of the pain in programming
  6 //: machine code stems from computer and programmer going out of sync on what
  7 //: a byte means. The miscommunication is usually not immediately caught, and
  8 //: metastasizes at runtime into kilobytes of misinterpreted instructions.
  9 //:
 10 //: To mitigate these issues, we'll start programming in terms of logical
 11 //: operands rather than physical bytes. Some operands are smaller than a
 12 //: byte, and others may consist of multiple bytes. This layer will correctly
 13 //: pack and order the bytes corresponding to the operands in an instruction.
 14 
 15 :(before "End Help Texts")
 16 put(Help, "instructions",
 17   "Each x86 instruction consists of an instruction or opcode and some number\n"
 18   "of operands.\n"
 19   "Each operand has a type. An instruction won't have more than one operand of\n"
 20   "any type.\n"
 21   "Each instruction has some set of allowed operand types. It'll reject others.\n"
 22   "The complete list of operand types: mod, subop, r32 (register), rm32\n"
 23   "(register or memory), scale, index, base, disp8, disp16, disp32, imm8,\n"
 24   "imm32.\n"
 25   "Each of these has its own help page. Try reading 'subx help mod' next.\n"
 26 );
 27 :(before "End Help Contents")
 28 cerr << "  instructions\n";
 29 
 30 :(scenario pack_immediate_constants)
 31 == 0x1
 32 # instruction                     effective address                                                   operand     displacement    immediate
 33 # op          subop               mod             rm32          base        index         scale       r32
 34 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
 35   bb                                                                                                                              0x2a/imm32        # copy 42 to EBX
 36 +transform: packing instruction 'bb 0x2a/imm32'
 37 +transform: instruction after packing: 'bb 2a 00 00 00'
 38 +run: copy imm32 0x0000002a to EBX
 39 
 40 //: complete set of valid operand types
 41 
 42 :(before "End Globals")
 43 set<string> Instruction_operands;
 44 :(before "End One-time Setup")
 45 Instruction_operands.insert("subop");
 46 Instruction_operands.insert("mod");
 47 Instruction_operands.insert("rm32");
 48 Instruction_operands.insert("base");
 49 Instruction_operands.insert("index");
 50 Instruction_operands.insert("scale");
 51 Instruction_operands.insert("r32");
 52 Instruction_operands.insert("disp8");
 53 Instruction_operands.insert("disp16");
 54 Instruction_operands.insert("disp32");
 55 Instruction_operands.insert("imm8");
 56 Instruction_operands.insert("imm32");
 57 
 58 :(before "End Help Texts")
 59 init_operand_type_help();
 60 :(code)
 61 void init_operand_type_help() {
 62   put(Help, "mod",
 63     "2-bit operand controlling the _addressing mode_ of many instructions,\n"
 64     "to determine how to compute the _effective address_ to look up memory at\n"
 65     "based on the 'rm32' operand and potentially others.\n"
 66     "\n"
 67     "If mod = 3, just operate on the contents of the register specified by rm32\n"
 68     "            (direct mode).\n"
 69     "If mod = 2, effective address is usually* rm32 + disp32\n"
 70     "            (indirect mode with displacement).\n"
 71     "If mod = 1, effective address is usually* rm32 + disp8\n"
 72     "            (indirect mode with displacement).\n"
 73     "If mod = 0, effective address is usually* rm32 (indirect mode).\n"
 74     "(* - The exception is when rm32 is '4'. Register 4 is the stack pointer (ESP).\n"
 75     "     Using it as an address gets more involved. For more details,\n"
 76     "     try reading the help pages for 'base', 'index' and 'scale'.)\n"
 77     "\n"
 78     "For complete details consult the IA-32 software developer's manual, table 2-2,\n"
 79     "\"32-bit addressing forms with the ModR/M byte\".\n"
 80     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
 81   );
 82   put(Help, "subop",
 83     "Additional 3-bit operand for determining the instruction when the opcode is 81, 8f or ff.\n"
 84     "Can't coexist with operand of type 'r32' in a single instruction, because the two use the same bits.\n"
 85   );
 86   put(Help, "r32",
 87     "3-bit operand specifying a register operand used directly, without any further addressing modes.\n"
 88   );
 89   put(Help, "rm32",
 90     "3-bit operand specifying a register operand whose precise interpretation interacts with 'mod'.\n"
 91     "For complete details consult the IA-32 software developer's manual, table 2-2,\n"
 92     "\"32-bit addressing forms with the ModR/M byte\".\n"
 93     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
 94   );
 95   put(Help, "base",
 96     "Additional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) specifying the register containing an address to look up.\n"
 97     "This address may be further modified by 'index' and 'scale' operands.\n"
 98     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
 99     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
100     "\"32-bit addressing forms with the SIB byte\".\n"
101     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
102   );
103   put(Help, "index",
104     "Optional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be added to the 'base' operand to compute the 'effective address' at which to look up memory.\n"
105     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
106     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
107     "\"32-bit addressing forms with the SIB byte\".\n"
108     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
109   );
110   put(Help, "scale",
111     "Optional 2-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be multiplied to the 'index' operand before adding the result to the 'base' operand to compute the _effective address_ to operate on.\n"
112     "  effective address = base + index * scale + displacement (disp8 or disp32)\n"
113     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
114     "\"32-bit addressing forms with the SIB byte\".\n"
115     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
116   );
117   put(Help, "disp8",
118     "8-bit value to be added in many instructions.\n"
119   );
120   put(Help, "disp16",
121     "16-bit value to be added in many instructions.\n"
122   );
123   put(Help, "disp32",
124     "32-bit value to be added in many instructions.\n"
125   );
126   put(Help, "imm8",
127     "8-bit value for many instructions.\n"
128   );
129   put(Help, "imm32",
130     "32-bit value for many instructions.\n"
131   );
132 }
133 
134 //:: transform packing operands into bytes in the right order
135 
136 :(before "End Transforms")
137 // Begin Level-2 Transforms
138 Transform.push_back(pack_operands);
139 // End Level-2 Transforms
140 
141 :(code)
142 void pack_operands(program& p) {
143   if (p.segments.empty()) return;
144   segment& code = p.segments.at(0);
145   // Pack Operands(segment code)
146   trace(99, "transform") << "-- pack operands" << end();
147   for (int i = 0;  i < SIZE(code.lines);  ++i) {
148     line& inst = code.lines.at(i);
149     if (all_hex_bytes(inst)) continue;
150     trace(99, "transform") << "packing instruction '" << to_string(/*with metadata*/inst) << "'" << end();
151     pack_operands(inst);
152     trace(99, "transform") << "instruction after packing: '" << to_string(/*without metadata*/inst.words) << "'" << end();
153   }
154 }
155 
156 void pack_operands(line& inst) {
157   line new_inst;
158   add_opcodes(inst, new_inst);
159   add_modrm_byte(inst, new_inst);
160   add_sib_byte(inst, new_inst);
161   add_disp_bytes(inst, new_inst);
162   add_imm_bytes(inst, new_inst);
163   inst.words.swap(new_inst.words);
164 }
165 
166 void add_opcodes(const line& in, line& out) {
167   out.words.push_back(in.words.at(0));
168   if (in.words.at(0).data == "0f" || in.words.at(0).data == "f2" || in.words.at(0).data == "f3")
169     out.words.push_back(in.words.at(1));
170   if (in.words.at(0).data == "f3" && in.words.at(1).data == "0f")
171     out.words.push_back(in.words.at(2));
172   if (in.words.at(0).data == "f2" && in.words.at(1).data == "0f")
173     out.words.push_back(in.words.at(2));
174 }
175 
176 void add_modrm_byte(const line& in, line& out) {
177   uint8_t mod=0, reg_subop=0, rm32=0;
178   bool emit = false;
179   for (int i = 0;  i < SIZE(in.words);  ++i) {
180     const word& curr = in.words.at(i);
181     if (has_metadata(curr, "mod")) {
182       mod = hex_byte(curr.data);
183       emit = true;
184     }
185     else if (has_metadata(curr, "rm32")) {
186       rm32 = hex_byte(curr.data);
187       emit = true;
188     }
189     else if (has_metadata(curr, "r32")) {
190       reg_subop = hex_byte(curr.data);
191       emit = true;
192     }
193     else if (has_metadata(curr, "subop")) {
194       reg_subop = hex_byte(curr.data);
195       emit = true;
196     }
197   }
198   if (emit)
199     out.words.push_back(hex_byte_text((mod << 6) | (reg_subop << 3) | rm32));
200 }
201 
202 void add_sib_byte(const line& in, line& out) {
203   uint8_t scale=0, index=0, base=0;
204   bool emit = false;
205   for (int i = 0;  i < SIZE(in.words);  ++i) {
206     const word& curr = in.words.at(i);
207     if (has_metadata(curr, "scale")) {
208       scale = hex_byte(curr.data);
209       emit = true;
210     }
211     else if (has_metadata(curr, "index")) {
212       index = hex_byte(curr.data);
213       emit = true;
214     }
215     else if (has_metadata(curr, "base")) {
216       base = hex_byte(curr.data);
217       emit = true;
218     }
219   }
220   if (emit)
221     out.words.push_back(hex_byte_text((scale << 6) | (index << 3) | base));
222 }
223 
224 void add_disp_bytes(const line& in, line& out) {
225   for (int i = 0;  i < SIZE(in.words);  ++i) {
226     const word& curr = in.words.at(i);
227     if (has_metadata(curr, "disp8"))
228       emit_hex_bytes(out, curr, 1);
229     if (has_metadata(curr, "disp16"))
230       emit_hex_bytes(out, curr, 2);
231     else if (has_metadata(curr, "disp32"))
232       emit_hex_bytes(out, curr, 4);
233   }
234 }
235 
236 void add_imm_bytes(const line& in, line& out) {
237   for (int i = 0;  i < SIZE(in.words);  ++i) {
238     const word& curr = in.words.at(i);
239     if (has_metadata(curr, "imm8"))
240       emit_hex_bytes(out, curr, 1);
241     else if (has_metadata(curr, "imm32"))
242       emit_hex_bytes(out, curr, 4);
243   }
244 }
245 
246 void emit_hex_bytes(line& out, const word& w, int num) {
247   assert(num <= 4);
248   if (num == 1 || !is_hex_int(w.data)) {
249     out.words.push_back(w);
250     if (is_hex_int(w.data))
251       out.words.back().data = hex_byte_to_string(parse_int(w.data));
252     return;
253   }
254   emit_hex_bytes(out, static_cast<uint32_t>(parse_int(w.data)), num);
255 }
256 
257 void emit_hex_bytes(line& out, uint32_t val, int num) {
258   assert(num <= 4);
259   for (int i = 0;  i < num;  ++i) {
260     out.words.push_back(hex_byte_text(val & 0xff));
261     val = val >> 8;
262   }
263 }
264 
265 word hex_byte_text(uint8_t val) {
266   word result;
267   result.data = hex_byte_to_string(val);
268   result.original = result.data+"/auto";
269   return result;
270 }
271 
272 string hex_byte_to_string(uint8_t val) {
273   ostringstream out;
274   out << HEXBYTE << NUM(val);
275   return out.str();
276 }
277 
278 string to_string(const vector<word>& in) {
279   ostringstream out;
280   for (int i = 0;  i < SIZE(in);  ++i) {
281     if (i > 0) out << ' ';
282     out << in.at(i).data;
283   }
284   return out.str();
285 }
286 
287 :(before "End Unit Tests")
288 void test_preserve_metadata_when_emitting_single_byte() {
289   word in;
290   in.data = "f0";
291   in.original = "f0/foo";
292   line out;
293   emit_hex_bytes(out, in, 1);
294   CHECK_EQ(out.words.at(0).data, "f0");
295   CHECK_EQ(out.words.at(0).original, "f0/foo");
296 }
297 
298 :(scenario pack_disp8)
299 == 0x1
300 74 2/disp8  # jump 2 bytes away if ZF is set
301 +transform: packing instruction '74 2/disp8'
302 +transform: instruction after packing: '74 02'
303 
304 :(scenarios transform)
305 :(scenario pack_disp8_negative)
306 == 0x1
307 # running this will cause an infinite loop
308 74 -1/disp8  # jump 1 byte before if ZF is set
309 +transform: packing instruction '74 -1/disp8'
310 +transform: instruction after packing: '74 ff'
311 :(scenarios run)
312 
313 //: helper for scenario
314 :(code)
315 void transform(const string& text_bytes) {
316   program p;
317   istringstream in(text_bytes);
318   parse(in, p);
319   if (trace_contains_errors()) return;
320   transform(p);
321 }
322 
323 :(scenario pack_modrm_imm32)
324 == 0x1
325 # instruction                     effective address                                                   operand     displacement    immediate
326 # op          subop               mod             rm32          base        index         scale       r32
327 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
328   81          0/add/subop         3/mod/direct    3/ebx/rm32                                                                      1/imm32           # add 1 to EBX
329 +transform: packing instruction '81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32'
330 +transform: instruction after packing: '81 c3 01 00 00 00'
331 
332 :(scenario pack_imm32_large)
333 == 0x1
334 b9 0x080490a7/imm32  # copy to ECX
335 +transform: packing instruction 'b9 0x080490a7/imm32'
336 +transform: instruction after packing: 'b9 a7 90 04 08'
337 
338 :(scenario pack_immediate_constants_hex)
339 == 0x1
340 # instruction                     effective address                                                   operand     displacement    immediate
341 # op          subop               mod             rm32          base        index         scale       r32
342 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
343   bb                                                                                                                              0x2a/imm32        # copy 42 to EBX
344 +transform: packing instruction 'bb 0x2a/imm32'
345 +transform: instruction after packing: 'bb 2a 00 00 00'
346 +run: copy imm32 0x0000002a to EBX
347 
348 :(scenarios transform)
349 :(scenario pack_silently_ignores_non_hex)
350 == 0x1
351 # instruction                     effective address                                                   operand     displacement    immediate
352 # op          subop               mod             rm32          base        index         scale       r32
353 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
354   bb                                                                                                                              foo/imm32         # copy foo to EBX
355 +transform: packing instruction 'bb foo/imm32'
356 # no change (we're just not printing metadata to the trace)
357 +transform: instruction after packing: 'bb foo'
358 $error: 0
359 :(scenarios run)
360 
361 //:: helpers
362 
363 :(code)
364 bool all_hex_bytes(const line& inst) {
365   for (int i = 0;  i < SIZE(inst.words);  ++i)
366     if (!is_hex_byte(inst.words.at(i)))
367       return false;
368   return true;
369 }
370 
371 bool is_hex_byte(const word& curr) {
372   if (contains_any_operand_metadata(curr))
373     return false;
374   if (SIZE(curr.data) != 2)
375     return false;
376   if (curr.data.find_first_not_of("0123456789abcdefABCDEF") != string::npos)
377     return false;
378   return true;
379 }
380 
381 bool contains_any_operand_metadata(const word& word) {
382   for (int i = 0;  i < SIZE(word.metadata);  ++i)
383     if (Instruction_operands.find(word.metadata.at(i)) != Instruction_operands.end())
384       return true;
385   return false;
386 }
387 
388 bool has_metadata(const line& inst, const string& m) {
389   bool result = false;
390   for (int i = 0;  i < SIZE(inst.words);  ++i) {
391     if (!has_metadata(inst.words.at(i), m)) continue;
392     if (result) {
393       raise << "'" << to_string(inst) << "' has conflicting " << m << " operands\n" << end();
394       return false;
395     }
396     result = true;
397   }
398   return result;
399 }
400 
401 bool has_metadata(const word& w, const string& m) {
402   bool result = false;
403   bool metadata_found = false;
404   for (int i = 0;  i < SIZE(w.metadata);  ++i) {
405     const string& curr = w.metadata.at(i);
406     if (Instruction_operands.find(curr) == Instruction_operands.end()) continue;  // ignore unrecognized metadata
407     if (metadata_found) {
408       raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end();
409       return false;
410     }
411     metadata_found = true;
412     result = (curr == m);
413   }
414   return result;
415 }
416 
417 word metadata(const line& inst, const string& m) {
418   for (int i = 0;  i < SIZE(inst.words);  ++i)
419     if (has_metadata(inst.words.at(i), m))
420       return inst.words.at(i);
421   assert(false);
422 }
423 
424 bool is_hex_int(const string& s) {
425   if (s.empty()) return false;
426   size_t pos = 0;
427   if (s.at(0) == '-' || s.at(0) == '+') pos++;
428   if (s.substr(pos, pos+2) == "0x") pos += 2;
429   return s.find_first_not_of("0123456789abcdefABCDEF", pos) == string::npos;
430 }
431 
432 int32_t parse_int(const string& s) {
433   if (s.empty()) return 0;
434   istringstream in(s);
435   in >> std::hex;
436   if (s.at(0) == '-') {
437     int32_t result = 0;
438     in >> result;
439     if (!in || !in.eof()) {
440       raise << "not a number: " << s << '\n' << end();
441       return 0;
442     }
443     return result;
444   }
445   uint32_t uresult = 0;
446   in >> uresult;
447   if (!in || !in.eof()) {
448     raise << "not a number: " << s << '\n' << end();
449     return 0;
450   }
451   return static_cast<int32_t>(uresult);
452 }
453 :(before "End Unit Tests")
454 void test_parse_int() {
455   CHECK_EQ(0, parse_int("0"));
456   CHECK_EQ(0, parse_int("0x0"));
457   CHECK_EQ(0, parse_int("0x0"));
458   CHECK_EQ(16, parse_int("10"));  // hex always
459   CHECK_EQ(-1, parse_int("-1"));
460   CHECK_EQ(-1, parse_int("0xffffffff"));
461 }
462 
463 :(code)
464 string to_string(const line& inst) {
465   ostringstream out;
466   for (int i = 0;  i < SIZE(inst.words);  ++i) {
467     if (i > 0) out << ' ';
468     out << inst.words.at(i).original;
469   }
470   return out.str();
471 }