1 //: Beginning of "level 2": tagging bytes with metadata around what field of
  2 //: an x86 instruction they're for.
  3 //:
  4 //: The x86 instruction set is variable-length, and how a byte is interpreted
  5 //: affects later instruction boundaries. A lot of the pain in programming
  6 //: machine code stems from computer and programmer going out of sync on what
  7 //: a byte means. The miscommunication is usually not immediately caught, and
  8 //: metastasizes at runtime into kilobytes of misinterpreted instructions.
  9 //:
 10 //: To mitigate these issues, we'll start programming in terms of logical
 11 //: operands rather than physical bytes. Some operands are smaller than a
 12 //: byte, and others may consist of multiple bytes. This layer will correctly
 13 //: pack and order the bytes corresponding to the operands in an instruction.
 14 
 15 :(before "End Help Texts")
 16 put(Help, "instructions",
 17   "Each x86 instruction consists of an instruction or opcode and some number\n"
 18   "of operands.\n"
 19   "Each operand has a type. An instruction won't have more than one operand of\n"
 20   "any type.\n"
 21   "Each instruction has some set of allowed operand types. It'll reject others.\n"
 22   "The complete list of operand types: mod, subop, r32 (register), rm32\n"
 23   "(register or memory), scale, index, base, disp8, disp16, disp32, imm8,\n"
 24   "imm32.\n"
 25   "Each of these has its own help page. Try reading 'subx help mod' next.\n"
 26 );
 27 :(before "End Help Contents")
 28 cerr << "  instructions\n";
 29 
 30 :(scenario pack_immediate_constants)
 31 == 0x1
 32 # instruction                     effective address                                                   operand     displacement    immediate
 33 # op          subop               mod             rm32          base        index         scale       r32
 34 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
 35   bb                                                                                                                              0x2a/imm32        # copy 42 to EBX
 36 +transform: packing instruction 'bb 0x2a/imm32'
 37 +transform: instruction after packing: 'bb 2a 00 00 00'
 38 +run: copy imm32 0x0000002a to EBX
 39 
 40 //: complete set of valid operand types
 41 
 42 :(before "End Globals")
 43 set<string> Instruction_operands;
 44 :(before "End One-time Setup")
 45 Instruction_operands.insert("subop");
 46 Instruction_operands.insert("mod");
 47 Instruction_operands.insert("rm32");
 48 Instruction_operands.insert("base");
 49 Instruction_operands.insert("index");
 50 Instruction_operands.insert("scale");
 51 Instruction_operands.insert("r32");
 52 Instruction_operands.insert("disp8");
 53 Instruction_operands.insert("disp16");
 54 Instruction_operands.insert("disp32");
 55 Instruction_operands.insert("imm8");
 56 Instruction_operands.insert("imm32");
 57 
 58 :(before "End Help Texts")
 59 init_operand_type_help();
 60 :(code)
 61 void init_operand_type_help() {
 62   put(Help, "mod",
 63     "2-bit operand controlling the _addressing mode_ of many instructions,\n"
 64     "to determine how to compute the _effective address_ to look up memory at\n"
 65     "based on the 'rm32' operand and potentially others.\n"
 66     "\n"
 67     "If mod = 3, just operate on the contents of the register specified by rm32\n"
 68     "            (direct mode).\n"
 69     "If mod = 2, effective address is usually* rm32 + disp32\n"
 70     "            (indirect mode with displacement).\n"
 71     "If mod = 1, effective address is usually* rm32 + disp8\n"
 72     "            (indirect mode with displacement).\n"
 73     "If mod = 0, effective address is usually* rm32 (indirect mode).\n"
 74     "(* - The exception is when rm32 is '4'. Register 4 is the stack pointer (ESP).\n"
 75     "     Using it as an address gets more involved. For more details,\n"
 76     "     try reading the help pages for 'base', 'index' and 'scale'.)\n"
 77     "\n"
 78     "For complete details consult the IA-32 software developer's manual, table 2-2,\n"
 79     "\"32-bit addressing forms with the ModR/M byte\".\n"
 80     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
 81   );
 82   put(Help, "subop",
 83     "Additional 3-bit operand for determining the instruction when the opcode is 81, 8f or ff.\n"
 84     "Can't coexist with operand of type 'r32' in a single instruction, because the two use the same bits.\n"
 85   );
 86   put(Help, "r32",
 87     "3-bit operand specifying a register operand used directly, without any further addressing modes.\n"
 88   );
 89   put(Help, "rm32",
 90     "32-bit value in register or memory. The precise details of its construction depend on the eponymous 3-bit\n"
 91     "'rm32' operand, the 'mod' operand, and also potentially the 'SIB' operands ('scale', 'index' and 'base')\n"
 92     "and a displacement ('disp8' or 'disp32').\n"
 93     "For complete details consult the IA-32 software developer's manual, table 2-2,\n"
 94     "\"32-bit addressing forms with the ModR/M byte\".\n"
 95     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
 96   );
 97   put(Help, "base",
 98     "Additional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) specifying the register containing an address to look up.\n"
 99     "This address may be further modified by 'index' and 'scale' operands.\n"
100     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
101     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
102     "\"32-bit addressing forms with the SIB byte\".\n"
103     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
104   );
105   put(Help, "index",
106     "Optional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be added to the 'base' operand to compute the 'effective address' at which to look up memory.\n"
107     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
108     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
109     "\"32-bit addressing forms with the SIB byte\".\n"
110     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
111   );
112   put(Help, "scale",
113     "Optional 2-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be multiplied to the 'index' operand before adding the result to the 'base' operand to compute the _effective address_ to operate on.\n"
114     "  effective address = base + index * scale + displacement (disp8 or disp32)\n"
115     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
116     "\"32-bit addressing forms with the SIB byte\".\n"
117     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
118   );
119   put(Help, "disp8",
120     "8-bit value to be added in many instructions.\n"
121   );
122   put(Help, "disp16",
123     "16-bit value to be added in many instructions.\n"
124   );
125   put(Help, "disp32",
126     "32-bit value to be added in many instructions.\n"
127   );
128   put(Help, "imm8",
129     "8-bit value for many instructions.\n"
130   );
131   put(Help, "imm32",
132     "32-bit value for many instructions.\n"
133   );
134 }
135 
136 //:: transform packing operands into bytes in the right order
137 
138 :(after "Begin Transforms")
139 // Begin Level-2 Transforms
140 Transform.push_back(pack_operands);
141 // End Level-2 Transforms
142 
143 :(code)
144 void pack_operands(program& p) {
145   if (p.segments.empty()) return;
146   segment& code = p.segments.at(0);
147   // Pack Operands(segment code)
148   trace(99, "transform") << "-- pack operands" << end();
149   for (int i = 0;  i < SIZE(code.lines);  ++i) {
150     line& inst = code.lines.at(i);
151     if (all_hex_bytes(inst)) continue;
152     trace(99, "transform") << "packing instruction '" << to_string(/*with metadata*/inst) << "'" << end();
153     pack_operands(inst);
154     trace(99, "transform") << "instruction after packing: '" << to_string(/*without metadata*/inst.words) << "'" << end();
155   }
156 }
157 
158 void pack_operands(line& inst) {
159   line new_inst;
160   add_opcodes(inst, new_inst);
161   add_modrm_byte(inst, new_inst);
162   add_sib_byte(inst, new_inst);
163   add_disp_bytes(inst, new_inst);
164   add_imm_bytes(inst, new_inst);
165   inst.words.swap(new_inst.words);
166 }
167 
168 void add_opcodes(const line& in, line& out) {
169   out.words.push_back(in.words.at(0));
170   if (in.words.at(0).data == "0f" || in.words.at(0).data == "f2" || in.words.at(0).data == "f3")
171     out.words.push_back(in.words.at(1));
172   if (in.words.at(0).data == "f3" && in.words.at(1).data == "0f")
173     out.words.push_back(in.words.at(2));
174   if (in.words.at(0).data == "f2" && in.words.at(1).data == "0f")
175     out.words.push_back(in.words.at(2));
176 }
177 
178 void add_modrm_byte(const line& in, line& out) {
179   uint8_t mod=0, reg_subop=0, rm32=0;
180   bool emit = false;
181   for (int i = 0;  i < SIZE(in.words);  ++i) {
182     const word& curr = in.words.at(i);
183     if (has_operand_metadata(curr, "mod")) {
184       mod = hex_byte(curr.data);
185       emit = true;
186     }
187     else if (has_operand_metadata(curr, "rm32")) {
188       rm32 = hex_byte(curr.data);
189       emit = true;
190     }
191     else if (has_operand_metadata(curr, "r32")) {
192       reg_subop = hex_byte(curr.data);
193       emit = true;
194     }
195     else if (has_operand_metadata(curr, "subop")) {
196       reg_subop = hex_byte(curr.data);
197       emit = true;
198     }
199   }
200   if (emit)
201     out.words.push_back(hex_byte_text((mod << 6) | (reg_subop << 3) | rm32));
202 }
203 
204 void add_sib_byte(const line& in, line& out) {
205   uint8_t scale=0, index=0, base=0;
206   bool emit = false;
207   for (int i = 0;  i < SIZE(in.words);  ++i) {
208     const word& curr = in.words.at(i);
209     if (has_operand_metadata(curr, "scale")) {
210       scale = hex_byte(curr.data);
211       emit = true;
212     }
213     else if (has_operand_metadata(curr, "index")) {
214       index = hex_byte(curr.data);
215       emit = true;
216     }
217     else if (has_operand_metadata(curr, "base")) {
218       base = hex_byte(curr.data);
219       emit = true;
220     }
221   }
222   if (emit)
223     out.words.push_back(hex_byte_text((scale << 6) | (index << 3) | base));
224 }
225 
226 void add_disp_bytes(const line& in, line& out) {
227   for (int i = 0;  i < SIZE(in.words);  ++i) {
228     const word& curr = in.words.at(i);
229     if (has_operand_metadata(curr, "disp8"))
230       emit_hex_bytes(out, curr, 1);
231     if (has_operand_metadata(curr, "disp16"))
232       emit_hex_bytes(out, curr, 2);
233     else if (has_operand_metadata(curr, "disp32"))
234       emit_hex_bytes(out, curr, 4);
235   }
236 }
237 
238 void add_imm_bytes(const line& in, line& out) {
239   for (int i = 0;  i < SIZE(in.words);  ++i) {
240     const word& curr = in.words.at(i);
241     if (has_operand_metadata(curr, "imm8"))
242       emit_hex_bytes(out, curr, 1);
243     else if (has_operand_metadata(curr, "imm32"))
244       emit_hex_bytes(out, curr, 4);
245   }
246 }
247 
248 void emit_hex_bytes(line& out, const word& w, int num) {
249   assert(num <= 4);
250   bool is_number = looks_like_hex_int(w.data);
251   if (num == 1 || !is_number) {
252     out.words.push_back(w);  // preserve existing metadata
253     if (is_number)
254       out.words.back().data = hex_byte_to_string(parse_int(w.data));
255     return;
256   }
257   emit_hex_bytes(out, static_cast<uint32_t>(parse_int(w.data)), num);
258 }
259 
260 void emit_hex_bytes(line& out, uint32_t val, int num) {
261   assert(num <= 4);
262   for (int i = 0;  i < num;  ++i) {
263     out.words.push_back(hex_byte_text(val & 0xff));
264     val = val >> 8;
265   }
266 }
267 
268 word hex_byte_text(uint8_t val) {
269   word result;
270   result.data = hex_byte_to_string(val);
271   result.original = result.data+"/auto";
272   return result;
273 }
274 
275 string hex_byte_to_string(uint8_t val) {
276   ostringstream out;
277   // uint8_t prints without padding, but int8_t will expand to 32 bits again
278   out << HEXBYTE << NUM(val);
279   return out.str();
280 }
281 
282 string to_string(const vector<word>& in) {
283   ostringstream out;
284   for (int i = 0;  i < SIZE(in);  ++i) {
285     if (i > 0) out << ' ';
286     out << in.at(i).data;
287   }
288   return out.str();
289 }
290 
291 :(before "End Unit Tests")
292 void test_preserve_metadata_when_emitting_single_byte() {
293   word in;
294   in.data = "f0";
295   in.original = "f0/foo";
296   line out;
297   emit_hex_bytes(out, in, 1);
298   CHECK_EQ(out.words.at(0).data, "f0");
299   CHECK_EQ(out.words.at(0).original, "f0/foo");
300 }
301 
302 :(scenario pack_disp8)
303 == 0x1
304 74 2/disp8  # jump 2 bytes away if ZF is set
305 +transform: packing instruction '74 2/disp8'
306 +transform: instruction after packing: '74 02'
307 
308 :(scenarios transform)
309 :(scenario pack_disp8_negative)
310 == 0x1
311 # running this will cause an infinite loop
312 74 -1/disp8  # jump 1 byte before if ZF is set
313 +transform: packing instruction '74 -1/disp8'
314 +transform: instruction after packing: '74 ff'
315 :(scenarios run)
316 
317 //: helper for scenario
318 :(code)
319 void transform(const string& text_bytes) {
320   program p;
321   istringstream in(text_bytes);
322   parse(in, p);
323   if (trace_contains_errors()) return;
324   transform(p);
325 }
326 
327 :(scenario pack_modrm_imm32)
328 == 0x1
329 # instruction                     effective address                                                   operand     displacement    immediate
330 # op          subop               mod             rm32          base        index         scale       r32
331 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
332   81          0/add/subop         3/mod/direct    3/ebx/rm32                                                                      1/imm32           # add 1 to EBX
333 +transform: packing instruction '81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32'
334 +transform: instruction after packing: '81 c3 01 00 00 00'
335 
336 :(scenario pack_imm32_large)
337 == 0x1
338 b9 0x080490a7/imm32  # copy to ECX
339 +transform: packing instruction 'b9 0x080490a7/imm32'
340 +transform: instruction after packing: 'b9 a7 90 04 08'
341 
342 :(scenario pack_immediate_constants_hex)
343 == 0x1
344 # instruction                     effective address                                                   operand     displacement    immediate
345 # op          subop               mod             rm32          base        index         scale       r32
346 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
347   bb                                                                                                                              0x2a/imm32        # copy 42 to EBX
348 +transform: packing instruction 'bb 0x2a/imm32'
349 +transform: instruction after packing: 'bb 2a 00 00 00'
350 +run: copy imm32 0x0000002a to EBX
351 
352 :(scenarios transform)
353 :(scenario pack_silently_ignores_non_hex)
354 % Hide_errors = true;
355 == 0x1
356 # instruction                     effective address                                                   operand     displacement    immediate
357 # op          subop               mod             rm32          base        index         scale       r32
358 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
359   bb                                                                                                                              foo/imm32         # copy to EBX
360 +transform: packing instruction 'bb foo/imm32'
361 # no change (we're just not printing metadata to the trace)
362 +transform: instruction after packing: 'bb foo'
363 :(scenarios run)
364 
365 :(scenario pack_flags_bad_hex)
366 % Hide_errors = true;
367 == 0x1
368 # instruction                     effective address                                                   operand     displacement    immediate
369 # op          subop               mod             rm32          base        index         scale       r32
370 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
371   bb                                                                                                                              0xfoo/imm32       # copy to EBX
372 +error: not a number: 0xfoo
373 
374 //:: helpers
375 
376 :(code)
377 bool all_hex_bytes(const line& inst) {
378   for (int i = 0;  i < SIZE(inst.words);  ++i)
379     if (!is_hex_byte(inst.words.at(i)))
380       return false;
381   return true;
382 }
383 
384 bool is_hex_byte(const word& curr) {
385   if (contains_any_operand_metadata(curr))
386     return false;
387   if (SIZE(curr.data) != 2)
388     return false;
389   if (curr.data.find_first_not_of("0123456789abcdefABCDEF") != string::npos)
390     return false;
391   return true;
392 }
393 
394 bool contains_any_operand_metadata(const word& word) {
395   for (int i = 0;  i < SIZE(word.metadata);  ++i)
396     if (Instruction_operands.find(word.metadata.at(i)) != Instruction_operands.end())
397       return true;
398   return false;
399 }
400 
401 bool has_operand_metadata(const line& inst, const string& m) {
402   bool result = false;
403   for (int i = 0;  i < SIZE(inst.words);  ++i) {
404     if (!has_operand_metadata(inst.words.at(i), m)) continue;
405     if (result) {
406       raise << "'" << to_string(inst) << "' has conflicting " << m << " operands\n" << end();
407       return false;
408     }
409     result = true;
410   }
411   return result;
412 }
413 
414 bool has_operand_metadata(const word& w, const string& m) {
415   bool result = false;
416   bool metadata_found = false;
417   for (int i = 0;  i < SIZE(w.metadata);  ++i) {
418     const string& curr = w.metadata.at(i);
419     if (Instruction_operands.find(curr) == Instruction_operands.end()) continue;  // ignore unrecognized metadata
420     if (metadata_found) {
421       raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end();
422       return false;
423     }
424     metadata_found = true;
425     result = (curr == m);
426   }
427   return result;
428 }
429 
430 word metadata(const line& inst, const string& m) {
431   for (int i = 0;  i < SIZE(inst.words);  ++i)
432     if (has_operand_metadata(inst.words.at(i), m))
433       return inst.words.at(i);
434   assert(false);
435 }
436 
437 bool looks_like_hex_int(const string& s) {
438   if (s.empty()) return false;
439   if (s.at(0) == '-' || s.at(0) == '+') return true;
440   if (isdigit(s.at(0))) return true;  // includes '0x' prefix
441   // End looks_like_hex_int(s) Detectors
442   return false;
443 }
444 
445 :(code)
446 string to_string(const line& inst) {
447   ostringstream out;
448   for (int i = 0;  i < SIZE(inst.words);  ++i) {
449     if (i > 0) out << ' ';
450     out << inst.words.at(i).original;
451   }
452   return out.str();
453 }