https://github.com/akkartik/mu/blob/master/032---operands.cc
  1 //: Beginning of "level 2": tagging bytes with metadata around what field of
  2 //: an x86 instruction they're for.
  3 //:
  4 //: The x86 instruction set is variable-length, and how a byte is interpreted
  5 //: affects later instruction boundaries. A lot of the pain in programming
  6 //: machine code stems from computer and programmer going out of sync on what
  7 //: a byte means. The miscommunication is usually not immediately caught, and
  8 //: metastasizes at runtime into kilobytes of misinterpreted instructions.
  9 //:
 10 //: To mitigate these issues, we'll start programming in terms of logical
 11 //: operands rather than physical bytes. Some operands are smaller than a
 12 //: byte, and others may consist of multiple bytes. This layer will correctly
 13 //: pack and order the bytes corresponding to the operands in an instruction.
 14 
 15 :(before "End Help Texts")
 16 put_new(Help, "instructions",
 17   "Each x86 instruction consists of an instruction or opcode and some number\n"
 18   "of operands.\n"
 19   "Each operand has a type. An instruction won't have more than one operand of\n"
 20   "any type.\n"
 21   "Each instruction has some set of allowed operand types. It'll reject others.\n"
 22   "The complete list of operand types: mod, subop, r32 (register), rm32\n"
 23   "(register or memory), scale, index, base, disp8, disp16, disp32, imm8,\n"
 24   "imm32.\n"
 25   "Each of these has its own help page. Try reading 'subx help mod' next.\n"
 26 );
 27 :(before "End Help Contents")
 28 cerr << "  instructions\n";
 29 
 30 :(code)
 31 void test_pack_immediate_constants() {
 32   run(
 33       "== code 0x1\n"
 34       "bb  0x2a/imm32\n"
 35   );
 36   CHECK_TRACE_CONTENTS(
 37       "transform: packing instruction 'bb 0x2a/imm32'\n"
 38       "transform: instruction after packing: 'bb 2a 00 00 00'\n"
 39       "run: copy imm32 0x0000002a to EBX\n"
 40   );
 41 }
 42 
 43 //: complete set of valid operand types
 44 
 45 :(before "End Globals")
 46 set<string> Instruction_operands;
 47 :(before "End One-time Setup")
 48 Instruction_operands.insert("subop");
 49 Instruction_operands.insert("mod");
 50 Instruction_operands.insert("rm32");
 51 Instruction_operands.insert("base");
 52 Instruction_operands.insert("index");
 53 Instruction_operands.insert("scale");
 54 Instruction_operands.insert("r32");
 55 Instruction_operands.insert("disp8");
 56 Instruction_operands.insert("disp16");
 57 Instruction_operands.insert("disp32");
 58 Instruction_operands.insert("imm8");
 59 Instruction_operands.insert("imm32");
 60 
 61 :(before "End Help Texts")
 62 init_operand_type_help();
 63 :(code)
 64 void init_operand_type_help() {
 65   put(Help, "mod",
 66     "2-bit operand controlling the _addressing mode_ of many instructions,\n"
 67     "to determine how to compute the _effective address_ to look up memory at\n"
 68     "based on the 'rm32' operand and potentially others.\n"
 69     "\n"
 70     "If mod = 3, just operate on the contents of the register specified by rm32\n"
 71     "            (direct mode).\n"
 72     "If mod = 2, effective address is usually* rm32 + disp32\n"
 73     "            (indirect mode with displacement).\n"
 74     "If mod = 1, effective address is usually* rm32 + disp8\n"
 75     "            (indirect mode with displacement).\n"
 76     "If mod = 0, effective address is usually* rm32 (indirect mode).\n"
 77     "(* - The exception is when rm32 is '4'. Register 4 is the stack pointer (ESP).\n"
 78     "     Using it as an address gets more involved. For more details,\n"
 79     "     try reading the help pages for 'base', 'index' and 'scale'.)\n"
 80     "\n"
 81     "For complete details, spend some time with two tables in the IA-32 software\n"
 82     "developer's manual that are also included in this repo:\n"
 83     "  - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n"
 84     "  - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n"
 85   );
 86   put(Help, "subop",
 87     "Additional 3-bit operand for determining the instruction when the opcode\n"
 88     "is 81, 8f, d3, f7 or ff.\n"
 89     "Can't coexist with operand of type 'r32' in a single instruction, because\n"
 90     "the two use the same bits.\n"
 91   );
 92   put(Help, "r32",
 93     "3-bit operand specifying a register operand used directly, without any further addressing modes.\n"
 94   );
 95   put(Help, "rm32",
 96     "32-bit value in register or memory. The precise details of its construction\n"
 97     "depend on the eponymous 3-bit 'rm32' operand, the 'mod' operand, and also\n"
 98     "potentially the 'SIB' operands ('scale', 'index' and 'base') and a displacement\n"
 99     "('disp8' or 'disp32').\n"
100     "\n"
101     "For complete details, spend some time with two tables in the IA-32 software\n"
102     "developer's manual that are also included in this repo:\n"
103     "  - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n"
104     "  - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n"
105   );
106   put(Help, "base",
107     "Additional 3-bit operand (when 'rm32' is 4, unless 'mod' is 3) specifying the\n"
108     "register containing an address to look up.\n"
109     "This address may be further modified by 'index' and 'scale' operands.\n"
110     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
111     "For complete details, spend some time with the IA-32 software developer's manual,\n"
112     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
113     "It is included in this repository as 'sib.pdf'.\n"
114   );
115   put(Help, "index",
116     "Optional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be added to\n"
117     "the 'base' operand to compute the 'effective address' at which to look up memory.\n"
118     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
119     "For complete details, spend some time with the IA-32 software developer's manual,\n"
120     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
121     "It is included in this repository as 'sib.pdf'.\n"
122   );
123   put(Help, "scale",
124     "Optional 2-bit operand (when 'rm32' is 4 unless 'mod' is 3) that encodes a\n"
125     "power of 2 to be multiplied to the 'index' operand before adding the result to\n"
126     "the 'base' operand to compute the _effective address_ to operate on.\n"
127     "  effective address = base + index * scale + displacement (disp8 or disp32)\n"
128     "\n"
129     "When scale is 0, use index unmodified.\n"
130     "When scale is 1, multiply index by 2.\n"
131     "When scale is 2, multiply index by 4.\n"
132     "When scale is 3, multiply index by 8.\n"
133     "\n"
134     "For complete details, spend some time with the IA-32 software developer's manual,\n"
135     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
136     "It is included in this repository as 'sib.pdf'.\n"
137   );
138   put(Help, "disp8",
139     "8-bit value to be added in many instructions.\n"
140   );
141   put(Help, "disp16",
142     "16-bit value to be added in many instructions.\n"
143     "Currently not used in any SubX instructions.\n"
144   );
145   put(Help, "disp32",
146     "32-bit value to be added in many instructions.\n"
147   );
148   put(Help, "imm8",
149     "8-bit value for many instructions.\n"
150   );
151   put(Help, "imm32",
152     "32-bit value for many instructions.\n"
153   );
154 }
155 
156 //:: transform packing operands into bytes in the right order
157 
158 :(after "Begin Transforms")
159 // Begin Level-2 Transforms
160 Transform.push_back(pack_operands);
161 // End Level-2 Transforms
162 
163 :(code)
164 void pack_operands(program& p) {
165   if (p.segments.empty()) return;
166   segment& code = *find(p, "code");
167   // Pack Operands(segment code)
168   trace(3, "transform") << "-- pack operands" << end();
169   for (int i = 0;  i < SIZE(code.lines);  ++i) {
170     line& inst = code.lines.at(i);
171     if (all_hex_bytes(inst)) continue;
172     trace(99, "transform") << "packing instruction '" << to_string(/*with metadata*/inst) << "'" << end();
173     pack_operands(inst);
174     trace(99, "transform") << "instruction after packing: '" << to_string(/*without metadata*/inst.words) << "'" << end();
175   }
176 }
177 
178 void pack_operands(line& inst) {
179   line new_inst;
180   add_opcodes(inst, new_inst);
181   add_modrm_byte(inst, new_inst);
182   add_sib_byte(inst, new_inst);
183   add_disp_bytes(inst, new_inst);
184   add_imm_bytes(inst, new_inst);
185   inst.words.swap(new_inst.words);
186 }
187 
188 void add_opcodes(const line& in, line& out) {
189   out.words.push_back(in.words.at(0));
190   if (in.words.at(0).data == "0f" || in.words.at(0).data == "f2" || in.words.at(0).data == "f3")
191     out.words.push_back(in.words.at(1));
192   if (in.words.at(0).data == "f3" && in.words.at(1).data == "0f")
193     out.words.push_back(in.words.at(2));
194   if (in.words.at(0).data == "f2" && in.words.at(1).data == "0f")
195     out.words.push_back(in.words.at(2));
196 }
197 
198 void add_modrm_byte(const line& in, line& out) {
199   uint8_t mod=0, reg_subop=0, rm32=0;
200   bool emit = false;
201   for (int i = 0;  i < SIZE(in.words);  ++i) {
202     const word& curr = in.words.at(i);
203     if (has_operand_metadata(curr, "mod")) {
204       mod = hex_byte(curr.data);
205       emit = true;
206     }
207     else if (has_operand_metadata(curr, "rm32")) {
208       rm32 = hex_byte(curr.data);
209       emit = true;
210     }
211     else if (has_operand_metadata(curr, "r32")) {
212       reg_subop = hex_byte(curr.data);
213       emit = true;
214     }
215     else if (has_operand_metadata(curr, "subop")) {
216       reg_subop = hex_byte(curr.data);
217       emit = true;
218     }
219   }
220   if (emit)
221     out.words.push_back(hex_byte_text((mod << 6) | (reg_subop << 3) | rm32));
222 }
223 
224 void add_sib_byte(const line& in, line& out) {
225   uint8_t scale=0, index=0, base=0;
226   bool emit = false;
227   for (int i = 0;  i < SIZE(in.words);  ++i) {
228     const word& curr = in.words.at(i);
229     if (has_operand_metadata(curr, "scale")) {
230       scale = hex_byte(curr.data);
231       emit = true;
232     }
233     else if (has_operand_metadata(curr, "index")) {
234       index = hex_byte(curr.data);
235       emit = true;
236     }
237     else if (has_operand_metadata(curr, "base")) {
238       base = hex_byte(curr.data);
239       emit = true;
240     }
241   }
242   if (emit)
243     out.words.push_back(hex_byte_text((scale << 6) | (index << 3) | base));
244 }
245 
246 void add_disp_bytes(const line& in, line& out) {
247   for (int i = 0;  i < SIZE(in.words);  ++i) {
248     const word& curr = in.words.at(i);
249     if (has_operand_metadata(curr, "disp8"))
250       emit_hex_bytes(out, curr, 1);
251     if (has_operand_metadata(curr, "disp16"))
252       emit_hex_bytes(out, curr, 2);
253     else if (has_operand_metadata(curr, "disp32"))
254       emit_hex_bytes(out, curr, 4);
255   }
256 }
257 
258 void add_imm_bytes(const line& in, line& out) {
259   for (int i = 0;  i < SIZE(in.words);  ++i) {
260     const word& curr = in.words.at(i);
261     if (has_operand_metadata(curr, "imm8"))
262       emit_hex_bytes(out, curr, 1);
263     else if (has_operand_metadata(curr, "imm32"))
264       emit_hex_bytes(out, curr, 4);
265   }
266 }
267 
268 void emit_hex_bytes(line& out, const word& w, int num) {
269   assert(num <= 4);
270   bool is_number = looks_like_hex_int(w.data);
271   if (num == 1 || !is_number) {
272     out.words.push_back(w);  // preserve existing metadata
273     if (is_number)
274       out.words.back().data = hex_byte_to_string(parse_int(w.data));
275     return;
276   }
277   emit_hex_bytes(out, static_cast<uint32_t>(parse_int(w.data)), num);
278 }
279 
280 void emit_hex_bytes(line& out, uint32_t val, int num) {
281   assert(num <= 4);
282   for (int i = 0;  i < num;  ++i) {
283     out.words.push_back(hex_byte_text(val & 0xff));
284     val = val >> 8;
285   }
286 }
287 
288 word hex_byte_text(uint8_t val) {
289   word result;
290   result.data = hex_byte_to_string(val);
291   result.original = result.data+"/auto";
292   return result;
293 }
294 
295 string hex_byte_to_string(uint8_t val) {
296   ostringstream out;
297   // uint8_t prints without padding, but int8_t will expand to 32 bits again
298   out << HEXBYTE << NUM(val);
299   return out.str();
300 }
301 
302 string to_string(const vector<word>& in) {
303   ostringstream out;
304   for (int i = 0;  i < SIZE(in);  ++i) {
305     if (i > 0) out << ' ';
306     out << in.at(i).data;
307   }
308   return out.str();
309 }
310 
311 :(before "End Unit Tests")
312 void test_preserve_metadata_when_emitting_single_byte() {
313   word in;
314   in.data = "f0";
315   in.original = "f0/foo";
316   line out;
317   emit_hex_bytes(out, in, 1);
318   CHECK_EQ(out.words.at(0).data, "f0");
319   CHECK_EQ(out.words.at(0).original, "f0/foo");
320 }
321 
322 :(code)
323 void test_pack_disp8() {
324   run(
325       "== code 0x1\n"
326       "74 2/disp8\n"  // jump 2 bytes away if ZF is set
327   );
328   CHECK_TRACE_CONTENTS(
329       "transform: packing instruction '74 2/disp8'\n"
330       "transform: instruction after packing: '74 02'\n"
331   );
332 }
333 
334 void test_pack_disp8_negative() {
335   transform(
336       "== code 0x1\n"
337       // running this will cause an infinite loop
338       "74 -1/disp8\n"  // jump 1 byte before if ZF is set
339   );
340   CHECK_TRACE_CONTENTS(
341       "transform: packing instruction '74 -1/disp8'\n"
342       "transform: instruction after packing: '74 ff'\n"
343   );
344 }
345 
346 //: helper for scenario
347 void transform(const string& text_bytes) {
348   program p;
349   istringstream in(text_bytes);
350   parse(in, p);
351   if (trace_contains_errors()) return;
352   transform(p);
353 }
354 
355 void test_pack_modrm_imm32() {
356   run(
357       "== code 0x1\n"
358       // instruction                     effective address                                                   operand     displacement    immediate\n"
359       // op          subop               mod             rm32          base        index         scale       r32\n"
360       // 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes\n"
361       "  81          0/add/subop         3/mod/direct    3/ebx/rm32                                                                      1/imm32      \n"  // add 1 to EBX
362   );
363   CHECK_TRACE_CONTENTS(
364       "transform: packing instruction '81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32'\n"
365       "transform: instruction after packing: '81 c3 01 00 00 00'\n"
366   );
367 }
368 
369 void test_pack_imm32_large() {
370   run(
371       "== code 0x1\n"
372       "b9  0x080490a7/imm32\n"
373   );
374   CHECK_TRACE_CONTENTS(
375       "transform: packing instruction 'b9 0x080490a7/imm32'\n"
376       "transform: instruction after packing: 'b9 a7 90 04 08'\n"
377   );
378 }
379 
380 void test_pack_immediate_constants_hex() {
381   run(
382       "== code 0x1\n"
383       "b9  0x2a/imm32\n"
384   );
385   CHECK_TRACE_CONTENTS(
386       "transform: packing instruction 'b9 0x2a/imm32'\n"
387       "transform: instruction after packing: 'b9 2a 00 00 00'\n"
388       "run: copy imm32 0x0000002a to ECX\n"
389   );
390 }
391 
392 void test_pack_silently_ignores_non_hex() {
393   Hide_errors = true;
394   transform(
395       "== code 0x1\n"
396       "b9  foo/imm32\n"
397   );
398   CHECK_TRACE_CONTENTS(
399       "transform: packing instruction 'b9 foo/imm32'\n"
400       // no change (we're just not printing metadata to the trace)
401       "transform: instruction after packing: 'b9 foo'\n"
402   );
403 }
404 
405 void test_pack_flags_bad_hex() {
406   Hide_errors = true;
407   run(
408       "== code 0x1\n"
409       "b9  0xfoo/imm32\n"
410   );
411   CHECK_TRACE_CONTENTS(
412       "error: not a number: 0xfoo\n"
413   );
414 }
415 
416 void test_pack_flags_uppercase_hex() {
417   Hide_errors = true;
418   run(
419       "== code 0x1\n"
420       "b9 0xAb/imm32\n"
421   );
422   CHECK_TRACE_CONTENTS(
423       "error: uppercase hex not allowed: 0xAb\n"
424   );
425 }
426 
427 //:: helpers
428 
429 bool all_hex_bytes(const line& inst) {
430   for (int i = 0;  i < SIZE(inst.words);  ++i)
431     if (!is_hex_byte(inst.words.at(i)))
432       return false;
433   return true;
434 }
435 
436 bool is_hex_byte(const word& curr) {
437   if (contains_any_operand_metadata(curr))
438     return false;
439   if (SIZE(curr.data) != 2)
440     return false;
441   if (curr.data.find_first_not_of("0123456789abcdef") != string::npos)
442     return false;
443   return true;
444 }
445 
446 bool contains_any_operand_metadata(const word& word) {
447   for (int i = 0;  i < SIZE(word.metadata);  ++i)
448     if (Instruction_operands.find(word.metadata.at(i)) != Instruction_operands.end())
449       return true;
450   return false;
451 }
452 
453 bool has_operand_metadata(const line& inst, const string& m) {
454   bool result = false;
455   for (int i = 0;  i < SIZE(inst.words);  ++i) {
456     if (!has_operand_metadata(inst.words.at(i), m)) continue;
457     if (result) {
458       raise << "'" << to_string(inst) << "' has conflicting " << m << " operands\n" << end();
459       return false;
460     }
461     result = true;
462   }
463   return result;
464 }
465 
466 bool has_operand_metadata(const word& w, const string& m) {
467   bool result = false;
468   bool metadata_found = false;
469   for (int i = 0;  i < SIZE(w.metadata);  ++i) {
470     const string& curr = w.metadata.at(i);
471     if (Instruction_operands.find(curr) == Instruction_operands.end()) continue;  // ignore unrecognized metadata
472     if (metadata_found) {
473       raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end();
474       return false;
475     }
476     metadata_found = true;
477     result = (curr == m);
478   }
479   return result;
480 }
481 
482 word metadata(const line& inst, const string& m) {
483   for (int i = 0;  i < SIZE(inst.words);  ++i)
484     if (has_operand_metadata(inst.words.at(i), m))
485       return inst.words.at(i);
486   assert(false);
487 }
488 
489 bool looks_like_hex_int(const string& s) {
490   if (s.empty()) return false;
491   if (s.at(0) == '-' || s.at(0) == '+') return true;
492   if (isdigit(s.at(0))) return true;  // includes '0x' prefix
493   // End looks_like_hex_int(s) Detectors
494   return false;
495 }
496 
497 string to_string(const line& inst) {
498   ostringstream out;
499   for (int i = 0;  i < SIZE(inst.words);  ++i) {
500     if (i > 0) out << ' ';
501     out << inst.words.at(i).original;
502   }
503   return out.str();
504 }
505 
506 int32_t parse_int(const string& s) {
507   if (s.empty()) return 0;
508   if (contains_uppercase(s)) {
509     raise << "uppercase hex not allowed: " << s << '\n' << end();
510     return 0;
511   }
512   istringstream in(s);
513   in >> std::hex;
514   if (s.at(0) == '-') {
515     int32_t result = 0;
516     in >> result;
517     if (!in || !in.eof()) {
518       raise << "not a number: " << s << '\n' << end();
519       return 0;
520     }
521     return result;
522   }
523   uint32_t uresult = 0;
524   in >> uresult;
525   if (!in || !in.eof()) {
526     raise << "not a number: " << s << '\n' << end();
527     return 0;
528   }
529   return static_cast<int32_t>(uresult);
530 }
531 :(before "End Unit Tests")
532 void test_parse_int() {
533   CHECK_EQ(0, parse_int("0"));
534   CHECK_EQ(0, parse_int("0x0"));
535   CHECK_EQ(0, parse_int("0x0"));
536   CHECK_EQ(16, parse_int("10"));  // hex always
537   CHECK_EQ(-1, parse_int("-1"));
538   CHECK_EQ(-1, parse_int("0xffffffff"));
539 }