https://github.com/akkartik/mu/blob/master/032---operands.cc
  1 //: Metadata for fields of an x86 instruction.
  2 //:
  3 //: The x86 instruction set is variable-length, and how a byte is interpreted
  4 //: affects later instruction boundaries. A lot of the pain in programming
  5 //: machine code stems from computer and programmer going out of sync on what
  6 //: a byte means. The miscommunication is usually not immediately caught, and
  7 //: metastasizes at runtime into kilobytes of misinterpreted instructions.
  8 //:
  9 //: To mitigate these issues, we'll start programming in terms of logical
 10 //: operands rather than physical bytes. Some operands are smaller than a
 11 //: byte, and others may consist of multiple bytes. This layer will correctly
 12 //: pack and order the bytes corresponding to the operands in an instruction.
 13 
 14 :(before "End Help Texts")
 15 put_new(Help, "instructions",
 16   "Each x86 instruction consists of an instruction or opcode and some number\n"
 17   "of operands.\n"
 18   "Each operand has a type. An instruction won't have more than one operand of\n"
 19   "any type.\n"
 20   "Each instruction has some set of allowed operand types. It'll reject others.\n"
 21   "The complete list of operand types: mod, subop, r32 (register), rm32\n"
 22   "(register or memory), scale, index, base, disp8, disp16, disp32, imm8,\n"
 23   "imm32.\n"
 24   "Each of these has its own help page. Try reading 'bootstrap help mod' next.\n"
 25 );
 26 :(before "End Help Contents")
 27 cerr << "  instructions\n";
 28 
 29 :(before "Running Test Program")
 30 transform(p);
 31 if (trace_contains_errors()) return;
 32 
 33 :(code)
 34 void test_pack_immediate_constants() {
 35   run(
 36       "== code 0x1\n"
 37       "bb  0x2a/imm32\n"
 38   );
 39   CHECK_TRACE_CONTENTS(
 40       "transform: packing instruction 'bb 0x2a/imm32'\n"
 41       "transform: instruction after packing: 'bb 2a 00 00 00'\n"
 42       "run: copy imm32 0x0000002a to EBX\n"
 43   );
 44 }
 45 
 46 //: complete set of valid operand types
 47 
 48 :(before "End Globals")
 49 set<string> Instruction_operands;
 50 :(before "End One-time Setup")
 51 Instruction_operands.insert("subop");
 52 Instruction_operands.insert("mod");
 53 Instruction_operands.insert("rm32");
 54 Instruction_operands.insert("base");
 55 Instruction_operands.insert("index");
 56 Instruction_operands.insert("scale");
 57 Instruction_operands.insert("r32");
 58 Instruction_operands.insert("disp8");
 59 Instruction_operands.insert("disp16");
 60 Instruction_operands.insert("disp32");
 61 Instruction_operands.insert("imm8");
 62 Instruction_operands.insert("imm32");
 63 
 64 :(before "End Help Texts")
 65 init_operand_type_help();
 66 :(code)
 67 void init_operand_type_help() {
 68   put(Help, "mod",
 69     "2-bit operand controlling the _addressing mode_ of many instructions,\n"
 70     "to determine how to compute the _effective address_ to look up memory at\n"
 71     "based on the 'rm32' operand and potentially others.\n"
 72     "\n"
 73     "If mod = 3, just operate on the contents of the register specified by rm32\n"
 74     "            (direct mode).\n"
 75     "If mod = 2, effective address is usually* rm32 + disp32\n"
 76     "            (indirect mode with displacement).\n"
 77     "If mod = 1, effective address is usually* rm32 + disp8\n"
 78     "            (indirect mode with displacement).\n"
 79     "If mod = 0, effective address is usually* rm32 (indirect mode).\n"
 80     "(* - The exception is when rm32 is '4'. Register 4 is the stack pointer (ESP).\n"
 81     "     Using it as an address gets more involved. For more details,\n"
 82     "     try reading the help pages for 'base', 'index' and 'scale'.)\n"
 83     "\n"
 84     "For complete details, spend some time with two tables in the IA-32 software\n"
 85     "developer's manual that are also included in this repo:\n"
 86     "  - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n"
 87     "  - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n"
 88   );
 89   put(Help, "subop",
 90     "Additional 3-bit operand for determining the instruction when the opcode\n"
 91     "is 81, 8f, d3, f7 or ff.\n"
 92     "Can't coexist with operand of type 'r32' in a single instruction, because\n"
 93     "the two use the same bits.\n"
 94   );
 95   put(Help, "r32",
 96     "3-bit operand specifying a register operand used directly, without any further addressing modes.\n"
 97   );
 98   put(Help, "rm32",
 99     "32-bit value in register or memory. The precise details of its construction\n"
100     "depend on the eponymous 3-bit 'rm32' operand, the 'mod' operand, and also\n"
101     "potentially the 'SIB' operands ('scale', 'index' and 'base') and a displacement\n"
102     "('disp8' or 'disp32').\n"
103     "\n"
104     "For complete details, spend some time with two tables in the IA-32 software\n"
105     "developer's manual that are also included in this repo:\n"
106     "  - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n"
107     "  - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n"
108   );
109   put(Help, "base",
110     "Additional 3-bit operand (when 'rm32' is 4, unless 'mod' is 3) specifying the\n"
111     "register containing an address to look up.\n"
112     "This address may be further modified by 'index' and 'scale' operands.\n"
113     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
114     "For complete details, spend some time with the IA-32 software developer's manual,\n"
115     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
116     "It is included in this repository as 'sib.pdf'.\n"
117   );
118   put(Help, "index",
119     "Optional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be added to\n"
120     "the 'base' operand to compute the 'effective address' at which to look up memory.\n"
121     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
122     "For complete details, spend some time with the IA-32 software developer's manual,\n"
123     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
124     "It is included in this repository as 'sib.pdf'.\n"
125   );
126   put(Help, "scale",
127     "Optional 2-bit operand (when 'rm32' is 4 unless 'mod' is 3) that encodes a\n"
128     "power of 2 to be multiplied to the 'index' operand before adding the result to\n"
129     "the 'base' operand to compute the _effective address_ to operate on.\n"
130     "  effective address = base + index * scale + displacement (disp8 or disp32)\n"
131     "\n"
132     "When scale is 0, use index unmodified.\n"
133     "When scale is 1, multiply index by 2.\n"
134     "When scale is 2, multiply index by 4.\n"
135     "When scale is 3, multiply index by 8.\n"
136     "\n"
137     "For complete details, spend some time with the IA-32 software developer's manual,\n"
138     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
139     "It is included in this repository as 'sib.pdf'.\n"
140   );
141   put(Help, "disp8",
142     "8-bit value to be added in many instructions.\n"
143   );
144   put(Help, "disp16",
145     "16-bit value to be added in many instructions.\n"
146     "Currently not used in any SubX instructions.\n"
147   );
148   put(Help, "disp32",
149     "32-bit value to be added in many instructions.\n"
150   );
151   put(Help, "imm8",
152     "8-bit value for many instructions.\n"
153   );
154   put(Help, "imm32",
155     "32-bit value for many instructions.\n"
156   );
157 }
158 
159 //:: transform packing operands into bytes in the right order
160 
161 :(after "Begin Transforms")
162 Transform.push_back(pack_operands);
163 
164 :(code)
165 void pack_operands(program& p) {
166   if (p.segments.empty()) return;
167   segment& code = *find(p, "code");
168   // Pack Operands(segment code)
169   trace(3, "transform") << "-- pack operands" << end();
170   for (int i = 0;  i < SIZE(code.lines);  ++i) {
171     line& inst = code.lines.at(i);
172     if (all_hex_bytes(inst)) continue;
173     trace(99, "transform") << "packing instruction '" << to_string(/*with metadata*/inst) << "'" << end();
174     pack_operands(inst);
175     trace(99, "transform") << "instruction after packing: '" << to_string(/*without metadata*/inst.words) << "'" << end();
176   }
177 }
178 
179 void pack_operands(line& inst) {
180   line new_inst;
181   add_opcodes(inst, new_inst);
182   add_modrm_byte(inst, new_inst);
183   add_sib_byte(inst, new_inst);
184   add_disp_bytes(inst, new_inst);
185   add_imm_bytes(inst, new_inst);
186   inst.words.swap(new_inst.words);
187 }
188 
189 void add_opcodes(const line& in, line& out) {
190   out.words.push_back(in.words.at(0));
191   if (in.words.at(0).data == "0f" || in.words.at(0).data == "f2" || in.words.at(0).data == "f3")
192     out.words.push_back(in.words.at(1));
193   if (in.words.at(0).data == "f3" && in.words.at(1).data == "0f")
194     out.words.push_back(in.words.at(2));
195   if (in.words.at(0).data == "f2" && in.words.at(1).data == "0f")
196     out.words.push_back(in.words.at(2));
197 }
198 
199 void add_modrm_byte(const line& in, line& out) {
200   uint8_t mod=0, reg_subop=0, rm32=0;
201   bool emit = false;
202   for (int i = 0;  i < SIZE(in.words);  ++i) {
203     const word& curr = in.words.at(i);
204     if (has_operand_metadata(curr, "mod")) {
205       mod = hex_byte(curr.data);
206       emit = true;
207     }
208     else if (has_operand_metadata(curr, "rm32")) {
209       rm32 = hex_byte(curr.data);
210       emit = true;
211     }
212     else if (has_operand_metadata(curr, "r32")) {
213       reg_subop = hex_byte(curr.data);
214       emit = true;
215     }
216     else if (has_operand_metadata(curr, "subop")) {
217       reg_subop = hex_byte(curr.data);
218       emit = true;
219     }
220   }
221   if (emit)
222     out.words.push_back(hex_byte_text((mod << 6) | (reg_subop << 3) | rm32));
223 }
224 
225 void add_sib_byte(const line& in, line& out) {
226   uint8_t scale=0, index=0, base=0;
227   bool emit = false;
228   for (int i = 0;  i < SIZE(in.words);  ++i) {
229     const word& curr = in.words.at(i);
230     if (has_operand_metadata(curr, "scale")) {
231       scale = hex_byte(curr.data);
232       emit = true;
233     }
234     else if (has_operand_metadata(curr, "index")) {
235       index = hex_byte(curr.data);
236       emit = true;
237     }
238     else if (has_operand_metadata(curr, "base")) {
239       base = hex_byte(curr.data);
240       emit = true;
241     }
242   }
243   if (emit)
244     out.words.push_back(hex_byte_text((scale << 6) | (index << 3) | base));
245 }
246 
247 void add_disp_bytes(const line& in, line& out) {
248   for (int i = 0;  i < SIZE(in.words);  ++i) {
249     const word& curr = in.words.at(i);
250     if (has_operand_metadata(curr, "disp8"))
251       emit_hex_bytes(out, curr, 1);
252     if (has_operand_metadata(curr, "disp16"))
253       emit_hex_bytes(out, curr, 2);
254     else if (has_operand_metadata(curr, "disp32"))
255       emit_hex_bytes(out, curr, 4);
256   }
257 }
258 
259 void add_imm_bytes(const line& in, line& out) {
260   for (int i = 0;  i < SIZE(in.words);  ++i) {
261     const word& curr = in.words.at(i);
262     if (has_operand_metadata(curr, "imm8"))
263       emit_hex_bytes(out, curr, 1);
264     else if (has_operand_metadata(curr, "imm32"))
265       emit_hex_bytes(out, curr, 4);
266   }
267 }
268 
269 void emit_hex_bytes(line& out, const word& w, int num) {
270   assert(num <= 4);
271   bool is_number = looks_like_hex_int(w.data);
272   if (num == 1 || !is_number) {
273     out.words.push_back(w);  // preserve existing metadata
274     if (is_number)
275       out.words.back().data = hex_byte_to_string(parse_int(w.data));
276     return;
277   }
278   emit_hex_bytes(out, static_cast<uint32_t>(parse_int(w.data)), num);
279 }
280 
281 void emit_hex_bytes(line& out, uint32_t val, int num) {
282   assert(num <= 4);
283   for (int i = 0;  i < num;  ++i) {
284     out.words.push_back(hex_byte_text(val & 0xff));
285     val = val >> 8;
286   }
287 }
288 
289 word hex_byte_text(uint8_t val) {
290   word result;
291   result.data = hex_byte_to_string(val);
292   result.original = result.data+"/auto";
293   return result;
294 }
295 
296 string hex_byte_to_string(uint8_t val) {
297   ostringstream out;
298   // uint8_t prints without padding, but int8_t will expand to 32 bits again
299   out << HEXBYTE << NUM(val);
300   return out.str();
301 }
302 
303 string to_string(const vector<word>& in) {
304   ostringstream out;
305   for (int i = 0;  i < SIZE(in);  ++i) {
306     if (i > 0) out << ' ';
307     out << in.at(i).data;
308   }
309   return out.str();
310 }
311 
312 :(before "End Unit Tests")
313 void test_preserve_metadata_when_emitting_single_byte() {
314   word in;
315   in.data = "f0";
316   in.original = "f0/foo";
317   line out;
318   emit_hex_bytes(out, in, 1);
319   CHECK_EQ(out.words.at(0).data, "f0");
320   CHECK_EQ(out.words.at(0).original, "f0/foo");
321 }
322 
323 :(code)
324 void test_pack_disp8() {
325   run(
326       "== code 0x1\n"
327       "74 2/disp8\n"  // jump 2 bytes away if ZF is set
328   );
329   CHECK_TRACE_CONTENTS(
330       "transform: packing instruction '74 2/disp8'\n"
331       "transform: instruction after packing: '74 02'\n"
332   );
333 }
334 
335 void test_pack_disp8_negative() {
336   transform(
337       "== code 0x1\n"
338       // running this will cause an infinite loop
339       "74 -1/disp8\n"  // jump 1 byte before if ZF is set
340   );
341   CHECK_TRACE_CONTENTS(
342       "transform: packing instruction '74 -1/disp8'\n"
343       "transform: instruction after packing: '74 ff'\n"
344   );
345 }
346 
347 //: helper for scenario
348 void transform(const string& text_bytes) {
349   program p;
350   istringstream in(text_bytes);
351   parse(in, p);
352   if (trace_contains_errors()) return;
353   transform(p);
354 }
355 
356 void test_pack_modrm_imm32() {
357   run(
358       "== code 0x1\n"
359       // instruction                     effective address                                                   operand     displacement    immediate\n"
360       // op          subop               mod             rm32          base        index         scale       r32\n"
361       // 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes\n"
362       "  81          0/add/subop         3/mod/direct    3/ebx/rm32                                                                      1/imm32      \n"  // add 1 to EBX
363   );
364   CHECK_TRACE_CONTENTS(
365       "transform: packing instruction '81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32'\n"
366       "transform: instruction after packing: '81 c3 01 00 00 00'\n"
367   );
368 }
369 
370 void test_pack_imm32_large() {
371   run(
372       "== code 0x1\n"
373       "b9  0x080490a7/imm32\n"
374   );
375   CHECK_TRACE_CONTENTS(
376       "transform: packing instruction 'b9 0x080490a7/imm32'\n"
377       "transform: instruction after packing: 'b9 a7 90 04 08'\n"
378   );
379 }
380 
381 void test_pack_immediate_constants_hex() {
382   run(
383       "== code 0x1\n"
384       "b9  0x2a/imm32\n"
385   );
386   CHECK_TRACE_CONTENTS(
387       "transform: packing instruction 'b9 0x2a/imm32'\n"
388       "transform: instruction after packing: 'b9 2a 00 00 00'\n"
389       "run: copy imm32 0x0000002a to ECX\n"
390   );
391 }
392 
393 void test_pack_silently_ignores_non_hex() {
394   Hide_errors = true;
395   transform(
396       "== code 0x1\n"
397       "b9  foo/imm32\n"
398   );
399   CHECK_TRACE_CONTENTS(
400       "transform: packing instruction 'b9 foo/imm32'\n"
401       // no change (we're just not printing metadata to the trace)
402       "transform: instruction after packing: 'b9 foo'\n"
403   );
404 }
405 
406 void test_pack_flags_bad_hex() {
407   Hide_errors = true;
408   run(
409       "== code 0x1\n"
410       "b9  0xfoo/imm32\n"
411   );
412   CHECK_TRACE_CONTENTS(
413       "error: not a number: 0xfoo\n"
414   );
415 }
416 
417 void test_pack_flags_uppercase_hex() {
418   Hide_errors = true;
419   run(
420       "== code 0x1\n"
421       "b9 0xAb/imm32\n"
422   );
423   CHECK_TRACE_CONTENTS(
424       "error: uppercase hex not allowed: 0xAb\n"
425   );
426 }
427 
428 //:: helpers
429 
430 bool all_hex_bytes(const line& inst) {
431   for (int i = 0;  i < SIZE(inst.words);  ++i)
432     if (!is_hex_byte(inst.words.at(i)))
433       return false;
434   return true;
435 }
436 
437 bool is_hex_byte(const word& curr) {
438   if (contains_any_operand_metadata(curr))
439     return false;
440   if (SIZE(curr.data) != 2)
441     return false;
442   if (curr.data.find_first_not_of("0123456789abcdef") != string::npos)
443     return false;
444   return true;
445 }
446 
447 bool contains_any_operand_metadata(const word& word) {
448   for (int i = 0;  i < SIZE(word.metadata);  ++i)
449     if (Instruction_operands.find(word.metadata.at(i)) != Instruction_operands.end())
450       return true;
451   return false;
452 }
453 
454 bool has_operand_metadata(const line& inst, const string& m) {
455   bool result = false;
456   for (int i = 0;  i < SIZE(inst.words);  ++i) {
457     if (!has_operand_metadata(inst.words.at(i), m)) continue;
458     if (result) {
459       raise << "'" << to_string(inst) << "' has conflicting " << m << " operands\n" << end();
460       return false;
461     }
462     result = true;
463   }
464   return result;
465 }
466 
467 bool has_operand_metadata(const word& w, const string& m) {
468   bool result = false;
469   bool metadata_found = false;
470   for (int i = 0;  i < SIZE(w.metadata);  ++i) {
471     const string& curr = w.metadata.at(i);
472     if (Instruction_operands.find(curr) == Instruction_operands.end()) continue;  // ignore unrecognized metadata
473     if (metadata_found) {
474       raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end();
475       return false;
476     }
477     metadata_found = true;
478     result = (curr == m);
479   }
480   return result;
481 }
482 
483 word metadata(const line& inst, const string& m) {
484   for (int i = 0;  i < SIZE(inst.words);  ++i)
485     if (has_operand_metadata(inst.words.at(i), m))
486       return inst.words.at(i);
487   assert(false);
488 }
489 
490 bool looks_like_hex_int(const string& s) {
491   if (s.empty()) return false;
492   if (s.at(0) == '-' || s.at(0) == '+') return true;
493   if (isdigit(s.at(0))) return true;  // includes '0x' prefix
494   // End looks_like_hex_int(s) Detectors
495   return false;
496 }
497 
498 string to_string(const line& inst) {
499   ostringstream out;
500   for (int i = 0;  i < SIZE(inst.words);  ++i) {
501     if (i > 0) out << ' ';
502     out << inst.words.at(i).original;
503   }
504   return out.str();
505 }
506 
507 int32_t parse_int(const string& s) {
508   if (s.empty()) return 0;
509   if (contains_uppercase(s)) {
510     raise << "uppercase hex not allowed: " << s << '\n' << end();
511     return 0;
512   }
513   istringstream in(s);
514   in >> std::hex;
515   if (s.at(0) == '-') {
516     int32_t result = 0;
517     in >> result;
518     if (!in || !in.eof()) {
519       raise << "not a number: " << s << '\n' << end();
520       return 0;
521     }
522     return result;
523   }
524   uint32_t uresult = 0;
525   in >> uresult;
526   if (!in || !in.eof()) {
527     raise << "not a number: " << s << '\n' << end();
528     return 0;
529   }
530   return static_cast<int32_t>(uresult);
531 }
532 :(before "End Unit Tests")
533 void test_parse_int() {
534   CHECK_EQ(0, parse_int("0"));
535   CHECK_EQ(0, parse_int("0x0"));
536   CHECK_EQ(0, parse_int("0x0"));
537   CHECK_EQ(16, parse_int("10"));  // hex always
538   CHECK_EQ(-1, parse_int("-1"));
539   CHECK_EQ(-1, parse_int("0xffffffff"));
540 }