Mu - 032---operands.cc

From 91624dbacabd0b437bbcce3fdb3dc8e67f577fa6 Mon Sep 17 00:00:00 2001 From: Kartik Agaram Date: Mon, 9 Dec 2019 01:32:48 -0800 Subject: 5807 --- html/032---operands.cc.html | 603 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 603 insertions(+) create mode 100644 html/032---operands.cc.html (limited to 'html/032---operands.cc.html') diff --git a/html/032---operands.cc.html b/html/032---operands.cc.html new file mode 100644 index 00000000..ac421edd --- /dev/null +++ b/html/032---operands.cc.html @@ -0,0 +1,603 @@ + + + + +Mu - 032---operands.cc + + + + + + + + + + +https://github.com/akkartik/mu/blob/master/032---operands.cc +
+  1 //: Beginning of "level 2": tagging bytes with metadata around what field of
+  2 //: an x86 instruction they're for.
+  3 //:
+  4 //: The x86 instruction set is variable-length, and how a byte is interpreted
+  5 //: affects later instruction boundaries. A lot of the pain in programming
+  6 //: machine code stems from computer and programmer going out of sync on what
+  7 //: a byte means. The miscommunication is usually not immediately caught, and
+  8 //: metastasizes at runtime into kilobytes of misinterpreted instructions.
+  9 //:
+ 10 //: To mitigate these issues, we'll start programming in terms of logical
+ 11 //: operands rather than physical bytes. Some operands are smaller than a
+ 12 //: byte, and others may consist of multiple bytes. This layer will correctly
+ 13 //: pack and order the bytes corresponding to the operands in an instruction.
+ 14 
+ 15 :(before "End Help Texts")
+ 16 put_new(Help, "instructions",
+ 17   "Each x86 instruction consists of an instruction or opcode and some number\n"
+ 18   "of operands.\n"
+ 19   "Each operand has a type. An instruction won't have more than one operand of\n"
+ 20   "any type.\n"
+ 21   "Each instruction has some set of allowed operand types. It'll reject others.\n"
+ 22   "The complete list of operand types: mod, subop, r32 (register), rm32\n"
+ 23   "(register or memory), scale, index, base, disp8, disp16, disp32, imm8,\n"
+ 24   "imm32.\n"
+ 25   "Each of these has its own help page. Try reading 'subx help mod' next.\n"
+ 26 );
+ 27 :(before "End Help Contents")
+ 28 cerr << "  instructions\n";
+ 29 
+ 30 :(code)
+ 31 void test_pack_immediate_constants() {
+ 32   run(
+ 33       "== code 0x1\n"
+ 34       "bb  0x2a/imm32\n"
+ 35   );
+ 36   CHECK_TRACE_CONTENTS(
+ 37       "transform: packing instruction 'bb 0x2a/imm32'\n"
+ 38       "transform: instruction after packing: 'bb 2a 00 00 00'\n"
+ 39       "run: copy imm32 0x0000002a to EBX\n"
+ 40   );
+ 41 }
+ 42 
+ 43 //: complete set of valid operand types
+ 44 
+ 45 :(before "End Globals")
+ 46 set<string> Instruction_operands;
+ 47 :(before "End One-time Setup")
+ 48 Instruction_operands.insert("subop");
+ 49 Instruction_operands.insert("mod");
+ 50 Instruction_operands.insert("rm32");
+ 51 Instruction_operands.insert("base");
+ 52 Instruction_operands.insert("index");
+ 53 Instruction_operands.insert("scale");
+ 54 Instruction_operands.insert("r32");
+ 55 Instruction_operands.insert("disp8");
+ 56 Instruction_operands.insert("disp16");
+ 57 Instruction_operands.insert("disp32");
+ 58 Instruction_operands.insert("imm8");
+ 59 Instruction_operands.insert("imm32");
+ 60 
+ 61 :(before "End Help Texts")
+ 62 init_operand_type_help();
+ 63 :(code)
+ 64 void init_operand_type_help() {
+ 65   put(Help, "mod",
+ 66     "2-bit operand controlling the _addressing mode_ of many instructions,\n"
+ 67     "to determine how to compute the _effective address_ to look up memory at\n"
+ 68     "based on the 'rm32' operand and potentially others.\n"
+ 69     "\n"
+ 70     "If mod = 3, just operate on the contents of the register specified by rm32\n"
+ 71     "            (direct mode).\n"
+ 72     "If mod = 2, effective address is usually* rm32 + disp32\n"
+ 73     "            (indirect mode with displacement).\n"
+ 74     "If mod = 1, effective address is usually* rm32 + disp8\n"
+ 75     "            (indirect mode with displacement).\n"
+ 76     "If mod = 0, effective address is usually* rm32 (indirect mode).\n"
+ 77     "(* - The exception is when rm32 is '4'. Register 4 is the stack pointer (ESP).\n"
+ 78     "     Using it as an address gets more involved. For more details,\n"
+ 79     "     try reading the help pages for 'base', 'index' and 'scale'.)\n"
+ 80     "\n"
+ 81     "For complete details, spend some time with two tables in the IA-32 software\n"
+ 82     "developer's manual that are also included in this repo:\n"
+ 83     "  - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n"
+ 84     "  - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n"
+ 85   );
+ 86   put(Help, "subop",
+ 87     "Additional 3-bit operand for determining the instruction when the opcode\n"
+ 88     "is 81, 8f, d3, f7 or ff.\n"
+ 89     "Can't coexist with operand of type 'r32' in a single instruction, because\n"
+ 90     "the two use the same bits.\n"
+ 91   );
+ 92   put(Help, "r32",
+ 93     "3-bit operand specifying a register operand used directly, without any further addressing modes.\n"
+ 94   );
+ 95   put(Help, "rm32",
+ 96     "32-bit value in register or memory. The precise details of its construction\n"
+ 97     "depend on the eponymous 3-bit 'rm32' operand, the 'mod' operand, and also\n"
+ 98     "potentially the 'SIB' operands ('scale', 'index' and 'base') and a displacement\n"
+ 99     "('disp8' or 'disp32').\n"
+100     "\n"
+101     "For complete details, spend some time with two tables in the IA-32 software\n"
+102     "developer's manual that are also included in this repo:\n"
+103     "  - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n"
+104     "  - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n"
+105   );
+106   put(Help, "base",
+107     "Additional 3-bit operand (when 'rm32' is 4, unless 'mod' is 3) specifying the\n"
+108     "register containing an address to look up.\n"
+109     "This address may be further modified by 'index' and 'scale' operands.\n"
+110     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
+111     "For complete details, spend some time with the IA-32 software developer's manual,\n"
+112     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
+113     "It is included in this repository as 'sib.pdf'.\n"
+114   );
+115   put(Help, "index",
+116     "Optional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be added to\n"
+117     "the 'base' operand to compute the 'effective address' at which to look up memory.\n"
+118     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
+119     "For complete details, spend some time with the IA-32 software developer's manual,\n"
+120     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
+121     "It is included in this repository as 'sib.pdf'.\n"
+122   );
+123   put(Help, "scale",
+124     "Optional 2-bit operand (when 'rm32' is 4 unless 'mod' is 3) that encodes a\n"
+125     "power of 2 to be multiplied to the 'index' operand before adding the result to\n"
+126     "the 'base' operand to compute the _effective address_ to operate on.\n"
+127     "  effective address = base + index * scale + displacement (disp8 or disp32)\n"
+128     "\n"
+129     "When scale is 0, use index unmodified.\n"
+130     "When scale is 1, multiply index by 2.\n"
+131     "When scale is 2, multiply index by 4.\n"
+132     "When scale is 3, multiply index by 8.\n"
+133     "\n"
+134     "For complete details, spend some time with the IA-32 software developer's manual,\n"
+135     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
+136     "It is included in this repository as 'sib.pdf'.\n"
+137   );
+138   put(Help, "disp8",
+139     "8-bit value to be added in many instructions.\n"
+140   );
+141   put(Help, "disp16",
+142     "16-bit value to be added in many instructions.\n"
+143     "Currently not used in any SubX instructions.\n"
+144   );
+145   put(Help, "disp32",
+146     "32-bit value to be added in many instructions.\n"
+147   );
+148   put(Help, "imm8",
+149     "8-bit value for many instructions.\n"
+150   );
+151   put(Help, "imm32",
+152     "32-bit value for many instructions.\n"
+153   );
+154 }
+155 
+156 //:: transform packing operands into bytes in the right order
+157 
+158 :(after "Begin Transforms")
+159 // Begin Level-2 Transforms
+160 Transform.push_back(pack_operands);
+161 // End Level-2 Transforms
+162 
+163 :(code)
+164 void pack_operands(program& p) {
+165   if (p.segments.empty()) return;
+166   segment& code = *find(p, "code");
+167   // Pack Operands(segment code)
+168   trace(3, "transform") << "-- pack operands" << end();
+169   for (int i = 0;  i < SIZE(code.lines);  ++i) {
+170     line& inst = code.lines.at(i);
+171     if (all_hex_bytes(inst)) continue;
+172     trace(99, "transform") << "packing instruction '" << to_string(/*with metadata*/inst) << "'" << end();
+173     pack_operands(inst);
+174     trace(99, "transform") << "instruction after packing: '" << to_string(/*without metadata*/inst.words) << "'" << end();
+175   }
+176 }
+177 
+178 void pack_operands(line& inst) {
+179   line new_inst;
+180   add_opcodes(inst, new_inst);
+181   add_modrm_byte(inst, new_inst);
+182   add_sib_byte(inst, new_inst);
+183   add_disp_bytes(inst, new_inst);
+184   add_imm_bytes(inst, new_inst);
+185   inst.words.swap(new_inst.words);
+186 }
+187 
+188 void add_opcodes(const line& in, line& out) {
+189   out.words.push_back(in.words.at(0));
+190   if (in.words.at(0).data == "0f" || in.words.at(0).data == "f2" || in.words.at(0).data == "f3")
+191     out.words.push_back(in.words.at(1));
+192   if (in.words.at(0).data == "f3" && in.words.at(1).data == "0f")
+193     out.words.push_back(in.words.at(2));
+194   if (in.words.at(0).data == "f2" && in.words.at(1).data == "0f")
+195     out.words.push_back(in.words.at(2));
+196 }
+197 
+198 void add_modrm_byte(const line& in, line& out) {
+199   uint8_t mod=0, reg_subop=0, rm32=0;
+200   bool emit = false;
+201   for (int i = 0;  i < SIZE(in.words);  ++i) {
+202     const word& curr = in.words.at(i);
+203     if (has_operand_metadata(curr, "mod")) {
+204       mod = hex_byte(curr.data);
+205       emit = true;
+206     }
+207     else if (has_operand_metadata(curr, "rm32")) {
+208       rm32 = hex_byte(curr.data);
+209       emit = true;
+210     }
+211     else if (has_operand_metadata(curr, "r32")) {
+212       reg_subop = hex_byte(curr.data);
+213       emit = true;
+214     }
+215     else if (has_operand_metadata(curr, "subop")) {
+216       reg_subop = hex_byte(curr.data);
+217       emit = true;
+218     }
+219   }
+220   if (emit)
+221     out.words.push_back(hex_byte_text((mod << 6) | (reg_subop << 3) | rm32));
+222 }
+223 
+224 void add_sib_byte(const line& in, line& out) {
+225   uint8_t scale=0, index=0, base=0;
+226   bool emit = false;
+227   for (int i = 0;  i < SIZE(in.words);  ++i) {
+228     const word& curr = in.words.at(i);
+229     if (has_operand_metadata(curr, "scale")) {
+230       scale = hex_byte(curr.data);
+231       emit = true;
+232     }
+233     else if (has_operand_metadata(curr, "index")) {
+234       index = hex_byte(curr.data);
+235       emit = true;
+236     }
+237     else if (has_operand_metadata(curr, "base")) {
+238       base = hex_byte(curr.data);
+239       emit = true;
+240     }
+241   }
+242   if (emit)
+243     out.words.push_back(hex_byte_text((scale << 6) | (index << 3) | base));
+244 }
+245 
+246 void add_disp_bytes(const line& in, line& out) {
+247   for (int i = 0;  i < SIZE(in.words);  ++i) {
+248     const word& curr = in.words.at(i);
+249     if (has_operand_metadata(curr, "disp8"))
+250       emit_hex_bytes(out, curr, 1);
+251     if (has_operand_metadata(curr, "disp16"))
+252       emit_hex_bytes(out, curr, 2);
+253     else if (has_operand_metadata(curr, "disp32"))
+254       emit_hex_bytes(out, curr, 4);
+255   }
+256 }
+257 
+258 void add_imm_bytes(const line& in, line& out) {
+259   for (int i = 0;  i < SIZE(in.words);  ++i) {
+260     const word& curr = in.words.at(i);
+261     if (has_operand_metadata(curr, "imm8"))
+262       emit_hex_bytes(out, curr, 1);
+263     else if (has_operand_metadata(curr, "imm32"))
+264       emit_hex_bytes(out, curr, 4);
+265   }
+266 }
+267 
+268 void emit_hex_bytes(line& out, const word& w, int num) {
+269   assert(num <= 4);
+270   bool is_number = looks_like_hex_int(w.data);
+271   if (num == 1 || !is_number) {
+272     out.words.push_back(w);  // preserve existing metadata
+273     if (is_number)
+274       out.words.back().data = hex_byte_to_string(parse_int(w.data));
+275     return;
+276   }
+277   emit_hex_bytes(out, static_cast<uint32_t>(parse_int(w.data)), num);
+278 }
+279 
+280 void emit_hex_bytes(line& out, uint32_t val, int num) {
+281   assert(num <= 4);
+282   for (int i = 0;  i < num;  ++i) {
+283     out.words.push_back(hex_byte_text(val & 0xff));
+284     val = val >> 8;
+285   }
+286 }
+287 
+288 word hex_byte_text(uint8_t val) {
+289   word result;
+290   result.data = hex_byte_to_string(val);
+291   result.original = result.data+"/auto";
+292   return result;
+293 }
+294 
+295 string hex_byte_to_string(uint8_t val) {
+296   ostringstream out;
+297   // uint8_t prints without padding, but int8_t will expand to 32 bits again
+298   out << HEXBYTE << NUM(val);
+299   return out.str();
+300 }
+301 
+302 string to_string(const vector<word>& in) {
+303   ostringstream out;
+304   for (int i = 0;  i < SIZE(in);  ++i) {
+305     if (i > 0) out << ' ';
+306     out << in.at(i).data;
+307   }
+308   return out.str();
+309 }
+310 
+311 :(before "End Unit Tests")
+312 void test_preserve_metadata_when_emitting_single_byte() {
+313   word in;
+314   in.data = "f0";
+315   in.original = "f0/foo";
+316   line out;
+317   emit_hex_bytes(out, in, 1);
+318   CHECK_EQ(out.words.at(0).data, "f0");
+319   CHECK_EQ(out.words.at(0).original, "f0/foo");
+320 }
+321 
+322 :(code)
+323 void test_pack_disp8() {
+324   run(
+325       "== code 0x1\n"
+326       "74 2/disp8\n"  // jump 2 bytes away if ZF is set
+327   );
+328   CHECK_TRACE_CONTENTS(
+329       "transform: packing instruction '74 2/disp8'\n"
+330       "transform: instruction after packing: '74 02'\n"
+331   );
+332 }
+333 
+334 void test_pack_disp8_negative() {
+335   transform(
+336       "== code 0x1\n"
+337       // running this will cause an infinite loop
+338       "74 -1/disp8\n"  // jump 1 byte before if ZF is set
+339   );
+340   CHECK_TRACE_CONTENTS(
+341       "transform: packing instruction '74 -1/disp8'\n"
+342       "transform: instruction after packing: '74 ff'\n"
+343   );
+344 }
+345 
+346 //: helper for scenario
+347 void transform(const string& text_bytes) {
+348   program p;
+349   istringstream in(text_bytes);
+350   parse(in, p);
+351   if (trace_contains_errors()) return;
+352   transform(p);
+353 }
+354 
+355 void test_pack_modrm_imm32() {
+356   run(
+357       "== code 0x1\n"
+358       // instruction                     effective address                                                   operand     displacement    immediate\n"
+359       // op          subop               mod             rm32          base        index         scale       r32\n"
+360       // 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes\n"
+361       "  81          0/add/subop         3/mod/direct    3/ebx/rm32                                                                      1/imm32      \n"  // add 1 to EBX
+362   );
+363   CHECK_TRACE_CONTENTS(
+364       "transform: packing instruction '81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32'\n"
+365       "transform: instruction after packing: '81 c3 01 00 00 00'\n"
+366   );
+367 }
+368 
+369 void test_pack_imm32_large() {
+370   run(
+371       "== code 0x1\n"
+372       "b9  0x080490a7/imm32\n"
+373   );
+374   CHECK_TRACE_CONTENTS(
+375       "transform: packing instruction 'b9 0x080490a7/imm32'\n"
+376       "transform: instruction after packing: 'b9 a7 90 04 08'\n"
+377   );
+378 }
+379 
+380 void test_pack_immediate_constants_hex() {
+381   run(
+382       "== code 0x1\n"
+383       "b9  0x2a/imm32\n"
+384   );
+385   CHECK_TRACE_CONTENTS(
+386       "transform: packing instruction 'b9 0x2a/imm32'\n"
+387       "transform: instruction after packing: 'b9 2a 00 00 00'\n"
+388       "run: copy imm32 0x0000002a to ECX\n"
+389   );
+390 }
+391 
+392 void test_pack_silently_ignores_non_hex() {
+393   Hide_errors = true;
+394   transform(
+395       "== code 0x1\n"
+396       "b9  foo/imm32\n"
+397   );
+398   CHECK_TRACE_CONTENTS(
+399       "transform: packing instruction 'b9 foo/imm32'\n"
+400       // no change (we're just not printing metadata to the trace)
+401       "transform: instruction after packing: 'b9 foo'\n"
+402   );
+403 }
+404 
+405 void test_pack_flags_bad_hex() {
+406   Hide_errors = true;
+407   run(
+408       "== code 0x1\n"
+409       "b9  0xfoo/imm32\n"
+410   );
+411   CHECK_TRACE_CONTENTS(
+412       "error: not a number: 0xfoo\n"
+413   );
+414 }
+415 
+416 void test_pack_flags_uppercase_hex() {
+417   Hide_errors = true;
+418   run(
+419       "== code 0x1\n"
+420       "b9 0xAb/imm32\n"
+421   );
+422   CHECK_TRACE_CONTENTS(
+423       "error: uppercase hex not allowed: 0xAb\n"
+424   );
+425 }
+426 
+427 //:: helpers
+428 
+429 bool all_hex_bytes(const line& inst) {
+430   for (int i = 0;  i < SIZE(inst.words);  ++i)
+431     if (!is_hex_byte(inst.words.at(i)))
+432       return false;
+433   return true;
+434 }
+435 
+436 bool is_hex_byte(const word& curr) {
+437   if (contains_any_operand_metadata(curr))
+438     return false;
+439   if (SIZE(curr.data) != 2)
+440     return false;
+441   if (curr.data.find_first_not_of("0123456789abcdef") != string::npos)
+442     return false;
+443   return true;
+444 }
+445 
+446 bool contains_any_operand_metadata(const word& word) {
+447   for (int i = 0;  i < SIZE(word.metadata);  ++i)
+448     if (Instruction_operands.find(word.metadata.at(i)) != Instruction_operands.end())
+449       return true;
+450   return false;
+451 }
+452 
+453 bool has_operand_metadata(const line& inst, const string& m) {
+454   bool result = false;
+455   for (int i = 0;  i < SIZE(inst.words);  ++i) {
+456     if (!has_operand_metadata(inst.words.at(i), m)) continue;
+457     if (result) {
+458       raise << "'" << to_string(inst) << "' has conflicting " << m << " operands\n" << end();
+459       return false;
+460     }
+461     result = true;
+462   }
+463   return result;
+464 }
+465 
+466 bool has_operand_metadata(const word& w, const string& m) {
+467   bool result = false;
+468   bool metadata_found = false;
+469   for (int i = 0;  i < SIZE(w.metadata);  ++i) {
+470     const string& curr = w.metadata.at(i);
+471     if (Instruction_operands.find(curr) == Instruction_operands.end()) continue;  // ignore unrecognized metadata
+472     if (metadata_found) {
+473       raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end();
+474       return false;
+475     }
+476     metadata_found = true;
+477     result = (curr == m);
+478   }
+479   return result;
+480 }
+481 
+482 word metadata(const line& inst, const string& m) {
+483   for (int i = 0;  i < SIZE(inst.words);  ++i)
+484     if (has_operand_metadata(inst.words.at(i), m))
+485       return inst.words.at(i);
+486   assert(false);
+487 }
+488 
+489 bool looks_like_hex_int(const string& s) {
+490   if (s.empty()) return false;
+491   if (s.at(0) == '-' || s.at(0) == '+') return true;
+492   if (isdigit(s.at(0))) return true;  // includes '0x' prefix
+493   // End looks_like_hex_int(s) Detectors
+494   return false;
+495 }
+496 
+497 string to_string(const line& inst) {
+498   ostringstream out;
+499   for (int i = 0;  i < SIZE(inst.words);  ++i) {
+500     if (i > 0) out << ' ';
+501     out << inst.words.at(i).original;
+502   }
+503   return out.str();
+504 }
+505 
+506 int32_t parse_int(const string& s) {
+507   if (s.empty()) return 0;
+508   if (contains_uppercase(s)) {
+509     raise << "uppercase hex not allowed: " << s << '\n' << end();
+510     return 0;
+511   }
+512   istringstream in(s);
+513   in >> std::hex;
+514   if (s.at(0) == '-') {
+515     int32_t result = 0;
+516     in >> result;
+517     if (!in || !in.eof()) {
+518       raise << "not a number: " << s << '\n' << end();
+519       return 0;
+520     }
+521     return result;
+522   }
+523   uint32_t uresult = 0;
+524   in >> uresult;
+525   if (!in || !in.eof()) {
+526     raise << "not a number: " << s << '\n' << end();
+527     return 0;
+528   }
+529   return static_cast<int32_t>(uresult);
+530 }
+531 :(before "End Unit Tests")
+532 void test_parse_int() {
+533   CHECK_EQ(0, parse_int("0"));
+534   CHECK_EQ(0, parse_int("0x0"));
+535   CHECK_EQ(0, parse_int("0x0"));
+536   CHECK_EQ(16, parse_int("10"));  // hex always
+537   CHECK_EQ(-1, parse_int("-1"));
+538   CHECK_EQ(-1, parse_int("0xffffffff"));
+539 }
+
+ + + -- cgit 1.4.1-2-gfad0