Mu - 032operands.cc

From fcc161e70502caf34bc0206d2c428e8341e97fa6 Mon Sep 17 00:00:00 2001 From: Kartik Agaram Date: Sun, 24 May 2020 22:43:18 -0700 Subject: 6397 Drop '---' section boundaries from filenames. I noticed them confusing tab-completion for certain advanced shell setups. --- html/032operands.cc.html | 604 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 604 insertions(+) create mode 100644 html/032operands.cc.html (limited to 'html/032operands.cc.html') diff --git a/html/032operands.cc.html b/html/032operands.cc.html new file mode 100644 index 00000000..66c116e8 --- /dev/null +++ b/html/032operands.cc.html @@ -0,0 +1,604 @@ + + + + +Mu - 032operands.cc + + + + + + + + + + +https://github.com/akkartik/mu/blob/master/032operands.cc +
+  1 //: Metadata for fields of an x86 instruction.
+  2 //:
+  3 //: The x86 instruction set is variable-length, and how a byte is interpreted
+  4 //: affects later instruction boundaries. A lot of the pain in programming
+  5 //: machine code stems from computer and programmer going out of sync on what
+  6 //: a byte means. The miscommunication is usually not immediately caught, and
+  7 //: metastasizes at runtime into kilobytes of misinterpreted instructions.
+  8 //:
+  9 //: To mitigate these issues, we'll start programming in terms of logical
+ 10 //: operands rather than physical bytes. Some operands are smaller than a
+ 11 //: byte, and others may consist of multiple bytes. This layer will correctly
+ 12 //: pack and order the bytes corresponding to the operands in an instruction.
+ 13 
+ 14 :(before "End Help Texts")
+ 15 put_new(Help, "instructions",
+ 16   "Each x86 instruction consists of an instruction or opcode and some number\n"
+ 17   "of operands.\n"
+ 18   "Each operand has a type. An instruction won't have more than one operand of\n"
+ 19   "any type.\n"
+ 20   "Each instruction has some set of allowed operand types. It'll reject others.\n"
+ 21   "The complete list of operand types: mod, subop, r32 (register), rm32\n"
+ 22   "(register or memory), scale, index, base, disp8, disp16, disp32, imm8,\n"
+ 23   "imm32.\n"
+ 24   "Each of these has its own help page. Try reading 'bootstrap help mod' next.\n"
+ 25 );
+ 26 :(before "End Help Contents")
+ 27 cerr << "  instructions\n";
+ 28 
+ 29 :(before "Running Test Program")
+ 30 transform(p);
+ 31 if (trace_contains_errors()) return;
+ 32 
+ 33 :(code)
+ 34 void test_pack_immediate_constants() {
+ 35   run(
+ 36       "== code 0x1\n"
+ 37       "bb  0x2a/imm32\n"
+ 38   );
+ 39   CHECK_TRACE_CONTENTS(
+ 40       "transform: packing instruction 'bb 0x2a/imm32'\n"
+ 41       "transform: instruction after packing: 'bb 2a 00 00 00'\n"
+ 42       "run: copy imm32 0x0000002a to EBX\n"
+ 43   );
+ 44 }
+ 45 
+ 46 //: complete set of valid operand types
+ 47 
+ 48 :(before "End Globals")
+ 49 set<string> Instruction_operands;
+ 50 :(before "End One-time Setup")
+ 51 Instruction_operands.insert("subop");
+ 52 Instruction_operands.insert("mod");
+ 53 Instruction_operands.insert("rm32");
+ 54 Instruction_operands.insert("base");
+ 55 Instruction_operands.insert("index");
+ 56 Instruction_operands.insert("scale");
+ 57 Instruction_operands.insert("r32");
+ 58 Instruction_operands.insert("disp8");
+ 59 Instruction_operands.insert("disp16");
+ 60 Instruction_operands.insert("disp32");
+ 61 Instruction_operands.insert("imm8");
+ 62 Instruction_operands.insert("imm32");
+ 63 
+ 64 :(before "End Help Texts")
+ 65 init_operand_type_help();
+ 66 :(code)
+ 67 void init_operand_type_help() {
+ 68   put(Help, "mod",
+ 69     "2-bit operand controlling the _addressing mode_ of many instructions,\n"
+ 70     "to determine how to compute the _effective address_ to look up memory at\n"
+ 71     "based on the 'rm32' operand and potentially others.\n"
+ 72     "\n"
+ 73     "If mod = 3, just operate on the contents of the register specified by rm32\n"
+ 74     "            (direct mode).\n"
+ 75     "If mod = 2, effective address is usually* rm32 + disp32\n"
+ 76     "            (indirect mode with displacement).\n"
+ 77     "If mod = 1, effective address is usually* rm32 + disp8\n"
+ 78     "            (indirect mode with displacement).\n"
+ 79     "If mod = 0, effective address is usually* rm32 (indirect mode).\n"
+ 80     "(* - The exception is when rm32 is '4'. Register 4 is the stack pointer (ESP).\n"
+ 81     "     Using it as an address gets more involved. For more details,\n"
+ 82     "     try reading the help pages for 'base', 'index' and 'scale'.)\n"
+ 83     "\n"
+ 84     "For complete details, spend some time with two tables in the IA-32 software\n"
+ 85     "developer's manual that are also included in this repo:\n"
+ 86     "  - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n"
+ 87     "  - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n"
+ 88   );
+ 89   put(Help, "subop",
+ 90     "Additional 3-bit operand for determining the instruction when the opcode\n"
+ 91     "is 81, 8f, d3, f7 or ff.\n"
+ 92     "Can't coexist with operand of type 'r32' in a single instruction, because\n"
+ 93     "the two use the same bits.\n"
+ 94   );
+ 95   put(Help, "r32",
+ 96     "3-bit operand specifying a register operand used directly, without any further addressing modes.\n"
+ 97   );
+ 98   put(Help, "rm32",
+ 99     "32-bit value in register or memory. The precise details of its construction\n"
+100     "depend on the eponymous 3-bit 'rm32' operand, the 'mod' operand, and also\n"
+101     "potentially the 'SIB' operands ('scale', 'index' and 'base') and a displacement\n"
+102     "('disp8' or 'disp32').\n"
+103     "\n"
+104     "For complete details, spend some time with two tables in the IA-32 software\n"
+105     "developer's manual that are also included in this repo:\n"
+106     "  - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n"
+107     "  - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n"
+108   );
+109   put(Help, "base",
+110     "Additional 3-bit operand (when 'rm32' is 4, unless 'mod' is 3) specifying the\n"
+111     "register containing an address to look up.\n"
+112     "This address may be further modified by 'index' and 'scale' operands.\n"
+113     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
+114     "For complete details, spend some time with the IA-32 software developer's manual,\n"
+115     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
+116     "It is included in this repository as 'sib.pdf'.\n"
+117   );
+118   put(Help, "index",
+119     "Optional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be added to\n"
+120     "the 'base' operand to compute the 'effective address' at which to look up memory.\n"
+121     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
+122     "For complete details, spend some time with the IA-32 software developer's manual,\n"
+123     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
+124     "It is included in this repository as 'sib.pdf'.\n"
+125   );
+126   put(Help, "scale",
+127     "Optional 2-bit operand (when 'rm32' is 4 unless 'mod' is 3) that encodes a\n"
+128     "power of 2 to be multiplied to the 'index' operand before adding the result to\n"
+129     "the 'base' operand to compute the _effective address_ to operate on.\n"
+130     "  effective address = base + index * scale + displacement (disp8 or disp32)\n"
+131     "\n"
+132     "When scale is 0, use index unmodified.\n"
+133     "When scale is 1, multiply index by 2.\n"
+134     "When scale is 2, multiply index by 4.\n"
+135     "When scale is 3, multiply index by 8.\n"
+136     "\n"
+137     "For complete details, spend some time with the IA-32 software developer's manual,\n"
+138     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
+139     "It is included in this repository as 'sib.pdf'.\n"
+140   );
+141   put(Help, "disp8",
+142     "8-bit value to be added in many instructions.\n"
+143   );
+144   put(Help, "disp16",
+145     "16-bit value to be added in many instructions.\n"
+146     "Currently not used in any SubX instructions.\n"
+147   );
+148   put(Help, "disp32",
+149     "32-bit value to be added in many instructions.\n"
+150   );
+151   put(Help, "imm8",
+152     "8-bit value for many instructions.\n"
+153   );
+154   put(Help, "imm32",
+155     "32-bit value for many instructions.\n"
+156   );
+157 }
+158 
+159 //:: transform packing operands into bytes in the right order
+160 
+161 :(after "Begin Transforms")
+162 Transform.push_back(pack_operands);
+163 
+164 :(code)
+165 void pack_operands(program& p) {
+166   if (p.segments.empty()) return;
+167   segment& code = *find(p, "code");
+168   // Pack Operands(segment code)
+169   trace(3, "transform") << "-- pack operands" << end();
+170   for (int i = 0;  i < SIZE(code.lines);  ++i) {
+171     line& inst = code.lines.at(i);
+172     if (all_hex_bytes(inst)) continue;
+173     trace(99, "transform") << "packing instruction '" << to_string(/*with metadata*/inst) << "'" << end();
+174     pack_operands(inst);
+175     trace(99, "transform") << "instruction after packing: '" << to_string(/*without metadata*/inst.words) << "'" << end();
+176   }
+177 }
+178 
+179 void pack_operands(line& inst) {
+180   line new_inst;
+181   add_opcodes(inst, new_inst);
+182   add_modrm_byte(inst, new_inst);
+183   add_sib_byte(inst, new_inst);
+184   add_disp_bytes(inst, new_inst);
+185   add_imm_bytes(inst, new_inst);
+186   inst.words.swap(new_inst.words);
+187 }
+188 
+189 void add_opcodes(const line& in, line& out) {
+190   out.words.push_back(in.words.at(0));
+191   if (in.words.at(0).data == "0f" || in.words.at(0).data == "f2" || in.words.at(0).data == "f3")
+192     out.words.push_back(in.words.at(1));
+193   if (in.words.at(0).data == "f3" && in.words.at(1).data == "0f")
+194     out.words.push_back(in.words.at(2));
+195   if (in.words.at(0).data == "f2" && in.words.at(1).data == "0f")
+196     out.words.push_back(in.words.at(2));
+197 }
+198 
+199 void add_modrm_byte(const line& in, line& out) {
+200   uint8_t mod=0, reg_subop=0, rm32=0;
+201   bool emit = false;
+202   for (int i = 0;  i < SIZE(in.words);  ++i) {
+203     const word& curr = in.words.at(i);
+204     if (has_operand_metadata(curr, "mod")) {
+205       mod = hex_byte(curr.data);
+206       emit = true;
+207     }
+208     else if (has_operand_metadata(curr, "rm32")) {
+209       rm32 = hex_byte(curr.data);
+210       emit = true;
+211     }
+212     else if (has_operand_metadata(curr, "r32")) {
+213       reg_subop = hex_byte(curr.data);
+214       emit = true;
+215     }
+216     else if (has_operand_metadata(curr, "subop")) {
+217       reg_subop = hex_byte(curr.data);
+218       emit = true;
+219     }
+220   }
+221   if (emit)
+222     out.words.push_back(hex_byte_text((mod << 6) | (reg_subop << 3) | rm32));
+223 }
+224 
+225 void add_sib_byte(const line& in, line& out) {
+226   uint8_t scale=0, index=0, base=0;
+227   bool emit = false;
+228   for (int i = 0;  i < SIZE(in.words);  ++i) {
+229     const word& curr = in.words.at(i);
+230     if (has_operand_metadata(curr, "scale")) {
+231       scale = hex_byte(curr.data);
+232       emit = true;
+233     }
+234     else if (has_operand_metadata(curr, "index")) {
+235       index = hex_byte(curr.data);
+236       emit = true;
+237     }
+238     else if (has_operand_metadata(curr, "base")) {
+239       base = hex_byte(curr.data);
+240       emit = true;
+241     }
+242   }
+243   if (emit)
+244     out.words.push_back(hex_byte_text((scale << 6) | (index << 3) | base));
+245 }
+246 
+247 void add_disp_bytes(const line& in, line& out) {
+248   for (int i = 0;  i < SIZE(in.words);  ++i) {
+249     const word& curr = in.words.at(i);
+250     if (has_operand_metadata(curr, "disp8"))
+251       emit_hex_bytes(out, curr, 1);
+252     if (has_operand_metadata(curr, "disp16"))
+253       emit_hex_bytes(out, curr, 2);
+254     else if (has_operand_metadata(curr, "disp32"))
+255       emit_hex_bytes(out, curr, 4);
+256   }
+257 }
+258 
+259 void add_imm_bytes(const line& in, line& out) {
+260   for (int i = 0;  i < SIZE(in.words);  ++i) {
+261     const word& curr = in.words.at(i);
+262     if (has_operand_metadata(curr, "imm8"))
+263       emit_hex_bytes(out, curr, 1);
+264     else if (has_operand_metadata(curr, "imm32"))
+265       emit_hex_bytes(out, curr, 4);
+266   }
+267 }
+268 
+269 void emit_hex_bytes(line& out, const word& w, int num) {
+270   assert(num <= 4);
+271   bool is_number = looks_like_hex_int(w.data);
+272   if (num == 1 || !is_number) {
+273     out.words.push_back(w);  // preserve existing metadata
+274     if (is_number)
+275       out.words.back().data = hex_byte_to_string(parse_int(w.data));
+276     return;
+277   }
+278   emit_hex_bytes(out, static_cast<uint32_t>(parse_int(w.data)), num);
+279 }
+280 
+281 void emit_hex_bytes(line& out, uint32_t val, int num) {
+282   assert(num <= 4);
+283   for (int i = 0;  i < num;  ++i) {
+284     out.words.push_back(hex_byte_text(val & 0xff));
+285     val = val >> 8;
+286   }
+287 }
+288 
+289 word hex_byte_text(uint8_t val) {
+290   word result;
+291   result.data = hex_byte_to_string(val);
+292   result.original = result.data+"/auto";
+293   return result;
+294 }
+295 
+296 string hex_byte_to_string(uint8_t val) {
+297   ostringstream out;
+298   // uint8_t prints without padding, but int8_t will expand to 32 bits again
+299   out << HEXBYTE << NUM(val);
+300   return out.str();
+301 }
+302 
+303 string to_string(const vector<word>& in) {
+304   ostringstream out;
+305   for (int i = 0;  i < SIZE(in);  ++i) {
+306     if (i > 0) out << ' ';
+307     out << in.at(i).data;
+308   }
+309   return out.str();
+310 }
+311 
+312 :(before "End Unit Tests")
+313 void test_preserve_metadata_when_emitting_single_byte() {
+314   word in;
+315   in.data = "f0";
+316   in.original = "f0/foo";
+317   line out;
+318   emit_hex_bytes(out, in, 1);
+319   CHECK_EQ(out.words.at(0).data, "f0");
+320   CHECK_EQ(out.words.at(0).original, "f0/foo");
+321 }
+322 
+323 :(code)
+324 void test_pack_disp8() {
+325   run(
+326       "== code 0x1\n"
+327       "74 2/disp8\n"  // jump 2 bytes away if ZF is set
+328   );
+329   CHECK_TRACE_CONTENTS(
+330       "transform: packing instruction '74 2/disp8'\n"
+331       "transform: instruction after packing: '74 02'\n"
+332   );
+333 }
+334 
+335 void test_pack_disp8_negative() {
+336   transform(
+337       "== code 0x1\n"
+338       // running this will cause an infinite loop
+339       "74 -1/disp8\n"  // jump 1 byte before if ZF is set
+340   );
+341   CHECK_TRACE_CONTENTS(
+342       "transform: packing instruction '74 -1/disp8'\n"
+343       "transform: instruction after packing: '74 ff'\n"
+344   );
+345 }
+346 
+347 //: helper for scenario
+348 void transform(const string& text_bytes) {
+349   program p;
+350   istringstream in(text_bytes);
+351   parse(in, p);
+352   if (trace_contains_errors()) return;
+353   transform(p);
+354 }
+355 
+356 void test_pack_modrm_imm32() {
+357   run(
+358       "== code 0x1\n"
+359       // instruction                     effective address                                                   operand     displacement    immediate\n"
+360       // op          subop               mod             rm32          base        index         scale       r32\n"
+361       // 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes\n"
+362       "  81          0/add/subop         3/mod/direct    3/ebx/rm32                                                                      1/imm32      \n"  // add 1 to EBX
+363   );
+364   CHECK_TRACE_CONTENTS(
+365       "transform: packing instruction '81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32'\n"
+366       "transform: instruction after packing: '81 c3 01 00 00 00'\n"
+367   );
+368 }
+369 
+370 void test_pack_imm32_large() {
+371   run(
+372       "== code 0x1\n"
+373       "b9  0x080490a7/imm32\n"
+374   );
+375   CHECK_TRACE_CONTENTS(
+376       "transform: packing instruction 'b9 0x080490a7/imm32'\n"
+377       "transform: instruction after packing: 'b9 a7 90 04 08'\n"
+378   );
+379 }
+380 
+381 void test_pack_immediate_constants_hex() {
+382   run(
+383       "== code 0x1\n"
+384       "b9  0x2a/imm32\n"
+385   );
+386   CHECK_TRACE_CONTENTS(
+387       "transform: packing instruction 'b9 0x2a/imm32'\n"
+388       "transform: instruction after packing: 'b9 2a 00 00 00'\n"
+389       "run: copy imm32 0x0000002a to ECX\n"
+390   );
+391 }
+392 
+393 void test_pack_silently_ignores_non_hex() {
+394   Hide_errors = true;
+395   transform(
+396       "== code 0x1\n"
+397       "b9  foo/imm32\n"
+398   );
+399   CHECK_TRACE_CONTENTS(
+400       "transform: packing instruction 'b9 foo/imm32'\n"
+401       // no change (we're just not printing metadata to the trace)
+402       "transform: instruction after packing: 'b9 foo'\n"
+403   );
+404 }
+405 
+406 void test_pack_flags_bad_hex() {
+407   Hide_errors = true;
+408   run(
+409       "== code 0x1\n"
+410       "b9  0xfoo/imm32\n"
+411   );
+412   CHECK_TRACE_CONTENTS(
+413       "error: not a number: 0xfoo\n"
+414   );
+415 }
+416 
+417 void test_pack_flags_uppercase_hex() {
+418   Hide_errors = true;
+419   run(
+420       "== code 0x1\n"
+421       "b9 0xAb/imm32\n"
+422   );
+423   CHECK_TRACE_CONTENTS(
+424       "error: uppercase hex not allowed: 0xAb\n"
+425   );
+426 }
+427 
+428 //:: helpers
+429 
+430 bool all_hex_bytes(const line& inst) {
+431   for (int i = 0;  i < SIZE(inst.words);  ++i)
+432     if (!is_hex_byte(inst.words.at(i)))
+433       return false;
+434   return true;
+435 }
+436 
+437 bool is_hex_byte(const word& curr) {
+438   if (contains_any_operand_metadata(curr))
+439     return false;
+440   if (SIZE(curr.data) != 2)
+441     return false;
+442   if (curr.data.find_first_not_of("0123456789abcdef") != string::npos)
+443     return false;
+444   return true;
+445 }
+446 
+447 bool contains_any_operand_metadata(const word& word) {
+448   for (int i = 0;  i < SIZE(word.metadata);  ++i)
+449     if (Instruction_operands.find(word.metadata.at(i)) != Instruction_operands.end())
+450       return true;
+451   return false;
+452 }
+453 
+454 bool has_operand_metadata(const line& inst, const string& m) {
+455   bool result = false;
+456   for (int i = 0;  i < SIZE(inst.words);  ++i) {
+457     if (!has_operand_metadata(inst.words.at(i), m)) continue;
+458     if (result) {
+459       raise << "'" << to_string(inst) << "' has conflicting " << m << " operands\n" << end();
+460       return false;
+461     }
+462     result = true;
+463   }
+464   return result;
+465 }
+466 
+467 bool has_operand_metadata(const word& w, const string& m) {
+468   bool result = false;
+469   bool metadata_found = false;
+470   for (int i = 0;  i < SIZE(w.metadata);  ++i) {
+471     const string& curr = w.metadata.at(i);
+472     if (Instruction_operands.find(curr) == Instruction_operands.end()) continue;  // ignore unrecognized metadata
+473     if (metadata_found) {
+474       raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end();
+475       return false;
+476     }
+477     metadata_found = true;
+478     result = (curr == m);
+479   }
+480   return result;
+481 }
+482 
+483 word metadata(const line& inst, const string& m) {
+484   for (int i = 0;  i < SIZE(inst.words);  ++i)
+485     if (has_operand_metadata(inst.words.at(i), m))
+486       return inst.words.at(i);
+487   assert(false);
+488 }
+489 
+490 bool looks_like_hex_int(const string& s) {
+491   if (s.empty()) return false;
+492   if (s.at(0) == '-' || s.at(0) == '+') return true;
+493   if (isdigit(s.at(0))) return true;  // includes '0x' prefix
+494   // End looks_like_hex_int(s) Detectors
+495   return false;
+496 }
+497 
+498 string to_string(const line& inst) {
+499   ostringstream out;
+500   for (int i = 0;  i < SIZE(inst.words);  ++i) {
+501     if (i > 0) out << ' ';
+502     out << inst.words.at(i).original;
+503   }
+504   return out.str();
+505 }
+506 
+507 int32_t parse_int(const string& s) {
+508   if (s.empty()) return 0;
+509   if (contains_uppercase(s)) {
+510     raise << "uppercase hex not allowed: " << s << '\n' << end();
+511     return 0;
+512   }
+513   istringstream in(s);
+514   in >> std::hex;
+515   if (s.at(0) == '-') {
+516     int32_t result = 0;
+517     in >> result;
+518     if (!in || !in.eof()) {
+519       raise << "not a number: " << s << '\n' << end();
+520       return 0;
+521     }
+522     return result;
+523   }
+524   uint32_t uresult = 0;
+525   in >> uresult;
+526   if (!in || !in.eof()) {
+527     raise << "not a number: " << s << '\n' << end();
+528     return 0;
+529   }
+530   return static_cast<int32_t>(uresult);
+531 }
+532 :(before "End Unit Tests")
+533 void test_parse_int() {
+534   CHECK_EQ(0, parse_int("0"));
+535   CHECK_EQ(0, parse_int("0x0"));
+536   CHECK_EQ(0, parse_int("0x0"));
+537   CHECK_EQ(16, parse_int("10"));  // hex always
+538   CHECK_EQ(-1, parse_int("-1"));
+539   CHECK_EQ(-1, parse_int("0xffffffff"));
+540 }
+
+ + + -- cgit 1.4.1-2-gfad0