From 7328af20a1921d9258a60803ee5367da97a6082e Mon Sep 17 00:00:00 2001 From: Kartik Agaram Date: Mon, 13 Aug 2018 21:25:22 -0700 Subject: 4521 --- html/subx/030---operands.cc.html | 536 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 536 insertions(+) create mode 100644 html/subx/030---operands.cc.html (limited to 'html/subx/030---operands.cc.html') diff --git a/html/subx/030---operands.cc.html b/html/subx/030---operands.cc.html new file mode 100644 index 00000000..ea38d64b --- /dev/null +++ b/html/subx/030---operands.cc.html @@ -0,0 +1,536 @@ + + + + +Mu - subx/030---operands.cc + + + + + + + + + + +
+  1 //: Beginning of "level 2": tagging bytes with metadata around what field of
+  2 //: an x86 instruction they're for.
+  3 //:
+  4 //: The x86 instruction set is variable-length, and how a byte is interpreted
+  5 //: affects later instruction boundaries. A lot of the pain in programming
+  6 //: machine code stems from computer and programmer going out of sync on what
+  7 //: a byte means. The miscommunication is usually not immediately caught, and
+  8 //: metastasizes at runtime into kilobytes of misinterpreted instructions.
+  9 //:
+ 10 //: To mitigate these issues, we'll start programming in terms of logical
+ 11 //: operands rather than physical bytes. Some operands are smaller than a
+ 12 //: byte, and others may consist of multiple bytes. This layer will correctly
+ 13 //: pack and order the bytes corresponding to the operands in an instruction.
+ 14 
+ 15 :(before "End Help Texts")
+ 16 put(Help, "instructions",
+ 17   "Each x86 instruction consists of an instruction or opcode and some number\n"
+ 18   "of operands.\n"
+ 19   "Each operand has a type. An instruction won't have more than one operand of\n"
+ 20   "any type.\n"
+ 21   "Each instruction has some set of allowed operand types. It'll reject others.\n"
+ 22   "The complete list of operand types: mod, subop, r32 (register), rm32\n"
+ 23   "(register or memory), scale, index, base, disp8, disp16, disp32, imm8,\n"
+ 24   "imm32.\n"
+ 25   "Each of these has its own help page. Try reading 'subx help mod' next.\n"
+ 26 );
+ 27 :(before "End Help Contents")
+ 28 cerr << "  instructions\n";
+ 29 
+ 30 :(scenario pack_immediate_constants)
+ 31 == 0x1
+ 32 # instruction                     effective address                                                   operand     displacement    immediate
+ 33 # op          subop               mod             rm32          base        index         scale       r32
+ 34 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
+ 35   bb                                                                                                                              0x2a/imm32        # copy 42 to EBX
+ 36 +transform: packing instruction 'bb 0x2a/imm32'
+ 37 +transform: instruction after packing: 'bb 2a 00 00 00'
+ 38 +run: copy imm32 0x0000002a to EBX
+ 39 
+ 40 //: complete set of valid operand types
+ 41 
+ 42 :(before "End Globals")
+ 43 set<string> Instruction_operands;
+ 44 :(before "End One-time Setup")
+ 45 Instruction_operands.insert("subop");
+ 46 Instruction_operands.insert("mod");
+ 47 Instruction_operands.insert("rm32");
+ 48 Instruction_operands.insert("base");
+ 49 Instruction_operands.insert("index");
+ 50 Instruction_operands.insert("scale");
+ 51 Instruction_operands.insert("r32");
+ 52 Instruction_operands.insert("disp8");
+ 53 Instruction_operands.insert("disp16");
+ 54 Instruction_operands.insert("disp32");
+ 55 Instruction_operands.insert("imm8");
+ 56 Instruction_operands.insert("imm32");
+ 57 
+ 58 :(before "End Help Texts")
+ 59 init_operand_type_help();
+ 60 :(code)
+ 61 void init_operand_type_help() {
+ 62   put(Help, "mod",
+ 63     "2-bit operand controlling the _addressing mode_ of many instructions,\n"
+ 64     "to determine how to compute the _effective address_ to look up memory at\n"
+ 65     "based on the 'rm32' operand and potentially others.\n"
+ 66     "\n"
+ 67     "If mod = 3, just operate on the contents of the register specified by rm32\n"
+ 68     "            (direct mode).\n"
+ 69     "If mod = 2, effective address is usually* rm32 + disp32\n"
+ 70     "            (indirect mode with displacement).\n"
+ 71     "If mod = 1, effective address is usually* rm32 + disp8\n"
+ 72     "            (indirect mode with displacement).\n"
+ 73     "If mod = 0, effective address is usually* rm32 (indirect mode).\n"
+ 74     "(* - The exception is when rm32 is '4'. Register 4 is the stack pointer (ESP).\n"
+ 75     "     Using it as an address gets more involved. For more details,\n"
+ 76     "     try reading the help pages for 'base', 'index' and 'scale'.)\n"
+ 77     "\n"
+ 78     "For complete details consult the IA-32 software developer's manual, table 2-2,\n"
+ 79     "\"32-bit addressing forms with the ModR/M byte\".\n"
+ 80     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
+ 81   );
+ 82   put(Help, "subop",
+ 83     "Additional 3-bit operand for determining the instruction when the opcode is 81, 8f or ff.\n"
+ 84     "Can't coexist with operand of type 'r32' in a single instruction, because the two use the same bits.\n"
+ 85   );
+ 86   put(Help, "r32",
+ 87     "3-bit operand specifying a register operand used directly, without any further addressing modes.\n"
+ 88   );
+ 89   put(Help, "rm32",
+ 90     "3-bit operand specifying a register operand whose precise interpretation interacts with 'mod'.\n"
+ 91     "For complete details consult the IA-32 software developer's manual, table 2-2,\n"
+ 92     "\"32-bit addressing forms with the ModR/M byte\".\n"
+ 93     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
+ 94   );
+ 95   put(Help, "base",
+ 96     "Additional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) specifying the register containing an address to look up.\n"
+ 97     "This address may be further modified by 'index' and 'scale' operands.\n"
+ 98     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
+ 99     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
+100     "\"32-bit addressing forms with the SIB byte\".\n"
+101     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
+102   );
+103   put(Help, "index",
+104     "Optional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be added to the 'base' operand to compute the 'effective address' at which to look up memory.\n"
+105     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
+106     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
+107     "\"32-bit addressing forms with the SIB byte\".\n"
+108     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
+109   );
+110   put(Help, "scale",
+111     "Optional 2-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be multiplied to the 'index' operand before adding the result to the 'base' operand to compute the _effective address_ to operate on.\n"
+112     "  effective address = base + index * scale + displacement (disp8 or disp32)\n"
+113     "For complete details consult the IA-32 software developer's manual, table 2-3,\n"
+114     "\"32-bit addressing forms with the SIB byte\".\n"
+115     "  https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf\n"
+116   );
+117   put(Help, "disp8",
+118     "8-bit value to be added in many instructions.\n"
+119   );
+120   put(Help, "disp16",
+121     "16-bit value to be added in many instructions.\n"
+122   );
+123   put(Help, "disp32",
+124     "32-bit value to be added in many instructions.\n"
+125   );
+126   put(Help, "imm8",
+127     "8-bit value for many instructions.\n"
+128   );
+129   put(Help, "imm32",
+130     "32-bit value for many instructions.\n"
+131   );
+132 }
+133 
+134 //:: transform packing operands into bytes in the right order
+135 
+136 :(before "End Transforms")
+137 // Begin Level-2 Transforms
+138 Transform.push_back(pack_operands);
+139 // End Level-2 Transforms
+140 
+141 :(code)
+142 void pack_operands(program& p) {
+143   if (p.segments.empty()) return;
+144   segment& code = p.segments.at(0);
+145   // Pack Operands(segment code)
+146   trace(99, "transform") << "-- pack operands" << end();
+147   for (int i = 0;  i < SIZE(code.lines);  ++i) {
+148     line& inst = code.lines.at(i);
+149     if (all_hex_bytes(inst)) continue;
+150     trace(99, "transform") << "packing instruction '" << to_string(/*with metadata*/inst) << "'" << end();
+151     pack_operands(inst);
+152     trace(99, "transform") << "instruction after packing: '" << to_string(/*without metadata*/inst.words) << "'" << end();
+153   }
+154 }
+155 
+156 void pack_operands(line& inst) {
+157   line new_inst;
+158   add_opcodes(inst, new_inst);
+159   add_modrm_byte(inst, new_inst);
+160   add_sib_byte(inst, new_inst);
+161   add_disp_bytes(inst, new_inst);
+162   add_imm_bytes(inst, new_inst);
+163   inst.words.swap(new_inst.words);
+164 }
+165 
+166 void add_opcodes(const line& in, line& out) {
+167   out.words.push_back(in.words.at(0));
+168   if (in.words.at(0).data == "0f" || in.words.at(0).data == "f2" || in.words.at(0).data == "f3")
+169     out.words.push_back(in.words.at(1));
+170   if (in.words.at(0).data == "f3" && in.words.at(1).data == "0f")
+171     out.words.push_back(in.words.at(2));
+172   if (in.words.at(0).data == "f2" && in.words.at(1).data == "0f")
+173     out.words.push_back(in.words.at(2));
+174 }
+175 
+176 void add_modrm_byte(const line& in, line& out) {
+177   uint8_t mod=0, reg_subop=0, rm32=0;
+178   bool emit = false;
+179   for (int i = 0;  i < SIZE(in.words);  ++i) {
+180     const word& curr = in.words.at(i);
+181     if (has_metadata(curr, "mod")) {
+182       mod = hex_byte(curr.data);
+183       emit = true;
+184     }
+185     else if (has_metadata(curr, "rm32")) {
+186       rm32 = hex_byte(curr.data);
+187       emit = true;
+188     }
+189     else if (has_metadata(curr, "r32")) {
+190       reg_subop = hex_byte(curr.data);
+191       emit = true;
+192     }
+193     else if (has_metadata(curr, "subop")) {
+194       reg_subop = hex_byte(curr.data);
+195       emit = true;
+196     }
+197   }
+198   if (emit)
+199     out.words.push_back(hex_byte_text((mod << 6) | (reg_subop << 3) | rm32));
+200 }
+201 
+202 void add_sib_byte(const line& in, line& out) {
+203   uint8_t scale=0, index=0, base=0;
+204   bool emit = false;
+205   for (int i = 0;  i < SIZE(in.words);  ++i) {
+206     const word& curr = in.words.at(i);
+207     if (has_metadata(curr, "scale")) {
+208       scale = hex_byte(curr.data);
+209       emit = true;
+210     }
+211     else if (has_metadata(curr, "index")) {
+212       index = hex_byte(curr.data);
+213       emit = true;
+214     }
+215     else if (has_metadata(curr, "base")) {
+216       base = hex_byte(curr.data);
+217       emit = true;
+218     }
+219   }
+220   if (emit)
+221     out.words.push_back(hex_byte_text((scale << 6) | (index << 3) | base));
+222 }
+223 
+224 void add_disp_bytes(const line& in, line& out) {
+225   for (int i = 0;  i < SIZE(in.words);  ++i) {
+226     const word& curr = in.words.at(i);
+227     if (has_metadata(curr, "disp8"))
+228       emit_hex_bytes(out, curr, 1);
+229     if (has_metadata(curr, "disp16"))
+230       emit_hex_bytes(out, curr, 2);
+231     else if (has_metadata(curr, "disp32"))
+232       emit_hex_bytes(out, curr, 4);
+233   }
+234 }
+235 
+236 void add_imm_bytes(const line& in, line& out) {
+237   for (int i = 0;  i < SIZE(in.words);  ++i) {
+238     const word& curr = in.words.at(i);
+239     if (has_metadata(curr, "imm8"))
+240       emit_hex_bytes(out, curr, 1);
+241     else if (has_metadata(curr, "imm32"))
+242       emit_hex_bytes(out, curr, 4);
+243   }
+244 }
+245 
+246 void emit_hex_bytes(line& out, const word& w, int num) {
+247   assert(num <= 4);
+248   if (num == 1 || !is_hex_int(w.data)) {
+249     out.words.push_back(w);
+250     if (is_hex_int(w.data))
+251       out.words.back().data = hex_byte_to_string(parse_int(w.data));
+252     return;
+253   }
+254   emit_hex_bytes(out, static_cast<uint32_t>(parse_int(w.data)), num);
+255 }
+256 
+257 void emit_hex_bytes(line& out, uint32_t val, int num) {
+258   assert(num <= 4);
+259   for (int i = 0;  i < num;  ++i) {
+260     out.words.push_back(hex_byte_text(val & 0xff));
+261     val = val >> 8;
+262   }
+263 }
+264 
+265 word hex_byte_text(uint8_t val) {
+266   word result;
+267   result.data = hex_byte_to_string(val);
+268   result.original = result.data+"/auto";
+269   return result;
+270 }
+271 
+272 string hex_byte_to_string(uint8_t val) {
+273   ostringstream out;
+274   out << HEXBYTE << NUM(val);
+275   return out.str();
+276 }
+277 
+278 string to_string(const vector<word>& in) {
+279   ostringstream out;
+280   for (int i = 0;  i < SIZE(in);  ++i) {
+281     if (i > 0) out << ' ';
+282     out << in.at(i).data;
+283   }
+284   return out.str();
+285 }
+286 
+287 :(before "End Unit Tests")
+288 void test_preserve_metadata_when_emitting_single_byte() {
+289   word in;
+290   in.data = "f0";
+291   in.original = "f0/foo";
+292   line out;
+293   emit_hex_bytes(out, in, 1);
+294   CHECK_EQ(out.words.at(0).data, "f0");
+295   CHECK_EQ(out.words.at(0).original, "f0/foo");
+296 }
+297 
+298 :(scenario pack_disp8)
+299 == 0x1
+300 74 2/disp8  # jump 2 bytes away if ZF is set
+301 +transform: packing instruction '74 2/disp8'
+302 +transform: instruction after packing: '74 02'
+303 
+304 :(scenarios transform)
+305 :(scenario pack_disp8_negative)
+306 == 0x1
+307 # running this will cause an infinite loop
+308 74 -1/disp8  # jump 1 byte before if ZF is set
+309 +transform: packing instruction '74 -1/disp8'
+310 +transform: instruction after packing: '74 ff'
+311 :(scenarios run)
+312 
+313 //: helper for scenario
+314 :(code)
+315 void transform(const string& text_bytes) {
+316   program p;
+317   istringstream in(text_bytes);
+318   parse(in, p);
+319   if (trace_contains_errors()) return;
+320   transform(p);
+321 }
+322 
+323 :(scenario pack_modrm_imm32)
+324 == 0x1
+325 # instruction                     effective address                                                   operand     displacement    immediate
+326 # op          subop               mod             rm32          base        index         scale       r32
+327 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
+328   81          0/add/subop         3/mod/direct    3/ebx/rm32                                                                      1/imm32           # add 1 to EBX
+329 +transform: packing instruction '81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32'
+330 +transform: instruction after packing: '81 c3 01 00 00 00'
+331 
+332 :(scenario pack_imm32_large)
+333 == 0x1
+334 b9 0x080490a7/imm32  # copy to ECX
+335 +transform: packing instruction 'b9 0x080490a7/imm32'
+336 +transform: instruction after packing: 'b9 a7 90 04 08'
+337 
+338 :(scenario pack_immediate_constants_hex)
+339 == 0x1
+340 # instruction                     effective address                                                   operand     displacement    immediate
+341 # op          subop               mod             rm32          base        index         scale       r32
+342 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
+343   bb                                                                                                                              0x2a/imm32        # copy 42 to EBX
+344 +transform: packing instruction 'bb 0x2a/imm32'
+345 +transform: instruction after packing: 'bb 2a 00 00 00'
+346 +run: copy imm32 0x0000002a to EBX
+347 
+348 :(scenarios transform)
+349 :(scenario pack_silently_ignores_non_hex)
+350 == 0x1
+351 # instruction                     effective address                                                   operand     displacement    immediate
+352 # op          subop               mod             rm32          base        index         scale       r32
+353 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
+354   bb                                                                                                                              foo/imm32         # copy foo to EBX
+355 +transform: packing instruction 'bb foo/imm32'
+356 # no change (we're just not printing metadata to the trace)
+357 +transform: instruction after packing: 'bb foo'
+358 $error: 0
+359 :(scenarios run)
+360 
+361 //:: helpers
+362 
+363 :(code)
+364 bool all_hex_bytes(const line& inst) {
+365   for (int i = 0;  i < SIZE(inst.words);  ++i)
+366     if (!is_hex_byte(inst.words.at(i)))
+367       return false;
+368   return true;
+369 }
+370 
+371 bool is_hex_byte(const word& curr) {
+372   if (contains_any_operand_metadata(curr))
+373     return false;
+374   if (SIZE(curr.data) != 2)
+375     return false;
+376   if (curr.data.find_first_not_of("0123456789abcdefABCDEF") != string::npos)
+377     return false;
+378   return true;
+379 }
+380 
+381 bool contains_any_operand_metadata(const word& word) {
+382   for (int i = 0;  i < SIZE(word.metadata);  ++i)
+383     if (Instruction_operands.find(word.metadata.at(i)) != Instruction_operands.end())
+384       return true;
+385   return false;
+386 }
+387 
+388 bool has_metadata(const line& inst, const string& m) {
+389   bool result = false;
+390   for (int i = 0;  i < SIZE(inst.words);  ++i) {
+391     if (!has_metadata(inst.words.at(i), m)) continue;
+392     if (result) {
+393       raise << "'" << to_string(inst) << "' has conflicting " << m << " operands\n" << end();
+394       return false;
+395     }
+396     result = true;
+397   }
+398   return result;
+399 }
+400 
+401 bool has_metadata(const word& w, const string& m) {
+402   bool result = false;
+403   bool metadata_found = false;
+404   for (int i = 0;  i < SIZE(w.metadata);  ++i) {
+405     const string& curr = w.metadata.at(i);
+406     if (Instruction_operands.find(curr) == Instruction_operands.end()) continue;  // ignore unrecognized metadata
+407     if (metadata_found) {
+408       raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end();
+409       return false;
+410     }
+411     metadata_found = true;
+412     result = (curr == m);
+413   }
+414   return result;
+415 }
+416 
+417 word metadata(const line& inst, const string& m) {
+418   for (int i = 0;  i < SIZE(inst.words);  ++i)
+419     if (has_metadata(inst.words.at(i), m))
+420       return inst.words.at(i);
+421   assert(false);
+422 }
+423 
+424 bool is_hex_int(const string& s) {
+425   if (s.empty()) return false;
+426   size_t pos = 0;
+427   if (s.at(0) == '-' || s.at(0) == '+') pos++;
+428   if (s.substr(pos, pos+2) == "0x") pos += 2;
+429   return s.find_first_not_of("0123456789abcdefABCDEF", pos) == string::npos;
+430 }
+431 
+432 int32_t parse_int(const string& s) {
+433   if (s.empty()) return 0;
+434   istringstream in(s);
+435   in >> std::hex;
+436   if (s.at(0) == '-') {
+437     int32_t result = 0;
+438     in >> result;
+439     if (!in || !in.eof()) {
+440       raise << "not a number: " << s << '\n' << end();
+441       return 0;
+442     }
+443     return result;
+444   }
+445   uint32_t uresult = 0;
+446   in >> uresult;
+447   if (!in || !in.eof()) {
+448     raise << "not a number: " << s << '\n' << end();
+449     return 0;
+450   }
+451   return static_cast<int32_t>(uresult);
+452 }
+453 :(before "End Unit Tests")
+454 void test_parse_int() {
+455   CHECK_EQ(0, parse_int("0"));
+456   CHECK_EQ(0, parse_int("0x0"));
+457   CHECK_EQ(0, parse_int("0x0"));
+458   CHECK_EQ(16, parse_int("10"));  // hex always
+459   CHECK_EQ(-1, parse_int("-1"));
+460   CHECK_EQ(-1, parse_int("0xffffffff"));
+461 }
+462 
+463 :(code)
+464 string to_string(const line& inst) {
+465   ostringstream out;
+466   for (int i = 0;  i < SIZE(inst.words);  ++i) {
+467     if (i > 0) out << ' ';
+468     out << inst.words.at(i).original;
+469   }
+470   return out.str();
+471 }
+
+ + + -- cgit 1.4.1-2-gfad0