https://github.com/akkartik/mu/blob/master/subx/030---operands.cc
  1 //: Beginning of "level 2": tagging bytes with metadata around what field of
  2 //: an x86 instruction they're for.
  3 //:
  4 //: The x86 instruction set is variable-length, and how a byte is interpreted
  5 //: affects later instruction boundaries. A lot of the pain in programming
  6 //: machine code stems from computer and programmer going out of sync on what
  7 //: a byte means. The miscommunication is usually not immediately caught, and
  8 //: metastasizes at runtime into kilobytes of misinterpreted instructions.
  9 //:
 10 //: To mitigate these issues, we'll start programming in terms of logical
 11 //: operands rather than physical bytes. Some operands are smaller than a
 12 //: byte, and others may consist of multiple bytes. This layer will correctly
 13 //: pack and order the bytes corresponding to the operands in an instruction.
 14 
 15 :(before "End Help Texts")
 16 put_new(Help, "instructions",
 17   "Each x86 instruction consists of an instruction or opcode and some number\n"
 18   "of operands.\n"
 19   "Each operand has a type. An instruction won't have more than one operand of\n"
 20   "any type.\n"
 21   "Each instruction has some set of allowed operand types. It'll reject others.\n"
 22   "The complete list of operand types: mod, subop, r32 (register), rm32\n"
 23   "(register or memory), scale, index, base, disp8, disp16, disp32, imm8,\n"
 24   "imm32.\n"
 25   "Each of these has its own help page. Try reading 'subx help mod' next.\n"
 26 );
 27 :(before "End Help Contents")
 28 cerr << "  instructions\n";
 29 
 30 :(scenario pack_immediate_constants)
 31 == 0x1
 32 bb  0x2a/imm32
 33 +transform: packing instruction 'bb 0x2a/imm32'
 34 +transform: instruction after packing: 'bb 2a 00 00 00'
 35 +run: copy imm32 0x0000002a to EBX
 36 
 37 //: complete set of valid operand types
 38 
 39 :(before "End Globals")
 40 set<string> Instruction_operands;
 41 :(before "End One-time Setup")
 42 Instruction_operands.insert("subop");
 43 Instruction_operands.insert("mod");
 44 Instruction_operands.insert("rm32");
 45 Instruction_operands.insert("base");
 46 Instruction_operands.insert("index");
 47 Instruction_operands.insert("scale");
 48 Instruction_operands.insert("r32");
 49 Instruction_operands.insert("disp8");
 50 Instruction_operands.insert("disp16");
 51 Instruction_operands.insert("disp32");
 52 Instruction_operands.insert("imm8");
 53 Instruction_operands.insert("imm32");
 54 
 55 :(before "End Help Texts")
 56 init_operand_type_help();
 57 :(code)
 58 void init_operand_type_help() {
 59   put(Help, "mod",
 60     "2-bit operand controlling the _addressing mode_ of many instructions,\n"
 61     "to determine how to compute the _effective address_ to look up memory at\n"
 62     "based on the 'rm32' operand and potentially others.\n"
 63     "\n"
 64     "If mod = 3, just operate on the contents of the register specified by rm32\n"
 65     "            (direct mode).\n"
 66     "If mod = 2, effective address is usually* rm32 + disp32\n"
 67     "            (indirect mode with displacement).\n"
 68     "If mod = 1, effective address is usually* rm32 + disp8\n"
 69     "            (indirect mode with displacement).\n"
 70     "If mod = 0, effective address is usually* rm32 (indirect mode).\n"
 71     "(* - The exception is when rm32 is '4'. Register 4 is the stack pointer (ESP).\n"
 72     "     Using it as an address gets more involved. For more details,\n"
 73     "     try reading the help pages for 'base', 'index' and 'scale'.)\n"
 74     "\n"
 75     "For complete details, spend some time with two tables in the IA-32 software\n"
 76     "developer's manual that are also included in this repo:\n"
 77     "  - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n"
 78     "  - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n"
 79   );
 80   put(Help, "subop",
 81     "Additional 3-bit operand for determining the instruction when the opcode is 81, 8f or ff.\n"
 82     "Can't coexist with operand of type 'r32' in a single instruction, because the two use the same bits.\n"
 83   );
 84   put(Help, "r32",
 85     "3-bit operand specifying a register operand used directly, without any further addressing modes.\n"
 86   );
 87   put(Help, "rm32",
 88     "32-bit value in register or memory. The precise details of its construction\n"
 89     "depend on the eponymous 3-bit 'rm32' operand, the 'mod' operand, and also\n"
 90     "potentially the 'SIB' operands ('scale', 'index' and 'base') and a displacement\n"
 91     "('disp8' or 'disp32').\n"
 92     "\n"
 93     "For complete details, spend some time with two tables in the IA-32 software\n"
 94     "developer's manual that are also included in this repo:\n"
 95     "  - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n"
 96     "  - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n"
 97   );
 98   put(Help, "base",
 99     "Additional 3-bit operand (when 'rm32' is 4, unless 'mod' is 3) specifying the\n"
100     "register containing an address to look up.\n"
101     "This address may be further modified by 'index' and 'scale' operands.\n"
102     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
103     "For complete details, spend some time with the IA-32 software developer's manual,\n"
104     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
105     "It is included in this repository as 'sib.pdf'.\n"
106   );
107   put(Help, "index",
108     "Optional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be added to\n"
109     "the 'base' operand to compute the 'effective address' at which to look up memory.\n"
110     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
111     "For complete details, spend some time with the IA-32 software developer's manual,\n"
112     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
113     "It is included in this repository as 'sib.pdf'.\n"
114   );
115   put(Help, "scale",
116     "Optional 2-bit operand (when 'rm32' is 4 unless 'mod' is 3) that encodes a\n"
117     "power of 2 to be multiplied to the 'index' operand before adding the result to\n"
118     "the 'base' operand to compute the _effective address_ to operate on.\n"
119     "  effective address = base + index * scale + displacement (disp8 or disp32)\n"
120     "\n"
121     "When scale is 0, use index unmodified.\n"
122     "When scale is 1, multiply index by 2.\n"
123     "When scale is 2, multiply index by 4.\n"
124     "When scale is 3, multiply index by 8.\n"
125     "\n"
126     "For complete details, spend some time with the IA-32 software developer's manual,\n"
127     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
128     "It is included in this repository as 'sib.pdf'.\n"
129   );
130   put(Help, "disp8",
131     "8-bit value to be added in many instructions.\n"
132   );
133   put(Help, "disp16",
134     "16-bit value to be added in many instructions.\n"
135   );
136   put(Help, "disp32",
137     "32-bit value to be added in many instructions.\n"
138   );
139   put(Help, "imm8",
140     "8-bit value for many instructions.\n"
141   );
142   put(Help, "imm32",
143     "32-bit value for many instructions.\n"
144   );
145 }
146 
147 //:: transform packing operands into bytes in the right order
148 
149 :(after "Begin Transforms")
150 // Begin Level-2 Transforms
151 Transform.push_back(pack_operands);
152 // End Level-2 Transforms
153 
154 :(code)
155 void pack_operands(program& p) {
156   if (p.segments.empty()) return;
157   segment& code = p.segments.at(0);
158   // Pack Operands(segment code)
159   trace(3, "transform") << "-- pack operands" << end();
160   for (int i = 0;  i < SIZE(code.lines);  ++i) {
161     line& inst = code.lines.at(i);
162     if (all_hex_bytes(inst)) continue;
163     trace(99, "transform") << "packing instruction '" << to_string(/*with metadata*/inst) << "'" << end();
164     pack_operands(inst);
165     trace(99, "transform") << "instruction after packing: '" << to_string(/*without metadata*/inst.words) << "'" << end();
166   }
167 }
168 
169 void pack_operands(line& inst) {
170   line new_inst;
171   add_opcodes(inst, new_inst);
172   add_modrm_byte(inst, new_inst);
173   add_sib_byte(inst, new_inst);
174   add_disp_bytes(inst, new_inst);
175   add_imm_bytes(inst, new_inst);
176   inst.words.swap(new_inst.words);
177 }
178 
179 void add_opcodes(const line& in, line& out) {
180   out.words.push_back(in.words.at(0));
181   if (in.words.at(0).data == "0f" || in.words.at(0).data == "f2" || in.words.at(0).data == "f3")
182     out.words.push_back(in.words.at(1));
183   if (in.words.at(0).data == "f3" && in.words.at(1).data == "0f")
184     out.words.push_back(in.words.at(2));
185   if (in.words.at(0).data == "f2" && in.words.at(1).data == "0f")
186     out.words.push_back(in.words.at(2));
187 }
188 
189 void add_modrm_byte(const line& in, line& out) {
190   uint8_t mod=0, reg_subop=0, rm32=0;
191   bool emit = false;
192   for (int i = 0;  i < SIZE(in.words);  ++i) {
193     const word& curr = in.words.at(i);
194     if (has_operand_metadata(curr, "mod")) {
195       mod = hex_byte(curr.data);
196       emit = true;
197     }
198     else if (has_operand_metadata(curr, "rm32")) {
199       rm32 = hex_byte(curr.data);
200       emit = true;
201     }
202     else if (has_operand_metadata(curr, "r32")) {
203       reg_subop = hex_byte(curr.data);
204       emit = true;
205     }
206     else if (has_operand_metadata(curr, "subop")) {
207       reg_subop = hex_byte(curr.data);
208       emit = true;
209     }
210   }
211   if (emit)
212     out.words.push_back(hex_byte_text((mod << 6) | (reg_subop << 3) | rm32));
213 }
214 
215 void add_sib_byte(const line& in, line& out) {
216   uint8_t scale=0, index=0, base=0;
217   bool emit = false;
218   for (int i = 0;  i < SIZE(in.words);  ++i) {
219     const word& curr = in.words.at(i);
220     if (has_operand_metadata(curr, "scale")) {
221       scale = hex_byte(curr.data);
222       emit = true;
223     }
224     else if (has_operand_metadata(curr, "index")) {
225       index = hex_byte(curr.data);
226       emit = true;
227     }
228     else if (has_operand_metadata(curr, "base")) {
229       base = hex_byte(curr.data);
230       emit = true;
231     }
232   }
233   if (emit)
234     out.words.push_back(hex_byte_text((scale << 6) | (index << 3) | base));
235 }
236 
237 void add_disp_bytes(const line& in, line& out) {
238   for (int i = 0;  i < SIZE(in.words);  ++i) {
239     const word& curr = in.words.at(i);
240     if (has_operand_metadata(curr, "disp8"))
241       emit_hex_bytes(out, curr, 1);
242     if (has_operand_metadata(curr, "disp16"))
243       emit_hex_bytes(out, curr, 2);
244     else if (has_operand_metadata(curr, "disp32"))
245       emit_hex_bytes(out, curr, 4);
246   }
247 }
248 
249 void add_imm_bytes(const line& in, line& out) {
250   for (int i = 0;  i < SIZE(in.words);  ++i) {
251     const word& curr = in.words.at(i);
252     if (has_operand_metadata(curr, "imm8"))
253       emit_hex_bytes(out, curr, 1);
254     else if (has_operand_metadata(curr, "imm32"))
255       emit_hex_bytes(out, curr, 4);
256   }
257 }
258 
259 void emit_hex_bytes(line& out, const word& w, int num) {
260   assert(num <= 4);
261   bool is_number = looks_like_hex_int(w.data);
262   if (num == 1 || !is_number) {
263     out.words.push_back(w);  // preserve existing metadata
264     if (is_number)
265       out.words.back().data = hex_byte_to_string(parse_int(w.data));
266     return;
267   }
268   emit_hex_bytes(out, static_cast<uint32_t>(parse_int(w.data)), num);
269 }
270 
271 void emit_hex_bytes(line& out, uint32_t val, int num) {
272   assert(num <= 4);
273   for (int i = 0;  i < num;  ++i) {
274     out.words.push_back(hex_byte_text(val & 0xff));
275     val = val >> 8;
276   }
277 }
278 
279 word hex_byte_text(uint8_t val) {
280   word result;
281   result.data = hex_byte_to_string(val);
282   result.original = result.data+"/auto";
283   return result;
284 }
285 
286 string hex_byte_to_string(uint8_t val) {
287   ostringstream out;
288   // uint8_t prints without padding, but int8_t will expand to 32 bits again
289   out << HEXBYTE << NUM(val);
290   return out.str();
291 }
292 
293 string to_string(const vector<word>& in) {
294   ostringstream out;
295   for (int i = 0;  i < SIZE(in);  ++i) {
296     if (i > 0) out << ' ';
297     out << in.at(i).data;
298   }
299   return out.str();
300 }
301 
302 :(before "End Unit Tests")
303 void test_preserve_metadata_when_emitting_single_byte() {
304   word in;
305   in.data = "f0";
306   in.original = "f0/foo";
307   line out;
308   emit_hex_bytes(out, in, 1);
309   CHECK_EQ(out.words.at(0).data, "f0");
310   CHECK_EQ(out.words.at(0).original, "f0/foo");
311 }
312 
313 :(scenario pack_disp8)
314 == 0x1
315 74 2/disp8  # jump 2 bytes away if ZF is set
316 +transform: packing instruction '74 2/disp8'
317 +transform: instruction after packing: '74 02'
318 
319 :(scenarios transform)
320 :(scenario pack_disp8_negative)
321 == 0x1
322 # running this will cause an infinite loop
323 74 -1/disp8  # jump 1 byte before if ZF is set
324 +transform: packing instruction '74 -1/disp8'
325 +transform: instruction after packing: '74 ff'
326 :(scenarios run)
327 
328 //: helper for scenario
329 :(code)
330 void transform(const string& text_bytes) {
331   program p;
332   istringstream in(text_bytes);
333   parse(in, p);
334   if (trace_contains_errors()) return;
335   transform(p);
336 }
337 
338 :(scenario pack_modrm_imm32)
339 == 0x1
340 # instruction                     effective address                                                   operand     displacement    immediate
341 # op          subop               mod             rm32          base        index         scale       r32
342 # 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes
343   81          0/add/subop         3/mod/direct    3/ebx/rm32                                                                      1/imm32           # add 1 to EBX
344 +transform: packing instruction '81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32'
345 +transform: instruction after packing: '81 c3 01 00 00 00'
346 
347 :(scenario pack_imm32_large)
348 == 0x1
349 b9  0x080490a7/imm32
350 +transform: packing instruction 'b9 0x080490a7/imm32'
351 +transform: instruction after packing: 'b9 a7 90 04 08'
352 
353 :(scenario pack_immediate_constants_hex)
354 == 0x1
355 b9  0x2a/imm32
356 +transform: packing instruction 'b9 0x2a/imm32'
357 +transform: instruction after packing: 'b9 2a 00 00 00'
358 +run: copy imm32 0x0000002a to ECX
359 
360 :(scenarios transform)
361 :(scenario pack_silently_ignores_non_hex)
362 % Hide_errors = true;
363 == 0x1
364 b9  foo/imm32
365 +transform: packing instruction 'b9 foo/imm32'
366 # no change (we're just not printing metadata to the trace)
367 +transform: instruction after packing: 'b9 foo'
368 :(scenarios run)
369 
370 :(scenario pack_flags_bad_hex)
371 % Hide_errors = true;
372 == 0x1
373 b9  0xfoo/imm32
374 +error: not a number: 0xfoo
375 
376 //:: helpers
377 
378 :(code)
379 bool all_hex_bytes(const line& inst) {
380   for (int i = 0;  i < SIZE(inst.words);  ++i)
381     if (!is_hex_byte(inst.words.at(i)))
382       return false;
383   return true;
384 }
385 
386 bool is_hex_byte(const word& curr) {
387   if (contains_any_operand_metadata(curr))
388     return false;
389   if (SIZE(curr.data) != 2)
390     return false;
391   if (curr.data.find_first_not_of("0123456789abcdefABCDEF") != string::npos)
392     return false;
393   return true;
394 }
395 
396 bool contains_any_operand_metadata(const word& word) {
397   for (int i = 0;  i < SIZE(word.metadata);  ++i)
398     if (Instruction_operands.find(word.metadata.at(i)) != Instruction_operands.end())
399       return true;
400   return false;
401 }
402 
403 bool has_operand_metadata(const line& inst, const string& m) {
404   bool result = false;
405   for (int i = 0;  i < SIZE(inst.words);  ++i) {
406     if (!has_operand_metadata(inst.words.at(i), m)) continue;
407     if (result) {
408       raise << "'" << to_string(inst) << "' has conflicting " << m << " operands\n" << end();
409       return false;
410     }
411     result = true;
412   }
413   return result;
414 }
415 
416 bool has_operand_metadata(const word& w, const string& m) {
417   bool result = false;
418   bool metadata_found = false;
419   for (int i = 0;  i < SIZE(w.metadata);  ++i) {
420     const string& curr = w.metadata.at(i);
421     if (Instruction_operands.find(curr) == Instruction_operands.end()) continue;  // ignore unrecognized metadata
422     if (metadata_found) {
423       raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end();
424       return false;
425     }
426     metadata_found = true;
427     result = (curr == m);
428   }
429   return result;
430 }
431 
432 word metadata(const line& inst, const string& m) {
433   for (int i = 0;  i < SIZE(inst.words);  ++i)
434     if (has_operand_metadata(inst.words.at(i), m))
435       return inst.words.at(i);
436   assert(false);
437 }
438 
439 bool looks_like_hex_int(const string& s) {
440   if (s.empty()) return false;
441   if (s.at(0) == '-' || s.at(0) == '+') return true;
442   if (isdigit(s.at(0))) return true;  // includes '0x' prefix
443   // End looks_like_hex_int(s) Detectors
444   return false;
445 }
446 
447 :(code)
448 string to_string(const line& inst) {
449   ostringstream out;
450   for (int i = 0;  i < SIZE(inst.words);  ++i) {
451     if (i > 0) out << ' ';
452     out << inst.words.at(i).original;
453   }
454   return out.str();
455 }