1 //:: simulated x86 registers
  2 
  3 :(before "End Types")
  4 enum {
  5   EAX,
  6   ECX,
  7   EDX,
  8   EBX,
  9   ESP,
 10   EBP,
 11   ESI,
 12   EDI,
 13   NUM_INT_REGISTERS,
 14 };
 15 union reg {
 16   int32_t i;
 17   uint32_t u;
 18 };
 19 :(before "End Globals")
 20 reg R[NUM_INT_REGISTERS] = { {0} };
 21 uint32_t EIP = 0;
 22 :(before "End Reset")
 23 bzero(R, sizeof(R));
 24 EIP = 0;
 25 
 26 //:: simulated flag registers; just a subset that we care about
 27 
 28 :(before "End Globals")
 29 bool SF = false;  // sign flag
 30 bool ZF = false;  // zero flag
 31 bool OF = false;  // overflow flag
 32 :(before "End Reset")
 33 SF = ZF = OF = false;
 34 
 35 //: how the flag registers are updated after each instruction
 36 
 37 :(before "End Includes")
 38 // beware: no side-effects in args
 39 #define BINARY_ARITHMETIC_OP(op, arg1, arg2) { \
 40   /* arg1 and arg2 must be signed */ \
 41   int64_t tmp = arg1 op arg2; \
 42   arg1 = arg1 op arg2; \
 43   SF = (arg1 < 0); \
 44   ZF = (arg1 == 0); \
 45   OF = (arg1 != tmp); \
 46 }
 47 
 48 #define BINARY_BITWISE_OP(op, arg1, arg2) { \
 49   /* arg1 and arg2 must be unsigned */ \
 50   arg1 = arg1 op arg2; \
 51   SF = (arg1 >> 31); \
 52   ZF = (arg1 == 0); \
 53   OF = false; \
 54 }
 55 
 56 //:: simulated RAM
 57 
 58 :(before "End Globals")
 59 map<uint32_t, uint8_t> Memory;
 60 uint32_t End_of_program = 0;
 61 :(before "End Reset")
 62 Memory.clear();
 63 End_of_program = 0;
 64 
 65 //:: core interpreter loop
 66 
 67 :(scenario add_imm32_to_eax)
 68 # In scenarios, programs are a series of hex bytes, each (variable-length)
 69 # instruction on one line.
 70 #
 71 # x86 instructions consist of the following parts (see cheatsheet.pdf):
 72 #   opcode        ModRM                 SIB                   displacement    immediate
 73 #   instruction   mod, reg, R/M bits    scale, index, base
 74 #   1-3 bytes     0/1 byte              0/1 byte              0/1/2/4 bytes   0/1/2/4 bytes
 75   ¦ 0x05                                                                      0a 0b 0c 0d  # add 0x0d0c0b0a to EAX
 76 +load: 1 -> 05
 77 +load: 2 -> 0a
 78 +load: 3 -> 0b
 79 +load: 4 -> 0c
 80 +load: 5 -> 0d
 81 +run: add imm32 0x0d0c0b0a to reg EAX
 82 +reg: storing 0x0d0c0b0a in reg EAX
 83 
 84 :(code)
 85 // helper for tests: load a program into memory from a textual representation
 86 // of its bytes, and run it
 87 void run(const string& text_bytes) {
 88   load_program(text_bytes);
 89   EIP = 1;  // preserve null pointer
 90   while (EIP < End_of_program)
 91   ¦ run_one_instruction();
 92 }
 93 
 94 void load_program(const string& text_bytes) {
 95   uint32_t addr = 1;
 96   // we'll use C's 'strtol` to parse ASCII hex bytes
 97   // strtol needs a char*, so we grab the buffer backing the string object
 98   char* curr = const_cast<char*>(&text_bytes[0]);   // non-portable, but blessed by Herb Sutter (http://herbsutter.com/2008/04/07/cringe-not-vectors-are-guaranteed-to-be-contiguous/#comment-483)
 99   char* max = curr + strlen(curr);
100   while (curr < max) {
101   ¦ // skip whitespace
102   ¦ while (*curr == ' ' || *curr == '\n') ++curr;
103   ¦ // skip comments
104   ¦ if (*curr == '#') {
105   ¦ ¦ while (*curr != '\n') {
106   ¦ ¦ ¦ ++curr;
107   ¦ ¦ ¦ if (curr >= max) break;
108   ¦ ¦ }
109   ¦ ¦ ++curr;
110   ¦ ¦ continue;
111   ¦ }
112   ¦ put(Memory, addr, strtol(curr, &curr, /*hex*/16));
113   ¦ trace(99, "load") << addr << " -> " << HEXBYTE << static_cast<unsigned int>(get_or_insert(Memory, addr)) << end();  // ugly that iostream doesn't print uint8_t as an integer
114   ¦ addr++;
115   }
116   End_of_program = addr;
117 }
118 
119 // skeleton of how x86 instructions are decoded
120 void run_one_instruction() {
121   uint8_t op=0, op2=0, op3=0;
122   switch(op = next()) {
123   // our first opcode
124   case 0xf4:  // hlt
125   ¦ EIP = End_of_program;
126   ¦ break;
127   case 0x05: {  // add imm32 to EAX
128   ¦ int32_t arg2 = imm32();
129   ¦ trace(2, "run") << "add imm32 0x" << HEXWORD << arg2 << " to reg EAX" << end();
130   ¦ BINARY_ARITHMETIC_OP(+, R[EAX].i, arg2);
131   ¦ trace(98, "reg") << "storing 0x" << HEXWORD << R[EAX].i << " in reg EAX" << end();
132   ¦ break;
133   }
134   // End Single-Byte Opcodes
135   case 0x0f:
136   ¦ switch(op2 = next()) {
137   ¦ // End Two-Byte Opcodes Starting With 0f
138   ¦ default:
139   ¦ ¦ cerr << "unrecognized second opcode after 0f: " << std::hex << static_cast<int>(op2) << '\n';
140   ¦ ¦ exit(1);
141   ¦ }
142   ¦ break;
143   case 0xf3:
144   ¦ switch(op2 = next()) {
145   ¦ // End Two-Byte Opcodes Starting With f3
146   ¦ case 0x0f:
147   ¦ ¦ switch(op3 = next()) {
148   ¦ ¦ // End Three-Byte Opcodes Starting With f3 0f
149   ¦ ¦ default:
150   ¦ ¦ ¦ cerr << "unrecognized third opcode after f3 0f: " << std::hex << static_cast<int>(op3) << '\n';
151   ¦ ¦ ¦ exit(1);
152   ¦ ¦ }
153   ¦ ¦ break;
154   ¦ default:
155   ¦ ¦ cerr << "unrecognized second opcode after f3: " << std::hex << static_cast<int>(op2) << '\n';
156   ¦ ¦ exit(1);
157   ¦ }
158   ¦ break;
159   default:
160   ¦ cerr << "unrecognized opcode: " << std::hex << static_cast<int>(op) << '\n';
161   ¦ exit(1);
162   }
163 }
164 
165 uint8_t next() {
166   return get_or_insert(Memory, EIP++);
167 }
168 
169 // read a 32-bit immediate in little-endian order from the instruction stream
170 int32_t imm32() {
171   int32_t result = next();
172   result |= (next()<<8);
173   result |= (next()<<16);
174   result |= (next()<<24);
175   return result;
176 }
177 
178 :(before "End Includes")
179 #include <iomanip>
180 #define HEXBYTE  std::hex << std::setw(2) << std::setfill('0')
181 #define HEXWORD  std::hex << std::setw(8) << std::setfill('0')
182 #include <stdint.h>