1 //:: simulated x86 registers
  2 
  3 :(before "End Types")
  4 enum {
  5   EAX,
  6   ECX,
  7   EDX,
  8   EBX,
  9   ESP,
 10   EBP,
 11   ESI,
 12   EDI,
 13   NUM_INT_REGISTERS,
 14 };
 15 union reg {
 16   int32_t i;
 17   uint32_t u;
 18 };
 19 :(before "End Globals")
 20 reg Reg[NUM_INT_REGISTERS] = { {0} };
 21 uint32_t EIP = 0;
 22 :(before "End Reset")
 23 bzero(Reg, sizeof(Reg));
 24 EIP = 0;
 25 
 26 //:: simulated flag registers; just a subset that we care about
 27 
 28 :(before "End Globals")
 29 bool SF = false;  // sign flag
 30 bool ZF = false;  // zero flag
 31 bool OF = false;  // overflow flag
 32 :(before "End Reset")
 33 SF = ZF = OF = false;
 34 
 35 //: how the flag registers are updated after each instruction
 36 
 37 :(before "End Includes")
 38 // Combine 'arg1' and 'arg2' with arithmetic operation 'op' and store the
 39 // result in 'arg1', then update flags.
 40 // beware: no side-effects in args
 41 #define BINARY_ARITHMETIC_OP(op, arg1, arg2) { \
 42   /* arg1 and arg2 must be signed */ \
 43   int64_t tmp = arg1 op arg2; \
 44   arg1 = arg1 op arg2; \
 45   trace(2, "run") << "storing 0x" << HEXWORD << arg1 << end(); \
 46   SF = (arg1 < 0); \
 47   ZF = (arg1 == 0); \
 48   OF = (arg1 != tmp); \
 49 }
 50 
 51 // Combine 'arg1' and 'arg2' with bitwise operation 'op' and store the result
 52 // in 'arg1', then update flags.
 53 #define BINARY_BITWISE_OP(op, arg1, arg2) { \
 54   /* arg1 and arg2 must be unsigned */ \
 55   arg1 = arg1 op arg2; \
 56   trace(2, "run") << "storing 0x" << HEXWORD << arg1 << end(); \
 57   SF = (arg1 >> 31); \
 58   ZF = (arg1 == 0); \
 59   OF = false; \
 60 }
 61 
 62 //:: simulated RAM
 63 
 64 :(before "End Globals")
 65 vector<uint8_t> Mem;
 66 uint32_t End_of_program = 0;
 67 :(before "End Reset")
 68 Mem.clear();
 69 Mem.resize(1024);
 70 End_of_program = 0;
 71 :(before "End Includes")
 72 // depends on Mem being laid out contiguously (so you can't use a map, etc.)
 73 // and on the host also being little-endian
 74 #define SET_WORD_IN_MEM(addr, val)  *reinterpret_cast<int32_t*>(&Mem.at(addr)) = val;
 75 
 76 //:: core interpreter loop
 77 
 78 :(scenario add_imm32_to_eax)
 79 # In scenarios, programs are a series of hex bytes, each (variable-length)
 80 # instruction on one line.
 81 #
 82 # x86 instructions consist of the following parts (see cheatsheet.pdf):
 83 #   opcode        ModR/M                    SIB                   displacement    immediate
 84 #   instruction   mod, reg, Reg/Mem bits    scale, index, base
 85 #   1-3 bytes     0/1 byte                  0/1 byte              0/1/2/4 bytes   0/1/2/4 bytes
 86   ¦ 05                                                                            0a 0b 0c 0d  # add 0x0d0c0b0a to EAX
 87 # All hex bytes must be exactly 2 characters each. No '0x' prefixes.
 88 +load: 1 -> 05
 89 +load: 2 -> 0a
 90 +load: 3 -> 0b
 91 +load: 4 -> 0c
 92 +load: 5 -> 0d
 93 +run: add imm32 0x0d0c0b0a to reg EAX
 94 +run: storing 0x0d0c0b0a
 95 
 96 :(code)
 97 // helper for tests: load a program into memory from a textual representation
 98 // of its bytes, and run it
 99 void run(const string& text_bytes) {
100   load_program(text_bytes);
101   EIP = 1;  // preserve null pointer
102   while (EIP < End_of_program)
103   ¦ run_one_instruction();
104 }
105 
106 // skeleton of how x86 instructions are decoded
107 void run_one_instruction() {
108   uint8_t op=0, op2=0, op3=0;
109   switch (op = next()) {
110   case 0xf4:  // hlt
111   ¦ EIP = End_of_program;
112   ¦ break;
113   // our first opcode
114   case 0x05: {  // add imm32 to EAX
115   ¦ int32_t arg2 = imm32();
116   ¦ trace(2, "run") << "add imm32 0x" << HEXWORD << arg2 << " to reg EAX" << end();
117   ¦ BINARY_ARITHMETIC_OP(+, Reg[EAX].i, arg2);
118   ¦ break;
119   }
120   // End Single-Byte Opcodes
121   case 0x0f:
122   ¦ switch(op2 = next()) {
123   ¦ // End Two-Byte Opcodes Starting With 0f
124   ¦ default:
125   ¦ ¦ cerr << "unrecognized second opcode after 0f: " << HEXBYTE << NUM(op2) << '\n';
126   ¦ ¦ exit(1);
127   ¦ }
128   ¦ break;
129   case 0xf3:
130   ¦ switch(op2 = next()) {
131   ¦ // End Two-Byte Opcodes Starting With f3
132   ¦ case 0x0f:
133   ¦ ¦ switch(op3 = next()) {
134   ¦ ¦ // End Three-Byte Opcodes Starting With f3 0f
135   ¦ ¦ default:
136   ¦ ¦ ¦ cerr << "unrecognized third opcode after f3 0f: " << HEXBYTE << NUM(op3) << '\n';
137   ¦ ¦ ¦ exit(1);
138   ¦ ¦ }
139   ¦ ¦ break;
140   ¦ default:
141   ¦ ¦ cerr << "unrecognized second opcode after f3: " << HEXBYTE << NUM(op2) << '\n';
142   ¦ ¦ exit(1);
143   ¦ }
144   ¦ break;
145   default:
146   ¦ cerr << "unrecognized opcode: " << HEXBYTE << NUM(op) << '\n';
147   ¦ exit(1);
148   }
149 }
150 
151 void load_program(const string& text_bytes) {
152   uint32_t addr = 1;
153   istringstream in(text_bytes);
154   in >> std::noskipws;
155   while (has_data(in)) {
156   ¦ char c1 = next_hex_byte(in);
157   ¦ if (c1 == '\0') break;
158   ¦ if (!has_data(in)) {
159   ¦ ¦ raise << "input program truncated mid-byte\n" << end();
160   ¦ ¦ return;
161   ¦ }
162   ¦ char c2 = next_hex_byte(in);
163   ¦ if (c2 == '\0') {
164   ¦ ¦ raise << "input program truncated mid-byte\n" << end();
165   ¦ ¦ return;
166   ¦ }
167   ¦ Mem.at(addr) = to_byte(c1, c2);
168   ¦ trace(99, "load") << addr << " -> " << HEXBYTE << NUM(Mem.at(addr)) << end();
169   ¦ addr++;
170   }
171   End_of_program = addr;
172 }
173 
174 char next_hex_byte(istream& in) {
175   while (has_data(in)) {
176   ¦ char c = '\0';
177   ¦ in >> c;
178   ¦ if (c == ' ' || c == '\n') continue;
179   ¦ while (c == '#') {
180   ¦ ¦ while (has_data(in)) {
181   ¦ ¦ ¦ in >> c;
182   ¦ ¦ ¦ if (c == '\n') {
183   ¦ ¦ ¦ ¦ in >> c;
184   ¦ ¦ ¦ ¦ break;
185   ¦ ¦ ¦ }
186   ¦ ¦ }
187   ¦ }
188   ¦ if (c >= '0' && c <= '9') return c;
189   ¦ else if (c >= 'a' && c <= 'f') return c;
190   ¦ else if (c >= 'A' && c <= 'F') return tolower(c);
191   ¦ // disallow any non-hex characters, including a '0x' prefix
192   ¦ if (!isspace(c)) {
193   ¦ ¦ raise << "invalid non-hex character '" << c << "'\n" << end();
194   ¦ ¦ break;
195   ¦ }
196   }
197   return '\0';
198 }
199 
200 uint8_t to_byte(char hex_byte1, char hex_byte2) {
201   return to_hex_num(hex_byte1)*16 + to_hex_num(hex_byte2);
202 }
203 uint8_t to_hex_num(char c) {
204   if (c >= '0' && c <= '9') return c - '0';
205   if (c >= 'a' && c <= 'f') return c - 'a' + 10;
206   assert(false);
207   return 0;
208 }
209 
210 inline uint8_t next() {
211   return Mem.at(EIP++);
212 }
213 
214 // read a 32-bit immediate in little-endian order from the instruction stream
215 int32_t imm32() {
216   int32_t result = next();
217   result |= (next()<<8);
218   result |= (next()<<16);
219   result |= (next()<<24);
220   return result;
221 }
222 
223 :(before "End Includes")
224 #include <iomanip>
225 #define HEXBYTE  std::hex << std::setw(2) << std::setfill('0')
226 #define HEXWORD  std::hex << std::setw(8) << std::setfill('0')
227 // ugly that iostream doesn't print uint8_t as an integer
228 #define NUM(X) static_cast<int>(X)
229 #include <stdint.h>