1 //:: simulated x86 registers; just a subset
  2 //:    assume segment registers are hard-coded to 0
  3 //:    no floating-point, MMX, etc. yet
  4 
  5 :(before "End Types")
  6 enum {
  7   EAX,
  8   ECX,
  9   EDX,
 10   EBX,
 11   ESP,
 12   EBP,
 13   ESI,
 14   EDI,
 15   NUM_INT_REGISTERS,
 16 };
 17 union reg {
 18   int32_t i;
 19   uint32_t u;
 20 };
 21 :(before "End Globals")
 22 reg Reg[NUM_INT_REGISTERS] = { {0} };
 23 uint32_t EIP = 0;
 24 :(before "End Reset")
 25 bzero(Reg, sizeof(Reg));
 26 EIP = 0;
 27 
 28 //:: simulated flag registers; just a subset that we care about
 29 
 30 :(before "End Globals")
 31 bool SF = false;  // sign flag
 32 bool ZF = false;  // zero flag
 33 bool OF = false;  // overflow flag
 34 :(before "End Reset")
 35 SF = ZF = OF = false;
 36 
 37 //: how the flag registers are updated after each instruction
 38 
 39 :(before "End Includes")
 40 // Combine 'arg1' and 'arg2' with arithmetic operation 'op' and store the
 41 // result in 'arg1', then update flags.
 42 // beware: no side-effects in args
 43 #define BINARY_ARITHMETIC_OP(op, arg1, arg2) { \
 44   /* arg1 and arg2 must be signed */ \
 45   int64_t tmp = arg1 op arg2; \
 46   arg1 = arg1 op arg2; \
 47   trace(2, "run") << "storing 0x" << HEXWORD << arg1 << end(); \
 48   SF = (arg1 < 0); \
 49   ZF = (arg1 == 0); \
 50   OF = (arg1 != tmp); \
 51 }
 52 
 53 // Combine 'arg1' and 'arg2' with bitwise operation 'op' and store the result
 54 // in 'arg1', then update flags.
 55 #define BINARY_BITWISE_OP(op, arg1, arg2) { \
 56   /* arg1 and arg2 must be unsigned */ \
 57   arg1 = arg1 op arg2; \
 58   trace(2, "run") << "storing 0x" << HEXWORD << arg1 << end(); \
 59   SF = (arg1 >> 31); \
 60   ZF = (arg1 == 0); \
 61   OF = false; \
 62 }
 63 
 64 //:: simulated RAM
 65 
 66 :(before "End Globals")
 67 vector<uint8_t> Mem;
 68 uint32_t End_of_program = 0;
 69 :(before "End Reset")
 70 Mem.clear();
 71 Mem.resize(1024);
 72 End_of_program = 0;
 73 :(before "End Includes")
 74 // depends on Mem being laid out contiguously (so you can't use a map, etc.)
 75 // and on the host also being little-endian
 76 #define SET_WORD_IN_MEM(addr, val)  *reinterpret_cast<int32_t*>(&Mem.at(addr)) = val;
 77 
 78 //:: core interpreter loop
 79 
 80 :(scenario add_imm32_to_eax)
 81 # In scenarios, programs are a series of hex bytes, each (variable-length)
 82 # instruction on one line.
 83 #
 84 # x86 instructions consist of the following parts (see cheatsheet.pdf):
 85 #   opcode        ModR/M                    SIB                   displacement    immediate
 86 #   instruction   mod, reg, Reg/Mem bits    scale, index, base
 87 #   1-3 bytes     0/1 byte                  0/1 byte              0/1/2/4 bytes   0/1/2/4 bytes
 88     05                                                                            0a 0b 0c 0d  # add 0x0d0c0b0a to EAX
 89 # All hex bytes must be exactly 2 characters each. No '0x' prefixes.
 90 +load: 1 -> 05
 91 +load: 2 -> 0a
 92 +load: 3 -> 0b
 93 +load: 4 -> 0c
 94 +load: 5 -> 0d
 95 +run: add imm32 0x0d0c0b0a to reg EAX
 96 +run: storing 0x0d0c0b0a
 97 
 98 :(code)
 99 // helper for tests: load a program into memory from a textual representation
100 // of its bytes, and run it
101 void run(const string& text_bytes) {
102   load_program(text_bytes);
103   EIP = 1;  // preserve null pointer
104   while (EIP < End_of_program)
105     run_one_instruction();
106 }
107 
108 // skeleton of how x86 instructions are decoded
109 void run_one_instruction() {
110   uint8_t op=0, op2=0, op3=0;
111   trace(2, "run") << "inst: 0x" << HEXWORD << EIP << end();
112   switch (op = next()) {
113   case 0xf4:  // hlt
114     EIP = End_of_program;
115     break;
116   // our first opcode
117   case 0x05: {  // add imm32 to EAX
118     int32_t arg2 = imm32();
119     trace(2, "run") << "add imm32 0x" << HEXWORD << arg2 << " to reg EAX" << end();
120     BINARY_ARITHMETIC_OP(+, Reg[EAX].i, arg2);
121     break;
122   }
123   // End Single-Byte Opcodes
124   case 0x0f:
125     switch(op2 = next()) {
126     // End Two-Byte Opcodes Starting With 0f
127     default:
128       cerr << "unrecognized second opcode after 0f: " << HEXBYTE << NUM(op2) << '\n';
129       exit(1);
130     }
131     break;
132   case 0xf3:
133     switch(op2 = next()) {
134     // End Two-Byte Opcodes Starting With f3
135     case 0x0f:
136       switch(op3 = next()) {
137       // End Three-Byte Opcodes Starting With f3 0f
138       default:
139         cerr << "unrecognized third opcode after f3 0f: " << HEXBYTE << NUM(op3) << '\n';
140         exit(1);
141       }
142       break;
143     default:
144       cerr << "unrecognized second opcode after f3: " << HEXBYTE << NUM(op2) << '\n';
145       exit(1);
146     }
147     break;
148   default:
149     cerr << "unrecognized opcode: " << HEXBYTE << NUM(op) << '\n';
150     exit(1);
151   }
152 }
153 
154 void load_program(const string& text_bytes) {
155   uint32_t addr = 1;
156   istringstream in(text_bytes);
157   in >> std::noskipws;
158   while (has_data(in)) {
159     char c1 = next_hex_byte(in);
160     if (c1 == '\0') break;
161     if (!has_data(in)) {
162       raise << "input program truncated mid-byte\n" << end();
163       return;
164     }
165     char c2 = next_hex_byte(in);
166     if (c2 == '\0') {
167       raise << "input program truncated mid-byte\n" << end();
168       return;
169     }
170     Mem.at(addr) = to_byte(c1, c2);
171     trace(99, "load") << addr << " -> " << HEXBYTE << NUM(Mem.at(addr)) << end();
172     addr++;
173   }
174   End_of_program = addr;
175 }
176 
177 char next_hex_byte(istream& in) {
178   while (has_data(in)) {
179     char c = '\0';
180     in >> c;
181     if (c == ' ' || c == '\n') continue;
182     while (c == '#') {
183       while (has_data(in)) {
184         in >> c;
185         if (c == '\n') {
186           in >> c;
187           break;
188         }
189       }
190     }
191     if (c == '\0') return c;
192     if (c >= '0' && c <= '9') return c;
193     if (c >= 'a' && c <= 'f') return c;
194     if (c >= 'A' && c <= 'F') return tolower(c);
195     // disallow any non-hex characters, including a '0x' prefix
196     if (!isspace(c)) {
197       raise << "invalid non-hex character " << NUM(c) << "\n" << end();
198       break;
199     }
200   }
201   return '\0';
202 }
203 
204 uint8_t to_byte(char hex_byte1, char hex_byte2) {
205   return to_hex_num(hex_byte1)*16 + to_hex_num(hex_byte2);
206 }
207 uint8_t to_hex_num(char c) {
208   if (c >= '0' && c <= '9') return c - '0';
209   if (c >= 'a' && c <= 'f') return c - 'a' + 10;
210   assert(false);
211   return 0;
212 }
213 
214 inline uint8_t next() {
215   return Mem.at(EIP++);
216 }
217 
218 // read a 32-bit immediate in little-endian order from the instruction stream
219 int32_t imm32() {
220   int32_t result = next();
221   result |= (next()<<8);
222   result |= (next()<<16);
223   result |= (next()<<24);
224   return result;
225 }
226 
227 string rname(uint8_t r) {
228   switch (r) {
229   case 0: return "EAX";
230   case 1: return "ECX";
231   case 2: return "EDX";
232   case 3: return "EBX";
233   case 4: return "ESP";
234   case 5: return "EBP";
235   case 6: return "ESI";
236   case 7: return "EDI";
237   default: raise << "invalid register " << r << '\n' << end();  return "";
238   }
239 }
240 
241 :(before "End Includes")
242 #include <iomanip>
243 #define HEXBYTE  std::hex << std::setw(2) << std::setfill('0')
244 #define HEXWORD  std::hex << std::setw(8) << std::setfill('0')
245 // ugly that iostream doesn't print uint8_t as an integer
246 #define NUM(X) static_cast<int>(X)
247 #include <stdint.h>