1 //: Core data structures for simulating the SubX VM (subset of an x86 processor)
  2 //:
  3 //: At the lowest level ("level 1") of abstraction, SubX executes x86
  4 //: instructions provided in the form of an array of bytes, loaded into memory
  5 //: starting at a specific address.
  6 
  7 //:: registers
  8 //: assume segment registers are hard-coded to 0
  9 //: no floating-point, MMX, etc. yet
 10 
 11 :(before "End Types")
 12 enum {
 13   EAX,
 14   ECX,
 15   EDX,
 16   EBX,
 17   ESP,
 18   EBP,
 19   ESI,
 20   EDI,
 21   NUM_INT_REGISTERS,
 22 };
 23 union reg {
 24   int32_t i;
 25   uint32_t u;
 26 };
 27 :(before "End Globals")
 28 reg Reg[NUM_INT_REGISTERS] = { {0} };
 29 uint32_t EIP = 1;  // preserve null pointer
 30 :(before "End Reset")
 31 bzero(Reg, sizeof(Reg));
 32 EIP = 1;  // preserve null pointer
 33 
 34 :(before "End Help Contents")
 35 cerr << "  registers\n";
 36 :(before "End Help Texts")
 37 put(Help, "registers",
 38   "SubX currently supports eight 32-bit integer registers: R0 to R7.\n"
 39   "R4 (ESP) contains the top of the stack.\n"
 40   "\n"
 41   "There's also a register for the address of the currently executing\n"
 42   "instruction. It is modified by jumps.\n"
 43   "\n"
 44   "Various instructions modify one or more of three 1-bit 'flag' registers,\n"
 45   "as a side-effect:\n"
 46   "- the sign flag (SF): usually set if an arithmetic result is negative, or\n"
 47   "  reset if not.\n"
 48   "- the zero flag (ZF): usually set if a result is zero, or reset if not.\n"
 49   "- the overflow flag (OF): usually set if an arithmetic result overflows.\n"
 50   "The flag bits are read by conditional jumps.\n"
 51   "\n"
 52   "We don't support non-integer (floating-point) registers yet.\n"
 53 );
 54 
 55 :(before "End Globals")
 56 // the subset of x86 flag registers we care about
 57 bool SF = false;  // sign flag
 58 bool ZF = false;  // zero flag
 59 bool OF = false;  // overflow flag
 60 :(before "End Reset")
 61 SF = ZF = OF = false;
 62 
 63 //: how the flag registers are updated after each instruction
 64 
 65 :(before "End Includes")
 66 // Combine 'arg1' and 'arg2' with arithmetic operation 'op' and store the
 67 // result in 'arg1', then update flags.
 68 // beware: no side-effects in args
 69 #define BINARY_ARITHMETIC_OP(op, arg1, arg2) { \
 70   /* arg1 and arg2 must be signed */ \
 71   int64_t tmp = arg1 op arg2; \
 72   arg1 = arg1 op arg2; \
 73   trace(90, "run") << "storing 0x" << HEXWORD << arg1 << end(); \
 74   SF = (arg1 < 0); \
 75   ZF = (arg1 == 0); \
 76   OF = (arg1 != tmp); \
 77 }
 78 
 79 // Combine 'arg1' and 'arg2' with bitwise operation 'op' and store the result
 80 // in 'arg1', then update flags.
 81 #define BINARY_BITWISE_OP(op, arg1, arg2) { \
 82   /* arg1 and arg2 must be unsigned */ \
 83   arg1 = arg1 op arg2; \
 84   trace(90, "run") << "storing 0x" << HEXWORD << arg1 << end(); \
 85   SF = (arg1 >> 31); \
 86   ZF = (arg1 == 0); \
 87   OF = false; \
 88 }
 89 
 90 //:: simulated RAM
 91 
 92 :(before "End Types")
 93 const uint32_t INITIAL_SEGMENT_SIZE = 0x1000 - 1;
 94 // Subtract one just so we can start the first segment at address 1 without
 95 // overflowing the first segment. Other segments will learn to adjust.
 96 
 97 // Like in real-world Linux, we'll allocate RAM for our programs in disjoint
 98 // slabs called VMAs or Virtual Memory Areas.
 99 struct vma {
100   uint32_t start;  // inclusive
101   uint32_t end;  // exclusive
102   vector<uint8_t> _data;
103   vma(uint32_t s, uint32_t e) :start(s), end(e) {
104     _data.resize(end-start);
105   }
106   vma(uint32_t s) :start(s), end(s+INITIAL_SEGMENT_SIZE) {
107     _data.resize(end-start);
108   }
109   bool match(uint32_t a) {
110     return a >= start && a < end;
111   }
112   bool match32(uint32_t a) {
113     return a >= start && a+4 <= end;
114   }
115   uint8_t& data(uint32_t a) {
116     assert(match(a));
117     return _data.at(a-start);
118   }
119   void grow_until(uint32_t new_end_address) {
120     if (new_end_address < end) return;
121     // Ugly: vma knows about the global Memory list of vmas
122     void sanity_check(uint32_t start, uint32_t end);
123     sanity_check(start, new_end_address);
124     end = new_end_address;
125     _data.resize(new_end_address - start);
126   }
127   // End vma Methods
128 };
129 :(code)
130 void sanity_check(uint32_t start, uint32_t end) {
131   bool dup_found = false;
132   for (int i = 0;  i < SIZE(Mem);  ++i) {
133     const vma& curr = Mem.at(i);
134     if (curr.start == start) {
135       assert(!dup_found);
136       dup_found = true;
137     }
138     else if (curr.start > start) {
139       assert(curr.start > end);
140     }
141     else if (curr.start < start) {
142       assert(curr.end < start);
143     }
144   }
145 }
146 
147 :(before "End Globals")
148 // RAM is made of VMAs.
149 vector<vma> Mem;
150 :(code)
151 // The first 3 VMAs are special. When loading ELF binaries in later layers,
152 // we'll assume that the first VMA is for code, the second is for data
153 // (including the heap), and the third for the stack.
154 void grow_code_segment(uint32_t new_end_address) {
155   assert(!Mem.empty());
156   Mem.at(0).grow_until(new_end_address);
157 }
158 void grow_data_segment(uint32_t new_end_address) {
159   assert(SIZE(Mem) > 1);
160   Mem.at(1).grow_until(new_end_address);
161 }
162 :(before "End Globals")
163 uint32_t End_of_program = 0;  // when the program executes past this address in tests we'll stop the test
164 // The stack grows downward. Can't increase its size for now.
165 :(before "End Reset")
166 Mem.clear();
167 End_of_program = 0;
168 :(code)
169 // These helpers depend on Mem being laid out contiguously (so you can't use a
170 // map, etc.) and on the host also being little-endian.
171 inline uint8_t read_mem_u8(uint32_t addr) {
172   uint8_t* handle = mem_addr_u8(addr);  // error messages get printed here
173   return handle ? *handle : 0;
174 }
175 inline int8_t read_mem_i8(uint32_t addr) {
176   return static_cast<int8_t>(read_mem_u8(addr));
177 }
178 inline uint32_t read_mem_u32(uint32_t addr) {
179   uint32_t* handle = mem_addr_u32(addr);  // error messages get printed here
180   return handle ? *handle : 0;
181 }
182 inline int32_t read_mem_i32(uint32_t addr) {
183   return static_cast<int32_t>(read_mem_u32(addr));
184 }
185 
186 inline uint8_t* mem_addr_u8(uint32_t addr) {
187   uint8_t* result = NULL;
188   for (int i = 0;  i < SIZE(Mem);  ++i) {
189     if (Mem.at(i).match(addr)) {
190       if (result)
191         raise << "address 0x" << HEXWORD << addr << " is in two segments\n" << end();
192       result = &Mem.at(i).data(addr);
193     }
194   }
195   if (result == NULL)
196     raise << "Tried to access uninitialized memory at address 0x" << HEXWORD << addr << '\n' << end();
197   return result;
198 }
199 inline int8_t* mem_addr_i8(uint32_t addr) {
200   return reinterpret_cast<int8_t*>(mem_addr_u8(addr));
201 }
202 inline uint32_t* mem_addr_u32(uint32_t addr) {
203   uint32_t* result = NULL;
204   for (int i = 0;  i < SIZE(Mem);  ++i) {
205     if (Mem.at(i).match32(addr)) {
206       if (result)
207         raise << "address 0x" << HEXWORD << addr << " is in two segments\n" << end();
208       result = reinterpret_cast<uint32_t*>(&Mem.at(i).data(addr));
209     }
210   }
211   if (result == NULL) {
212     raise << "Tried to access uninitialized memory at address 0x" << HEXWORD << addr << '\n' << end();
213     raise << "The entire 4-byte word should be initialized and lie in a single segment.\n" << end();
214   }
215   return result;
216 }
217 inline int32_t* mem_addr_i32(uint32_t addr) {
218   return reinterpret_cast<int32_t*>(mem_addr_u32(addr));
219 }
220 // helper for some syscalls. But read-only.
221 inline const char* mem_addr_string(uint32_t addr) {
222   return reinterpret_cast<const char*>(mem_addr_u8(addr));
223 }
224 
225 inline void write_mem_u8(uint32_t addr, uint8_t val) {
226   uint8_t* handle = mem_addr_u8(addr);
227   if (handle != NULL) *handle = val;
228 }
229 inline void write_mem_i8(uint32_t addr, int8_t val) {
230   int8_t* handle = mem_addr_i8(addr);
231   if (handle != NULL) *handle = val;
232 }
233 inline void write_mem_u32(uint32_t addr, uint32_t val) {
234   uint32_t* handle = mem_addr_u32(addr);
235   if (handle != NULL) *handle = val;
236 }
237 inline void write_mem_i32(uint32_t addr, int32_t val) {
238   int32_t* handle = mem_addr_i32(addr);
239   if (handle != NULL) *handle = val;
240 }
241 
242 inline bool already_allocated(uint32_t addr) {
243   bool result = false;
244   for (int i = 0;  i < SIZE(Mem);  ++i) {
245     if (Mem.at(i).match(addr)) {
246       if (result)
247         raise << "address 0x" << HEXWORD << addr << " is in two segments\n" << end();
248       result = true;
249     }
250   }
251   return result;
252 }
253 
254 //:: core interpreter loop
255 
256 :(code)
257 // skeleton of how x86 instructions are decoded
258 void run_one_instruction() {
259   uint8_t op=0, op2=0, op3=0;
260   trace(90, "run") << "inst: 0x" << HEXWORD << EIP << end();
261 //?   dump_registers();
262 //?   cerr << "inst: 0x" << EIP << " => ";
263   op = next();
264 //?   cerr << HEXBYTE << NUM(op) << '\n';
265   switch (op) {
266   case 0xf4:  // hlt
267     EIP = End_of_program;
268     break;
269   // End Single-Byte Opcodes
270   case 0x0f:
271     switch(op2 = next()) {
272     // End Two-Byte Opcodes Starting With 0f
273     default:
274       cerr << "unrecognized second opcode after 0f: " << HEXBYTE << NUM(op2) << '\n';
275       DUMP("");
276       exit(1);
277     }
278     break;
279   case 0xf2:
280     switch(op2 = next()) {
281     // End Two-Byte Opcodes Starting With f2
282     case 0x0f:
283       switch(op3 = next()) {
284       // End Three-Byte Opcodes Starting With f2 0f
285       default:
286         cerr << "unrecognized third opcode after f2 0f: " << HEXBYTE << NUM(op3) << '\n';
287         DUMP("");
288         exit(1);
289       }
290       break;
291     default:
292       cerr << "unrecognized second opcode after f2: " << HEXBYTE << NUM(op2) << '\n';
293       DUMP("");
294       exit(1);
295     }
296     break;
297   case 0xf3:
298     switch(op2 = next()) {
299     // End Two-Byte Opcodes Starting With f3
300     case 0x0f:
301       switch(op3 = next()) {
302       // End Three-Byte Opcodes Starting With f3 0f
303       default:
304         cerr << "unrecognized third opcode after f3 0f: " << HEXBYTE << NUM(op3) << '\n';
305         DUMP("");
306         exit(1);
307       }
308       break;
309     default:
310       cerr << "unrecognized second opcode after f3: " << HEXBYTE << NUM(op2) << '\n';
311       DUMP("");
312       exit(1);
313     }
314     break;
315   default:
316     cerr << "unrecognized opcode: " << HEXBYTE << NUM(op) << '\n';
317     DUMP("");
318     exit(1);
319   }
320 }
321 
322 inline uint8_t next() {
323   return read_mem_u8(EIP++);
324 }
325 
326 void dump_registers() {
327   for (int i = 0;  i < NUM_INT_REGISTERS;  ++i) {
328     if (i > 0) cerr << "; ";
329     cerr << "  " << i << ": " << std::hex << std::setw(8) << std::setfill('_') << Reg[i].u;
330   }
331   cerr << " -- SF: " << SF << "; ZF: " << ZF << "; OF: " << OF << '\n';
332 }
333 
334 //: start tracking supported opcodes
335 :(before "End Globals")
336 map</*op*/string, string> name;
337 map</*op*/string, string> name_0f;
338 map</*op*/string, string> name_f3;
339 map</*op*/string, string> name_f3_0f;
340 :(before "End One-time Setup")
341 init_op_names();
342 :(code)
343 void init_op_names() {
344   put(name, "f4", "halt");
345   // End Initialize Op Names(name)
346 }
347 
348 :(before "End Help Special-cases(key)")
349 if (key == "opcodes") {
350   cerr << "Opcodes currently supported by SubX:\n";
351   for (map<string, string>::iterator p = name.begin();  p != name.end();  ++p)
352     cerr << "  " << p->first << ": " << p->second << '\n';
353   for (map<string, string>::iterator p = name_0f.begin();  p != name_0f.end();  ++p)
354     cerr << "  0f " << p->first << ": " << p->second << '\n';
355   for (map<string, string>::iterator p = name_f3.begin();  p != name_f3.end();  ++p)
356     cerr << "  f3 " << p->first << ": " << p->second << '\n';
357   for (map<string, string>::iterator p = name_f3_0f.begin();  p != name_f3_0f.end();  ++p)
358     cerr << "  f3 0f " << p->first << ": " << p->second << '\n';
359   cerr << "Run `subx help instructions` for details on words like 'r32' and 'disp8'.\n";
360   return 0;
361 }
362 :(before "End Help Contents")
363 cerr << "  opcodes\n";
364 
365 :(before "End Includes")
366 #include <iomanip>
367 #define HEXBYTE  std::hex << std::setw(2) << std::setfill('0')
368 #define HEXWORD  std::hex << std::setw(8) << std::setfill('0')
369 // ugly that iostream doesn't print uint8_t as an integer
370 #define NUM(X) static_cast<int>(X)
371 #include <stdint.h>