//:: simulated x86 registers; just a subset //: assume segment registers are hard-coded to 0 //: no floating-point, MMX, etc. yet :(before "End Types") enum { EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI, NUM_INT_REGISTERS, }; union reg { int32_t i; uint32_t u; }; :(before "End Globals") reg Reg[NUM_INT_REGISTERS] = { {0} }; uint32_t EIP = 0; :(before "End Reset") bzero(Reg, sizeof(Reg)); EIP = 0; //:: simulated flag registers; just a subset that we care about :(before "End Globals") bool SF = false; // sign flag bool ZF = false; // zero flag bool OF = false; // overflow flag :(before "End Reset") SF = ZF = OF = false; //: how the flag registers are updated after each instruction :(before "End Includes") // Combine 'arg1' and 'arg2' with arithmetic operation 'op' and store the // result in 'arg1', then update flags. // beware: no side-effects in args #define BINARY_ARITHMETIC_OP(op, arg1, arg2) { \ /* arg1 and arg2 must be signed */ \ int64_t tmp = arg1 op arg2; \ arg1 = arg1 op arg2; \ trace(2, "run") << "storing 0x" << HEXWORD << arg1 << end(); \ SF = (arg1 < 0); \ ZF = (arg1 == 0); \ OF = (arg1 != tmp); \ } // Combine 'arg1' and 'arg2' with bitwise operation 'op' and store the result // in 'arg1', then update flags. #define BINARY_BITWISE_OP(op, arg1, arg2) { \ /* arg1 and arg2 must be unsigned */ \ arg1 = arg1 op arg2; \ trace(2, "run") << "storing 0x" << HEXWORD << arg1 << end(); \ SF = (arg1 >> 31); \ ZF = (arg1 == 0); \ OF = false; \ } //:: simulated RAM :(before "End Globals") vector Mem; uint32_t End_of_program = 0; :(before "End Reset") Mem.clear(); Mem.resize(1024); End_of_program = 0; :(before "End Includes") // depends on Mem being laid out contiguously (so you can't use a map, etc.) // and on the host also being little-endian #define SET_WORD_IN_MEM(addr, val) *reinterpret_cast(&Mem.at(addr)) = val; //:: core interpreter loop :(scenario add_imm32_to_eax) # In scenarios, programs are a series of hex bytes, each (variable-length) # instruction on one line. # # x86 instructions consist of the following parts (see cheatsheet.pdf): # opcode ModR/M SIB displacement immediate # instruction mod, reg, Reg/Mem bits scale, index, base # 1-3 bytes 0/1 byte 0/1 byte 0/1/2/4 bytes 0/1/2/4 bytes 05 0a 0b 0c 0d # add 0x0d0c0b0a to EAX # All hex bytes must be exactly 2 characters each. No '0x' prefixes. +load: 1 -> 05 +load: 2 -> 0a +load: 3 -> 0b +load: 4 -> 0c +load: 5 -> 0d +run: add imm32 0x0d0c0b0a to reg EAX +run: storing 0x0d0c0b0a :(code) // helper for tests: load a program into memory from a textual representation // of its bytes, and run it void run(string text_bytes) { // Begin run() For Scenarios cerr << text_bytes << '\n'; load_program(text_bytes, 1); // tests always assume a starting address of 1 EIP = 1; // preserve null pointer while (EIP < End_of_program) run_one_instruction(); } // skeleton of how x86 instructions are decoded void run_one_instruction() { uint8_t op=0, op2=0, op3=0; trace(2, "run") << "inst: 0x" << HEXWORD << EIP << end(); switch (op = next()) { case 0xf4: // hlt EIP = End_of_program; break; // our first opcode case 0x05: { // add imm32 to EAX int32_t arg2 = imm32(); trace(2, "run") << "add imm32 0x" << HEXWORD << arg2 << " to reg EAX" << end(); BINARY_ARITHMETIC_OP(+, Reg[EAX].i, arg2); break; } // End Single-Byte Opcodes case 0x0f: switch(op2 = next()) { // End Two-Byte Opcodes Starting With 0f default: cerr << "unrecognized second opcode after 0f: " << HEXBYTE << NUM(op2) << '\n'; exit(1); } break; case 0xf3: switch(op2 = next()) { // End Two-Byte Opcodes Starting With f3 case 0x0f: switch(op3 = next()) { // End Three-Byte Opcodes Starting With f3 0f default: cerr << "unrecognized third opcode after f3 0f: " << HEXBYTE << NUM(op3) << '\n'; exit(1); } break; default: cerr << "unrecognized second opcode after f3: " << HEXBYTE << NUM(op2) << '\n'; exit(1); } break; default: cerr << "unrecognized opcode: " << HEXBYTE << NUM(op) << '\n'; exit(1); } } void load_program(const string& text_bytes, uint32_t addr) { istringstream in(text_bytes); load_program(in, addr); } void load_program(istream& in, uint32_t addr) { in >> std::noskipws; while (has_data(in)) { char c1 = next_hex_byte(in); if (c1 == '\0') break; if (!has_data(in)) { raise << "input program truncated mid-byte\n" << end(); return; } char c2 = next_hex_byte(in); if (c2 == '\0') { raise << "input program truncated mid-byte\n" << end(); return; } Mem.at(addr) = to_byte(c1, c2); trace(99, "load") << addr << " -> " << HEXBYTE << NUM(Mem.at(addr)) << end(); addr++; if (addr >= Mem.size()) Mem.resize(Mem.size()*2); } End_of_program = addr; } char next_hex_byte(istream& in) { while (has_data(in)) { char c = '\0'; in >> c; if (c == ' ' || c == '\n') continue; while (c == '#') { while (has_data(in)) { in >> c; if (c == '\n') { in >> c; break; } } } if (c == '\0') return c; if (c >= '0' && c <= '9') return c; if (c >= 'a' && c <= 'f') return c; if (c >= 'A' && c <= 'F') return tolower(c); // disallow any non-hex characters, including a '0x' prefix if (!isspace(c)) { raise << "invalid non-hex character " << NUM(c) << "\n" << end(); break; } } return '\0'; } uint8_t to_byte(char hex_byte1, char hex_byte2) { return to_hex_num(hex_byte1)*16 + to_hex_num(hex_byte2); } uint8_t to_hex_num(char c) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 10; assert(false); return 0; } inline uint8_t next() { return Mem.at(EIP++); } // read a 32-bit immediate in little-endian order from the instruction stream int32_t imm32() { int32_t result = next(); result |= (next()<<8); result |= (next()<<16); result |= (next()<<24); return result; } string rname(uint8_t r) { switch (r) { case 0: return "EAX"; case 1: return "ECX"; case 2: return "EDX"; case 3: return "EBX"; case 4: return "ESP"; case 5: return "EBP"; case 6: return "ESI"; case 7: return "EDI"; default: raise << "invalid register " << r << '\n' << end(); return ""; } } :(before "End Includes") #include #define HEXBYTE std::hex << std::setw(2) << std::setfill('0') #define HEXWORD std::hex << std::setw(8) << std::setfill('0') // ugly that iostream doesn't print uint8_t as an integer #define NUM(X) static_cast(X) #include