From dc8790941e39efb25c40de0420fdd4bce03f2761 Mon Sep 17 00:00:00 2001 From: Kartik Agaram Date: Thu, 19 Sep 2019 15:26:24 -0700 Subject: 5670 --- 028translate.cc | 213 ------------- 029transforms.cc | 65 ---- 030---operands.cc | 539 -------------------------------- 030---translate.cc | 213 +++++++++++++ 031check_operands.cc | 691 ------------------------------------------ 031transforms.cc | 65 ++++ 032---operands.cc | 539 ++++++++++++++++++++++++++++++++ 032check_operand_bounds.cc | 143 --------- 033check_operands.cc | 691 ++++++++++++++++++++++++++++++++++++++++++ 034check_operand_bounds.cc | 143 +++++++++ 034compute_segment_address.cc | 86 ------ 035compute_segment_address.cc | 86 ++++++ 035labels.cc | 416 ------------------------- 036global_variables.cc | 305 ------------------- 036labels.cc | 416 +++++++++++++++++++++++++ 037global_variables.cc | 305 +++++++++++++++++++ 16 files changed, 2458 insertions(+), 2458 deletions(-) delete mode 100644 028translate.cc delete mode 100644 029transforms.cc delete mode 100644 030---operands.cc create mode 100644 030---translate.cc delete mode 100644 031check_operands.cc create mode 100644 031transforms.cc create mode 100644 032---operands.cc delete mode 100644 032check_operand_bounds.cc create mode 100644 033check_operands.cc create mode 100644 034check_operand_bounds.cc delete mode 100644 034compute_segment_address.cc create mode 100644 035compute_segment_address.cc delete mode 100644 035labels.cc delete mode 100644 036global_variables.cc create mode 100644 036labels.cc create mode 100644 037global_variables.cc diff --git a/028translate.cc b/028translate.cc deleted file mode 100644 index 9737834e..00000000 --- a/028translate.cc +++ /dev/null @@ -1,213 +0,0 @@ -//: The bedrock level 1 of abstraction is now done, and we're going to start -//: building levels above it that make programming in x86 machine code a -//: little more ergonomic. -//: -//: All levels will be "pass through by default". Whatever they don't -//: understand they will silently pass through to lower levels. -//: -//: Since raw hex bytes of machine code are always possible to inject, SubX is -//: not a language, and we aren't building a compiler. This is something -//: deliberately leakier. Levels are more for improving auditing, checks and -//: error messages rather than for hiding low-level details. - -//: Translator workflow: read 'source' file. Run a series of transforms on it, -//: each passing through what it doesn't understand. The final program should -//: be just machine code, suitable to write to an ELF binary. -//: -//: Higher levels usually transform code on the basis of metadata. - -:(before "End Main") -if (is_equal(argv[1], "translate")) { - // Outside of tests, traces must be explicitly requested. - if (Trace_file.is_open()) Trace_stream = new trace_stream; - reset(); - // Begin subx translate - program p; - string output_filename; - for (int i = /*skip 'subx translate'*/2; i < argc; ++i) { - if (is_equal(argv[i], "-o")) { - ++i; - if (i >= argc) { - print_translate_usage(); - cerr << "'-o' must be followed by a filename to write results to\n"; - exit(1); - } - output_filename = argv[i]; - } - else { - trace(2, "parse") << argv[i] << end(); - ifstream fin(argv[i]); - if (!fin) { - cerr << "could not open " << argv[i] << '\n'; - return 1; - } - parse(fin, p); - if (trace_contains_errors()) return 1; - } - } - if (p.segments.empty()) { - print_translate_usage(); - cerr << "nothing to do; must provide at least one file to read\n"; - exit(1); - } - if (output_filename.empty()) { - print_translate_usage(); - cerr << "must provide a filename to write to using '-o'\n"; - exit(1); - } - trace(2, "transform") << "begin" << end(); - transform(p); - if (trace_contains_errors()) return 1; - trace(2, "translate") << "begin" << end(); - save_elf(p, output_filename); - if (trace_contains_errors()) { - unlink(output_filename.c_str()); - return 1; - } - // End subx translate - return 0; -} - -:(code) -void print_translate_usage() { - cerr << "Usage: subx translate file1 file2 ... -o output\n"; -} - -// write out a program to a bare-bones ELF file -void save_elf(const program& p, const string& filename) { - ofstream out(filename.c_str(), ios::binary); - save_elf(p, out); - out.close(); -} - -void save_elf(const program& p, ostream& out) { - // validation: stay consistent with the self-hosted translator - if (p.entry == 0) { - raise << "no 'Entry' label found\n" << end(); - return; - } - if (find(p, "data") == NULL) { - raise << "must include a 'data' segment\n" << end(); - return; - } - // processing - write_elf_header(out, p); - for (size_t i = 0; i < p.segments.size(); ++i) - write_segment(p.segments.at(i), out); -} - -void write_elf_header(ostream& out, const program& p) { - char c = '\0'; -#define O(X) c = (X); out.write(&c, sizeof(c)) -// host is required to be little-endian -#define emit(X) out.write(reinterpret_cast(&X), sizeof(X)) - //// ehdr - // e_ident - O(0x7f); O(/*E*/0x45); O(/*L*/0x4c); O(/*F*/0x46); - O(0x1); // 32-bit format - O(0x1); // little-endian - O(0x1); O(0x0); - for (size_t i = 0; i < 8; ++i) { O(0x0); } - // e_type - O(0x02); O(0x00); - // e_machine - O(0x03); O(0x00); - // e_version - O(0x01); O(0x00); O(0x00); O(0x00); - // e_entry - uint32_t e_entry = p.entry; - // Override e_entry - emit(e_entry); - // e_phoff -- immediately after ELF header - uint32_t e_phoff = 0x34; - emit(e_phoff); - // e_shoff; unused - uint32_t dummy32 = 0; - emit(dummy32); - // e_flags; unused - emit(dummy32); - // e_ehsize - uint16_t e_ehsize = 0x34; - emit(e_ehsize); - // e_phentsize - uint16_t e_phentsize = 0x20; - emit(e_phentsize); - // e_phnum - uint16_t e_phnum = SIZE(p.segments); - emit(e_phnum); - // e_shentsize - uint16_t dummy16 = 0x0; - emit(dummy16); - // e_shnum - emit(dummy16); - // e_shstrndx - emit(dummy16); - - uint32_t p_offset = /*size of ehdr*/0x34 + SIZE(p.segments)*0x20/*size of each phdr*/; - for (int i = 0; i < SIZE(p.segments); ++i) { - const segment& curr = p.segments.at(i); - //// phdr - // p_type - uint32_t p_type = 0x1; - emit(p_type); - // p_offset - emit(p_offset); - // p_vaddr - uint32_t p_start = curr.start; - emit(p_start); - // p_paddr - emit(p_start); - // p_filesz - uint32_t size = num_words(curr); - assert(p_offset + size < SEGMENT_ALIGNMENT); - emit(size); - // p_memsz - emit(size); - // p_flags - uint32_t p_flags = (curr.name == "code") ? /*r-x*/0x5 : /*rw-*/0x6; - emit(p_flags); - - // p_align - // "As the system creates or augments a process image, it logically copies - // a file's segment to a virtual memory segment. When—and if— the system - // physically reads the file depends on the program's execution behavior, - // system load, and so on. A process does not require a physical page - // unless it references the logical page during execution, and processes - // commonly leave many pages unreferenced. Therefore delaying physical - // reads frequently obviates them, improving system performance. To obtain - // this efficiency in practice, executable and shared object files must - // have segment images whose file offsets and virtual addresses are - // congruent, modulo the page size." -- http://refspecs.linuxbase.org/elf/elf.pdf (page 95) - uint32_t p_align = 0x1000; // default page size on linux - emit(p_align); - if (p_offset % p_align != p_start % p_align) { - raise << "segment starting at 0x" << HEXWORD << p_start << " is improperly aligned; alignment for p_offset " << p_offset << " should be " << (p_offset % p_align) << " but is " << (p_start % p_align) << '\n' << end(); - return; - } - - // prepare for next segment - p_offset += size; - } -#undef O -#undef emit -} - -void write_segment(const segment& s, ostream& out) { - for (int i = 0; i < SIZE(s.lines); ++i) { - const vector& w = s.lines.at(i).words; - for (int j = 0; j < SIZE(w); ++j) { - uint8_t x = hex_byte(w.at(j).data); // we're done with metadata by this point - out.write(reinterpret_cast(&x), /*sizeof(byte)*/1); - } - } -} - -uint32_t num_words(const segment& s) { - uint32_t sum = 0; - for (int i = 0; i < SIZE(s.lines); ++i) - sum += SIZE(s.lines.at(i).words); - return sum; -} - -:(before "End Includes") -using std::ios; diff --git a/029transforms.cc b/029transforms.cc deleted file mode 100644 index a6e12502..00000000 --- a/029transforms.cc +++ /dev/null @@ -1,65 +0,0 @@ -//: Ordering transforms is a well-known hard problem when building compilers. -//: In our case we also have the additional notion of layers. The ordering of -//: layers can have nothing in common with the ordering of transforms when -//: SubX is tangled and run. This can be confusing for readers, particularly -//: if later layers start inserting transforms at arbitrary points between -//: transforms introduced earlier. Over time adding transforms can get harder -//: and harder, having to meet the constraints of everything that's come -//: before. It's worth thinking about organization up-front so the ordering is -//: easy to hold in our heads, and it's obvious where to add a new transform. -//: Some constraints: -//: -//: 1. Layers force us to build SubX bottom-up; since we want to be able to -//: build and run SubX after stopping loading at any layer, the overall -//: organization has to be to introduce primitives before we start using -//: them. -//: -//: 2. Transforms usually need to be run top-down, converting high-level -//: representations to low-level ones so that low-level layers can be -//: oblivious to them. -//: -//: 3. When running we'd often like new representations to be checked before -//: they are transformed away. The whole reason for new representations is -//: often to add new kinds of automatic checking for our machine code -//: programs. -//: -//: Putting these constraints together, we'll use the following broad -//: organization: -//: -//: a) We'll divide up our transforms into "levels", each level consisting -//: of multiple transforms, and dealing in some new set of representational -//: ideas. Levels will be added in reverse order to the one their transforms -//: will be run in. -//: -//: To run all transforms: -//: Load transforms for level n -//: Load transforms for level n-1 -//: ... -//: Load transforms for level 2 -//: Run code at level 1 -//: -//: b) *Within* a level we'll usually introduce transforms in the order -//: they're run in. -//: -//: To run transforms for level n: -//: Perform transform of layer l -//: Perform transform of layer l+1 -//: ... -//: -//: c) Within a level it's often most natural to introduce a new -//: representation by showing how it's transformed to the level below. To -//: make such exceptions more obvious checks usually won't be first-class -//: transforms; instead code that keeps the program unmodified will run -//: within transforms before they mutate the program. As an example: -//: -//: Layer l introduces a transform -//: Layer l+1 adds precondition checks for the transform -//: -//: This may all seem abstract, but will hopefully make sense over time. The -//: goals are basically to always have a working program after any layer, to -//: have the order of layers make narrative sense, and to order transforms -//: correctly at runtime. - -:(before "End One-time Setup") -// Begin Transforms -// End Transforms diff --git a/030---operands.cc b/030---operands.cc deleted file mode 100644 index 5203201e..00000000 --- a/030---operands.cc +++ /dev/null @@ -1,539 +0,0 @@ -//: Beginning of "level 2": tagging bytes with metadata around what field of -//: an x86 instruction they're for. -//: -//: The x86 instruction set is variable-length, and how a byte is interpreted -//: affects later instruction boundaries. A lot of the pain in programming -//: machine code stems from computer and programmer going out of sync on what -//: a byte means. The miscommunication is usually not immediately caught, and -//: metastasizes at runtime into kilobytes of misinterpreted instructions. -//: -//: To mitigate these issues, we'll start programming in terms of logical -//: operands rather than physical bytes. Some operands are smaller than a -//: byte, and others may consist of multiple bytes. This layer will correctly -//: pack and order the bytes corresponding to the operands in an instruction. - -:(before "End Help Texts") -put_new(Help, "instructions", - "Each x86 instruction consists of an instruction or opcode and some number\n" - "of operands.\n" - "Each operand has a type. An instruction won't have more than one operand of\n" - "any type.\n" - "Each instruction has some set of allowed operand types. It'll reject others.\n" - "The complete list of operand types: mod, subop, r32 (register), rm32\n" - "(register or memory), scale, index, base, disp8, disp16, disp32, imm8,\n" - "imm32.\n" - "Each of these has its own help page. Try reading 'subx help mod' next.\n" -); -:(before "End Help Contents") -cerr << " instructions\n"; - -:(code) -void test_pack_immediate_constants() { - run( - "== code 0x1\n" - "bb 0x2a/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "transform: packing instruction 'bb 0x2a/imm32'\n" - "transform: instruction after packing: 'bb 2a 00 00 00'\n" - "run: copy imm32 0x0000002a to EBX\n" - ); -} - -//: complete set of valid operand types - -:(before "End Globals") -set Instruction_operands; -:(before "End One-time Setup") -Instruction_operands.insert("subop"); -Instruction_operands.insert("mod"); -Instruction_operands.insert("rm32"); -Instruction_operands.insert("base"); -Instruction_operands.insert("index"); -Instruction_operands.insert("scale"); -Instruction_operands.insert("r32"); -Instruction_operands.insert("disp8"); -Instruction_operands.insert("disp16"); -Instruction_operands.insert("disp32"); -Instruction_operands.insert("imm8"); -Instruction_operands.insert("imm32"); - -:(before "End Help Texts") -init_operand_type_help(); -:(code) -void init_operand_type_help() { - put(Help, "mod", - "2-bit operand controlling the _addressing mode_ of many instructions,\n" - "to determine how to compute the _effective address_ to look up memory at\n" - "based on the 'rm32' operand and potentially others.\n" - "\n" - "If mod = 3, just operate on the contents of the register specified by rm32\n" - " (direct mode).\n" - "If mod = 2, effective address is usually* rm32 + disp32\n" - " (indirect mode with displacement).\n" - "If mod = 1, effective address is usually* rm32 + disp8\n" - " (indirect mode with displacement).\n" - "If mod = 0, effective address is usually* rm32 (indirect mode).\n" - "(* - The exception is when rm32 is '4'. Register 4 is the stack pointer (ESP).\n" - " Using it as an address gets more involved. For more details,\n" - " try reading the help pages for 'base', 'index' and 'scale'.)\n" - "\n" - "For complete details, spend some time with two tables in the IA-32 software\n" - "developer's manual that are also included in this repo:\n" - " - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n" - " - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n" - ); - put(Help, "subop", - "Additional 3-bit operand for determining the instruction when the opcode\n" - "is 81, 8f, d3, f7 or ff.\n" - "Can't coexist with operand of type 'r32' in a single instruction, because\n" - "the two use the same bits.\n" - ); - put(Help, "r32", - "3-bit operand specifying a register operand used directly, without any further addressing modes.\n" - ); - put(Help, "rm32", - "32-bit value in register or memory. The precise details of its construction\n" - "depend on the eponymous 3-bit 'rm32' operand, the 'mod' operand, and also\n" - "potentially the 'SIB' operands ('scale', 'index' and 'base') and a displacement\n" - "('disp8' or 'disp32').\n" - "\n" - "For complete details, spend some time with two tables in the IA-32 software\n" - "developer's manual that are also included in this repo:\n" - " - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n" - " - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n" - ); - put(Help, "base", - "Additional 3-bit operand (when 'rm32' is 4, unless 'mod' is 3) specifying the\n" - "register containing an address to look up.\n" - "This address may be further modified by 'index' and 'scale' operands.\n" - " effective address = base + index*scale + displacement (disp8 or disp32)\n" - "For complete details, spend some time with the IA-32 software developer's manual,\n" - "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n" - "It is included in this repository as 'sib.pdf'.\n" - ); - put(Help, "index", - "Optional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be added to\n" - "the 'base' operand to compute the 'effective address' at which to look up memory.\n" - " effective address = base + index*scale + displacement (disp8 or disp32)\n" - "For complete details, spend some time with the IA-32 software developer's manual,\n" - "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n" - "It is included in this repository as 'sib.pdf'.\n" - ); - put(Help, "scale", - "Optional 2-bit operand (when 'rm32' is 4 unless 'mod' is 3) that encodes a\n" - "power of 2 to be multiplied to the 'index' operand before adding the result to\n" - "the 'base' operand to compute the _effective address_ to operate on.\n" - " effective address = base + index * scale + displacement (disp8 or disp32)\n" - "\n" - "When scale is 0, use index unmodified.\n" - "When scale is 1, multiply index by 2.\n" - "When scale is 2, multiply index by 4.\n" - "When scale is 3, multiply index by 8.\n" - "\n" - "For complete details, spend some time with the IA-32 software developer's manual,\n" - "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n" - "It is included in this repository as 'sib.pdf'.\n" - ); - put(Help, "disp8", - "8-bit value to be added in many instructions.\n" - ); - put(Help, "disp16", - "16-bit value to be added in many instructions.\n" - "Currently not used in any SubX instructions.\n" - ); - put(Help, "disp32", - "32-bit value to be added in many instructions.\n" - ); - put(Help, "imm8", - "8-bit value for many instructions.\n" - ); - put(Help, "imm32", - "32-bit value for many instructions.\n" - ); -} - -//:: transform packing operands into bytes in the right order - -:(after "Begin Transforms") -// Begin Level-2 Transforms -Transform.push_back(pack_operands); -// End Level-2 Transforms - -:(code) -void pack_operands(program& p) { - if (p.segments.empty()) return; - segment& code = *find(p, "code"); - // Pack Operands(segment code) - trace(3, "transform") << "-- pack operands" << end(); - for (int i = 0; i < SIZE(code.lines); ++i) { - line& inst = code.lines.at(i); - if (all_hex_bytes(inst)) continue; - trace(99, "transform") << "packing instruction '" << to_string(/*with metadata*/inst) << "'" << end(); - pack_operands(inst); - trace(99, "transform") << "instruction after packing: '" << to_string(/*without metadata*/inst.words) << "'" << end(); - } -} - -void pack_operands(line& inst) { - line new_inst; - add_opcodes(inst, new_inst); - add_modrm_byte(inst, new_inst); - add_sib_byte(inst, new_inst); - add_disp_bytes(inst, new_inst); - add_imm_bytes(inst, new_inst); - inst.words.swap(new_inst.words); -} - -void add_opcodes(const line& in, line& out) { - out.words.push_back(in.words.at(0)); - if (in.words.at(0).data == "0f" || in.words.at(0).data == "f2" || in.words.at(0).data == "f3") - out.words.push_back(in.words.at(1)); - if (in.words.at(0).data == "f3" && in.words.at(1).data == "0f") - out.words.push_back(in.words.at(2)); - if (in.words.at(0).data == "f2" && in.words.at(1).data == "0f") - out.words.push_back(in.words.at(2)); -} - -void add_modrm_byte(const line& in, line& out) { - uint8_t mod=0, reg_subop=0, rm32=0; - bool emit = false; - for (int i = 0; i < SIZE(in.words); ++i) { - const word& curr = in.words.at(i); - if (has_operand_metadata(curr, "mod")) { - mod = hex_byte(curr.data); - emit = true; - } - else if (has_operand_metadata(curr, "rm32")) { - rm32 = hex_byte(curr.data); - emit = true; - } - else if (has_operand_metadata(curr, "r32")) { - reg_subop = hex_byte(curr.data); - emit = true; - } - else if (has_operand_metadata(curr, "subop")) { - reg_subop = hex_byte(curr.data); - emit = true; - } - } - if (emit) - out.words.push_back(hex_byte_text((mod << 6) | (reg_subop << 3) | rm32)); -} - -void add_sib_byte(const line& in, line& out) { - uint8_t scale=0, index=0, base=0; - bool emit = false; - for (int i = 0; i < SIZE(in.words); ++i) { - const word& curr = in.words.at(i); - if (has_operand_metadata(curr, "scale")) { - scale = hex_byte(curr.data); - emit = true; - } - else if (has_operand_metadata(curr, "index")) { - index = hex_byte(curr.data); - emit = true; - } - else if (has_operand_metadata(curr, "base")) { - base = hex_byte(curr.data); - emit = true; - } - } - if (emit) - out.words.push_back(hex_byte_text((scale << 6) | (index << 3) | base)); -} - -void add_disp_bytes(const line& in, line& out) { - for (int i = 0; i < SIZE(in.words); ++i) { - const word& curr = in.words.at(i); - if (has_operand_metadata(curr, "disp8")) - emit_hex_bytes(out, curr, 1); - if (has_operand_metadata(curr, "disp16")) - emit_hex_bytes(out, curr, 2); - else if (has_operand_metadata(curr, "disp32")) - emit_hex_bytes(out, curr, 4); - } -} - -void add_imm_bytes(const line& in, line& out) { - for (int i = 0; i < SIZE(in.words); ++i) { - const word& curr = in.words.at(i); - if (has_operand_metadata(curr, "imm8")) - emit_hex_bytes(out, curr, 1); - else if (has_operand_metadata(curr, "imm32")) - emit_hex_bytes(out, curr, 4); - } -} - -void emit_hex_bytes(line& out, const word& w, int num) { - assert(num <= 4); - bool is_number = looks_like_hex_int(w.data); - if (num == 1 || !is_number) { - out.words.push_back(w); // preserve existing metadata - if (is_number) - out.words.back().data = hex_byte_to_string(parse_int(w.data)); - return; - } - emit_hex_bytes(out, static_cast(parse_int(w.data)), num); -} - -void emit_hex_bytes(line& out, uint32_t val, int num) { - assert(num <= 4); - for (int i = 0; i < num; ++i) { - out.words.push_back(hex_byte_text(val & 0xff)); - val = val >> 8; - } -} - -word hex_byte_text(uint8_t val) { - word result; - result.data = hex_byte_to_string(val); - result.original = result.data+"/auto"; - return result; -} - -string hex_byte_to_string(uint8_t val) { - ostringstream out; - // uint8_t prints without padding, but int8_t will expand to 32 bits again - out << HEXBYTE << NUM(val); - return out.str(); -} - -string to_string(const vector& in) { - ostringstream out; - for (int i = 0; i < SIZE(in); ++i) { - if (i > 0) out << ' '; - out << in.at(i).data; - } - return out.str(); -} - -:(before "End Unit Tests") -void test_preserve_metadata_when_emitting_single_byte() { - word in; - in.data = "f0"; - in.original = "f0/foo"; - line out; - emit_hex_bytes(out, in, 1); - CHECK_EQ(out.words.at(0).data, "f0"); - CHECK_EQ(out.words.at(0).original, "f0/foo"); -} - -:(code) -void test_pack_disp8() { - run( - "== code 0x1\n" - "74 2/disp8\n" // jump 2 bytes away if ZF is set - ); - CHECK_TRACE_CONTENTS( - "transform: packing instruction '74 2/disp8'\n" - "transform: instruction after packing: '74 02'\n" - ); -} - -void test_pack_disp8_negative() { - transform( - "== code 0x1\n" - // running this will cause an infinite loop - "74 -1/disp8\n" // jump 1 byte before if ZF is set - ); - CHECK_TRACE_CONTENTS( - "transform: packing instruction '74 -1/disp8'\n" - "transform: instruction after packing: '74 ff'\n" - ); -} - -//: helper for scenario -void transform(const string& text_bytes) { - program p; - istringstream in(text_bytes); - parse(in, p); - if (trace_contains_errors()) return; - transform(p); -} - -void test_pack_modrm_imm32() { - run( - "== code 0x1\n" - // instruction effective address operand displacement immediate\n" - // op subop mod rm32 base index scale r32\n" - // 1-3 bytes 3 bits 2 bits 3 bits 3 bits 3 bits 2 bits 2 bits 0/1/2/4 bytes 0/1/2/4 bytes\n" - " 81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32 \n" // add 1 to EBX - ); - CHECK_TRACE_CONTENTS( - "transform: packing instruction '81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32'\n" - "transform: instruction after packing: '81 c3 01 00 00 00'\n" - ); -} - -void test_pack_imm32_large() { - run( - "== code 0x1\n" - "b9 0x080490a7/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "transform: packing instruction 'b9 0x080490a7/imm32'\n" - "transform: instruction after packing: 'b9 a7 90 04 08'\n" - ); -} - -void test_pack_immediate_constants_hex() { - run( - "== code 0x1\n" - "b9 0x2a/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "transform: packing instruction 'b9 0x2a/imm32'\n" - "transform: instruction after packing: 'b9 2a 00 00 00'\n" - "run: copy imm32 0x0000002a to ECX\n" - ); -} - -void test_pack_silently_ignores_non_hex() { - Hide_errors = true; - transform( - "== code 0x1\n" - "b9 foo/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "transform: packing instruction 'b9 foo/imm32'\n" - // no change (we're just not printing metadata to the trace) - "transform: instruction after packing: 'b9 foo'\n" - ); -} - -void test_pack_flags_bad_hex() { - Hide_errors = true; - run( - "== code 0x1\n" - "b9 0xfoo/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "error: not a number: 0xfoo\n" - ); -} - -void test_pack_flags_uppercase_hex() { - Hide_errors = true; - run( - "== code 0x1\n" - "b9 0xAb/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "error: uppercase hex not allowed: 0xAb\n" - ); -} - -//:: helpers - -bool all_hex_bytes(const line& inst) { - for (int i = 0; i < SIZE(inst.words); ++i) - if (!is_hex_byte(inst.words.at(i))) - return false; - return true; -} - -bool is_hex_byte(const word& curr) { - if (contains_any_operand_metadata(curr)) - return false; - if (SIZE(curr.data) != 2) - return false; - if (curr.data.find_first_not_of("0123456789abcdef") != string::npos) - return false; - return true; -} - -bool contains_any_operand_metadata(const word& word) { - for (int i = 0; i < SIZE(word.metadata); ++i) - if (Instruction_operands.find(word.metadata.at(i)) != Instruction_operands.end()) - return true; - return false; -} - -bool has_operand_metadata(const line& inst, const string& m) { - bool result = false; - for (int i = 0; i < SIZE(inst.words); ++i) { - if (!has_operand_metadata(inst.words.at(i), m)) continue; - if (result) { - raise << "'" << to_string(inst) << "' has conflicting " << m << " operands\n" << end(); - return false; - } - result = true; - } - return result; -} - -bool has_operand_metadata(const word& w, const string& m) { - bool result = false; - bool metadata_found = false; - for (int i = 0; i < SIZE(w.metadata); ++i) { - const string& curr = w.metadata.at(i); - if (Instruction_operands.find(curr) == Instruction_operands.end()) continue; // ignore unrecognized metadata - if (metadata_found) { - raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end(); - return false; - } - metadata_found = true; - result = (curr == m); - } - return result; -} - -word metadata(const line& inst, const string& m) { - for (int i = 0; i < SIZE(inst.words); ++i) - if (has_operand_metadata(inst.words.at(i), m)) - return inst.words.at(i); - assert(false); -} - -bool looks_like_hex_int(const string& s) { - if (s.empty()) return false; - if (s.at(0) == '-' || s.at(0) == '+') return true; - if (isdigit(s.at(0))) return true; // includes '0x' prefix - // End looks_like_hex_int(s) Detectors - return false; -} - -string to_string(const line& inst) { - ostringstream out; - for (int i = 0; i < SIZE(inst.words); ++i) { - if (i > 0) out << ' '; - out << inst.words.at(i).original; - } - return out.str(); -} - -int32_t parse_int(const string& s) { - if (s.empty()) return 0; - if (contains_uppercase(s)) { - raise << "uppercase hex not allowed: " << s << '\n' << end(); - return 0; - } - istringstream in(s); - in >> std::hex; - if (s.at(0) == '-') { - int32_t result = 0; - in >> result; - if (!in || !in.eof()) { - raise << "not a number: " << s << '\n' << end(); - return 0; - } - return result; - } - uint32_t uresult = 0; - in >> uresult; - if (!in || !in.eof()) { - raise << "not a number: " << s << '\n' << end(); - return 0; - } - return static_cast(uresult); -} -:(before "End Unit Tests") -void test_parse_int() { - CHECK_EQ(0, parse_int("0")); - CHECK_EQ(0, parse_int("0x0")); - CHECK_EQ(0, parse_int("0x0")); - CHECK_EQ(16, parse_int("10")); // hex always - CHECK_EQ(-1, parse_int("-1")); - CHECK_EQ(-1, parse_int("0xffffffff")); -} diff --git a/030---translate.cc b/030---translate.cc new file mode 100644 index 00000000..9737834e --- /dev/null +++ b/030---translate.cc @@ -0,0 +1,213 @@ +//: The bedrock level 1 of abstraction is now done, and we're going to start +//: building levels above it that make programming in x86 machine code a +//: little more ergonomic. +//: +//: All levels will be "pass through by default". Whatever they don't +//: understand they will silently pass through to lower levels. +//: +//: Since raw hex bytes of machine code are always possible to inject, SubX is +//: not a language, and we aren't building a compiler. This is something +//: deliberately leakier. Levels are more for improving auditing, checks and +//: error messages rather than for hiding low-level details. + +//: Translator workflow: read 'source' file. Run a series of transforms on it, +//: each passing through what it doesn't understand. The final program should +//: be just machine code, suitable to write to an ELF binary. +//: +//: Higher levels usually transform code on the basis of metadata. + +:(before "End Main") +if (is_equal(argv[1], "translate")) { + // Outside of tests, traces must be explicitly requested. + if (Trace_file.is_open()) Trace_stream = new trace_stream; + reset(); + // Begin subx translate + program p; + string output_filename; + for (int i = /*skip 'subx translate'*/2; i < argc; ++i) { + if (is_equal(argv[i], "-o")) { + ++i; + if (i >= argc) { + print_translate_usage(); + cerr << "'-o' must be followed by a filename to write results to\n"; + exit(1); + } + output_filename = argv[i]; + } + else { + trace(2, "parse") << argv[i] << end(); + ifstream fin(argv[i]); + if (!fin) { + cerr << "could not open " << argv[i] << '\n'; + return 1; + } + parse(fin, p); + if (trace_contains_errors()) return 1; + } + } + if (p.segments.empty()) { + print_translate_usage(); + cerr << "nothing to do; must provide at least one file to read\n"; + exit(1); + } + if (output_filename.empty()) { + print_translate_usage(); + cerr << "must provide a filename to write to using '-o'\n"; + exit(1); + } + trace(2, "transform") << "begin" << end(); + transform(p); + if (trace_contains_errors()) return 1; + trace(2, "translate") << "begin" << end(); + save_elf(p, output_filename); + if (trace_contains_errors()) { + unlink(output_filename.c_str()); + return 1; + } + // End subx translate + return 0; +} + +:(code) +void print_translate_usage() { + cerr << "Usage: subx translate file1 file2 ... -o output\n"; +} + +// write out a program to a bare-bones ELF file +void save_elf(const program& p, const string& filename) { + ofstream out(filename.c_str(), ios::binary); + save_elf(p, out); + out.close(); +} + +void save_elf(const program& p, ostream& out) { + // validation: stay consistent with the self-hosted translator + if (p.entry == 0) { + raise << "no 'Entry' label found\n" << end(); + return; + } + if (find(p, "data") == NULL) { + raise << "must include a 'data' segment\n" << end(); + return; + } + // processing + write_elf_header(out, p); + for (size_t i = 0; i < p.segments.size(); ++i) + write_segment(p.segments.at(i), out); +} + +void write_elf_header(ostream& out, const program& p) { + char c = '\0'; +#define O(X) c = (X); out.write(&c, sizeof(c)) +// host is required to be little-endian +#define emit(X) out.write(reinterpret_cast(&X), sizeof(X)) + //// ehdr + // e_ident + O(0x7f); O(/*E*/0x45); O(/*L*/0x4c); O(/*F*/0x46); + O(0x1); // 32-bit format + O(0x1); // little-endian + O(0x1); O(0x0); + for (size_t i = 0; i < 8; ++i) { O(0x0); } + // e_type + O(0x02); O(0x00); + // e_machine + O(0x03); O(0x00); + // e_version + O(0x01); O(0x00); O(0x00); O(0x00); + // e_entry + uint32_t e_entry = p.entry; + // Override e_entry + emit(e_entry); + // e_phoff -- immediately after ELF header + uint32_t e_phoff = 0x34; + emit(e_phoff); + // e_shoff; unused + uint32_t dummy32 = 0; + emit(dummy32); + // e_flags; unused + emit(dummy32); + // e_ehsize + uint16_t e_ehsize = 0x34; + emit(e_ehsize); + // e_phentsize + uint16_t e_phentsize = 0x20; + emit(e_phentsize); + // e_phnum + uint16_t e_phnum = SIZE(p.segments); + emit(e_phnum); + // e_shentsize + uint16_t dummy16 = 0x0; + emit(dummy16); + // e_shnum + emit(dummy16); + // e_shstrndx + emit(dummy16); + + uint32_t p_offset = /*size of ehdr*/0x34 + SIZE(p.segments)*0x20/*size of each phdr*/; + for (int i = 0; i < SIZE(p.segments); ++i) { + const segment& curr = p.segments.at(i); + //// phdr + // p_type + uint32_t p_type = 0x1; + emit(p_type); + // p_offset + emit(p_offset); + // p_vaddr + uint32_t p_start = curr.start; + emit(p_start); + // p_paddr + emit(p_start); + // p_filesz + uint32_t size = num_words(curr); + assert(p_offset + size < SEGMENT_ALIGNMENT); + emit(size); + // p_memsz + emit(size); + // p_flags + uint32_t p_flags = (curr.name == "code") ? /*r-x*/0x5 : /*rw-*/0x6; + emit(p_flags); + + // p_align + // "As the system creates or augments a process image, it logically copies + // a file's segment to a virtual memory segment. When—and if— the system + // physically reads the file depends on the program's execution behavior, + // system load, and so on. A process does not require a physical page + // unless it references the logical page during execution, and processes + // commonly leave many pages unreferenced. Therefore delaying physical + // reads frequently obviates them, improving system performance. To obtain + // this efficiency in practice, executable and shared object files must + // have segment images whose file offsets and virtual addresses are + // congruent, modulo the page size." -- http://refspecs.linuxbase.org/elf/elf.pdf (page 95) + uint32_t p_align = 0x1000; // default page size on linux + emit(p_align); + if (p_offset % p_align != p_start % p_align) { + raise << "segment starting at 0x" << HEXWORD << p_start << " is improperly aligned; alignment for p_offset " << p_offset << " should be " << (p_offset % p_align) << " but is " << (p_start % p_align) << '\n' << end(); + return; + } + + // prepare for next segment + p_offset += size; + } +#undef O +#undef emit +} + +void write_segment(const segment& s, ostream& out) { + for (int i = 0; i < SIZE(s.lines); ++i) { + const vector& w = s.lines.at(i).words; + for (int j = 0; j < SIZE(w); ++j) { + uint8_t x = hex_byte(w.at(j).data); // we're done with metadata by this point + out.write(reinterpret_cast(&x), /*sizeof(byte)*/1); + } + } +} + +uint32_t num_words(const segment& s) { + uint32_t sum = 0; + for (int i = 0; i < SIZE(s.lines); ++i) + sum += SIZE(s.lines.at(i).words); + return sum; +} + +:(before "End Includes") +using std::ios; diff --git a/031check_operands.cc b/031check_operands.cc deleted file mode 100644 index bf5d3719..00000000 --- a/031check_operands.cc +++ /dev/null @@ -1,691 +0,0 @@ -//: Since we're tagging operands with their types, let's start checking these -//: operand types for each instruction. - -void test_check_missing_imm8_operand() { - Hide_errors = true; - run( - "== code 0x1\n" - "cd\n" // interrupt ?? - ); - CHECK_TRACE_CONTENTS( - "error: 'cd' (software interrupt): missing imm8 operand\n" - ); -} - -:(before "Pack Operands(segment code)") -check_operands(code); -if (trace_contains_errors()) return; - -:(code) -void check_operands(const segment& code) { - trace(3, "transform") << "-- check operands" << end(); - for (int i = 0; i < SIZE(code.lines); ++i) { - check_operands(code.lines.at(i)); - if (trace_contains_errors()) return; // stop at the first mal-formed instruction - } -} - -void check_operands(const line& inst) { - word op = preprocess_op(inst.words.at(0)); - if (op.data == "0f") { - check_operands_0f(inst); - return; - } - if (op.data == "f3") { - check_operands_f3(inst); - return; - } - check_operands(inst, op); -} - -word preprocess_op(word/*copy*/ op) { - op.data = tolower(op.data.c_str()); - // opcodes can't be negative - if (starts_with(op.data, "0x")) - op.data = op.data.substr(2); - if (SIZE(op.data) == 1) - op.data = string("0")+op.data; - return op; -} - -void test_preprocess_op() { - word w1; w1.data = "0xf"; - word w2; w2.data = "0f"; - CHECK_EQ(preprocess_op(w1).data, preprocess_op(w2).data); -} - -//: To check the operands for an opcode, we'll track the permitted operands -//: for each supported opcode in a bitvector. That way we can often compute the -//: 'received' operand bitvector for each instruction's operands and compare -//: it with the 'expected' bitvector. -//: -//: The 'expected' and 'received' bitvectors can be different; the MODRM bit -//: in the 'expected' bitvector maps to multiple 'received' operand types in -//: an instruction. We deal in expected bitvectors throughout. - -:(before "End Types") -enum expected_operand_type { - // start from the least significant bit - MODRM, // more complex, may also involve disp8 or disp32 - SUBOP, - DISP8, - DISP16, - DISP32, - IMM8, - IMM32, - NUM_OPERAND_TYPES -}; -:(before "End Globals") -vector Operand_type_name; -map Operand_type; -:(before "End One-time Setup") -init_op_types(); -:(code) -void init_op_types() { - assert(NUM_OPERAND_TYPES <= /*bits in a uint8_t*/8); - Operand_type_name.resize(NUM_OPERAND_TYPES); - #define DEF(type) Operand_type_name.at(type) = tolower(#type), put(Operand_type, tolower(#type), type); - DEF(MODRM); - DEF(SUBOP); - DEF(DISP8); - DEF(DISP16); - DEF(DISP32); - DEF(IMM8); - DEF(IMM32); - #undef DEF -} - -:(before "End Globals") -map Permitted_operands; -const uint8_t INVALID_OPERANDS = 0xff; // no instruction uses all the operand types -:(before "End One-time Setup") -init_permitted_operands(); -:(code) -void init_permitted_operands() { - //// Class A: just op, no operands - // halt - put(Permitted_operands, "f4", 0x00); - // inc - put(Permitted_operands, "40", 0x00); - put(Permitted_operands, "41", 0x00); - put(Permitted_operands, "42", 0x00); - put(Permitted_operands, "43", 0x00); - put(Permitted_operands, "44", 0x00); - put(Permitted_operands, "45", 0x00); - put(Permitted_operands, "46", 0x00); - put(Permitted_operands, "47", 0x00); - // dec - put(Permitted_operands, "48", 0x00); - put(Permitted_operands, "49", 0x00); - put(Permitted_operands, "4a", 0x00); - put(Permitted_operands, "4b", 0x00); - put(Permitted_operands, "4c", 0x00); - put(Permitted_operands, "4d", 0x00); - put(Permitted_operands, "4e", 0x00); - put(Permitted_operands, "4f", 0x00); - // push - put(Permitted_operands, "50", 0x00); - put(Permitted_operands, "51", 0x00); - put(Permitted_operands, "52", 0x00); - put(Permitted_operands, "53", 0x00); - put(Permitted_operands, "54", 0x00); - put(Permitted_operands, "55", 0x00); - put(Permitted_operands, "56", 0x00); - put(Permitted_operands, "57", 0x00); - // pop - put(Permitted_operands, "58", 0x00); - put(Permitted_operands, "59", 0x00); - put(Permitted_operands, "5a", 0x00); - put(Permitted_operands, "5b", 0x00); - put(Permitted_operands, "5c", 0x00); - put(Permitted_operands, "5d", 0x00); - put(Permitted_operands, "5e", 0x00); - put(Permitted_operands, "5f", 0x00); - // sign-extend EAX into EDX - put(Permitted_operands, "99", 0x00); - // return - put(Permitted_operands, "c3", 0x00); - - //// Class B: just op and disp8 - // imm32 imm8 disp32 |disp16 disp8 subop modrm - // 0 0 0 |0 1 0 0 - - // jump - put(Permitted_operands, "eb", 0x04); - put(Permitted_operands, "72", 0x04); - put(Permitted_operands, "73", 0x04); - put(Permitted_operands, "74", 0x04); - put(Permitted_operands, "75", 0x04); - put(Permitted_operands, "76", 0x04); - put(Permitted_operands, "77", 0x04); - put(Permitted_operands, "7c", 0x04); - put(Permitted_operands, "7d", 0x04); - put(Permitted_operands, "7e", 0x04); - put(Permitted_operands, "7f", 0x04); - - //// Class D: just op and disp32 - // imm32 imm8 disp32 |disp16 disp8 subop modrm - // 0 0 1 |0 0 0 0 - put(Permitted_operands, "e8", 0x10); // call - put(Permitted_operands, "e9", 0x10); // jump - - //// Class E: just op and imm8 - // imm32 imm8 disp32 |disp16 disp8 subop modrm - // 0 1 0 |0 0 0 0 - put(Permitted_operands, "cd", 0x20); // software interrupt - - //// Class F: just op and imm32 - // imm32 imm8 disp32 |disp16 disp8 subop modrm - // 1 0 0 |0 0 0 0 - put(Permitted_operands, "05", 0x40); // add - put(Permitted_operands, "2d", 0x40); // subtract - put(Permitted_operands, "25", 0x40); // and - put(Permitted_operands, "0d", 0x40); // or - put(Permitted_operands, "35", 0x40); // xor - put(Permitted_operands, "3d", 0x40); // compare - put(Permitted_operands, "68", 0x40); // push - // copy - put(Permitted_operands, "b8", 0x40); - put(Permitted_operands, "b9", 0x40); - put(Permitted_operands, "ba", 0x40); - put(Permitted_operands, "bb", 0x40); - put(Permitted_operands, "bc", 0x40); - put(Permitted_operands, "bd", 0x40); - put(Permitted_operands, "be", 0x40); - put(Permitted_operands, "bf", 0x40); - - //// Class M: using ModR/M byte - // imm32 imm8 disp32 |disp16 disp8 subop modrm - // 0 0 0 |0 0 0 1 - - // add - put(Permitted_operands, "01", 0x01); - put(Permitted_operands, "03", 0x01); - // subtract - put(Permitted_operands, "29", 0x01); - put(Permitted_operands, "2b", 0x01); - // and - put(Permitted_operands, "21", 0x01); - put(Permitted_operands, "23", 0x01); - // or - put(Permitted_operands, "09", 0x01); - put(Permitted_operands, "0b", 0x01); - // xor - put(Permitted_operands, "31", 0x01); - put(Permitted_operands, "33", 0x01); - // compare - put(Permitted_operands, "39", 0x01); - put(Permitted_operands, "3b", 0x01); - // copy - put(Permitted_operands, "88", 0x01); - put(Permitted_operands, "89", 0x01); - put(Permitted_operands, "8a", 0x01); - put(Permitted_operands, "8b", 0x01); - // swap - put(Permitted_operands, "87", 0x01); - // copy address (lea) - put(Permitted_operands, "8d", 0x01); - - //// Class N: op, ModR/M and subop (not r32) - // imm32 imm8 disp32 |disp16 disp8 subop modrm - // 0 0 0 |0 0 1 1 - put(Permitted_operands, "8f", 0x03); // pop - put(Permitted_operands, "d3", 0x03); // shift - put(Permitted_operands, "f7", 0x03); // test/not/mul/div - put(Permitted_operands, "ff", 0x03); // jump/push/call - - //// Class O: op, ModR/M, subop (not r32) and imm8 - // imm32 imm8 disp32 |disp16 disp8 subop modrm - // 0 1 0 |0 0 1 1 - put(Permitted_operands, "c1", 0x23); // combine - put(Permitted_operands, "c6", 0x23); // copy - - //// Class P: op, ModR/M, subop (not r32) and imm32 - // imm32 imm8 disp32 |disp16 disp8 subop modrm - // 1 0 0 |0 0 1 1 - put(Permitted_operands, "81", 0x43); // combine - put(Permitted_operands, "c7", 0x43); // copy - - // End Init Permitted Operands -} - -#define HAS(bitvector, bit) ((bitvector) & (1 << (bit))) -#define SET(bitvector, bit) ((bitvector) | (1 << (bit))) -#define CLEAR(bitvector, bit) ((bitvector) & (~(1 << (bit)))) - -void check_operands(const line& inst, const word& op) { - if (!is_hex_byte(op)) return; - uint8_t expected_bitvector = get(Permitted_operands, op.data); - if (HAS(expected_bitvector, MODRM)) { - check_operands_modrm(inst, op); - compare_bitvector_modrm(inst, expected_bitvector, op); - } - else { - compare_bitvector(inst, expected_bitvector, op); - } -} - -//: Many instructions can be checked just by comparing bitvectors. - -void compare_bitvector(const line& inst, uint8_t expected, const word& op) { - if (all_hex_bytes(inst) && has_operands(inst)) return; // deliberately programming in raw hex; we'll raise a warning elsewhere - uint8_t bitvector = compute_expected_operand_bitvector(inst); - if (trace_contains_errors()) return; // duplicate operand type - if (bitvector == expected) return; // all good with this instruction - for (int i = 0; i < NUM_OPERAND_TYPES; ++i, bitvector >>= 1, expected >>= 1) { -//? cerr << "comparing " << HEXBYTE << NUM(bitvector) << " with " << NUM(expected) << '\n'; - if ((bitvector & 0x1) == (expected & 0x1)) continue; // all good with this operand - const string& optype = Operand_type_name.at(i); - if ((bitvector & 0x1) > (expected & 0x1)) - raise << "'" << to_string(inst) << "'" << maybe_name(op) << ": unexpected " << optype << " operand\n" << end(); - else - raise << "'" << to_string(inst) << "'" << maybe_name(op) << ": missing " << optype << " operand\n" << end(); - // continue giving all errors for a single instruction - } - // ignore settings in any unused bits -} - -string maybe_name(const word& op) { - if (!is_hex_byte(op)) return ""; - if (!contains_key(Name, op.data)) return ""; - // strip stuff in parens from the name - const string& s = get(Name, op.data); - return " ("+s.substr(0, s.find(" ("))+')'; -} - -uint32_t compute_expected_operand_bitvector(const line& inst) { - set operands_found; - uint32_t bitvector = 0; - for (int i = /*skip op*/1; i < SIZE(inst.words); ++i) { - bitvector = bitvector | expected_bit_for_received_operand(inst.words.at(i), operands_found, inst); - if (trace_contains_errors()) return INVALID_OPERANDS; // duplicate operand type - } - return bitvector; -} - -bool has_operands(const line& inst) { - return SIZE(inst.words) > first_operand(inst); -} - -int first_operand(const line& inst) { - if (inst.words.at(0).data == "0f") return 2; - if (inst.words.at(0).data == "f2" || inst.words.at(0).data == "f3") { - if (inst.words.at(1).data == "0f") - return 3; - else - return 2; - } - return 1; -} - -// Scan the metadata of 'w' and return the expected bit corresponding to any operand type. -// Also raise an error if metadata contains multiple operand types. -uint32_t expected_bit_for_received_operand(const word& w, set& instruction_operands, const line& inst) { - uint32_t bv = 0; - bool found = false; - for (int i = 0; i < SIZE(w.metadata); ++i) { - string/*copy*/ curr = w.metadata.at(i); - string expected_metadata = curr; - if (curr == "mod" || curr == "rm32" || curr == "r32" || curr == "scale" || curr == "index" || curr == "base") - expected_metadata = "modrm"; - else if (!contains_key(Operand_type, curr)) continue; // ignore unrecognized metadata - if (found) { - raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end(); - return INVALID_OPERANDS; - } - if (instruction_operands.find(curr) != instruction_operands.end()) { - raise << "'" << to_string(inst) << "': duplicate " << curr << " operand\n" << end(); - return INVALID_OPERANDS; - } - instruction_operands.insert(curr); - bv = (1 << get(Operand_type, expected_metadata)); - found = true; - } - return bv; -} - -void test_conflicting_operand_type() { - Hide_errors = true; - run( - "== code 0x1\n" - "cd/software-interrupt 80/imm8/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "error: '80/imm8/imm32' has conflicting operand types; it should have only one\n" - ); -} - -//: Instructions computing effective addresses have more complex rules, so -//: we'll hard-code a common set of instruction-decoding rules. - -void test_check_missing_mod_operand() { - Hide_errors = true; - run( - "== code 0x1\n" - "81 0/add/subop 3/rm32/ebx 1/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "error: '81 0/add/subop 3/rm32/ebx 1/imm32' (combine rm32 with imm32 based on subop): missing mod operand\n" - ); -} - -void check_operands_modrm(const line& inst, const word& op) { - if (all_hex_bytes(inst)) return; // deliberately programming in raw hex; we'll raise a warning elsewhere - check_operand_metadata_present(inst, "mod", op); - check_operand_metadata_present(inst, "rm32", op); - // no check for r32; some instructions don't use it; just assume it's 0 if missing - if (op.data == "81" || op.data == "8f" || op.data == "ff") { // keep sync'd with 'help subop' - check_operand_metadata_present(inst, "subop", op); - check_operand_metadata_absent(inst, "r32", op, "should be replaced by subop"); - } - if (trace_contains_errors()) return; - if (metadata(inst, "rm32").data != "4") return; - // SIB byte checks - uint8_t mod = hex_byte(metadata(inst, "mod").data); - if (mod != /*direct*/3) { - check_operand_metadata_present(inst, "base", op); - check_operand_metadata_present(inst, "index", op); // otherwise why go to SIB? - } - else { - check_operand_metadata_absent(inst, "base", op, "direct mode"); - check_operand_metadata_absent(inst, "index", op, "direct mode"); - } - // no check for scale; 0 (2**0 = 1) by default -} - -// same as compare_bitvector, with one additional exception for modrm-based -// instructions: they may use an extra displacement on occasion -void compare_bitvector_modrm(const line& inst, uint8_t expected, const word& op) { - if (all_hex_bytes(inst) && has_operands(inst)) return; // deliberately programming in raw hex; we'll raise a warning elsewhere - uint8_t bitvector = compute_expected_operand_bitvector(inst); - if (trace_contains_errors()) return; // duplicate operand type - // update 'expected' bitvector for the additional exception - if (has_operand_metadata(inst, "mod")) { - int32_t mod = parse_int(metadata(inst, "mod").data); - switch (mod) { - case 0: - if (has_operand_metadata(inst, "rm32") && parse_int(metadata(inst, "rm32").data) == 5) - expected |= (1<>= 1, expected >>= 1) { -//? cerr << "comparing for modrm " << HEXBYTE << NUM(bitvector) << " with " << NUM(expected) << '\n'; - if ((bitvector & 0x1) == (expected & 0x1)) continue; // all good with this operand - const string& optype = Operand_type_name.at(i); - if ((bitvector & 0x1) > (expected & 0x1)) - raise << "'" << to_string(inst) << "'" << maybe_name(op) << ": unexpected " << optype << " operand\n" << end(); - else - raise << "'" << to_string(inst) << "'" << maybe_name(op) << ": missing " << optype << " operand\n" << end(); - // continue giving all errors for a single instruction - } - // ignore settings in any unused bits -} - -void check_operand_metadata_present(const line& inst, const string& type, const word& op) { - if (!has_operand_metadata(inst, type)) - raise << "'" << to_string(inst) << "'" << maybe_name(op) << ": missing " << type << " operand\n" << end(); -} - -void check_operand_metadata_absent(const line& inst, const string& type, const word& op, const string& msg) { - if (has_operand_metadata(inst, type)) - raise << "'" << to_string(inst) << "'" << maybe_name(op) << ": unexpected " << type << " operand (" << msg << ")\n" << end(); -} - -void test_modrm_with_displacement() { - Reg[EAX].u = 0x1; - transform( - "== code 0x1\n" - // just avoid null pointer - "8b/copy 1/mod/lookup+disp8 0/rm32/EAX 2/r32/EDX 4/disp8\n" // copy *(EAX+4) to EDX - ); - CHECK_TRACE_COUNT("error", 0); -} - -void test_check_missing_disp8() { - Hide_errors = true; - transform( - "== code 0x1\n" - "89/copy 1/mod/lookup+disp8 0/rm32/EAX 1/r32/ECX\n" // missing disp8 - ); - CHECK_TRACE_CONTENTS( - "error: '89/copy 1/mod/lookup+disp8 0/rm32/EAX 1/r32/ECX' (copy r32 to rm32): missing disp8 operand\n" - ); -} - -void test_check_missing_disp32() { - Hide_errors = true; - transform( - "== code 0x1\n" - "8b/copy 0/mod/indirect 5/rm32/.disp32 2/r32/EDX\n" // missing disp32 - ); - CHECK_TRACE_CONTENTS( - "error: '8b/copy 0/mod/indirect 5/rm32/.disp32 2/r32/EDX' (copy rm32 to r32): missing disp32 operand\n" - ); -} - -void test_conflicting_operands_in_modrm_instruction() { - Hide_errors = true; - run( - "== code 0x1\n" - "01/add 0/mod 3/mod\n" - ); - CHECK_TRACE_CONTENTS( - "error: '01/add 0/mod 3/mod' has conflicting mod operands\n" - ); -} - -void test_conflicting_operand_type_modrm() { - Hide_errors = true; - run( - "== code 0x1\n" - "01/add 0/mod 3/rm32/r32\n" - ); - CHECK_TRACE_CONTENTS( - "error: '3/rm32/r32' has conflicting operand types; it should have only one\n" - ); -} - -void test_check_missing_rm32_operand() { - Hide_errors = true; - run( - "== code 0x1\n" - "81 0/add/subop 0/mod 1/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "error: '81 0/add/subop 0/mod 1/imm32' (combine rm32 with imm32 based on subop): missing rm32 operand\n" - ); -} - -void test_check_missing_subop_operand() { - Hide_errors = true; - run( - "== code 0x1\n" - "81 0/mod 3/rm32/ebx 1/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "error: '81 0/mod 3/rm32/ebx 1/imm32' (combine rm32 with imm32 based on subop): missing subop operand\n" - ); -} - -void test_check_missing_base_operand() { - Hide_errors = true; - run( - "== code 0x1\n" - "81 0/add/subop 0/mod/indirect 4/rm32/use-sib 1/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "error: '81 0/add/subop 0/mod/indirect 4/rm32/use-sib 1/imm32' (combine rm32 with imm32 based on subop): missing base operand\n" - ); -} - -void test_check_missing_index_operand() { - Hide_errors = true; - run( - "== code 0x1\n" - "81 0/add/subop 0/mod/indirect 4/rm32/use-sib 0/base 1/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "error: '81 0/add/subop 0/mod/indirect 4/rm32/use-sib 0/base 1/imm32' (combine rm32 with imm32 based on subop): missing index operand\n" - ); -} - -void test_check_missing_base_operand_2() { - Hide_errors = true; - run( - "== code 0x1\n" - "81 0/add/subop 0/mod/indirect 4/rm32/use-sib 2/index 3/scale 1/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "error: '81 0/add/subop 0/mod/indirect 4/rm32/use-sib 2/index 3/scale 1/imm32' (combine rm32 with imm32 based on subop): missing base operand\n" - ); -} - -void test_check_extra_displacement() { - Hide_errors = true; - run( - "== code 0x1\n" - "89/copy 0/mod/indirect 0/rm32/EAX 1/r32/ECX 4/disp8\n" - ); - CHECK_TRACE_CONTENTS( - "error: '89/copy 0/mod/indirect 0/rm32/EAX 1/r32/ECX 4/disp8' (copy r32 to rm32): unexpected disp8 operand\n" - ); -} - -void test_check_duplicate_operand() { - Hide_errors = true; - run( - "== code 0x1\n" - "89/copy 0/mod/indirect 0/rm32/EAX 1/r32/ECX 1/r32\n" - ); - CHECK_TRACE_CONTENTS( - "error: '89/copy 0/mod/indirect 0/rm32/EAX 1/r32/ECX 1/r32': duplicate r32 operand\n" - ); -} - -void test_check_base_operand_not_needed_in_direct_mode() { - run( - "== code 0x1\n" - "81 0/add/subop 3/mod/indirect 4/rm32/use-sib 1/imm32\n" - ); - CHECK_TRACE_COUNT("error", 0); -} - -void test_extra_modrm() { - Hide_errors = true; - run( - "== code 0x1\n" - "59/pop-to-ECX 3/mod/direct 1/rm32/ECX 4/r32/ESP\n" - ); - CHECK_TRACE_CONTENTS( - "error: '59/pop-to-ECX 3/mod/direct 1/rm32/ECX 4/r32/ESP' (pop top of stack to ECX): unexpected modrm operand\n" - ); -} - -//:: similarly handle multi-byte opcodes - -void check_operands_0f(const line& inst) { - assert(inst.words.at(0).data == "0f"); - if (SIZE(inst.words) == 1) { - raise << "opcode '0f' requires a second opcode\n" << end(); - return; - } - word op = preprocess_op(inst.words.at(1)); - if (!contains_key(Name_0f, op.data)) { - raise << "unknown 2-byte opcode '0f " << op.data << "'\n" << end(); - return; - } - check_operands_0f(inst, op); -} - -void check_operands_f3(const line& /*unused*/) { - raise << "no supported opcodes starting with f3\n" << end(); -} - -void test_check_missing_disp32_operand() { - Hide_errors = true; - run( - "== code 0x1\n" - " 0f 84 # jmp if ZF to ??\n" - ); - CHECK_TRACE_CONTENTS( - "error: '0f 84' (jump disp32 bytes away if equal, if ZF is set): missing disp32 operand\n" - ); -} - -:(before "End Globals") -map Permitted_operands_0f; -:(before "End Init Permitted Operands") -//// Class D: just op and disp32 -// imm32 imm8 disp32 |disp16 disp8 subop modrm -// 0 0 1 |0 0 0 0 -put_new(Permitted_operands_0f, "82", 0x10); -put_new(Permitted_operands_0f, "83", 0x10); -put_new(Permitted_operands_0f, "84", 0x10); -put_new(Permitted_operands_0f, "85", 0x10); -put_new(Permitted_operands_0f, "86", 0x10); -put_new(Permitted_operands_0f, "87", 0x10); -put_new(Permitted_operands_0f, "8c", 0x10); -put_new(Permitted_operands_0f, "8d", 0x10); -put_new(Permitted_operands_0f, "8e", 0x10); -put_new(Permitted_operands_0f, "8f", 0x10); - -//// Class M: using ModR/M byte -// imm32 imm8 disp32 |disp16 disp8 subop modrm -// 0 0 0 |0 0 0 1 -put_new(Permitted_operands_0f, "af", 0x01); - -:(code) -void check_operands_0f(const line& inst, const word& op) { - uint8_t expected_bitvector = get(Permitted_operands_0f, op.data); - if (HAS(expected_bitvector, MODRM)) - check_operands_modrm(inst, op); - compare_bitvector_0f(inst, CLEAR(expected_bitvector, MODRM), op); -} - -void compare_bitvector_0f(const line& inst, uint8_t expected, const word& op) { - if (all_hex_bytes(inst) && has_operands(inst)) return; // deliberately programming in raw hex; we'll raise a warning elsewhere - uint8_t bitvector = compute_expected_operand_bitvector(inst); - if (trace_contains_errors()) return; // duplicate operand type - if (bitvector == expected) return; // all good with this instruction - for (int i = 0; i < NUM_OPERAND_TYPES; ++i, bitvector >>= 1, expected >>= 1) { -//? cerr << "comparing " << HEXBYTE << NUM(bitvector) << " with " << NUM(expected) << '\n'; - if ((bitvector & 0x1) == (expected & 0x1)) continue; // all good with this operand - const string& optype = Operand_type_name.at(i); - if ((bitvector & 0x1) > (expected & 0x1)) - raise << "'" << to_string(inst) << "'" << maybe_name_0f(op) << ": unexpected " << optype << " operand\n" << end(); - else - raise << "'" << to_string(inst) << "'" << maybe_name_0f(op) << ": missing " << optype << " operand\n" << end(); - // continue giving all errors for a single instruction - } - // ignore settings in any unused bits -} - -string maybe_name_0f(const word& op) { - if (!is_hex_byte(op)) return ""; - if (!contains_key(Name_0f, op.data)) return ""; - // strip stuff in parens from the name - const string& s = get(Name_0f, op.data); - return " ("+s.substr(0, s.find(" ("))+')'; -} - -string tolower(const char* s) { - ostringstream out; - for (/*nada*/; *s; ++s) - out << static_cast(tolower(*s)); - return out.str(); -} - -#undef HAS -#undef SET -#undef CLEAR - -:(before "End Includes") -#include diff --git a/031transforms.cc b/031transforms.cc new file mode 100644 index 00000000..a6e12502 --- /dev/null +++ b/031transforms.cc @@ -0,0 +1,65 @@ +//: Ordering transforms is a well-known hard problem when building compilers. +//: In our case we also have the additional notion of layers. The ordering of +//: layers can have nothing in common with the ordering of transforms when +//: SubX is tangled and run. This can be confusing for readers, particularly +//: if later layers start inserting transforms at arbitrary points between +//: transforms introduced earlier. Over time adding transforms can get harder +//: and harder, having to meet the constraints of everything that's come +//: before. It's worth thinking about organization up-front so the ordering is +//: easy to hold in our heads, and it's obvious where to add a new transform. +//: Some constraints: +//: +//: 1. Layers force us to build SubX bottom-up; since we want to be able to +//: build and run SubX after stopping loading at any layer, the overall +//: organization has to be to introduce primitives before we start using +//: them. +//: +//: 2. Transforms usually need to be run top-down, converting high-level +//: representations to low-level ones so that low-level layers can be +//: oblivious to them. +//: +//: 3. When running we'd often like new representations to be checked before +//: they are transformed away. The whole reason for new representations is +//: often to add new kinds of automatic checking for our machine code +//: programs. +//: +//: Putting these constraints together, we'll use the following broad +//: organization: +//: +//: a) We'll divide up our transforms into "levels", each level consisting +//: of multiple transforms, and dealing in some new set of representational +//: ideas. Levels will be added in reverse order to the one their transforms +//: will be run in. +//: +//: To run all transforms: +//: Load transforms for level n +//: Load transforms for level n-1 +//: ... +//: Load transforms for level 2 +//: Run code at level 1 +//: +//: b) *Within* a level we'll usually introduce transforms in the order +//: they're run in. +//: +//: To run transforms for level n: +//: Perform transform of layer l +//: Perform transform of layer l+1 +//: ... +//: +//: c) Within a level it's often most natural to introduce a new +//: representation by showing how it's transformed to the level below. To +//: make such exceptions more obvious checks usually won't be first-class +//: transforms; instead code that keeps the program unmodified will run +//: within transforms before they mutate the program. As an example: +//: +//: Layer l introduces a transform +//: Layer l+1 adds precondition checks for the transform +//: +//: This may all seem abstract, but will hopefully make sense over time. The +//: goals are basically to always have a working program after any layer, to +//: have the order of layers make narrative sense, and to order transforms +//: correctly at runtime. + +:(before "End One-time Setup") +// Begin Transforms +// End Transforms diff --git a/032---operands.cc b/032---operands.cc new file mode 100644 index 00000000..5203201e --- /dev/null +++ b/032---operands.cc @@ -0,0 +1,539 @@ +//: Beginning of "level 2": tagging bytes with metadata around what field of +//: an x86 instruction they're for. +//: +//: The x86 instruction set is variable-length, and how a byte is interpreted +//: affects later instruction boundaries. A lot of the pain in programming +//: machine code stems from computer and programmer going out of sync on what +//: a byte means. The miscommunication is usually not immediately caught, and +//: metastasizes at runtime into kilobytes of misinterpreted instructions. +//: +//: To mitigate these issues, we'll start programming in terms of logical +//: operands rather than physical bytes. Some operands are smaller than a +//: byte, and others may consist of multiple bytes. This layer will correctly +//: pack and order the bytes corresponding to the operands in an instruction. + +:(before "End Help Texts") +put_new(Help, "instructions", + "Each x86 instruction consists of an instruction or opcode and some number\n" + "of operands.\n" + "Each operand has a type. An instruction won't have more than one operand of\n" + "any type.\n" + "Each instruction has some set of allowed operand types. It'll reject others.\n" + "The complete list of operand types: mod, subop, r32 (register), rm32\n" + "(register or memory), scale, index, base, disp8, disp16, disp32, imm8,\n" + "imm32.\n" + "Each of these has its own help page. Try reading 'subx help mod' next.\n" +); +:(before "End Help Contents") +cerr << " instructions\n"; + +:(code) +void test_pack_immediate_constants() { + run( + "== code 0x1\n" + "bb 0x2a/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "transform: packing instruction 'bb 0x2a/imm32'\n" + "transform: instruction after packing: 'bb 2a 00 00 00'\n" + "run: copy imm32 0x0000002a to EBX\n" + ); +} + +//: complete set of valid operand types + +:(before "End Globals") +set Instruction_operands; +:(before "End One-time Setup") +Instruction_operands.insert("subop"); +Instruction_operands.insert("mod"); +Instruction_operands.insert("rm32"); +Instruction_operands.insert("base"); +Instruction_operands.insert("index"); +Instruction_operands.insert("scale"); +Instruction_operands.insert("r32"); +Instruction_operands.insert("disp8"); +Instruction_operands.insert("disp16"); +Instruction_operands.insert("disp32"); +Instruction_operands.insert("imm8"); +Instruction_operands.insert("imm32"); + +:(before "End Help Texts") +init_operand_type_help(); +:(code) +void init_operand_type_help() { + put(Help, "mod", + "2-bit operand controlling the _addressing mode_ of many instructions,\n" + "to determine how to compute the _effective address_ to look up memory at\n" + "based on the 'rm32' operand and potentially others.\n" + "\n" + "If mod = 3, just operate on the contents of the register specified by rm32\n" + " (direct mode).\n" + "If mod = 2, effective address is usually* rm32 + disp32\n" + " (indirect mode with displacement).\n" + "If mod = 1, effective address is usually* rm32 + disp8\n" + " (indirect mode with displacement).\n" + "If mod = 0, effective address is usually* rm32 (indirect mode).\n" + "(* - The exception is when rm32 is '4'. Register 4 is the stack pointer (ESP).\n" + " Using it as an address gets more involved. For more details,\n" + " try reading the help pages for 'base', 'index' and 'scale'.)\n" + "\n" + "For complete details, spend some time with two tables in the IA-32 software\n" + "developer's manual that are also included in this repo:\n" + " - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n" + " - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n" + ); + put(Help, "subop", + "Additional 3-bit operand for determining the instruction when the opcode\n" + "is 81, 8f, d3, f7 or ff.\n" + "Can't coexist with operand of type 'r32' in a single instruction, because\n" + "the two use the same bits.\n" + ); + put(Help, "r32", + "3-bit operand specifying a register operand used directly, without any further addressing modes.\n" + ); + put(Help, "rm32", + "32-bit value in register or memory. The precise details of its construction\n" + "depend on the eponymous 3-bit 'rm32' operand, the 'mod' operand, and also\n" + "potentially the 'SIB' operands ('scale', 'index' and 'base') and a displacement\n" + "('disp8' or 'disp32').\n" + "\n" + "For complete details, spend some time with two tables in the IA-32 software\n" + "developer's manual that are also included in this repo:\n" + " - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n" + " - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n" + ); + put(Help, "base", + "Additional 3-bit operand (when 'rm32' is 4, unless 'mod' is 3) specifying the\n" + "register containing an address to look up.\n" + "This address may be further modified by 'index' and 'scale' operands.\n" + " effective address = base + index*scale + displacement (disp8 or disp32)\n" + "For complete details, spend some time with the IA-32 software developer's manual,\n" + "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n" + "It is included in this repository as 'sib.pdf'.\n" + ); + put(Help, "index", + "Optional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be added to\n" + "the 'base' operand to compute the 'effective address' at which to look up memory.\n" + " effective address = base + index*scale + displacement (disp8 or disp32)\n" + "For complete details, spend some time with the IA-32 software developer's manual,\n" + "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n" + "It is included in this repository as 'sib.pdf'.\n" + ); + put(Help, "scale", + "Optional 2-bit operand (when 'rm32' is 4 unless 'mod' is 3) that encodes a\n" + "power of 2 to be multiplied to the 'index' operand before adding the result to\n" + "the 'base' operand to compute the _effective address_ to operate on.\n" + " effective address = base + index * scale + displacement (disp8 or disp32)\n" + "\n" + "When scale is 0, use index unmodified.\n" + "When scale is 1, multiply index by 2.\n" + "When scale is 2, multiply index by 4.\n" + "When scale is 3, multiply index by 8.\n" + "\n" + "For complete details, spend some time with the IA-32 software developer's manual,\n" + "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n" + "It is included in this repository as 'sib.pdf'.\n" + ); + put(Help, "disp8", + "8-bit value to be added in many instructions.\n" + ); + put(Help, "disp16", + "16-bit value to be added in many instructions.\n" + "Currently not used in any SubX instructions.\n" + ); + put(Help, "disp32", + "32-bit value to be added in many instructions.\n" + ); + put(Help, "imm8", + "8-bit value for many instructions.\n" + ); + put(Help, "imm32", + "32-bit value for many instructions.\n" + ); +} + +//:: transform packing operands into bytes in the right order + +:(after "Begin Transforms") +// Begin Level-2 Transforms +Transform.push_back(pack_operands); +// End Level-2 Transforms + +:(code) +void pack_operands(program& p) { + if (p.segments.empty()) return; + segment& code = *find(p, "code"); + // Pack Operands(segment code) + trace(3, "transform") << "-- pack operands" << end(); + for (int i = 0; i < SIZE(code.lines); ++i) { + line& inst = code.lines.at(i); + if (all_hex_bytes(inst)) continue; + trace(99, "transform") << "packing instruction '" << to_string(/*with metadata*/inst) << "'" << end(); + pack_operands(inst); + trace(99, "transform") << "instruction after packing: '" << to_string(/*without metadata*/inst.words) << "'" << end(); + } +} + +void pack_operands(line& inst) { + line new_inst; + add_opcodes(inst, new_inst); + add_modrm_byte(inst, new_inst); + add_sib_byte(inst, new_inst); + add_disp_bytes(inst, new_inst); + add_imm_bytes(inst, new_inst); + inst.words.swap(new_inst.words); +} + +void add_opcodes(const line& in, line& out) { + out.words.push_back(in.words.at(0)); + if (in.words.at(0).data == "0f" || in.words.at(0).data == "f2" || in.words.at(0).data == "f3") + out.words.push_back(in.words.at(1)); + if (in.words.at(0).data == "f3" && in.words.at(1).data == "0f") + out.words.push_back(in.words.at(2)); + if (in.words.at(0).data == "f2" && in.words.at(1).data == "0f") + out.words.push_back(in.words.at(2)); +} + +void add_modrm_byte(const line& in, line& out) { + uint8_t mod=0, reg_subop=0, rm32=0; + bool emit = false; + for (int i = 0; i < SIZE(in.words); ++i) { + const word& curr = in.words.at(i); + if (has_operand_metadata(curr, "mod")) { + mod = hex_byte(curr.data); + emit = true; + } + else if (has_operand_metadata(curr, "rm32")) { + rm32 = hex_byte(curr.data); + emit = true; + } + else if (has_operand_metadata(curr, "r32")) { + reg_subop = hex_byte(curr.data); + emit = true; + } + else if (has_operand_metadata(curr, "subop")) { + reg_subop = hex_byte(curr.data); + emit = true; + } + } + if (emit) + out.words.push_back(hex_byte_text((mod << 6) | (reg_subop << 3) | rm32)); +} + +void add_sib_byte(const line& in, line& out) { + uint8_t scale=0, index=0, base=0; + bool emit = false; + for (int i = 0; i < SIZE(in.words); ++i) { + const word& curr = in.words.at(i); + if (has_operand_metadata(curr, "scale")) { + scale = hex_byte(curr.data); + emit = true; + } + else if (has_operand_metadata(curr, "index")) { + index = hex_byte(curr.data); + emit = true; + } + else if (has_operand_metadata(curr, "base")) { + base = hex_byte(curr.data); + emit = true; + } + } + if (emit) + out.words.push_back(hex_byte_text((scale << 6) | (index << 3) | base)); +} + +void add_disp_bytes(const line& in, line& out) { + for (int i = 0; i < SIZE(in.words); ++i) { + const word& curr = in.words.at(i); + if (has_operand_metadata(curr, "disp8")) + emit_hex_bytes(out, curr, 1); + if (has_operand_metadata(curr, "disp16")) + emit_hex_bytes(out, curr, 2); + else if (has_operand_metadata(curr, "disp32")) + emit_hex_bytes(out, curr, 4); + } +} + +void add_imm_bytes(const line& in, line& out) { + for (int i = 0; i < SIZE(in.words); ++i) { + const word& curr = in.words.at(i); + if (has_operand_metadata(curr, "imm8")) + emit_hex_bytes(out, curr, 1); + else if (has_operand_metadata(curr, "imm32")) + emit_hex_bytes(out, curr, 4); + } +} + +void emit_hex_bytes(line& out, const word& w, int num) { + assert(num <= 4); + bool is_number = looks_like_hex_int(w.data); + if (num == 1 || !is_number) { + out.words.push_back(w); // preserve existing metadata + if (is_number) + out.words.back().data = hex_byte_to_string(parse_int(w.data)); + return; + } + emit_hex_bytes(out, static_cast(parse_int(w.data)), num); +} + +void emit_hex_bytes(line& out, uint32_t val, int num) { + assert(num <= 4); + for (int i = 0; i < num; ++i) { + out.words.push_back(hex_byte_text(val & 0xff)); + val = val >> 8; + } +} + +word hex_byte_text(uint8_t val) { + word result; + result.data = hex_byte_to_string(val); + result.original = result.data+"/auto"; + return result; +} + +string hex_byte_to_string(uint8_t val) { + ostringstream out; + // uint8_t prints without padding, but int8_t will expand to 32 bits again + out << HEXBYTE << NUM(val); + return out.str(); +} + +string to_string(const vector& in) { + ostringstream out; + for (int i = 0; i < SIZE(in); ++i) { + if (i > 0) out << ' '; + out << in.at(i).data; + } + return out.str(); +} + +:(before "End Unit Tests") +void test_preserve_metadata_when_emitting_single_byte() { + word in; + in.data = "f0"; + in.original = "f0/foo"; + line out; + emit_hex_bytes(out, in, 1); + CHECK_EQ(out.words.at(0).data, "f0"); + CHECK_EQ(out.words.at(0).original, "f0/foo"); +} + +:(code) +void test_pack_disp8() { + run( + "== code 0x1\n" + "74 2/disp8\n" // jump 2 bytes away if ZF is set + ); + CHECK_TRACE_CONTENTS( + "transform: packing instruction '74 2/disp8'\n" + "transform: instruction after packing: '74 02'\n" + ); +} + +void test_pack_disp8_negative() { + transform( + "== code 0x1\n" + // running this will cause an infinite loop + "74 -1/disp8\n" // jump 1 byte before if ZF is set + ); + CHECK_TRACE_CONTENTS( + "transform: packing instruction '74 -1/disp8'\n" + "transform: instruction after packing: '74 ff'\n" + ); +} + +//: helper for scenario +void transform(const string& text_bytes) { + program p; + istringstream in(text_bytes); + parse(in, p); + if (trace_contains_errors()) return; + transform(p); +} + +void test_pack_modrm_imm32() { + run( + "== code 0x1\n" + // instruction effective address operand displacement immediate\n" + // op subop mod rm32 base index scale r32\n" + // 1-3 bytes 3 bits 2 bits 3 bits 3 bits 3 bits 2 bits 2 bits 0/1/2/4 bytes 0/1/2/4 bytes\n" + " 81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32 \n" // add 1 to EBX + ); + CHECK_TRACE_CONTENTS( + "transform: packing instruction '81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32'\n" + "transform: instruction after packing: '81 c3 01 00 00 00'\n" + ); +} + +void test_pack_imm32_large() { + run( + "== code 0x1\n" + "b9 0x080490a7/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "transform: packing instruction 'b9 0x080490a7/imm32'\n" + "transform: instruction after packing: 'b9 a7 90 04 08'\n" + ); +} + +void test_pack_immediate_constants_hex() { + run( + "== code 0x1\n" + "b9 0x2a/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "transform: packing instruction 'b9 0x2a/imm32'\n" + "transform: instruction after packing: 'b9 2a 00 00 00'\n" + "run: copy imm32 0x0000002a to ECX\n" + ); +} + +void test_pack_silently_ignores_non_hex() { + Hide_errors = true; + transform( + "== code 0x1\n" + "b9 foo/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "transform: packing instruction 'b9 foo/imm32'\n" + // no change (we're just not printing metadata to the trace) + "transform: instruction after packing: 'b9 foo'\n" + ); +} + +void test_pack_flags_bad_hex() { + Hide_errors = true; + run( + "== code 0x1\n" + "b9 0xfoo/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "error: not a number: 0xfoo\n" + ); +} + +void test_pack_flags_uppercase_hex() { + Hide_errors = true; + run( + "== code 0x1\n" + "b9 0xAb/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "error: uppercase hex not allowed: 0xAb\n" + ); +} + +//:: helpers + +bool all_hex_bytes(const line& inst) { + for (int i = 0; i < SIZE(inst.words); ++i) + if (!is_hex_byte(inst.words.at(i))) + return false; + return true; +} + +bool is_hex_byte(const word& curr) { + if (contains_any_operand_metadata(curr)) + return false; + if (SIZE(curr.data) != 2) + return false; + if (curr.data.find_first_not_of("0123456789abcdef") != string::npos) + return false; + return true; +} + +bool contains_any_operand_metadata(const word& word) { + for (int i = 0; i < SIZE(word.metadata); ++i) + if (Instruction_operands.find(word.metadata.at(i)) != Instruction_operands.end()) + return true; + return false; +} + +bool has_operand_metadata(const line& inst, const string& m) { + bool result = false; + for (int i = 0; i < SIZE(inst.words); ++i) { + if (!has_operand_metadata(inst.words.at(i), m)) continue; + if (result) { + raise << "'" << to_string(inst) << "' has conflicting " << m << " operands\n" << end(); + return false; + } + result = true; + } + return result; +} + +bool has_operand_metadata(const word& w, const string& m) { + bool result = false; + bool metadata_found = false; + for (int i = 0; i < SIZE(w.metadata); ++i) { + const string& curr = w.metadata.at(i); + if (Instruction_operands.find(curr) == Instruction_operands.end()) continue; // ignore unrecognized metadata + if (metadata_found) { + raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end(); + return false; + } + metadata_found = true; + result = (curr == m); + } + return result; +} + +word metadata(const line& inst, const string& m) { + for (int i = 0; i < SIZE(inst.words); ++i) + if (has_operand_metadata(inst.words.at(i), m)) + return inst.words.at(i); + assert(false); +} + +bool looks_like_hex_int(const string& s) { + if (s.empty()) return false; + if (s.at(0) == '-' || s.at(0) == '+') return true; + if (isdigit(s.at(0))) return true; // includes '0x' prefix + // End looks_like_hex_int(s) Detectors + return false; +} + +string to_string(const line& inst) { + ostringstream out; + for (int i = 0; i < SIZE(inst.words); ++i) { + if (i > 0) out << ' '; + out << inst.words.at(i).original; + } + return out.str(); +} + +int32_t parse_int(const string& s) { + if (s.empty()) return 0; + if (contains_uppercase(s)) { + raise << "uppercase hex not allowed: " << s << '\n' << end(); + return 0; + } + istringstream in(s); + in >> std::hex; + if (s.at(0) == '-') { + int32_t result = 0; + in >> result; + if (!in || !in.eof()) { + raise << "not a number: " << s << '\n' << end(); + return 0; + } + return result; + } + uint32_t uresult = 0; + in >> uresult; + if (!in || !in.eof()) { + raise << "not a number: " << s << '\n' << end(); + return 0; + } + return static_cast(uresult); +} +:(before "End Unit Tests") +void test_parse_int() { + CHECK_EQ(0, parse_int("0")); + CHECK_EQ(0, parse_int("0x0")); + CHECK_EQ(0, parse_int("0x0")); + CHECK_EQ(16, parse_int("10")); // hex always + CHECK_EQ(-1, parse_int("-1")); + CHECK_EQ(-1, parse_int("0xffffffff")); +} diff --git a/032check_operand_bounds.cc b/032check_operand_bounds.cc deleted file mode 100644 index 72a66e3f..00000000 --- a/032check_operand_bounds.cc +++ /dev/null @@ -1,143 +0,0 @@ -//:: Check that the different operands of an instruction aren't too large for their bitfields. - -void test_check_bitfield_sizes() { - Hide_errors = true; - run( - "== code 0x1\n" - "01/add 4/mod 3/rm32 1/r32\n" // add ECX to EBX - ); - CHECK_TRACE_CONTENTS( - "error: '4/mod' too large to fit in bitfield mod\n" - ); -} - -:(before "End Globals") -map Operand_bound; -:(before "End One-time Setup") -put_new(Operand_bound, "subop", 1<<3); -put_new(Operand_bound, "mod", 1<<2); -put_new(Operand_bound, "rm32", 1<<3); -put_new(Operand_bound, "base", 1<<3); -put_new(Operand_bound, "index", 1<<3); -put_new(Operand_bound, "scale", 1<<2); -put_new(Operand_bound, "r32", 1<<3); -put_new(Operand_bound, "disp8", 1<<8); -put_new(Operand_bound, "disp16", 1<<16); -// no bound needed for disp32 -put_new(Operand_bound, "imm8", 1<<8); -// no bound needed for imm32 - -:(before "Pack Operands(segment code)") -check_operand_bounds(code); -if (trace_contains_errors()) return; -:(code) -void check_operand_bounds(const segment& code) { - trace(3, "transform") << "-- check operand bounds" << end(); - for (int i = 0; i < SIZE(code.lines); ++i) { - const line& inst = code.lines.at(i); - for (int j = first_operand(inst); j < SIZE(inst.words); ++j) - check_operand_bounds(inst.words.at(j)); - if (trace_contains_errors()) return; // stop at the first mal-formed instruction - } -} - -void check_operand_bounds(const word& w) { - for (map::iterator p = Operand_bound.begin(); p != Operand_bound.end(); ++p) { - if (!has_operand_metadata(w, p->first)) continue; - if (!looks_like_hex_int(w.data)) continue; // later transforms are on their own to do their own bounds checking - int32_t x = parse_int(w.data); - if (x >= 0) { - if (p->first == "disp8" || p->first == "disp16") { - if (static_cast(x) >= p->second/2) - raise << "'" << w.original << "' too large to fit in signed bitfield " << p->first << '\n' << end(); - } - else { - if (static_cast(x) >= p->second) - raise << "'" << w.original << "' too large to fit in bitfield " << p->first << '\n' << end(); - } - } - else { - // hacky? assuming bound is a power of 2 - if (x < -1*static_cast(p->second/2)) - raise << "'" << w.original << "' too large to fit in bitfield " << p->first << '\n' << end(); - } - } -} - -void test_check_bitfield_sizes_for_imm8() { - run( - "== code 0x1\n" - "c1/shift 4/subop/left 3/mod/direct 1/rm32/ECX 0xff/imm8" // shift EBX left - ); - CHECK(!trace_contains_errors()); -} - -void test_check_bitfield_sizes_for_imm8_error() { - Hide_errors = true; - run( - "== code 0x1\n" - "c1/shift 4/subop/left 3/mod/direct 1/rm32/ECX 0x100/imm8" // shift EBX left - ); - CHECK_TRACE_CONTENTS( - "error: '0x100/imm8' too large to fit in bitfield imm8\n" - ); -} - -void test_check_bitfield_sizes_for_negative_imm8() { - run( - "== code 0x1\n" - "c1/shift 4/subop/left 3/mod/direct 1/rm32/ECX -0x80/imm8" // shift EBX left - ); - CHECK(!trace_contains_errors()); -} - -void test_check_bitfield_sizes_for_negative_imm8_error() { - Hide_errors = true; - run( - "== code 0x1\n" - "c1/shift 4/subop/left 3/mod/direct 1/rm32/ECX -0x81/imm8" // shift EBX left - ); - CHECK_TRACE_CONTENTS( - "error: '-0x81/imm8' too large to fit in bitfield imm8\n" - ); -} - -void test_check_bitfield_sizes_for_disp8() { - // not bothering to run - transform( - "== code 0x1\n" - "01/add 1/mod/*+disp8 3/rm32 1/r32 0x7f/disp8\n" // add ECX to *(EBX+0x7f) - ); - CHECK(!trace_contains_errors()); -} - -void test_check_bitfield_sizes_for_disp8_error() { - Hide_errors = true; - run( - "== code 0x1\n" - "01/add 1/mod/*+disp8 3/rm32 1/r32 0x80/disp8\n" // add ECX to *(EBX+0x80) - ); - CHECK_TRACE_CONTENTS( - "error: '0x80/disp8' too large to fit in signed bitfield disp8\n" - ); -} - -void test_check_bitfield_sizes_for_negative_disp8() { - // not bothering to run - transform( - "== code 0x1\n" - "01/add 1/mod/*+disp8 3/rm32 1/r32 -0x80/disp8\n" // add ECX to *(EBX-0x80) - ); - CHECK(!trace_contains_errors()); -} - -void test_check_bitfield_sizes_for_negative_disp8_error() { - Hide_errors = true; - run( - "== code 0x1\n" - "01/add 1/mod/*+disp8 3/rm32 1/r32 -0x81/disp8\n" // add ECX to *(EBX-0x81) - ); - CHECK_TRACE_CONTENTS( - "error: '-0x81/disp8' too large to fit in bitfield disp8\n" - ); -} diff --git a/033check_operands.cc b/033check_operands.cc new file mode 100644 index 00000000..bf5d3719 --- /dev/null +++ b/033check_operands.cc @@ -0,0 +1,691 @@ +//: Since we're tagging operands with their types, let's start checking these +//: operand types for each instruction. + +void test_check_missing_imm8_operand() { + Hide_errors = true; + run( + "== code 0x1\n" + "cd\n" // interrupt ?? + ); + CHECK_TRACE_CONTENTS( + "error: 'cd' (software interrupt): missing imm8 operand\n" + ); +} + +:(before "Pack Operands(segment code)") +check_operands(code); +if (trace_contains_errors()) return; + +:(code) +void check_operands(const segment& code) { + trace(3, "transform") << "-- check operands" << end(); + for (int i = 0; i < SIZE(code.lines); ++i) { + check_operands(code.lines.at(i)); + if (trace_contains_errors()) return; // stop at the first mal-formed instruction + } +} + +void check_operands(const line& inst) { + word op = preprocess_op(inst.words.at(0)); + if (op.data == "0f") { + check_operands_0f(inst); + return; + } + if (op.data == "f3") { + check_operands_f3(inst); + return; + } + check_operands(inst, op); +} + +word preprocess_op(word/*copy*/ op) { + op.data = tolower(op.data.c_str()); + // opcodes can't be negative + if (starts_with(op.data, "0x")) + op.data = op.data.substr(2); + if (SIZE(op.data) == 1) + op.data = string("0")+op.data; + return op; +} + +void test_preprocess_op() { + word w1; w1.data = "0xf"; + word w2; w2.data = "0f"; + CHECK_EQ(preprocess_op(w1).data, preprocess_op(w2).data); +} + +//: To check the operands for an opcode, we'll track the permitted operands +//: for each supported opcode in a bitvector. That way we can often compute the +//: 'received' operand bitvector for each instruction's operands and compare +//: it with the 'expected' bitvector. +//: +//: The 'expected' and 'received' bitvectors can be different; the MODRM bit +//: in the 'expected' bitvector maps to multiple 'received' operand types in +//: an instruction. We deal in expected bitvectors throughout. + +:(before "End Types") +enum expected_operand_type { + // start from the least significant bit + MODRM, // more complex, may also involve disp8 or disp32 + SUBOP, + DISP8, + DISP16, + DISP32, + IMM8, + IMM32, + NUM_OPERAND_TYPES +}; +:(before "End Globals") +vector Operand_type_name; +map Operand_type; +:(before "End One-time Setup") +init_op_types(); +:(code) +void init_op_types() { + assert(NUM_OPERAND_TYPES <= /*bits in a uint8_t*/8); + Operand_type_name.resize(NUM_OPERAND_TYPES); + #define DEF(type) Operand_type_name.at(type) = tolower(#type), put(Operand_type, tolower(#type), type); + DEF(MODRM); + DEF(SUBOP); + DEF(DISP8); + DEF(DISP16); + DEF(DISP32); + DEF(IMM8); + DEF(IMM32); + #undef DEF +} + +:(before "End Globals") +map Permitted_operands; +const uint8_t INVALID_OPERANDS = 0xff; // no instruction uses all the operand types +:(before "End One-time Setup") +init_permitted_operands(); +:(code) +void init_permitted_operands() { + //// Class A: just op, no operands + // halt + put(Permitted_operands, "f4", 0x00); + // inc + put(Permitted_operands, "40", 0x00); + put(Permitted_operands, "41", 0x00); + put(Permitted_operands, "42", 0x00); + put(Permitted_operands, "43", 0x00); + put(Permitted_operands, "44", 0x00); + put(Permitted_operands, "45", 0x00); + put(Permitted_operands, "46", 0x00); + put(Permitted_operands, "47", 0x00); + // dec + put(Permitted_operands, "48", 0x00); + put(Permitted_operands, "49", 0x00); + put(Permitted_operands, "4a", 0x00); + put(Permitted_operands, "4b", 0x00); + put(Permitted_operands, "4c", 0x00); + put(Permitted_operands, "4d", 0x00); + put(Permitted_operands, "4e", 0x00); + put(Permitted_operands, "4f", 0x00); + // push + put(Permitted_operands, "50", 0x00); + put(Permitted_operands, "51", 0x00); + put(Permitted_operands, "52", 0x00); + put(Permitted_operands, "53", 0x00); + put(Permitted_operands, "54", 0x00); + put(Permitted_operands, "55", 0x00); + put(Permitted_operands, "56", 0x00); + put(Permitted_operands, "57", 0x00); + // pop + put(Permitted_operands, "58", 0x00); + put(Permitted_operands, "59", 0x00); + put(Permitted_operands, "5a", 0x00); + put(Permitted_operands, "5b", 0x00); + put(Permitted_operands, "5c", 0x00); + put(Permitted_operands, "5d", 0x00); + put(Permitted_operands, "5e", 0x00); + put(Permitted_operands, "5f", 0x00); + // sign-extend EAX into EDX + put(Permitted_operands, "99", 0x00); + // return + put(Permitted_operands, "c3", 0x00); + + //// Class B: just op and disp8 + // imm32 imm8 disp32 |disp16 disp8 subop modrm + // 0 0 0 |0 1 0 0 + + // jump + put(Permitted_operands, "eb", 0x04); + put(Permitted_operands, "72", 0x04); + put(Permitted_operands, "73", 0x04); + put(Permitted_operands, "74", 0x04); + put(Permitted_operands, "75", 0x04); + put(Permitted_operands, "76", 0x04); + put(Permitted_operands, "77", 0x04); + put(Permitted_operands, "7c", 0x04); + put(Permitted_operands, "7d", 0x04); + put(Permitted_operands, "7e", 0x04); + put(Permitted_operands, "7f", 0x04); + + //// Class D: just op and disp32 + // imm32 imm8 disp32 |disp16 disp8 subop modrm + // 0 0 1 |0 0 0 0 + put(Permitted_operands, "e8", 0x10); // call + put(Permitted_operands, "e9", 0x10); // jump + + //// Class E: just op and imm8 + // imm32 imm8 disp32 |disp16 disp8 subop modrm + // 0 1 0 |0 0 0 0 + put(Permitted_operands, "cd", 0x20); // software interrupt + + //// Class F: just op and imm32 + // imm32 imm8 disp32 |disp16 disp8 subop modrm + // 1 0 0 |0 0 0 0 + put(Permitted_operands, "05", 0x40); // add + put(Permitted_operands, "2d", 0x40); // subtract + put(Permitted_operands, "25", 0x40); // and + put(Permitted_operands, "0d", 0x40); // or + put(Permitted_operands, "35", 0x40); // xor + put(Permitted_operands, "3d", 0x40); // compare + put(Permitted_operands, "68", 0x40); // push + // copy + put(Permitted_operands, "b8", 0x40); + put(Permitted_operands, "b9", 0x40); + put(Permitted_operands, "ba", 0x40); + put(Permitted_operands, "bb", 0x40); + put(Permitted_operands, "bc", 0x40); + put(Permitted_operands, "bd", 0x40); + put(Permitted_operands, "be", 0x40); + put(Permitted_operands, "bf", 0x40); + + //// Class M: using ModR/M byte + // imm32 imm8 disp32 |disp16 disp8 subop modrm + // 0 0 0 |0 0 0 1 + + // add + put(Permitted_operands, "01", 0x01); + put(Permitted_operands, "03", 0x01); + // subtract + put(Permitted_operands, "29", 0x01); + put(Permitted_operands, "2b", 0x01); + // and + put(Permitted_operands, "21", 0x01); + put(Permitted_operands, "23", 0x01); + // or + put(Permitted_operands, "09", 0x01); + put(Permitted_operands, "0b", 0x01); + // xor + put(Permitted_operands, "31", 0x01); + put(Permitted_operands, "33", 0x01); + // compare + put(Permitted_operands, "39", 0x01); + put(Permitted_operands, "3b", 0x01); + // copy + put(Permitted_operands, "88", 0x01); + put(Permitted_operands, "89", 0x01); + put(Permitted_operands, "8a", 0x01); + put(Permitted_operands, "8b", 0x01); + // swap + put(Permitted_operands, "87", 0x01); + // copy address (lea) + put(Permitted_operands, "8d", 0x01); + + //// Class N: op, ModR/M and subop (not r32) + // imm32 imm8 disp32 |disp16 disp8 subop modrm + // 0 0 0 |0 0 1 1 + put(Permitted_operands, "8f", 0x03); // pop + put(Permitted_operands, "d3", 0x03); // shift + put(Permitted_operands, "f7", 0x03); // test/not/mul/div + put(Permitted_operands, "ff", 0x03); // jump/push/call + + //// Class O: op, ModR/M, subop (not r32) and imm8 + // imm32 imm8 disp32 |disp16 disp8 subop modrm + // 0 1 0 |0 0 1 1 + put(Permitted_operands, "c1", 0x23); // combine + put(Permitted_operands, "c6", 0x23); // copy + + //// Class P: op, ModR/M, subop (not r32) and imm32 + // imm32 imm8 disp32 |disp16 disp8 subop modrm + // 1 0 0 |0 0 1 1 + put(Permitted_operands, "81", 0x43); // combine + put(Permitted_operands, "c7", 0x43); // copy + + // End Init Permitted Operands +} + +#define HAS(bitvector, bit) ((bitvector) & (1 << (bit))) +#define SET(bitvector, bit) ((bitvector) | (1 << (bit))) +#define CLEAR(bitvector, bit) ((bitvector) & (~(1 << (bit)))) + +void check_operands(const line& inst, const word& op) { + if (!is_hex_byte(op)) return; + uint8_t expected_bitvector = get(Permitted_operands, op.data); + if (HAS(expected_bitvector, MODRM)) { + check_operands_modrm(inst, op); + compare_bitvector_modrm(inst, expected_bitvector, op); + } + else { + compare_bitvector(inst, expected_bitvector, op); + } +} + +//: Many instructions can be checked just by comparing bitvectors. + +void compare_bitvector(const line& inst, uint8_t expected, const word& op) { + if (all_hex_bytes(inst) && has_operands(inst)) return; // deliberately programming in raw hex; we'll raise a warning elsewhere + uint8_t bitvector = compute_expected_operand_bitvector(inst); + if (trace_contains_errors()) return; // duplicate operand type + if (bitvector == expected) return; // all good with this instruction + for (int i = 0; i < NUM_OPERAND_TYPES; ++i, bitvector >>= 1, expected >>= 1) { +//? cerr << "comparing " << HEXBYTE << NUM(bitvector) << " with " << NUM(expected) << '\n'; + if ((bitvector & 0x1) == (expected & 0x1)) continue; // all good with this operand + const string& optype = Operand_type_name.at(i); + if ((bitvector & 0x1) > (expected & 0x1)) + raise << "'" << to_string(inst) << "'" << maybe_name(op) << ": unexpected " << optype << " operand\n" << end(); + else + raise << "'" << to_string(inst) << "'" << maybe_name(op) << ": missing " << optype << " operand\n" << end(); + // continue giving all errors for a single instruction + } + // ignore settings in any unused bits +} + +string maybe_name(const word& op) { + if (!is_hex_byte(op)) return ""; + if (!contains_key(Name, op.data)) return ""; + // strip stuff in parens from the name + const string& s = get(Name, op.data); + return " ("+s.substr(0, s.find(" ("))+')'; +} + +uint32_t compute_expected_operand_bitvector(const line& inst) { + set operands_found; + uint32_t bitvector = 0; + for (int i = /*skip op*/1; i < SIZE(inst.words); ++i) { + bitvector = bitvector | expected_bit_for_received_operand(inst.words.at(i), operands_found, inst); + if (trace_contains_errors()) return INVALID_OPERANDS; // duplicate operand type + } + return bitvector; +} + +bool has_operands(const line& inst) { + return SIZE(inst.words) > first_operand(inst); +} + +int first_operand(const line& inst) { + if (inst.words.at(0).data == "0f") return 2; + if (inst.words.at(0).data == "f2" || inst.words.at(0).data == "f3") { + if (inst.words.at(1).data == "0f") + return 3; + else + return 2; + } + return 1; +} + +// Scan the metadata of 'w' and return the expected bit corresponding to any operand type. +// Also raise an error if metadata contains multiple operand types. +uint32_t expected_bit_for_received_operand(const word& w, set& instruction_operands, const line& inst) { + uint32_t bv = 0; + bool found = false; + for (int i = 0; i < SIZE(w.metadata); ++i) { + string/*copy*/ curr = w.metadata.at(i); + string expected_metadata = curr; + if (curr == "mod" || curr == "rm32" || curr == "r32" || curr == "scale" || curr == "index" || curr == "base") + expected_metadata = "modrm"; + else if (!contains_key(Operand_type, curr)) continue; // ignore unrecognized metadata + if (found) { + raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end(); + return INVALID_OPERANDS; + } + if (instruction_operands.find(curr) != instruction_operands.end()) { + raise << "'" << to_string(inst) << "': duplicate " << curr << " operand\n" << end(); + return INVALID_OPERANDS; + } + instruction_operands.insert(curr); + bv = (1 << get(Operand_type, expected_metadata)); + found = true; + } + return bv; +} + +void test_conflicting_operand_type() { + Hide_errors = true; + run( + "== code 0x1\n" + "cd/software-interrupt 80/imm8/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "error: '80/imm8/imm32' has conflicting operand types; it should have only one\n" + ); +} + +//: Instructions computing effective addresses have more complex rules, so +//: we'll hard-code a common set of instruction-decoding rules. + +void test_check_missing_mod_operand() { + Hide_errors = true; + run( + "== code 0x1\n" + "81 0/add/subop 3/rm32/ebx 1/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "error: '81 0/add/subop 3/rm32/ebx 1/imm32' (combine rm32 with imm32 based on subop): missing mod operand\n" + ); +} + +void check_operands_modrm(const line& inst, const word& op) { + if (all_hex_bytes(inst)) return; // deliberately programming in raw hex; we'll raise a warning elsewhere + check_operand_metadata_present(inst, "mod", op); + check_operand_metadata_present(inst, "rm32", op); + // no check for r32; some instructions don't use it; just assume it's 0 if missing + if (op.data == "81" || op.data == "8f" || op.data == "ff") { // keep sync'd with 'help subop' + check_operand_metadata_present(inst, "subop", op); + check_operand_metadata_absent(inst, "r32", op, "should be replaced by subop"); + } + if (trace_contains_errors()) return; + if (metadata(inst, "rm32").data != "4") return; + // SIB byte checks + uint8_t mod = hex_byte(metadata(inst, "mod").data); + if (mod != /*direct*/3) { + check_operand_metadata_present(inst, "base", op); + check_operand_metadata_present(inst, "index", op); // otherwise why go to SIB? + } + else { + check_operand_metadata_absent(inst, "base", op, "direct mode"); + check_operand_metadata_absent(inst, "index", op, "direct mode"); + } + // no check for scale; 0 (2**0 = 1) by default +} + +// same as compare_bitvector, with one additional exception for modrm-based +// instructions: they may use an extra displacement on occasion +void compare_bitvector_modrm(const line& inst, uint8_t expected, const word& op) { + if (all_hex_bytes(inst) && has_operands(inst)) return; // deliberately programming in raw hex; we'll raise a warning elsewhere + uint8_t bitvector = compute_expected_operand_bitvector(inst); + if (trace_contains_errors()) return; // duplicate operand type + // update 'expected' bitvector for the additional exception + if (has_operand_metadata(inst, "mod")) { + int32_t mod = parse_int(metadata(inst, "mod").data); + switch (mod) { + case 0: + if (has_operand_metadata(inst, "rm32") && parse_int(metadata(inst, "rm32").data) == 5) + expected |= (1<>= 1, expected >>= 1) { +//? cerr << "comparing for modrm " << HEXBYTE << NUM(bitvector) << " with " << NUM(expected) << '\n'; + if ((bitvector & 0x1) == (expected & 0x1)) continue; // all good with this operand + const string& optype = Operand_type_name.at(i); + if ((bitvector & 0x1) > (expected & 0x1)) + raise << "'" << to_string(inst) << "'" << maybe_name(op) << ": unexpected " << optype << " operand\n" << end(); + else + raise << "'" << to_string(inst) << "'" << maybe_name(op) << ": missing " << optype << " operand\n" << end(); + // continue giving all errors for a single instruction + } + // ignore settings in any unused bits +} + +void check_operand_metadata_present(const line& inst, const string& type, const word& op) { + if (!has_operand_metadata(inst, type)) + raise << "'" << to_string(inst) << "'" << maybe_name(op) << ": missing " << type << " operand\n" << end(); +} + +void check_operand_metadata_absent(const line& inst, const string& type, const word& op, const string& msg) { + if (has_operand_metadata(inst, type)) + raise << "'" << to_string(inst) << "'" << maybe_name(op) << ": unexpected " << type << " operand (" << msg << ")\n" << end(); +} + +void test_modrm_with_displacement() { + Reg[EAX].u = 0x1; + transform( + "== code 0x1\n" + // just avoid null pointer + "8b/copy 1/mod/lookup+disp8 0/rm32/EAX 2/r32/EDX 4/disp8\n" // copy *(EAX+4) to EDX + ); + CHECK_TRACE_COUNT("error", 0); +} + +void test_check_missing_disp8() { + Hide_errors = true; + transform( + "== code 0x1\n" + "89/copy 1/mod/lookup+disp8 0/rm32/EAX 1/r32/ECX\n" // missing disp8 + ); + CHECK_TRACE_CONTENTS( + "error: '89/copy 1/mod/lookup+disp8 0/rm32/EAX 1/r32/ECX' (copy r32 to rm32): missing disp8 operand\n" + ); +} + +void test_check_missing_disp32() { + Hide_errors = true; + transform( + "== code 0x1\n" + "8b/copy 0/mod/indirect 5/rm32/.disp32 2/r32/EDX\n" // missing disp32 + ); + CHECK_TRACE_CONTENTS( + "error: '8b/copy 0/mod/indirect 5/rm32/.disp32 2/r32/EDX' (copy rm32 to r32): missing disp32 operand\n" + ); +} + +void test_conflicting_operands_in_modrm_instruction() { + Hide_errors = true; + run( + "== code 0x1\n" + "01/add 0/mod 3/mod\n" + ); + CHECK_TRACE_CONTENTS( + "error: '01/add 0/mod 3/mod' has conflicting mod operands\n" + ); +} + +void test_conflicting_operand_type_modrm() { + Hide_errors = true; + run( + "== code 0x1\n" + "01/add 0/mod 3/rm32/r32\n" + ); + CHECK_TRACE_CONTENTS( + "error: '3/rm32/r32' has conflicting operand types; it should have only one\n" + ); +} + +void test_check_missing_rm32_operand() { + Hide_errors = true; + run( + "== code 0x1\n" + "81 0/add/subop 0/mod 1/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "error: '81 0/add/subop 0/mod 1/imm32' (combine rm32 with imm32 based on subop): missing rm32 operand\n" + ); +} + +void test_check_missing_subop_operand() { + Hide_errors = true; + run( + "== code 0x1\n" + "81 0/mod 3/rm32/ebx 1/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "error: '81 0/mod 3/rm32/ebx 1/imm32' (combine rm32 with imm32 based on subop): missing subop operand\n" + ); +} + +void test_check_missing_base_operand() { + Hide_errors = true; + run( + "== code 0x1\n" + "81 0/add/subop 0/mod/indirect 4/rm32/use-sib 1/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "error: '81 0/add/subop 0/mod/indirect 4/rm32/use-sib 1/imm32' (combine rm32 with imm32 based on subop): missing base operand\n" + ); +} + +void test_check_missing_index_operand() { + Hide_errors = true; + run( + "== code 0x1\n" + "81 0/add/subop 0/mod/indirect 4/rm32/use-sib 0/base 1/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "error: '81 0/add/subop 0/mod/indirect 4/rm32/use-sib 0/base 1/imm32' (combine rm32 with imm32 based on subop): missing index operand\n" + ); +} + +void test_check_missing_base_operand_2() { + Hide_errors = true; + run( + "== code 0x1\n" + "81 0/add/subop 0/mod/indirect 4/rm32/use-sib 2/index 3/scale 1/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "error: '81 0/add/subop 0/mod/indirect 4/rm32/use-sib 2/index 3/scale 1/imm32' (combine rm32 with imm32 based on subop): missing base operand\n" + ); +} + +void test_check_extra_displacement() { + Hide_errors = true; + run( + "== code 0x1\n" + "89/copy 0/mod/indirect 0/rm32/EAX 1/r32/ECX 4/disp8\n" + ); + CHECK_TRACE_CONTENTS( + "error: '89/copy 0/mod/indirect 0/rm32/EAX 1/r32/ECX 4/disp8' (copy r32 to rm32): unexpected disp8 operand\n" + ); +} + +void test_check_duplicate_operand() { + Hide_errors = true; + run( + "== code 0x1\n" + "89/copy 0/mod/indirect 0/rm32/EAX 1/r32/ECX 1/r32\n" + ); + CHECK_TRACE_CONTENTS( + "error: '89/copy 0/mod/indirect 0/rm32/EAX 1/r32/ECX 1/r32': duplicate r32 operand\n" + ); +} + +void test_check_base_operand_not_needed_in_direct_mode() { + run( + "== code 0x1\n" + "81 0/add/subop 3/mod/indirect 4/rm32/use-sib 1/imm32\n" + ); + CHECK_TRACE_COUNT("error", 0); +} + +void test_extra_modrm() { + Hide_errors = true; + run( + "== code 0x1\n" + "59/pop-to-ECX 3/mod/direct 1/rm32/ECX 4/r32/ESP\n" + ); + CHECK_TRACE_CONTENTS( + "error: '59/pop-to-ECX 3/mod/direct 1/rm32/ECX 4/r32/ESP' (pop top of stack to ECX): unexpected modrm operand\n" + ); +} + +//:: similarly handle multi-byte opcodes + +void check_operands_0f(const line& inst) { + assert(inst.words.at(0).data == "0f"); + if (SIZE(inst.words) == 1) { + raise << "opcode '0f' requires a second opcode\n" << end(); + return; + } + word op = preprocess_op(inst.words.at(1)); + if (!contains_key(Name_0f, op.data)) { + raise << "unknown 2-byte opcode '0f " << op.data << "'\n" << end(); + return; + } + check_operands_0f(inst, op); +} + +void check_operands_f3(const line& /*unused*/) { + raise << "no supported opcodes starting with f3\n" << end(); +} + +void test_check_missing_disp32_operand() { + Hide_errors = true; + run( + "== code 0x1\n" + " 0f 84 # jmp if ZF to ??\n" + ); + CHECK_TRACE_CONTENTS( + "error: '0f 84' (jump disp32 bytes away if equal, if ZF is set): missing disp32 operand\n" + ); +} + +:(before "End Globals") +map Permitted_operands_0f; +:(before "End Init Permitted Operands") +//// Class D: just op and disp32 +// imm32 imm8 disp32 |disp16 disp8 subop modrm +// 0 0 1 |0 0 0 0 +put_new(Permitted_operands_0f, "82", 0x10); +put_new(Permitted_operands_0f, "83", 0x10); +put_new(Permitted_operands_0f, "84", 0x10); +put_new(Permitted_operands_0f, "85", 0x10); +put_new(Permitted_operands_0f, "86", 0x10); +put_new(Permitted_operands_0f, "87", 0x10); +put_new(Permitted_operands_0f, "8c", 0x10); +put_new(Permitted_operands_0f, "8d", 0x10); +put_new(Permitted_operands_0f, "8e", 0x10); +put_new(Permitted_operands_0f, "8f", 0x10); + +//// Class M: using ModR/M byte +// imm32 imm8 disp32 |disp16 disp8 subop modrm +// 0 0 0 |0 0 0 1 +put_new(Permitted_operands_0f, "af", 0x01); + +:(code) +void check_operands_0f(const line& inst, const word& op) { + uint8_t expected_bitvector = get(Permitted_operands_0f, op.data); + if (HAS(expected_bitvector, MODRM)) + check_operands_modrm(inst, op); + compare_bitvector_0f(inst, CLEAR(expected_bitvector, MODRM), op); +} + +void compare_bitvector_0f(const line& inst, uint8_t expected, const word& op) { + if (all_hex_bytes(inst) && has_operands(inst)) return; // deliberately programming in raw hex; we'll raise a warning elsewhere + uint8_t bitvector = compute_expected_operand_bitvector(inst); + if (trace_contains_errors()) return; // duplicate operand type + if (bitvector == expected) return; // all good with this instruction + for (int i = 0; i < NUM_OPERAND_TYPES; ++i, bitvector >>= 1, expected >>= 1) { +//? cerr << "comparing " << HEXBYTE << NUM(bitvector) << " with " << NUM(expected) << '\n'; + if ((bitvector & 0x1) == (expected & 0x1)) continue; // all good with this operand + const string& optype = Operand_type_name.at(i); + if ((bitvector & 0x1) > (expected & 0x1)) + raise << "'" << to_string(inst) << "'" << maybe_name_0f(op) << ": unexpected " << optype << " operand\n" << end(); + else + raise << "'" << to_string(inst) << "'" << maybe_name_0f(op) << ": missing " << optype << " operand\n" << end(); + // continue giving all errors for a single instruction + } + // ignore settings in any unused bits +} + +string maybe_name_0f(const word& op) { + if (!is_hex_byte(op)) return ""; + if (!contains_key(Name_0f, op.data)) return ""; + // strip stuff in parens from the name + const string& s = get(Name_0f, op.data); + return " ("+s.substr(0, s.find(" ("))+')'; +} + +string tolower(const char* s) { + ostringstream out; + for (/*nada*/; *s; ++s) + out << static_cast(tolower(*s)); + return out.str(); +} + +#undef HAS +#undef SET +#undef CLEAR + +:(before "End Includes") +#include diff --git a/034check_operand_bounds.cc b/034check_operand_bounds.cc new file mode 100644 index 00000000..72a66e3f --- /dev/null +++ b/034check_operand_bounds.cc @@ -0,0 +1,143 @@ +//:: Check that the different operands of an instruction aren't too large for their bitfields. + +void test_check_bitfield_sizes() { + Hide_errors = true; + run( + "== code 0x1\n" + "01/add 4/mod 3/rm32 1/r32\n" // add ECX to EBX + ); + CHECK_TRACE_CONTENTS( + "error: '4/mod' too large to fit in bitfield mod\n" + ); +} + +:(before "End Globals") +map Operand_bound; +:(before "End One-time Setup") +put_new(Operand_bound, "subop", 1<<3); +put_new(Operand_bound, "mod", 1<<2); +put_new(Operand_bound, "rm32", 1<<3); +put_new(Operand_bound, "base", 1<<3); +put_new(Operand_bound, "index", 1<<3); +put_new(Operand_bound, "scale", 1<<2); +put_new(Operand_bound, "r32", 1<<3); +put_new(Operand_bound, "disp8", 1<<8); +put_new(Operand_bound, "disp16", 1<<16); +// no bound needed for disp32 +put_new(Operand_bound, "imm8", 1<<8); +// no bound needed for imm32 + +:(before "Pack Operands(segment code)") +check_operand_bounds(code); +if (trace_contains_errors()) return; +:(code) +void check_operand_bounds(const segment& code) { + trace(3, "transform") << "-- check operand bounds" << end(); + for (int i = 0; i < SIZE(code.lines); ++i) { + const line& inst = code.lines.at(i); + for (int j = first_operand(inst); j < SIZE(inst.words); ++j) + check_operand_bounds(inst.words.at(j)); + if (trace_contains_errors()) return; // stop at the first mal-formed instruction + } +} + +void check_operand_bounds(const word& w) { + for (map::iterator p = Operand_bound.begin(); p != Operand_bound.end(); ++p) { + if (!has_operand_metadata(w, p->first)) continue; + if (!looks_like_hex_int(w.data)) continue; // later transforms are on their own to do their own bounds checking + int32_t x = parse_int(w.data); + if (x >= 0) { + if (p->first == "disp8" || p->first == "disp16") { + if (static_cast(x) >= p->second/2) + raise << "'" << w.original << "' too large to fit in signed bitfield " << p->first << '\n' << end(); + } + else { + if (static_cast(x) >= p->second) + raise << "'" << w.original << "' too large to fit in bitfield " << p->first << '\n' << end(); + } + } + else { + // hacky? assuming bound is a power of 2 + if (x < -1*static_cast(p->second/2)) + raise << "'" << w.original << "' too large to fit in bitfield " << p->first << '\n' << end(); + } + } +} + +void test_check_bitfield_sizes_for_imm8() { + run( + "== code 0x1\n" + "c1/shift 4/subop/left 3/mod/direct 1/rm32/ECX 0xff/imm8" // shift EBX left + ); + CHECK(!trace_contains_errors()); +} + +void test_check_bitfield_sizes_for_imm8_error() { + Hide_errors = true; + run( + "== code 0x1\n" + "c1/shift 4/subop/left 3/mod/direct 1/rm32/ECX 0x100/imm8" // shift EBX left + ); + CHECK_TRACE_CONTENTS( + "error: '0x100/imm8' too large to fit in bitfield imm8\n" + ); +} + +void test_check_bitfield_sizes_for_negative_imm8() { + run( + "== code 0x1\n" + "c1/shift 4/subop/left 3/mod/direct 1/rm32/ECX -0x80/imm8" // shift EBX left + ); + CHECK(!trace_contains_errors()); +} + +void test_check_bitfield_sizes_for_negative_imm8_error() { + Hide_errors = true; + run( + "== code 0x1\n" + "c1/shift 4/subop/left 3/mod/direct 1/rm32/ECX -0x81/imm8" // shift EBX left + ); + CHECK_TRACE_CONTENTS( + "error: '-0x81/imm8' too large to fit in bitfield imm8\n" + ); +} + +void test_check_bitfield_sizes_for_disp8() { + // not bothering to run + transform( + "== code 0x1\n" + "01/add 1/mod/*+disp8 3/rm32 1/r32 0x7f/disp8\n" // add ECX to *(EBX+0x7f) + ); + CHECK(!trace_contains_errors()); +} + +void test_check_bitfield_sizes_for_disp8_error() { + Hide_errors = true; + run( + "== code 0x1\n" + "01/add 1/mod/*+disp8 3/rm32 1/r32 0x80/disp8\n" // add ECX to *(EBX+0x80) + ); + CHECK_TRACE_CONTENTS( + "error: '0x80/disp8' too large to fit in signed bitfield disp8\n" + ); +} + +void test_check_bitfield_sizes_for_negative_disp8() { + // not bothering to run + transform( + "== code 0x1\n" + "01/add 1/mod/*+disp8 3/rm32 1/r32 -0x80/disp8\n" // add ECX to *(EBX-0x80) + ); + CHECK(!trace_contains_errors()); +} + +void test_check_bitfield_sizes_for_negative_disp8_error() { + Hide_errors = true; + run( + "== code 0x1\n" + "01/add 1/mod/*+disp8 3/rm32 1/r32 -0x81/disp8\n" // add ECX to *(EBX-0x81) + ); + CHECK_TRACE_CONTENTS( + "error: '-0x81/disp8' too large to fit in bitfield disp8\n" + ); +} diff --git a/034compute_segment_address.cc b/034compute_segment_address.cc deleted file mode 100644 index 61c3739a..00000000 --- a/034compute_segment_address.cc +++ /dev/null @@ -1,86 +0,0 @@ -//: ELF binaries have finicky rules about the precise alignment each segment -//: should start at. They depend on the amount of code in a program. -//: We shouldn't expect people to adjust segment addresses everytime they make -//: a change to their programs. -//: Let's start taking the given segment addresses as guidelines, and adjust -//: them as necessary. -//: This gives up a measure of control in placing code and data. - -void test_segment_name() { - run( - "== code 0x09000000\n" - "05/add-to-EAX 0x0d0c0b0a/imm32\n" - // code starts at 0x09000000 + p_offset, which is 0x54 for a single-segment binary - ); - CHECK_TRACE_CONTENTS( - "load: 0x09000054 -> 05\n" - "load: 0x09000055 -> 0a\n" - "load: 0x09000056 -> 0b\n" - "load: 0x09000057 -> 0c\n" - "load: 0x09000058 -> 0d\n" - "run: add imm32 0x0d0c0b0a to EAX\n" - "run: storing 0x0d0c0b0a\n" - ); -} - -//: compute segment address - -:(before "End Level-2 Transforms") -Transform.push_back(compute_segment_starts); - -:(code) -void compute_segment_starts(program& p) { - trace(3, "transform") << "-- compute segment addresses" << end(); - uint32_t p_offset = /*size of ehdr*/0x34 + SIZE(p.segments)*0x20/*size of each phdr*/; - for (size_t i = 0; i < p.segments.size(); ++i) { - segment& curr = p.segments.at(i); - if (curr.start >= 0x08000000) { - // valid address for user space, so assume we're creating a real ELF binary, not just running a test - curr.start &= 0xfffff000; // same number of zeros as the p_align used when emitting the ELF binary - curr.start |= (p_offset & 0xfff); - trace(99, "transform") << "segment " << i << " begins at address 0x" << HEXWORD << curr.start << end(); - } - p_offset += size_of(curr); - assert(p_offset < SEGMENT_ALIGNMENT); // for now we get less and less available space in each successive segment - } -} - -uint32_t size_of(const segment& s) { - uint32_t sum = 0; - for (int i = 0; i < SIZE(s.lines); ++i) - sum += num_bytes(s.lines.at(i)); - return sum; -} - -// Assumes all bitfields are packed. -uint32_t num_bytes(const line& inst) { - uint32_t sum = 0; - for (int i = 0; i < SIZE(inst.words); ++i) - sum += size_of(inst.words.at(i)); - return sum; -} - -int size_of(const word& w) { - if (has_operand_metadata(w, "disp32") || has_operand_metadata(w, "imm32")) - return 4; - else if (has_operand_metadata(w, "disp16")) - return 2; - // End size_of(word w) Special-cases - else - return 1; -} - -//: Dependencies: -//: - We'd like to compute segment addresses before setting up global variables, -//: because computing addresses for global variables requires knowing where -//: the data segment starts. -//: - We'd like to finish expanding labels before computing segment addresses, -//: because it would make computing the sizes of segments more self-contained -//: (num_bytes). -//: -//: Decision: compute segment addresses before expanding labels, by being -//: aware in this layer of certain operand types that will eventually occupy -//: multiple bytes. -//: -//: The layer to expand labels later hooks into num_bytes() to teach this -//: layer that labels occupy zero space in the binary. diff --git a/035compute_segment_address.cc b/035compute_segment_address.cc new file mode 100644 index 00000000..61c3739a --- /dev/null +++ b/035compute_segment_address.cc @@ -0,0 +1,86 @@ +//: ELF binaries have finicky rules about the precise alignment each segment +//: should start at. They depend on the amount of code in a program. +//: We shouldn't expect people to adjust segment addresses everytime they make +//: a change to their programs. +//: Let's start taking the given segment addresses as guidelines, and adjust +//: them as necessary. +//: This gives up a measure of control in placing code and data. + +void test_segment_name() { + run( + "== code 0x09000000\n" + "05/add-to-EAX 0x0d0c0b0a/imm32\n" + // code starts at 0x09000000 + p_offset, which is 0x54 for a single-segment binary + ); + CHECK_TRACE_CONTENTS( + "load: 0x09000054 -> 05\n" + "load: 0x09000055 -> 0a\n" + "load: 0x09000056 -> 0b\n" + "load: 0x09000057 -> 0c\n" + "load: 0x09000058 -> 0d\n" + "run: add imm32 0x0d0c0b0a to EAX\n" + "run: storing 0x0d0c0b0a\n" + ); +} + +//: compute segment address + +:(before "End Level-2 Transforms") +Transform.push_back(compute_segment_starts); + +:(code) +void compute_segment_starts(program& p) { + trace(3, "transform") << "-- compute segment addresses" << end(); + uint32_t p_offset = /*size of ehdr*/0x34 + SIZE(p.segments)*0x20/*size of each phdr*/; + for (size_t i = 0; i < p.segments.size(); ++i) { + segment& curr = p.segments.at(i); + if (curr.start >= 0x08000000) { + // valid address for user space, so assume we're creating a real ELF binary, not just running a test + curr.start &= 0xfffff000; // same number of zeros as the p_align used when emitting the ELF binary + curr.start |= (p_offset & 0xfff); + trace(99, "transform") << "segment " << i << " begins at address 0x" << HEXWORD << curr.start << end(); + } + p_offset += size_of(curr); + assert(p_offset < SEGMENT_ALIGNMENT); // for now we get less and less available space in each successive segment + } +} + +uint32_t size_of(const segment& s) { + uint32_t sum = 0; + for (int i = 0; i < SIZE(s.lines); ++i) + sum += num_bytes(s.lines.at(i)); + return sum; +} + +// Assumes all bitfields are packed. +uint32_t num_bytes(const line& inst) { + uint32_t sum = 0; + for (int i = 0; i < SIZE(inst.words); ++i) + sum += size_of(inst.words.at(i)); + return sum; +} + +int size_of(const word& w) { + if (has_operand_metadata(w, "disp32") || has_operand_metadata(w, "imm32")) + return 4; + else if (has_operand_metadata(w, "disp16")) + return 2; + // End size_of(word w) Special-cases + else + return 1; +} + +//: Dependencies: +//: - We'd like to compute segment addresses before setting up global variables, +//: because computing addresses for global variables requires knowing where +//: the data segment starts. +//: - We'd like to finish expanding labels before computing segment addresses, +//: because it would make computing the sizes of segments more self-contained +//: (num_bytes). +//: +//: Decision: compute segment addresses before expanding labels, by being +//: aware in this layer of certain operand types that will eventually occupy +//: multiple bytes. +//: +//: The layer to expand labels later hooks into num_bytes() to teach this +//: layer that labels occupy zero space in the binary. diff --git a/035labels.cc b/035labels.cc deleted file mode 100644 index 6f7fdbfe..00000000 --- a/035labels.cc +++ /dev/null @@ -1,416 +0,0 @@ -//: Labels are defined by ending names with a ':'. This layer will compute -//: displacements for labels, and compute the offset for instructions using them. -//: -//: We won't check this, but our convention will be that jump targets will -//: start with a '$', while functions will not. Function names will never be -//: jumped to, and jump targets will never be called. - -//: We're introducing non-number names for the first time, so it's worth -//: laying down some ground rules all transforms will follow, so things don't -//: get too confusing: -//: - if it starts with a digit, it's treated as a number. If it can't be -//: parsed as hex it will raise an error. -//: - if it starts with '-' it's treated as a number. -//: - if it starts with '0x' it's treated as a number. -//: - if it's two characters long, it can't be a name. Either it's a hex -//: byte, or it raises an error. -//: That's it. Names can start with any non-digit that isn't a dash. They can -//: be a single character long. 'a' is not a hex number, it's a variable. -//: Later layers may add more conventions partitioning the space of names. But -//: the above rules will remain inviolate. - -//: One special label is 'Entry', the address to start running the program at. -//: It can be non-unique; the last declaration overrides earlier ones. -//: It must exist in a program. Otherwise we don't know where to start running -//: programs. - -void test_Entry_label() { - run( - "== code 0x1\n" - "05 0x0d0c0b0a/imm32\n" - "Entry:\n" - "05 0x0d0c0b0a/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "run: 0x00000006 opcode: 05\n" - ); - CHECK_TRACE_DOESNT_CONTAIN("run: 0x00000001 opcode: 05"); -} - -:(before "End looks_like_hex_int(s) Detectors") -if (SIZE(s) == 2) return true; - -:(code) -void test_pack_immediate_ignores_single_byte_nondigit_operand() { - Hide_errors = true; - transform( - "== code 0x1\n" - "b9/copy a/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "transform: packing instruction 'b9/copy a/imm32'\n" - // no change (we're just not printing metadata to the trace) - "transform: instruction after packing: 'b9 a'\n" - ); -} - -void test_pack_immediate_ignores_3_hex_digit_operand() { - Hide_errors = true; - transform( - "== code 0x1\n" - "b9/copy aaa/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "transform: packing instruction 'b9/copy aaa/imm32'\n" - // no change (we're just not printing metadata to the trace) - "transform: instruction after packing: 'b9 aaa'\n" - ); -} - -void test_pack_immediate_ignores_non_hex_operand() { - Hide_errors = true; - transform( - "== code 0x1\n" - "b9/copy xxx/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "transform: packing instruction 'b9/copy xxx/imm32'\n" - // no change (we're just not printing metadata to the trace) - "transform: instruction after packing: 'b9 xxx'\n" - ); -} - -//: a helper we'll find handy later -void check_valid_name(const string& s) { - if (s.empty()) { - raise << "empty name!\n" << end(); - return; - } - if (s.at(0) == '-') - raise << "'" << s << "' starts with '-', which can be confused with a negative number; use a different name\n" << end(); - if (s.substr(0, 2) == "0x") { - raise << "'" << s << "' looks like a hex number; use a different name\n" << end(); - return; - } - if (isdigit(s.at(0))) - raise << "'" << s << "' starts with a digit, and so can be confused with a number; use a different name.\n" << end(); - if (SIZE(s) == 2) - raise << "'" << s << "' is two characters long, which can look like raw hex bytes at a glance; use a different name\n" << end(); -} - -//: Now that that's done, let's start using names as labels. - -void test_map_label() { - transform( - "== code 0x1\n" - "loop:\n" - " 05 0x0d0c0b0a/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "transform: label 'loop' is at address 1\n" - ); -} - -:(before "End Level-2 Transforms") -Transform.push_back(rewrite_labels); -:(code) -void rewrite_labels(program& p) { - trace(3, "transform") << "-- rewrite labels" << end(); - if (p.segments.empty()) return; - segment& code = *find(p, "code"); - map byte_index; // values are unsigned, but we're going to do subtractions on them so they need to fit in 31 bits - compute_byte_indices_for_labels(code, byte_index); - if (trace_contains_errors()) return; - drop_labels(code); - if (trace_contains_errors()) return; - replace_labels_with_displacements(code, byte_index); - if (contains_key(byte_index, "Entry")) - p.entry = code.start + get(byte_index, "Entry"); -} - -void compute_byte_indices_for_labels(const segment& code, map& byte_index) { - int current_byte = 0; - for (int i = 0; i < SIZE(code.lines); ++i) { - const line& inst = code.lines.at(i); - if (Source_lines_file.is_open() && !inst.original.empty() && /*not a label*/ *inst.words.at(0).data.rbegin() != ':') - Source_lines_file << "0x" << HEXWORD << (code.start + current_byte) << ' ' << inst.original << '\n'; - for (int j = 0; j < SIZE(inst.words); ++j) { - const word& curr = inst.words.at(j); - // hack: if we have any operand metadata left after previous transforms, - // deduce its size - // Maybe we should just move this transform to before instruction - // packing, and deduce the size of *all* operands. But then we'll also - // have to deal with bitfields. - if (has_operand_metadata(curr, "disp32") || has_operand_metadata(curr, "imm32")) { - if (*curr.data.rbegin() == ':') - raise << "'" << to_string(inst) << "': don't use ':' when jumping to labels\n" << end(); - current_byte += 4; - } - else if (has_operand_metadata(curr, "disp16")) { - if (*curr.data.rbegin() == ':') - raise << "'" << to_string(inst) << "': don't use ':' when jumping to labels\n" << end(); - current_byte += 2; - } - // automatically handle /disp8 and /imm8 here - else if (*curr.data.rbegin() != ':') { - ++current_byte; - } - else { - string label = drop_last(curr.data); - // ensure labels look sufficiently different from raw hex - check_valid_name(label); - if (trace_contains_errors()) return; - if (contains_any_operand_metadata(curr)) - raise << "'" << to_string(inst) << "': label definition (':') not allowed in operand\n" << end(); - if (j > 0) - raise << "'" << to_string(inst) << "': labels can only be the first word in a line.\n" << end(); - if (Labels_file.is_open()) - Labels_file << "0x" << HEXWORD << (code.start + current_byte) << ' ' << label << '\n'; - if (contains_key(byte_index, label) && label != "Entry") { - raise << "duplicate label '" << label << "'\n" << end(); - return; - } - put(byte_index, label, current_byte); - trace(99, "transform") << "label '" << label << "' is at address " << (current_byte+code.start) << end(); - // no modifying current_byte; label definitions won't be in the final binary - } - } - } -} - -:(before "End Globals") -bool Dump_debug_info = false; // currently used only by 'subx translate' -ofstream Labels_file; -ofstream Source_lines_file; -:(before "End Commandline Options") -else if (is_equal(*arg, "--debug")) { - Dump_debug_info = true; - // End --debug Settings -} -//: wait to open "labels" for writing until we're sure we aren't trying to read it -:(after "Begin subx translate") -if (Dump_debug_info) { - cerr << "saving address->label information to 'labels'\n"; - Labels_file.open("labels"); - cerr << "saving address->source information to 'source_lines'\n"; - Source_lines_file.open("source_lines"); -} -:(before "End subx translate") -if (Dump_debug_info) { - Labels_file.close(); - Source_lines_file.close(); -} - -:(code) -void drop_labels(segment& code) { - for (int i = 0; i < SIZE(code.lines); ++i) { - line& inst = code.lines.at(i); - vector::iterator new_end = remove_if(inst.words.begin(), inst.words.end(), is_label); - inst.words.erase(new_end, inst.words.end()); - } -} - -bool is_label(const word& w) { - return *w.data.rbegin() == ':'; -} - -void replace_labels_with_displacements(segment& code, const map& byte_index) { - int32_t byte_index_next_instruction_starts_at = 0; - for (int i = 0; i < SIZE(code.lines); ++i) { - line& inst = code.lines.at(i); - byte_index_next_instruction_starts_at += num_bytes(inst); - line new_inst; - for (int j = 0; j < SIZE(inst.words); ++j) { - const word& curr = inst.words.at(j); - if (contains_key(byte_index, curr.data)) { - int32_t displacement = static_cast(get(byte_index, curr.data)) - byte_index_next_instruction_starts_at; - if (has_operand_metadata(curr, "disp8")) { - if (displacement > 0x7f || displacement < -0x7f) - raise << "'" << to_string(inst) << "': label too far away for displacement " << std::hex << displacement << " to fit in 8 signed bits\n" << end(); - else - emit_hex_bytes(new_inst, displacement, 1); - } - else if (has_operand_metadata(curr, "disp16")) { - if (displacement > 0x7fff || displacement < -0x7fff) - raise << "'" << to_string(inst) << "': label too far away for displacement " << std::hex << displacement << " to fit in 16 signed bits\n" << end(); - else - emit_hex_bytes(new_inst, displacement, 2); - } - else if (has_operand_metadata(curr, "disp32")) { - emit_hex_bytes(new_inst, displacement, 4); - } else if (has_operand_metadata(curr, "imm32")) { - emit_hex_bytes(new_inst, code.start + get(byte_index, curr.data), 4); - } - } - else { - new_inst.words.push_back(curr); - } - } - inst.words.swap(new_inst.words); - trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end(); - } -} - -string data_to_string(const line& inst) { - ostringstream out; - for (int i = 0; i < SIZE(inst.words); ++i) { - if (i > 0) out << ' '; - out << inst.words.at(i).data; - } - return out.str(); -} - -string drop_last(const string& s) { - return string(s.begin(), --s.end()); -} - -//: Label definitions must be the first word on a line. No jumping inside -//: instructions. -//: They should also be the only word on a line. -//: However, you can absolutely have multiple labels map to the same address, -//: as long as they're on separate lines. - -void test_multiple_labels_at() { - transform( - "== code 0x1\n" - // address 1 - "loop:\n" - " $loop2:\n" - // address 1 (labels take up no space) - " 05 0x0d0c0b0a/imm32\n" - // address 6 - " eb $loop2/disp8\n" - // address 8 - " eb $loop3/disp8\n" - // address 0xa - " $loop3:\n" - ); - CHECK_TRACE_CONTENTS( - "transform: label 'loop' is at address 1\n" - "transform: label '$loop2' is at address 1\n" - "transform: label '$loop3' is at address a\n" - // first jump is to -7 - "transform: instruction after transform: 'eb f9'\n" - // second jump is to 0 (fall through) - "transform: instruction after transform: 'eb 00'\n" - ); -} - -void test_loading_label_as_imm32() { - transform( - "== code 0x1\n" - "label:\n" - " be/copy-to-ESI label/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "transform: label 'label' is at address 1\n" - "transform: instruction after transform: 'be 01 00 00 00'\n" - ); -} - -void test_duplicate_label() { - Hide_errors = true; - transform( - "== code 0x1\n" - "loop:\n" - "loop:\n" - " 05 0x0d0c0b0a/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "error: duplicate label 'loop'\n" - ); -} - -void test_label_too_short() { - Hide_errors = true; - transform( - "== code 0x1\n" - "xz:\n" - " 05 0x0d0c0b0a/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "error: 'xz' is two characters long, which can look like raw hex bytes at a glance; use a different name\n" - ); -} - -void test_label_hex() { - Hide_errors = true; - transform( - "== code 0x1\n" - "0xab:\n" - " 05 0x0d0c0b0a/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "error: '0xab' looks like a hex number; use a different name\n" - ); -} - -void test_label_negative_hex() { - Hide_errors = true; - transform( - "== code 0x1\n" - "-a:\n" - " 05 0x0d0c0b0a/imm32\n" - ); - CHECK_TRACE_CONTENTS( - "error: '-a' starts with '-', which can be confused with a negative number; use a different name\n" - ); -} - -//: As said up top, the 'Entry' label is special. -//: It can be non-unique; the last declaration overrides earlier ones. -//: It must exist in a program. Otherwise we don't know where to start running -//: programs. - -void test_duplicate_Entry_label() { - transform( - "== code 0x1\n" - "Entry:\n" - "Entry:\n" - " 05 0x0d0c0b0a/imm32\n" - ); - CHECK_TRACE_DOESNT_CONTAIN_ERRORS(); -} - -// This test could do with some refactoring. -// We're duplicating the flow inside `subx translate`, but without -// reading/writing files. -// We can't just use run(string) because most of our tests allow programs -// without 'Entry' labels, as a convenience. -void test_programs_without_Entry_label() { - Hide_errors = true; - program p; - istringstream in( - "== code 0x1\n" - "05 0x0d0c0b0a/imm32\n" - "05 0x0d0c0b0a/imm32\n" - ); - parse(in, p); - transform(p); - ostringstream dummy; - save_elf(p, dummy); - CHECK_TRACE_CONTENTS( - "error: no 'Entry' label found\n" - ); -} - -//: now that we have labels, we need to adjust segment size computation to -//: ignore them. - -void test_segment_size_ignores_labels() { - transform( - "== code 0x09000074\n" - " 05/add 0x0d0c0b0a/imm32\n" // 5 bytes - "foo:\n" // 0 bytes - "== data 0x0a000000\n" - "bar:\n" - " 00\n" - ); - CHECK_TRACE_CONTENTS( - "transform: segment 1 begins at address 0x0a000079\n" - ); -} - -:(before "End size_of(word w) Special-cases") -else if (is_label(w)) - return 0; diff --git a/036global_variables.cc b/036global_variables.cc deleted file mode 100644 index c22ac3d3..00000000 --- a/036global_variables.cc +++ /dev/null @@ -1,305 +0,0 @@ -//: Global variables. -//: -//: Global variables are just labels in the data segment. -//: However, they can only be used in imm32 and not disp32 operands. And they -//: can't be used with jump and call instructions. -//: -//: This layer has much the same structure as rewriting labels. - -:(code) -void test_global_variable() { - run( - "== code 0x1\n" - "b9 x/imm32\n" - "== data 0x2000\n" - "x:\n" - " 00 00 00 00\n" - ); - CHECK_TRACE_CONTENTS( - "transform: global variable 'x' is at address 0x00002000\n" - ); -} - -:(before "End Level-2 Transforms") -Transform.push_back(rewrite_global_variables); -:(code) -void rewrite_global_variables(program& p) { - trace(3, "transform") << "-- rewrite global variables" << end(); - // Begin rewrite_global_variables - map address; - compute_addresses_for_global_variables(p, address); - if (trace_contains_errors()) return; - drop_global_variables(p); - replace_global_variables_with_addresses(p, address); -} - -void compute_addresses_for_global_variables(const program& p, map& address) { - for (int i = 0; i < SIZE(p.segments); ++i) { - if (p.segments.at(i).name != "code") - compute_addresses_for_global_variables(p.segments.at(i), address); - } -} - -void compute_addresses_for_global_variables(const segment& s, map& address) { - int current_address = s.start; - for (int i = 0; i < SIZE(s.lines); ++i) { - const line& inst = s.lines.at(i); - for (int j = 0; j < SIZE(inst.words); ++j) { - const word& curr = inst.words.at(j); - if (*curr.data.rbegin() != ':') { - current_address += size_of(curr); - } - else { - string variable = drop_last(curr.data); - // ensure variables look sufficiently different from raw hex - check_valid_name(variable); - if (trace_contains_errors()) return; - if (j > 0) - raise << "'" << to_string(inst) << "': global variable names can only be the first word in a line.\n" << end(); - if (Labels_file.is_open()) - Labels_file << "0x" << HEXWORD << current_address << ' ' << variable << '\n'; - if (contains_key(address, variable)) { - raise << "duplicate global '" << variable << "'\n" << end(); - return; - } - put(address, variable, current_address); - trace(99, "transform") << "global variable '" << variable << "' is at address 0x" << HEXWORD << current_address << end(); - // no modifying current_address; global variable definitions won't be in the final binary - } - } - } -} - -void drop_global_variables(program& p) { - for (int i = 0; i < SIZE(p.segments); ++i) { - if (p.segments.at(i).name != "code") - drop_labels(p.segments.at(i)); - } -} - -void replace_global_variables_with_addresses(program& p, const map& address) { - if (p.segments.empty()) return; - for (int i = 0; i < SIZE(p.segments); ++i) { - segment& curr = p.segments.at(i); - if (curr.name == "code") - replace_global_variables_in_code_segment(curr, address); - else - replace_global_variables_in_data_segment(curr, address); - } -} - -void replace_global_variables_in_code_segment(segment& code, const map& address) { - for (int i = 0; i < SIZE(code.lines); ++i) { - line& inst = code.lines.at(i); - line new_inst; - for (int j = 0; j < SIZE(inst.words); ++j) { - const word& curr = inst.words.at(j); - if (!contains_key(address, curr.data)) { - if (!looks_like_hex_int(curr.data)) - raise << "missing reference to global '" << curr.data << "'\n" << end(); - new_inst.words.push_back(curr); - continue; - } - if (!valid_use_of_global_variable(curr)) { - raise << "'" << to_string(inst) << "': can't refer to global variable '" << curr.data << "'\n" << end(); - return; - } - emit_hex_bytes(new_inst, get(address, curr.data), 4); - } - inst.words.swap(new_inst.words); - trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end(); - } -} - -void replace_global_variables_in_data_segment(segment& data, const map& address) { - for (int i = 0; i < SIZE(data.lines); ++i) { - line& l = data.lines.at(i); - line new_l; - for (int j = 0; j < SIZE(l.words); ++j) { - const word& curr = l.words.at(j); - if (!contains_key(address, curr.data)) { - if (looks_like_hex_int(curr.data)) { - if (has_operand_metadata(curr, "imm32")) - emit_hex_bytes(new_l, curr, 4); - else if (has_operand_metadata(curr, "imm16")) - emit_hex_bytes(new_l, curr, 2); - else if (has_operand_metadata(curr, "imm8")) - emit_hex_bytes(new_l, curr, 1); - else if (has_operand_metadata(curr, "disp8")) - raise << "can't use /disp8 in a non-code segment\n" << end(); - else if (has_operand_metadata(curr, "disp16")) - raise << "can't use /disp16 in a non-code segment\n" << end(); - else if (has_operand_metadata(curr, "disp32")) - raise << "can't use /disp32 in a non-code segment\n" << end(); - else - new_l.words.push_back(curr); - } - else { - raise << "missing reference to global '" << curr.data << "'\n" << end(); - new_l.words.push_back(curr); - } - continue; - } - trace(99, "transform") << curr.data << " maps to " << HEXWORD << get(address, curr.data) << end(); - emit_hex_bytes(new_l, get(address, curr.data), 4); - } - l.words.swap(new_l.words); - trace(99, "transform") << "after transform: '" << data_to_string(l) << "'" << end(); - } -} - -bool valid_use_of_global_variable(const word& curr) { - if (has_operand_metadata(curr, "imm32")) return true; - // End Valid Uses Of Global Variable(curr) - return false; -} - -//:: a more complex sanity check for how we use global variables -//: requires first saving some data early before we pack operands - -:(after "Begin Level-2 Transforms") -Transform.push_back(correlate_disp32_with_mod); -:(code) -void correlate_disp32_with_mod(program& p) { - if (p.segments.empty()) return; - segment& code = *find(p, "code"); - for (int i = 0; i < SIZE(code.lines); ++i) { - line& inst = code.lines.at(i); - for (int j = 0; j < SIZE(inst.words); ++j) { - word& curr = inst.words.at(j); - if (has_operand_metadata(curr, "disp32") - && has_operand_metadata(inst, "mod")) - curr.metadata.push_back("has_mod"); - } - } -} - -:(before "End Valid Uses Of Global Variable(curr)") -if (has_operand_metadata(curr, "disp32")) - return has_metadata(curr, "has_mod"); -// todo: more sophisticated check, to ensure we don't use global variable -// addresses as a real displacement added to other operands. - -:(code) -bool has_metadata(const word& w, const string& m) { - for (int i = 0; i < SIZE(w.metadata); ++i) - if (w.metadata.at(i) == m) return true; - return false; -} - -void test_global_variable_disallowed_in_jump() { - Hide_errors = true; - run( - "== code 0x1\n" - "eb/jump x/disp8\n" - "== data 0x2000\n" - "x:\n" - " 00 00 00 00\n" - ); - CHECK_TRACE_CONTENTS( - "error: 'eb/jump x/disp8': can't refer to global variable 'x'\n" - // sub-optimal error message; should be -//? "error: can't jump to data (variable 'x')\n" - ); -} - -void test_global_variable_disallowed_in_call() { - Hide_errors = true; - run( - "== code 0x1\n" - "e8/call x/disp32\n" - "== data 0x2000\n" - "x:\n" - " 00 00 00 00\n" - ); - CHECK_TRACE_CONTENTS( - "error: 'e8/call x/disp32': can't refer to global variable 'x'\n" - // sub-optimal error message; should be -//? "error: can't call to the data segment ('x')\n" - ); -} - -void test_global_variable_in_data_segment() { - run( - "== code 0x1\n" - "b9 x/imm32\n" - "== data 0x2000\n" - "x:\n" - " y/imm32\n" - "y:\n" - " 00 00 00 00\n" - ); - // check that we loaded 'x' with the address of 'y' - CHECK_TRACE_CONTENTS( - "load: 0x00002000 -> 04\n" - "load: 0x00002001 -> 20\n" - "load: 0x00002002 -> 00\n" - "load: 0x00002003 -> 00\n" - ); - CHECK_TRACE_COUNT("error", 0); -} - -void test_raw_number_with_imm32_in_data_segment() { - run( - "== code 0x1\n" - "b9 x/imm32\n" - "== data 0x2000\n" - "x:\n" - " 1/imm32\n" - ); - // check that we loaded 'x' with the address of 1 - CHECK_TRACE_CONTENTS( - "load: 0x00002000 -> 01\n" - "load: 0x00002001 -> 00\n" - "load: 0x00002002 -> 00\n" - "load: 0x00002003 -> 00\n" - ); - CHECK_TRACE_COUNT("error", 0); -} - -void test_duplicate_global_variable() { - Hide_errors = true; - run( - "== code 0x1\n" - "40/increment-EAX\n" - "== data 0x2000\n" - "x:\n" - "x:\n" - " 00\n" - ); - CHECK_TRACE_CONTENTS( - "error: duplicate global 'x'\n" - ); -} - -void test_global_variable_disp32_with_modrm() { - run( - "== code 0x1\n" - "8b/copy 0/mod/indirect 5/rm32/.disp32 2/r32/EDX x/disp32\n" - "== data 0x2000\n" - "x:\n" - " 00 00 00 00\n" - ); - CHECK_TRACE_COUNT("error", 0); -} - -void test_global_variable_disp32_with_call() { - transform( - "== code 0x1\n" - "foo:\n" - " e8/call bar/disp32\n" - "bar:\n" - ); - CHECK_TRACE_COUNT("error", 0); -} - -string to_full_string(const line& in) { - ostringstream out; - for (int i = 0; i < SIZE(in.words); ++i) { - if (i > 0) out << ' '; - out << in.words.at(i).data; - for (int j = 0; j < SIZE(in.words.at(i).metadata); ++j) - out << '/' << in.words.at(i).metadata.at(j); - } - return out.str(); -} diff --git a/036labels.cc b/036labels.cc new file mode 100644 index 00000000..6f7fdbfe --- /dev/null +++ b/036labels.cc @@ -0,0 +1,416 @@ +//: Labels are defined by ending names with a ':'. This layer will compute +//: displacements for labels, and compute the offset for instructions using them. +//: +//: We won't check this, but our convention will be that jump targets will +//: start with a '$', while functions will not. Function names will never be +//: jumped to, and jump targets will never be called. + +//: We're introducing non-number names for the first time, so it's worth +//: laying down some ground rules all transforms will follow, so things don't +//: get too confusing: +//: - if it starts with a digit, it's treated as a number. If it can't be +//: parsed as hex it will raise an error. +//: - if it starts with '-' it's treated as a number. +//: - if it starts with '0x' it's treated as a number. +//: - if it's two characters long, it can't be a name. Either it's a hex +//: byte, or it raises an error. +//: That's it. Names can start with any non-digit that isn't a dash. They can +//: be a single character long. 'a' is not a hex number, it's a variable. +//: Later layers may add more conventions partitioning the space of names. But +//: the above rules will remain inviolate. + +//: One special label is 'Entry', the address to start running the program at. +//: It can be non-unique; the last declaration overrides earlier ones. +//: It must exist in a program. Otherwise we don't know where to start running +//: programs. + +void test_Entry_label() { + run( + "== code 0x1\n" + "05 0x0d0c0b0a/imm32\n" + "Entry:\n" + "05 0x0d0c0b0a/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "run: 0x00000006 opcode: 05\n" + ); + CHECK_TRACE_DOESNT_CONTAIN("run: 0x00000001 opcode: 05"); +} + +:(before "End looks_like_hex_int(s) Detectors") +if (SIZE(s) == 2) return true; + +:(code) +void test_pack_immediate_ignores_single_byte_nondigit_operand() { + Hide_errors = true; + transform( + "== code 0x1\n" + "b9/copy a/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "transform: packing instruction 'b9/copy a/imm32'\n" + // no change (we're just not printing metadata to the trace) + "transform: instruction after packing: 'b9 a'\n" + ); +} + +void test_pack_immediate_ignores_3_hex_digit_operand() { + Hide_errors = true; + transform( + "== code 0x1\n" + "b9/copy aaa/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "transform: packing instruction 'b9/copy aaa/imm32'\n" + // no change (we're just not printing metadata to the trace) + "transform: instruction after packing: 'b9 aaa'\n" + ); +} + +void test_pack_immediate_ignores_non_hex_operand() { + Hide_errors = true; + transform( + "== code 0x1\n" + "b9/copy xxx/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "transform: packing instruction 'b9/copy xxx/imm32'\n" + // no change (we're just not printing metadata to the trace) + "transform: instruction after packing: 'b9 xxx'\n" + ); +} + +//: a helper we'll find handy later +void check_valid_name(const string& s) { + if (s.empty()) { + raise << "empty name!\n" << end(); + return; + } + if (s.at(0) == '-') + raise << "'" << s << "' starts with '-', which can be confused with a negative number; use a different name\n" << end(); + if (s.substr(0, 2) == "0x") { + raise << "'" << s << "' looks like a hex number; use a different name\n" << end(); + return; + } + if (isdigit(s.at(0))) + raise << "'" << s << "' starts with a digit, and so can be confused with a number; use a different name.\n" << end(); + if (SIZE(s) == 2) + raise << "'" << s << "' is two characters long, which can look like raw hex bytes at a glance; use a different name\n" << end(); +} + +//: Now that that's done, let's start using names as labels. + +void test_map_label() { + transform( + "== code 0x1\n" + "loop:\n" + " 05 0x0d0c0b0a/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "transform: label 'loop' is at address 1\n" + ); +} + +:(before "End Level-2 Transforms") +Transform.push_back(rewrite_labels); +:(code) +void rewrite_labels(program& p) { + trace(3, "transform") << "-- rewrite labels" << end(); + if (p.segments.empty()) return; + segment& code = *find(p, "code"); + map byte_index; // values are unsigned, but we're going to do subtractions on them so they need to fit in 31 bits + compute_byte_indices_for_labels(code, byte_index); + if (trace_contains_errors()) return; + drop_labels(code); + if (trace_contains_errors()) return; + replace_labels_with_displacements(code, byte_index); + if (contains_key(byte_index, "Entry")) + p.entry = code.start + get(byte_index, "Entry"); +} + +void compute_byte_indices_for_labels(const segment& code, map& byte_index) { + int current_byte = 0; + for (int i = 0; i < SIZE(code.lines); ++i) { + const line& inst = code.lines.at(i); + if (Source_lines_file.is_open() && !inst.original.empty() && /*not a label*/ *inst.words.at(0).data.rbegin() != ':') + Source_lines_file << "0x" << HEXWORD << (code.start + current_byte) << ' ' << inst.original << '\n'; + for (int j = 0; j < SIZE(inst.words); ++j) { + const word& curr = inst.words.at(j); + // hack: if we have any operand metadata left after previous transforms, + // deduce its size + // Maybe we should just move this transform to before instruction + // packing, and deduce the size of *all* operands. But then we'll also + // have to deal with bitfields. + if (has_operand_metadata(curr, "disp32") || has_operand_metadata(curr, "imm32")) { + if (*curr.data.rbegin() == ':') + raise << "'" << to_string(inst) << "': don't use ':' when jumping to labels\n" << end(); + current_byte += 4; + } + else if (has_operand_metadata(curr, "disp16")) { + if (*curr.data.rbegin() == ':') + raise << "'" << to_string(inst) << "': don't use ':' when jumping to labels\n" << end(); + current_byte += 2; + } + // automatically handle /disp8 and /imm8 here + else if (*curr.data.rbegin() != ':') { + ++current_byte; + } + else { + string label = drop_last(curr.data); + // ensure labels look sufficiently different from raw hex + check_valid_name(label); + if (trace_contains_errors()) return; + if (contains_any_operand_metadata(curr)) + raise << "'" << to_string(inst) << "': label definition (':') not allowed in operand\n" << end(); + if (j > 0) + raise << "'" << to_string(inst) << "': labels can only be the first word in a line.\n" << end(); + if (Labels_file.is_open()) + Labels_file << "0x" << HEXWORD << (code.start + current_byte) << ' ' << label << '\n'; + if (contains_key(byte_index, label) && label != "Entry") { + raise << "duplicate label '" << label << "'\n" << end(); + return; + } + put(byte_index, label, current_byte); + trace(99, "transform") << "label '" << label << "' is at address " << (current_byte+code.start) << end(); + // no modifying current_byte; label definitions won't be in the final binary + } + } + } +} + +:(before "End Globals") +bool Dump_debug_info = false; // currently used only by 'subx translate' +ofstream Labels_file; +ofstream Source_lines_file; +:(before "End Commandline Options") +else if (is_equal(*arg, "--debug")) { + Dump_debug_info = true; + // End --debug Settings +} +//: wait to open "labels" for writing until we're sure we aren't trying to read it +:(after "Begin subx translate") +if (Dump_debug_info) { + cerr << "saving address->label information to 'labels'\n"; + Labels_file.open("labels"); + cerr << "saving address->source information to 'source_lines'\n"; + Source_lines_file.open("source_lines"); +} +:(before "End subx translate") +if (Dump_debug_info) { + Labels_file.close(); + Source_lines_file.close(); +} + +:(code) +void drop_labels(segment& code) { + for (int i = 0; i < SIZE(code.lines); ++i) { + line& inst = code.lines.at(i); + vector::iterator new_end = remove_if(inst.words.begin(), inst.words.end(), is_label); + inst.words.erase(new_end, inst.words.end()); + } +} + +bool is_label(const word& w) { + return *w.data.rbegin() == ':'; +} + +void replace_labels_with_displacements(segment& code, const map& byte_index) { + int32_t byte_index_next_instruction_starts_at = 0; + for (int i = 0; i < SIZE(code.lines); ++i) { + line& inst = code.lines.at(i); + byte_index_next_instruction_starts_at += num_bytes(inst); + line new_inst; + for (int j = 0; j < SIZE(inst.words); ++j) { + const word& curr = inst.words.at(j); + if (contains_key(byte_index, curr.data)) { + int32_t displacement = static_cast(get(byte_index, curr.data)) - byte_index_next_instruction_starts_at; + if (has_operand_metadata(curr, "disp8")) { + if (displacement > 0x7f || displacement < -0x7f) + raise << "'" << to_string(inst) << "': label too far away for displacement " << std::hex << displacement << " to fit in 8 signed bits\n" << end(); + else + emit_hex_bytes(new_inst, displacement, 1); + } + else if (has_operand_metadata(curr, "disp16")) { + if (displacement > 0x7fff || displacement < -0x7fff) + raise << "'" << to_string(inst) << "': label too far away for displacement " << std::hex << displacement << " to fit in 16 signed bits\n" << end(); + else + emit_hex_bytes(new_inst, displacement, 2); + } + else if (has_operand_metadata(curr, "disp32")) { + emit_hex_bytes(new_inst, displacement, 4); + } else if (has_operand_metadata(curr, "imm32")) { + emit_hex_bytes(new_inst, code.start + get(byte_index, curr.data), 4); + } + } + else { + new_inst.words.push_back(curr); + } + } + inst.words.swap(new_inst.words); + trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end(); + } +} + +string data_to_string(const line& inst) { + ostringstream out; + for (int i = 0; i < SIZE(inst.words); ++i) { + if (i > 0) out << ' '; + out << inst.words.at(i).data; + } + return out.str(); +} + +string drop_last(const string& s) { + return string(s.begin(), --s.end()); +} + +//: Label definitions must be the first word on a line. No jumping inside +//: instructions. +//: They should also be the only word on a line. +//: However, you can absolutely have multiple labels map to the same address, +//: as long as they're on separate lines. + +void test_multiple_labels_at() { + transform( + "== code 0x1\n" + // address 1 + "loop:\n" + " $loop2:\n" + // address 1 (labels take up no space) + " 05 0x0d0c0b0a/imm32\n" + // address 6 + " eb $loop2/disp8\n" + // address 8 + " eb $loop3/disp8\n" + // address 0xa + " $loop3:\n" + ); + CHECK_TRACE_CONTENTS( + "transform: label 'loop' is at address 1\n" + "transform: label '$loop2' is at address 1\n" + "transform: label '$loop3' is at address a\n" + // first jump is to -7 + "transform: instruction after transform: 'eb f9'\n" + // second jump is to 0 (fall through) + "transform: instruction after transform: 'eb 00'\n" + ); +} + +void test_loading_label_as_imm32() { + transform( + "== code 0x1\n" + "label:\n" + " be/copy-to-ESI label/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "transform: label 'label' is at address 1\n" + "transform: instruction after transform: 'be 01 00 00 00'\n" + ); +} + +void test_duplicate_label() { + Hide_errors = true; + transform( + "== code 0x1\n" + "loop:\n" + "loop:\n" + " 05 0x0d0c0b0a/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "error: duplicate label 'loop'\n" + ); +} + +void test_label_too_short() { + Hide_errors = true; + transform( + "== code 0x1\n" + "xz:\n" + " 05 0x0d0c0b0a/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "error: 'xz' is two characters long, which can look like raw hex bytes at a glance; use a different name\n" + ); +} + +void test_label_hex() { + Hide_errors = true; + transform( + "== code 0x1\n" + "0xab:\n" + " 05 0x0d0c0b0a/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "error: '0xab' looks like a hex number; use a different name\n" + ); +} + +void test_label_negative_hex() { + Hide_errors = true; + transform( + "== code 0x1\n" + "-a:\n" + " 05 0x0d0c0b0a/imm32\n" + ); + CHECK_TRACE_CONTENTS( + "error: '-a' starts with '-', which can be confused with a negative number; use a different name\n" + ); +} + +//: As said up top, the 'Entry' label is special. +//: It can be non-unique; the last declaration overrides earlier ones. +//: It must exist in a program. Otherwise we don't know where to start running +//: programs. + +void test_duplicate_Entry_label() { + transform( + "== code 0x1\n" + "Entry:\n" + "Entry:\n" + " 05 0x0d0c0b0a/imm32\n" + ); + CHECK_TRACE_DOESNT_CONTAIN_ERRORS(); +} + +// This test could do with some refactoring. +// We're duplicating the flow inside `subx translate`, but without +// reading/writing files. +// We can't just use run(string) because most of our tests allow programs +// without 'Entry' labels, as a convenience. +void test_programs_without_Entry_label() { + Hide_errors = true; + program p; + istringstream in( + "== code 0x1\n" + "05 0x0d0c0b0a/imm32\n" + "05 0x0d0c0b0a/imm32\n" + ); + parse(in, p); + transform(p); + ostringstream dummy; + save_elf(p, dummy); + CHECK_TRACE_CONTENTS( + "error: no 'Entry' label found\n" + ); +} + +//: now that we have labels, we need to adjust segment size computation to +//: ignore them. + +void test_segment_size_ignores_labels() { + transform( + "== code 0x09000074\n" + " 05/add 0x0d0c0b0a/imm32\n" // 5 bytes + "foo:\n" // 0 bytes + "== data 0x0a000000\n" + "bar:\n" + " 00\n" + ); + CHECK_TRACE_CONTENTS( + "transform: segment 1 begins at address 0x0a000079\n" + ); +} + +:(before "End size_of(word w) Special-cases") +else if (is_label(w)) + return 0; diff --git a/037global_variables.cc b/037global_variables.cc new file mode 100644 index 00000000..c22ac3d3 --- /dev/null +++ b/037global_variables.cc @@ -0,0 +1,305 @@ +//: Global variables. +//: +//: Global variables are just labels in the data segment. +//: However, they can only be used in imm32 and not disp32 operands. And they +//: can't be used with jump and call instructions. +//: +//: This layer has much the same structure as rewriting labels. + +:(code) +void test_global_variable() { + run( + "== code 0x1\n" + "b9 x/imm32\n" + "== data 0x2000\n" + "x:\n" + " 00 00 00 00\n" + ); + CHECK_TRACE_CONTENTS( + "transform: global variable 'x' is at address 0x00002000\n" + ); +} + +:(before "End Level-2 Transforms") +Transform.push_back(rewrite_global_variables); +:(code) +void rewrite_global_variables(program& p) { + trace(3, "transform") << "-- rewrite global variables" << end(); + // Begin rewrite_global_variables + map address; + compute_addresses_for_global_variables(p, address); + if (trace_contains_errors()) return; + drop_global_variables(p); + replace_global_variables_with_addresses(p, address); +} + +void compute_addresses_for_global_variables(const program& p, map& address) { + for (int i = 0; i < SIZE(p.segments); ++i) { + if (p.segments.at(i).name != "code") + compute_addresses_for_global_variables(p.segments.at(i), address); + } +} + +void compute_addresses_for_global_variables(const segment& s, map& address) { + int current_address = s.start; + for (int i = 0; i < SIZE(s.lines); ++i) { + const line& inst = s.lines.at(i); + for (int j = 0; j < SIZE(inst.words); ++j) { + const word& curr = inst.words.at(j); + if (*curr.data.rbegin() != ':') { + current_address += size_of(curr); + } + else { + string variable = drop_last(curr.data); + // ensure variables look sufficiently different from raw hex + check_valid_name(variable); + if (trace_contains_errors()) return; + if (j > 0) + raise << "'" << to_string(inst) << "': global variable names can only be the first word in a line.\n" << end(); + if (Labels_file.is_open()) + Labels_file << "0x" << HEXWORD << current_address << ' ' << variable << '\n'; + if (contains_key(address, variable)) { + raise << "duplicate global '" << variable << "'\n" << end(); + return; + } + put(address, variable, current_address); + trace(99, "transform") << "global variable '" << variable << "' is at address 0x" << HEXWORD << current_address << end(); + // no modifying current_address; global variable definitions won't be in the final binary + } + } + } +} + +void drop_global_variables(program& p) { + for (int i = 0; i < SIZE(p.segments); ++i) { + if (p.segments.at(i).name != "code") + drop_labels(p.segments.at(i)); + } +} + +void replace_global_variables_with_addresses(program& p, const map& address) { + if (p.segments.empty()) return; + for (int i = 0; i < SIZE(p.segments); ++i) { + segment& curr = p.segments.at(i); + if (curr.name == "code") + replace_global_variables_in_code_segment(curr, address); + else + replace_global_variables_in_data_segment(curr, address); + } +} + +void replace_global_variables_in_code_segment(segment& code, const map& address) { + for (int i = 0; i < SIZE(code.lines); ++i) { + line& inst = code.lines.at(i); + line new_inst; + for (int j = 0; j < SIZE(inst.words); ++j) { + const word& curr = inst.words.at(j); + if (!contains_key(address, curr.data)) { + if (!looks_like_hex_int(curr.data)) + raise << "missing reference to global '" << curr.data << "'\n" << end(); + new_inst.words.push_back(curr); + continue; + } + if (!valid_use_of_global_variable(curr)) { + raise << "'" << to_string(inst) << "': can't refer to global variable '" << curr.data << "'\n" << end(); + return; + } + emit_hex_bytes(new_inst, get(address, curr.data), 4); + } + inst.words.swap(new_inst.words); + trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end(); + } +} + +void replace_global_variables_in_data_segment(segment& data, const map& address) { + for (int i = 0; i < SIZE(data.lines); ++i) { + line& l = data.lines.at(i); + line new_l; + for (int j = 0; j < SIZE(l.words); ++j) { + const word& curr = l.words.at(j); + if (!contains_key(address, curr.data)) { + if (looks_like_hex_int(curr.data)) { + if (has_operand_metadata(curr, "imm32")) + emit_hex_bytes(new_l, curr, 4); + else if (has_operand_metadata(curr, "imm16")) + emit_hex_bytes(new_l, curr, 2); + else if (has_operand_metadata(curr, "imm8")) + emit_hex_bytes(new_l, curr, 1); + else if (has_operand_metadata(curr, "disp8")) + raise << "can't use /disp8 in a non-code segment\n" << end(); + else if (has_operand_metadata(curr, "disp16")) + raise << "can't use /disp16 in a non-code segment\n" << end(); + else if (has_operand_metadata(curr, "disp32")) + raise << "can't use /disp32 in a non-code segment\n" << end(); + else + new_l.words.push_back(curr); + } + else { + raise << "missing reference to global '" << curr.data << "'\n" << end(); + new_l.words.push_back(curr); + } + continue; + } + trace(99, "transform") << curr.data << " maps to " << HEXWORD << get(address, curr.data) << end(); + emit_hex_bytes(new_l, get(address, curr.data), 4); + } + l.words.swap(new_l.words); + trace(99, "transform") << "after transform: '" << data_to_string(l) << "'" << end(); + } +} + +bool valid_use_of_global_variable(const word& curr) { + if (has_operand_metadata(curr, "imm32")) return true; + // End Valid Uses Of Global Variable(curr) + return false; +} + +//:: a more complex sanity check for how we use global variables +//: requires first saving some data early before we pack operands + +:(after "Begin Level-2 Transforms") +Transform.push_back(correlate_disp32_with_mod); +:(code) +void correlate_disp32_with_mod(program& p) { + if (p.segments.empty()) return; + segment& code = *find(p, "code"); + for (int i = 0; i < SIZE(code.lines); ++i) { + line& inst = code.lines.at(i); + for (int j = 0; j < SIZE(inst.words); ++j) { + word& curr = inst.words.at(j); + if (has_operand_metadata(curr, "disp32") + && has_operand_metadata(inst, "mod")) + curr.metadata.push_back("has_mod"); + } + } +} + +:(before "End Valid Uses Of Global Variable(curr)") +if (has_operand_metadata(curr, "disp32")) + return has_metadata(curr, "has_mod"); +// todo: more sophisticated check, to ensure we don't use global variable +// addresses as a real displacement added to other operands. + +:(code) +bool has_metadata(const word& w, const string& m) { + for (int i = 0; i < SIZE(w.metadata); ++i) + if (w.metadata.at(i) == m) return true; + return false; +} + +void test_global_variable_disallowed_in_jump() { + Hide_errors = true; + run( + "== code 0x1\n" + "eb/jump x/disp8\n" + "== data 0x2000\n" + "x:\n" + " 00 00 00 00\n" + ); + CHECK_TRACE_CONTENTS( + "error: 'eb/jump x/disp8': can't refer to global variable 'x'\n" + // sub-optimal error message; should be +//? "error: can't jump to data (variable 'x')\n" + ); +} + +void test_global_variable_disallowed_in_call() { + Hide_errors = true; + run( + "== code 0x1\n" + "e8/call x/disp32\n" + "== data 0x2000\n" + "x:\n" + " 00 00 00 00\n" + ); + CHECK_TRACE_CONTENTS( + "error: 'e8/call x/disp32': can't refer to global variable 'x'\n" + // sub-optimal error message; should be +//? "error: can't call to the data segment ('x')\n" + ); +} + +void test_global_variable_in_data_segment() { + run( + "== code 0x1\n" + "b9 x/imm32\n" + "== data 0x2000\n" + "x:\n" + " y/imm32\n" + "y:\n" + " 00 00 00 00\n" + ); + // check that we loaded 'x' with the address of 'y' + CHECK_TRACE_CONTENTS( + "load: 0x00002000 -> 04\n" + "load: 0x00002001 -> 20\n" + "load: 0x00002002 -> 00\n" + "load: 0x00002003 -> 00\n" + ); + CHECK_TRACE_COUNT("error", 0); +} + +void test_raw_number_with_imm32_in_data_segment() { + run( + "== code 0x1\n" + "b9 x/imm32\n" + "== data 0x2000\n" + "x:\n" + " 1/imm32\n" + ); + // check that we loaded 'x' with the address of 1 + CHECK_TRACE_CONTENTS( + "load: 0x00002000 -> 01\n" + "load: 0x00002001 -> 00\n" + "load: 0x00002002 -> 00\n" + "load: 0x00002003 -> 00\n" + ); + CHECK_TRACE_COUNT("error", 0); +} + +void test_duplicate_global_variable() { + Hide_errors = true; + run( + "== code 0x1\n" + "40/increment-EAX\n" + "== data 0x2000\n" + "x:\n" + "x:\n" + " 00\n" + ); + CHECK_TRACE_CONTENTS( + "error: duplicate global 'x'\n" + ); +} + +void test_global_variable_disp32_with_modrm() { + run( + "== code 0x1\n" + "8b/copy 0/mod/indirect 5/rm32/.disp32 2/r32/EDX x/disp32\n" + "== data 0x2000\n" + "x:\n" + " 00 00 00 00\n" + ); + CHECK_TRACE_COUNT("error", 0); +} + +void test_global_variable_disp32_with_call() { + transform( + "== code 0x1\n" + "foo:\n" + " e8/call bar/disp32\n" + "bar:\n" + ); + CHECK_TRACE_COUNT("error", 0); +} + +string to_full_string(const line& in) { + ostringstream out; + for (int i = 0; i < SIZE(in.words); ++i) { + if (i > 0) out << ' '; + out << in.words.at(i).data; + for (int j = 0; j < SIZE(in.words.at(i).metadata); ++j) + out << '/' << in.words.at(i).metadata.at(j); + } + return out.str(); +} -- cgit 1.4.1-2-gfad0