From 6e1eeeebfb453fa7c871869c19375ce60fbd7413 Mon Sep 17 00:00:00 2001 From: Kartik Agaram Date: Sat, 27 Jul 2019 16:01:55 -0700 Subject: 5485 - promote SubX to top-level --- 011run.cc | 467 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 467 insertions(+) create mode 100644 011run.cc (limited to '011run.cc') diff --git a/011run.cc b/011run.cc new file mode 100644 index 00000000..194676d8 --- /dev/null +++ b/011run.cc @@ -0,0 +1,467 @@ +//: Running SubX programs on the VM. + +//: (Not to be confused with the 'run' subcommand for running ELF binaries on +//: the VM. That comes later.) + +:(before "End Help Texts") +put_new(Help, "syntax", + "SubX programs consist of segments, each segment in turn consisting of lines.\n" + "Line-endings are significant; each line should contain a single\n" + "instruction, macro or directive.\n" + "\n" + "Comments start with the '#' character. It should be at the start of a word\n" + "(start of line, or following a space).\n" + "\n" + "Each segment starts with a header line: a '==' delimiter followed by the name of\n" + "the segment and a (sometimes approximate) starting address in memory.\n" + "The name 'code' is special; instructions to execute should always go here.\n" + "\n" + "The resulting binary starts running code from a label called 'Entry'\n" + "in the code segment.\n" + "\n" + "Segments with the same name get merged together. This rule helps keep functions and\n" + "their data close together in .subx files.\n" + "You don't have to specify the starting address after the first time.\n" + "\n" + "Lines consist of a series of words. Words can contain arbitrary metadata\n" + "after a '/', but they can never contain whitespace. Metadata has no effect\n" + "at runtime, but can be handy when rewriting macros.\n" + "\n" + "Check out the examples in the examples/ directory.\n" +); +:(before "End Help Contents") +cerr << " syntax\n"; + +:(code) +void test_copy_imm32_to_EAX() { + // At the lowest level, SubX programs are a series of hex bytes, each + // (variable-length) instruction on one line. + run( + // Comments start with '#' and are ignored. + "# comment\n" + // Segment headers start with '==', a name and a starting hex address. + // There's usually one code and one data segment. The code segment + // always comes first. + "== code 0x1\n" // code segment + + // After the header, each segment consists of lines, and each line + // consists of words separated by whitespace. + // + // All words can have metadata after a '/'. No spaces allowed in + // metadata, of course. + // Unrecognized metadata never causes errors, so you can use it for + // documentation. + // + // Within the code segment in particular, x86 instructions consist of + // some number of the following parts and sub-parts (see the Readme and + // cheatsheet.pdf for details): + // opcodes: 1-3 bytes + // ModR/M byte + // SIB byte + // displacement: 0/1/2/4 bytes + // immediate: 0/1/2/4 bytes + // opcode ModR/M SIB displacement immediate + // instruction mod, reg, Reg/Mem bits scale, index, base + // 1-3 bytes 0/1 byte 0/1 byte 0/1/2/4 bytes 0/1/2/4 bytes + " b8 . . . 0a 0b 0c 0d\n" // copy 0x0d0c0b0a to EAX + // The periods are just to help the eye track long gaps between columns, + // and are otherwise ignored. + ); + // This program, when run, causes the following events in the trace: + CHECK_TRACE_CONTENTS( + "load: 0x00000001 -> b8\n" + "load: 0x00000002 -> 0a\n" + "load: 0x00000003 -> 0b\n" + "load: 0x00000004 -> 0c\n" + "load: 0x00000005 -> 0d\n" + "run: copy imm32 0x0d0c0b0a to EAX\n" + ); +} + +// top-level helper for scenarios: parse the input, transform any macros, load +// the final hex bytes into memory, run it +void run(const string& text_bytes) { + program p; + istringstream in(text_bytes); + parse(in, p); + if (trace_contains_errors()) return; // if any stage raises errors, stop immediately + transform(p); + if (trace_contains_errors()) return; + load(p); + if (trace_contains_errors()) return; + // convenience to keep tests concise: 'Entry' label need not be provided + // not allowed in real programs + if (p.entry) + EIP = p.entry; + else + EIP = find(p, "code")->start; + while (EIP < End_of_program) + run_one_instruction(); +} + +//:: core data structures + +:(before "End Types") +struct program { + uint32_t entry; + vector segments; + program() { entry = 0; } +}; +:(before "struct program") +struct segment { + string name; + uint32_t start; + vector lines; + // End segment Fields + segment() { + start = 0; + // End segment Constructor + } +}; +:(before "struct segment") +struct line { + vector words; + vector metadata; + string original; +}; +:(before "struct line") +struct word { + string original; + string data; + vector metadata; +}; + +//:: parse + +:(code) +void parse(istream& fin, program& out) { + segment* curr_segment = NULL; + vector l; + while (has_data(fin)) { + string line_data; + line curr; + getline(fin, line_data); + curr.original = line_data; + trace(99, "parse") << "line: " << line_data << end(); + // End Line Parsing Special-cases(line_data -> l) + istringstream lin(line_data); + while (has_data(lin)) { + string word_data; + lin >> word_data; + if (word_data.empty()) continue; + if (word_data[0] == '#') break; // comment + if (word_data == ".") continue; // comment token + if (word_data == "==") { + flush(curr_segment, l); + string segment_name; + lin >> segment_name; + curr_segment = find(out, segment_name); + if (curr_segment != NULL) { + trace(3, "parse") << "appending to segment '" << segment_name << "'" << end(); + } + else { + trace(3, "parse") << "new segment '" << segment_name << "'" << end(); + uint32_t seg_start = 0; + lin >> std::hex >> seg_start; + sanity_check_program_segment(out, seg_start); + out.segments.push_back(segment()); + curr_segment = &out.segments.back(); + curr_segment->name = segment_name; + curr_segment->start = seg_start; + if (trace_contains_errors()) continue; + trace(3, "parse") << "starts at address 0x" << HEXWORD << curr_segment->start << end(); + } + break; // skip rest of line + } + if (word_data[0] == ':') { + // todo: line metadata + break; + } + curr.words.push_back(word()); + parse_word(word_data, curr.words.back()); + trace(99, "parse") << "word: " << to_string(curr.words.back()); + } + if (!curr.words.empty()) + l.push_back(curr); + } + flush(curr_segment, l); + trace(99, "parse") << "done" << end(); +} + +segment* find(program& p, const string& segment_name) { + for (int i = 0; i < SIZE(p.segments); ++i) { + if (p.segments.at(i).name == segment_name) + return &p.segments.at(i); + } + return NULL; +} + +void flush(segment* s, vector& lines) { + if (lines.empty()) return; + if (s == NULL) { + raise << "input does not start with a '==' section header\n" << end(); + return; + } + trace(3, "parse") << "flushing segment" << end(); + s->lines.insert(s->lines.end(), lines.begin(), lines.end()); + lines.clear(); +} + +void parse_word(const string& data, word& out) { + out.original = data; + istringstream win(data); + if (getline(win, out.data, '/')) { + string m; + while (getline(win, m, '/')) + out.metadata.push_back(m); + } +} + +void sanity_check_program_segment(const program& p, uint32_t addr) { + for (int i = 0; i < SIZE(p.segments); ++i) { + if (p.segments.at(i).start == addr) + raise << "can't have multiple segments starting at address 0x" << HEXWORD << addr << '\n' << end(); + } +} + +// helper for tests +void parse(const string& text_bytes) { + program p; + istringstream in(text_bytes); + parse(in, p); +} + +void test_detect_duplicate_segments() { + Hide_errors = true; + parse( + "== segment1 0xee\n" + "ab\n" + "== segment2 0xee\n" + "cd\n" + ); + CHECK_TRACE_CONTENTS( + "error: can't have multiple segments starting at address 0x000000ee\n" + ); +} + +//:: transform + +:(before "End Types") +typedef void (*transform_fn)(program&); +:(before "End Globals") +vector Transform; + +:(code) +void transform(program& p) { + for (int t = 0; t < SIZE(Transform); ++t) + (*Transform.at(t))(p); +} + +//:: load + +void load(const program& p) { + if (find(p, "code") == NULL) { + raise << "no code to run\n" << end(); + return; + } + // Ensure segments are disjoint. + set overlap; + for (int i = 0; i < SIZE(p.segments); ++i) { + const segment& seg = p.segments.at(i); + uint32_t addr = seg.start; + if (!already_allocated(addr)) + Mem.push_back(vma(seg.start)); + trace(99, "load") << "loading segment " << i << " from " << HEXWORD << addr << end(); + for (int j = 0; j < SIZE(seg.lines); ++j) { + const line& l = seg.lines.at(j); + for (int k = 0; k < SIZE(l.words); ++k) { + const word& w = l.words.at(k); + uint8_t val = hex_byte(w.data); + if (trace_contains_errors()) return; + assert(overlap.find(addr) == overlap.end()); + write_mem_u8(addr, val); + overlap.insert(addr); + trace(99, "load") << "0x" << HEXWORD << addr << " -> " << HEXBYTE << NUM(read_mem_u8(addr)) << end(); + ++addr; + } + } + if (seg.name == "code") { + End_of_program = addr; + } + } +} + +const segment* find(const program& p, const string& segment_name) { + for (int i = 0; i < SIZE(p.segments); ++i) { + if (p.segments.at(i).name == segment_name) + return &p.segments.at(i); + } + return NULL; +} + +uint8_t hex_byte(const string& s) { + if (contains_uppercase(s)) { + raise << "uppercase hex not allowed: " << s << '\n' << end(); + return 0; + } + istringstream in(s); + int result = 0; + in >> std::hex >> result; + if (!in || !in.eof()) { + raise << "token '" << s << "' is not a hex byte\n" << end(); + return '\0'; + } + if (result > 0xff || result < -0x8f) { + raise << "token '" << s << "' is not a hex byte\n" << end(); + return '\0'; + } + return static_cast(result); +} + +void test_number_too_large() { + Hide_errors = true; + parse_and_load( + "== code 0x1\n" + "01 cab\n" + ); + CHECK_TRACE_CONTENTS( + "error: token 'cab' is not a hex byte\n" + ); +} + +void test_invalid_hex() { + Hide_errors = true; + parse_and_load( + "== code 0x1\n" + "01 cx\n" + ); + CHECK_TRACE_CONTENTS( + "error: token 'cx' is not a hex byte\n" + ); +} + +void test_negative_number() { + parse_and_load( + "== code 0x1\n" + "01 -02\n" + ); + CHECK_TRACE_COUNT("error", 0); +} + +void test_negative_number_too_small() { + Hide_errors = true; + parse_and_load( + "== code 0x1\n" + "01 -12345\n" + ); + CHECK_TRACE_CONTENTS( + "error: token '-12345' is not a hex byte\n" + ); +} + +void test_hex_prefix() { + parse_and_load( + "== code 0x1\n" + "0x01 -0x02\n" + ); + CHECK_TRACE_COUNT("error", 0); +} + +void test_repeated_segment_merges_data() { + parse_and_load( + "== code 0x1\n" + "11 22\n" + "== code\n" // again + "33 44\n" + ); + CHECK_TRACE_CONTENTS( + "parse: new segment 'code'\n" + "parse: appending to segment 'code'\n" + // first segment + "load: 0x00000001 -> 11\n" + "load: 0x00000002 -> 22\n" + // second segment + "load: 0x00000003 -> 33\n" + "load: 0x00000004 -> 44\n" + ); +} + +void test_error_on_missing_segment_header() { + Hide_errors = true; + parse_and_load( + "01 02\n" + ); + CHECK_TRACE_CONTENTS( + "error: input does not start with a '==' section header\n" + ); +} + +void test_error_on_uppercase_hex() { + Hide_errors = true; + parse_and_load( + "== code\n" + "01 Ab\n" + ); + CHECK_TRACE_CONTENTS( + "error: uppercase hex not allowed: Ab\n" + ); +} + +//: helper for tests +void parse_and_load(const string& text_bytes) { + program p; + istringstream in(text_bytes); + parse(in, p); + if (trace_contains_errors()) return; // if any stage raises errors, stop immediately + load(p); +} + +//:: run + +:(before "End Initialize Op Names") +put_new(Name, "b8", "copy imm32 to EAX (mov)"); + +//: our first opcode + +:(before "End Single-Byte Opcodes") +case 0xb8: { // copy imm32 to EAX + const int32_t src = next32(); + trace(Callstack_depth+1, "run") << "copy imm32 0x" << HEXWORD << src << " to EAX" << end(); + Reg[EAX].i = src; + break; +} + +:(code) +void test_copy_imm32_to_EAX_again() { + run( + "== code 0x1\n" // code segment + // op ModR/M SIB displacement immediate + " b8 0a 0b 0c 0d \n" // copy 0x0d0c0b0a to EAX + ); + CHECK_TRACE_CONTENTS( + "run: copy imm32 0x0d0c0b0a to EAX\n" + ); +} + +// read a 32-bit int in little-endian order from the instruction stream +int32_t next32() { + int32_t result = read_mem_i32(EIP); + EIP+=4; + return result; +} + +//:: helpers + +string to_string(const word& w) { + ostringstream out; + out << w.data; + for (int i = 0; i < SIZE(w.metadata); ++i) + out << " /" << w.metadata.at(i); + return out.str(); +} + +bool contains_uppercase(const string& s) { + for (int i = 0; i < SIZE(s); ++i) + if (isupper(s.at(i))) return true; + return false; +} -- cgit 1.4.1-2-gfad0