diff options
author | Kartik Agaram <vc@akkartik.com> | 2018-07-15 22:59:02 -0700 |
---|---|---|
committer | Kartik Agaram <vc@akkartik.com> | 2018-07-15 22:59:02 -0700 |
commit | 1f56ac6483f97ab18245c69c8c006be158c18a8d (patch) | |
tree | 85006b281492a4e5504a32cb7b4a54943c984942 /subx/011parse.cc | |
parent | e1fcc521be3d2ec9e379b3baa974cb805386496d (diff) | |
download | mu-1f56ac6483f97ab18245c69c8c006be158c18a8d.tar.gz |
4350
Reorganize layers to introduce the translation workflow right at the start. We also avoid duplicating parsing code. Programs are always parsed into the `program` data structure.
Diffstat (limited to 'subx/011parse.cc')
-rw-r--r-- | subx/011parse.cc | 209 |
1 files changed, 209 insertions, 0 deletions
diff --git a/subx/011parse.cc b/subx/011parse.cc new file mode 100644 index 00000000..4735dfd2 --- /dev/null +++ b/subx/011parse.cc @@ -0,0 +1,209 @@ +//: Loading programs into the VM. + +:(scenario add_imm32_to_eax) +# At the lowest level, SubX programs are a series of hex bytes, each +# (variable-length) instruction on one line. +# +# Later we'll make things nicer using macros. But you'll always be able to +# insert hex bytes out of instructions. +# +# As you can see, comments start with '#' and are ignored. + +# Segment headers start with '==', specifying the hex address where they +# begin. The first segment is always assumed to be code. +== 0x1 + +# We don't show it here, but all lines can have metadata after a ':'. +# All words can have metadata after a '/'. No spaces allowed in word metadata, of course. +# Metadata doesn't directly form instructions, but some macros may look at it. +# Unrecognized metadata never causes errors, so you can also use it for +# documentation. + +# Within the code segment, x86 instructions consist of the following parts (see cheatsheet.pdf): +# opcode ModR/M SIB displacement immediate +# instruction mod, reg, Reg/Mem bits scale, index, base +# 1-3 bytes 0/1 byte 0/1 byte 0/1/2/4 bytes 0/1/2/4 bytes + 05 0a 0b 0c 0d # add 0x0d0c0b0a to EAX + +# This program, when run, causes the following events in the trace: ++load: 0x00000001 -> 05 ++load: 0x00000002 -> 0a ++load: 0x00000003 -> 0b ++load: 0x00000004 -> 0c ++load: 0x00000005 -> 0d ++run: add imm32 0x0d0c0b0a to reg EAX ++run: storing 0x0d0c0b0a + +:(code) +// top-level helper for scenarios: parse the input, transform any macros, load +// the final hex bytes into memory, run it +void run(const string& text_bytes) { + program p; + istringstream in(text_bytes); + parse(in, p); + if (trace_contains_errors()) return; // if any stage raises errors, stop immediately + transform(p); + if (trace_contains_errors()) return; + load(p); + if (trace_contains_errors()) return; + if (p.segments.empty()) return; + EIP = p.segments.at(0).start; + while (EIP < End_of_program) + run_one_instruction(); +} + +//:: core data structures + +:(before "End Types") +struct program { + vector<segment> segments; + // random ideas for other things we may eventually need + //map<name, address> globals; + //vector<recipe> recipes; + //map<string, type_info> types; +}; +:(before "struct program") +struct segment { + uint32_t start; + vector<line> lines; + segment() :start(0) {} +}; +:(before "struct segment") +struct line { + vector<word> words; + vector<string> metadata; +}; +:(before "struct line") +struct word { + string original; + string data; + vector<string> metadata; +}; + +//:: parse + +:(code) +void parse(istream& fin, program& out) { + vector<line> l; + while (has_data(fin)) { + string line_data; + getline(fin, line_data); + trace(99, "parse") << "line: " << line_data << end(); + istringstream lin(line_data); + vector<word> w; + while (has_data(lin)) { + string word_data; + lin >> word_data; + if (word_data.empty()) continue; + if (word_data == "==") { + if (!l.empty()) { + assert(!out.segments.empty()); + trace(99, "parse") << "flushing to segment" << end(); + out.segments.back().lines.swap(l); + } + segment s; + lin >> std::hex >> s.start; + trace(99, "parse") << "new segment from " << HEXWORD << s.start << end(); + out.segments.push_back(s); + // todo? + break; // skip rest of line + } + if (word_data[0] == ':') { + // todo: line metadata + break; + } + if (word_data[0] == '#') { + // comment + break; + } + w.push_back(word()); + w.back().original = word_data; + istringstream win(word_data); + if (getline(win, w.back().data, '/')) { + string m; + while (getline(win, m, '/')) + w.back().metadata.push_back(m); + } + trace(99, "parse") << "new word: " << w.back().data << end(); + } + if (!w.empty()) { + l.push_back(line()); + l.back().words.swap(w); + } + } + if (!l.empty()) { + assert(!out.segments.empty()); + trace(99, "parse") << "flushing to segment" << end(); + out.segments.back().lines.swap(l); + } +} + +//:: transform + +:(before "End Types") +typedef void (*transform_fn)(program&); +:(before "End Globals") +vector<transform_fn> Transform; + +void transform(program& p) { + for (int t = 0; t < SIZE(Transform); ++t) + (*Transform.at(t))(p); +} + +//:: load + +void load(const program& p) { + for (int i = 0; i < SIZE(p.segments); ++i) { + const segment& seg = p.segments.at(i); + uint32_t addr = seg.start; + trace(99, "load") << "loading segment " << i << " from " << HEXWORD << addr << end(); + for (int j = 0; j < SIZE(seg.lines); ++j) { + const line& l = seg.lines.at(j); + for (int k = 0; k < SIZE(l.words); ++k) { + const word& w = l.words.at(k); + uint8_t val = hex_byte(w.data); + if (trace_contains_errors()) return; + write_mem_u8(addr, val); + trace(99, "load") << "0x" << HEXWORD << addr << " -> " << HEXBYTE << NUM(read_mem_u8(addr)) << end(); + ++addr; + } + } + if (i == 0) End_of_program = addr; + } +} + +uint8_t hex_byte(const string& s) { + istringstream in(s); + int result = 0; + in >> std::hex >> result; + if (!in) { + raise << "invalid hex " << s << '\n' << end(); + return '\0'; + } + if (result > 0xff) { + raise << "invalid hex byte " << std::hex << result << '\n' << end(); + return '\0'; + } + return static_cast<uint8_t>(result); +} + +//:: run + +//: our first opcode +:(before "End Single-Byte Opcodes") +case 0x05: { // add imm32 to EAX + int32_t arg2 = imm32(); + trace(2, "run") << "add imm32 0x" << HEXWORD << arg2 << " to reg EAX" << end(); + BINARY_ARITHMETIC_OP(+, Reg[EAX].i, arg2); + break; +} + +:(code) +// read a 32-bit immediate in little-endian order from the instruction stream +int32_t imm32() { + int32_t result = next(); + result |= (next()<<8); + result |= (next()<<16); + result |= (next()<<24); + return result; +} |