1 //: Running SubX programs on the VM.
  2 
  3 //: (Not to be confused with the 'run' subcommand for running ELF binaries on
  4 //: the VM. That comes later.)
  5 
  6 :(before "End Help Texts")
  7 put(Help, "syntax",
  8   "SubX programs consist of segments, each segment in turn consisting of lines.\n"
  9   "Line-endings are significant; each line should contain a single\n"
 10   "instruction, macro or directive.\n"
 11   "\n"
 12   "Comments start with the '#' character. It should be at the start of a word\n"
 13   "(start of line, or following a space).\n"
 14   "\n"
 15   "Each segment starts with a header line: a '==' delimiter followed by the\n"
 16   "starting address for the segment.\n"
 17   "\n"
 18   "The starting address for a segment has some finicky requirements. But just\n"
 19   "start with a round number, and `subx` will try to guide you to a valid\n"
 20   "configuration.\n"
 21   "A good rule of thumb is to try to start the first segment at the default\n"
 22   "address of 0x08048000, and to start each subsequent segment at least 0x1000\n"
 23   "(most common page size) bytes after the last.\n"
 24   "If a segment occupies than 0x1000 bytes you'll need to push subsequent\n"
 25   "segments further down.\n"
 26   "Currently only the first segment contains executable code (because it gets\n"
 27   "annoying to have to change addresses in later segments every time an earlier\n"
 28   "one changes length; one of those finicky requirements).\n"
 29   "\n"
 30   "Lines consist of a series of words. Words can contain arbitrary metadata\n"
 31   "after a '/', but they can never contain whitespace. Metadata has no effect\n"
 32   "at runtime, but can be handy when rewriting macros.\n"
 33   "\n"
 34   "Check out some examples in this directory (ex*.subx)\n"
 35   "Programming in machine code can be annoying, but let's see if we can make\n"
 36   "it nice enough to be able to write a compiler in it.\n"
 37 );
 38 :(before "End Help Contents")
 39 cerr << "  syntax\n";
 40 
 41 :(scenario add_imm32_to_eax)
 42 # At the lowest level, SubX programs are a series of hex bytes, each
 43 # (variable-length) instruction on one line.
 44 #
 45 # Later we'll make things nicer using macros. But you'll always be able to
 46 # insert hex bytes out of instructions.
 47 #
 48 # As you can see, comments start with '#' and are ignored.
 49 
 50 # Segment headers start with '==', specifying the hex address where they
 51 # begin. The first segment is always assumed to be code.
 52 == 0x1
 53 
 54 # We don't show it here, but all lines can have metadata after a ':'.
 55 # All words can have metadata after a '/'. No spaces allowed in word metadata, of course.
 56 # Metadata doesn't directly form instructions, but some macros may look at it.
 57 # Unrecognized metadata never causes errors, so you can also use it for
 58 # documentation.
 59 
 60 # Within the code segment, x86 instructions consist of the following parts (see cheatsheet.pdf):
 61 #   opcode        ModR/M                    SIB                   displacement    immediate
 62 #   instruction   mod, reg, Reg/Mem bits    scale, index, base
 63 #   1-3 bytes     0/1 byte                  0/1 byte              0/1/2/4 bytes   0/1/2/4 bytes
 64     05            .                         .                     .               0a 0b 0c 0d  # add 0x0d0c0b0a to EAX
 65 # (The single periods are just to help the eye track long gaps between
 66 # columns, and are otherwise ignored.)
 67 
 68 # This program, when run, causes the following events in the trace:
 69 +load: 0x00000001 -> 05
 70 +load: 0x00000002 -> 0a
 71 +load: 0x00000003 -> 0b
 72 +load: 0x00000004 -> 0c
 73 +load: 0x00000005 -> 0d
 74 +run: add imm32 0x0d0c0b0a to reg EAX
 75 +run: storing 0x0d0c0b0a
 76 
 77 :(code)
 78 // top-level helper for scenarios: parse the input, transform any macros, load
 79 // the final hex bytes into memory, run it
 80 void run(const string& text_bytes) {
 81   program p;
 82   istringstream in(text_bytes);
 83   parse(in, p);
 84   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
 85   transform(p);
 86   if (trace_contains_errors()) return;
 87   load(p);
 88   if (trace_contains_errors()) return;
 89   while (EIP < End_of_program)
 90     run_one_instruction();
 91 }
 92 
 93 //:: core data structures
 94 
 95 :(before "End Types")
 96 struct program {
 97   vector<segment> segments;
 98   // random ideas for other things we may eventually need
 99   //map<name, address> globals;
100   //vector<recipe> recipes;
101   //map<string, type_info> types;
102 };
103 :(before "struct program")
104 struct segment {
105   uint32_t start;
106   vector<line> lines;
107   segment() :start(0) {}
108 };
109 :(before "struct segment")
110 struct line {
111   vector<word> words;
112   vector<string> metadata;
113   string original;
114 };
115 :(before "struct line")
116 struct word {
117   string original;
118   string data;
119   vector<string> metadata;
120 };
121 
122 //:: parse
123 
124 :(code)
125 void parse(istream& fin, program& out) {
126   vector<line> l;
127   trace(99, "parse") << "begin" << end();
128   while (has_data(fin)) {
129     string line_data;
130     line curr;
131     getline(fin, line_data);
132     curr.original = line_data;
133     trace(99, "parse") << "line: " << line_data << end();
134     istringstream lin(line_data);
135     while (has_data(lin)) {
136       string word_data;
137       lin >> word_data;
138       if (word_data.empty()) continue;
139       if (word_data[0] == '#') break;  // comment
140       if (word_data == ".") continue;  // comment token
141       if (word_data == "==") {
142         if (!l.empty()) {
143           assert(!out.segments.empty());
144           trace(99, "parse") << "flushing to segment" << end();
145           out.segments.back().lines.swap(l);
146         }
147         segment s;
148         lin >> std::hex >> s.start;
149         trace(99, "parse") << "new segment from " << HEXWORD << s.start << end();
150         out.segments.push_back(s);
151         // todo?
152         break;  // skip rest of line
153       }
154       if (word_data[0] == ':') {
155         // todo: line metadata
156         break;
157       }
158       curr.words.push_back(word());
159       curr.words.back().original = word_data;
160       istringstream win(word_data);
161       if (getline(win, curr.words.back().data, '/')) {
162         string m;
163         while (getline(win, m, '/'))
164           curr.words.back().metadata.push_back(m);
165       }
166       trace(99, "parse") << "new word: " << curr.words.back().data << end();
167     }
168     if (!curr.words.empty())
169       l.push_back(curr);
170   }
171   if (!l.empty()) {
172     assert(!out.segments.empty());
173     trace(99, "parse") << "flushing to segment" << end();
174     out.segments.back().lines.swap(l);
175   }
176   trace(99, "parse") << "done" << end();
177 }
178 
179 //:: transform
180 
181 :(before "End Types")
182 typedef void (*transform_fn)(program&);
183 :(before "End Globals")
184 vector<transform_fn> Transform;
185 
186 void transform(program& p) {
187   trace(99, "transform") << "begin" << end();
188   for (int t = 0;  t < SIZE(Transform);  ++t)
189     (*Transform.at(t))(p);
190   trace(99, "transform") << "done" << end();
191 }
192 
193 //:: load
194 
195 void load(const program& p) {
196   trace(99, "load") << "begin" << end();
197   if (p.segments.empty()) {
198     raise << "no code to run\n" << end();
199     return;
200   }
201   for (int i = 0;   i < SIZE(p.segments);  ++i) {
202     const segment& seg = p.segments.at(i);
203     uint32_t addr = seg.start;
204     trace(99, "load") << "loading segment " << i << " from " << HEXWORD << addr << end();
205     for (int j = 0;  j < SIZE(seg.lines);  ++j) {
206       const line& l = seg.lines.at(j);
207       for (int k = 0;  k < SIZE(l.words);  ++k) {
208         const word& w = l.words.at(k);
209         uint8_t val = hex_byte(w.data);
210         if (trace_contains_errors()) return;
211         write_mem_u8(addr, val);
212         trace(99, "load") << "0x" << HEXWORD << addr << " -> " << HEXBYTE << NUM(read_mem_u8(addr)) << end();
213         ++addr;
214       }
215     }
216     if (i == 0) End_of_program = addr;
217   }
218   EIP = p.segments.at(0).start;
219   trace(99, "load") << "done" << end();
220 }
221 
222 uint8_t hex_byte(const string& s) {
223   istringstream in(s);
224   int result = 0;
225   in >> std::hex >> result;
226   if (!in || !in.eof()) {
227     raise << "token '" << s << "' is not a hex byte\n" << end();
228     return '\0';
229   }
230   if (result > 0xff || result < -0x8f) {
231     raise << "token '" << s << "' is not a hex byte\n" << end();
232     return '\0';
233   }
234   return static_cast<uint8_t>(result);
235 }
236 
237 :(scenarios parse_and_load)
238 :(scenario number_too_large)
239 % Hide_errors = true;
240 == 0x1
241 05 cab
242 +error: token 'cab' is not a hex byte
243 
244 :(scenario invalid_hex)
245 % Hide_errors = true;
246 == 0x1
247 05 cx
248 +error: token 'cx' is not a hex byte
249 
250 :(scenario negative_number)
251 == 0x1
252 05 -12
253 $error: 0
254 
255 :(scenario negative_number_too_small)
256 % Hide_errors = true;
257 == 0x1
258 05 -12345
259 +error: token '-12345' is not a hex byte
260 
261 :(scenario hex_prefix)
262 == 0x1
263 0x05 -0x12
264 $error: 0
265 
266 //: helper for tests
267 :(code)
268 void parse_and_load(const string& text_bytes) {
269   program p;
270   istringstream in(text_bytes);
271   parse(in, p);
272   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
273   load(p);
274 }
275 
276 //:: run
277 
278 :(before "End Initialize Op Names(name)")
279 put(name, "05", "add imm32 to R0 (EAX)");
280 
281 //: our first opcode
282 :(before "End Single-Byte Opcodes")
283 case 0x05: {  // add imm32 to EAX
284   int32_t arg2 = imm32();
285   trace(90, "run") << "add imm32 0x" << HEXWORD << arg2 << " to reg EAX" << end();
286   BINARY_ARITHMETIC_OP(+, Reg[EAX].i, arg2);
287   break;
288 }
289 
290 :(code)
291 // read a 32-bit immediate in little-endian order from the instruction stream
292 int32_t imm32() {
293   int32_t result = next();
294   result |= (next()<<8);
295   result |= (next()<<16);
296   result |= (next()<<24);
297   return result;
298 }