https://github.com/akkartik/mu/blob/master/011run.cc
  1 //: Running SubX programs on the VM.
  2 
  3 //: (Not to be confused with the 'run' subcommand for running ELF binaries on
  4 //: the VM. That comes later.)
  5 
  6 :(before "End Help Texts")
  7 put_new(Help, "syntax",
  8   "SubX programs consist of segments, each segment in turn consisting of lines.\n"
  9   "Line-endings are significant; each line should contain a single\n"
 10   "instruction, macro or directive.\n"
 11   "\n"
 12   "Comments start with the '#' character. It should be at the start of a word\n"
 13   "(start of line, or following a space).\n"
 14   "\n"
 15   "Each segment starts with a header line: a '==' delimiter followed by the name of\n"
 16   "the segment and a (sometimes approximate) starting address in memory.\n"
 17   "The name 'code' is special; instructions to execute should always go here.\n"
 18   "\n"
 19   "The resulting binary starts running code from a label called 'Entry'\n"
 20   "in the code segment.\n"
 21   "\n"
 22   "Segments with the same name get merged together. This rule helps keep functions and\n"
 23   "their data close together in .subx files.\n"
 24   "You don't have to specify the starting address after the first time.\n"
 25   "\n"
 26   "Lines consist of a series of words. Words can contain arbitrary metadata\n"
 27   "after a '/', but they can never contain whitespace. Metadata has no effect\n"
 28   "at runtime, but can be handy when rewriting macros.\n"
 29   "\n"
 30   "Check out the examples in the examples/ directory.\n"
 31 );
 32 :(before "End Help Contents")
 33 cerr << "  syntax\n";
 34 
 35 :(code)
 36 void test_copy_imm32_to_EAX() {
 37   // At the lowest level, SubX programs are a series of hex bytes, each
 38   // (variable-length) instruction on one line.
 39   run(
 40       // Comments start with '#' and are ignored.
 41       "# comment\n"
 42       // Segment headers start with '==', a name and a starting hex address.
 43       // There's usually one code and one data segment. The code segment
 44       // always comes first.
 45       "== code 0x1\n"  // code segment
 46 
 47       // After the header, each segment consists of lines, and each line
 48       // consists of words separated by whitespace.
 49       //
 50       // All words can have metadata after a '/'. No spaces allowed in
 51       // metadata, of course.
 52       // Unrecognized metadata never causes errors, so you can use it for
 53       // documentation.
 54       //
 55       // Within the code segment in particular, x86 instructions consist of
 56       // some number of the following parts and sub-parts (see the Readme and
 57       // cheatsheet.pdf for details):
 58       //   opcodes: 1-3 bytes
 59       //   ModR/M byte
 60       //   SIB byte
 61       //   displacement: 0/1/2/4 bytes
 62       //   immediate: 0/1/2/4 bytes
 63       // opcode        ModR/M                    SIB                   displacement    immediate
 64       // instruction   mod, reg, Reg/Mem bits    scale, index, base
 65       // 1-3 bytes     0/1 byte                  0/1 byte              0/1/2/4 bytes   0/1/2/4 bytes
 66       "  b8            .                         .                     .               0a 0b 0c 0d\n"  // copy 0x0d0c0b0a to EAX
 67       // The periods are just to help the eye track long gaps between columns,
 68       // and are otherwise ignored.
 69   );
 70   // This program, when run, causes the following events in the trace:
 71   CHECK_TRACE_CONTENTS(
 72       "load: 0x00000001 -> b8\n"
 73       "load: 0x00000002 -> 0a\n"
 74       "load: 0x00000003 -> 0b\n"
 75       "load: 0x00000004 -> 0c\n"
 76       "load: 0x00000005 -> 0d\n"
 77       "run: copy imm32 0x0d0c0b0a to EAX\n"
 78   );
 79 }
 80 
 81 // top-level helper for scenarios: parse the input, transform any macros, load
 82 // the final hex bytes into memory, run it
 83 void run(const string& text_bytes) {
 84   program p;
 85   istringstream in(text_bytes);
 86   parse(in, p);
 87   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
 88   transform(p);
 89   if (trace_contains_errors()) return;
 90   load(p);
 91   if (trace_contains_errors()) return;
 92   // convenience to keep tests concise: 'Entry' label need not be provided
 93   // not allowed in real programs
 94   if (p.entry)
 95     EIP = p.entry;
 96   else
 97     EIP = find(p, "code")->start;
 98   while (EIP < End_of_program)
 99     run_one_instruction();
100 }
101 
102 //:: core data structures
103 
104 :(before "End Types")
105 struct program {
106   uint32_t entry;
107   vector<segment> segments;
108   program() { entry = 0; }
109 };
110 :(before "struct program")
111 struct segment {
112   string name;
113   uint32_t start;
114   vector<line> lines;
115   // End segment Fields
116   segment() {
117     start = 0;
118     // End segment Constructor
119   }
120 };
121 :(before "struct segment")
122 struct line {
123   vector<word> words;
124   vector<string> metadata;
125   string original;
126 };
127 :(before "struct line")
128 struct word {
129   string original;
130   string data;
131   vector<string> metadata;
132 };
133 
134 //:: parse
135 
136 :(code)
137 void parse(istream& fin, program& out) {
138   segment* curr_segment = NULL;
139   vector<line> l;
140   while (has_data(fin)) {
141     string line_data;
142     line curr;
143     getline(fin, line_data);
144     curr.original = line_data;
145     trace(99, "parse") << "line: " << line_data << end();
146     // End Line Parsing Special-cases(line_data -> l)
147     istringstream lin(line_data);
148     while (has_data(lin)) {
149       string word_data;
150       lin >> word_data;
151       if (word_data.empty()) continue;
152       if (word_data[0] == '#') break;  // comment
153       if (word_data == ".") continue;  // comment token
154       if (word_data == "==") {
155         flush(curr_segment, l);
156         string segment_name;
157         lin >> segment_name;
158         curr_segment = find(out, segment_name);
159         if (curr_segment != NULL) {
160           trace(3, "parse") << "appending to segment '" << segment_name << "'" << end();
161         }
162         else {
163           trace(3, "parse") << "new segment '" << segment_name << "'" << end();
164           uint32_t seg_start = 0;
165           lin >> std::hex >> seg_start;
166           sanity_check_program_segment(out, seg_start);
167           out.segments.push_back(segment());
168           curr_segment = &out.segments.back();
169           curr_segment->name = segment_name;
170           curr_segment->start = seg_start;
171           if (trace_contains_errors()) continue;
172           trace(3, "parse") << "starts at address 0x" << HEXWORD << curr_segment->start << end();
173         }
174         break;  // skip rest of line
175       }
176       if (word_data[0] == ':') {
177         // todo: line metadata
178         break;
179       }
180       curr.words.push_back(word());
181       parse_word(word_data, curr.words.back());
182       trace(99, "parse") << "word: " << to_string(curr.words.back());
183     }
184     if (!curr.words.empty())
185       l.push_back(curr);
186   }
187   flush(curr_segment, l);
188   trace(99, "parse") << "done" << end();
189 }
190 
191 segment* find(program& p, const string& segment_name) {
192   for (int i = 0;  i < SIZE(p.segments);  ++i) {
193     if (p.segments.at(i).name == segment_name)
194       return &p.segments.at(i);
195   }
196   return NULL;
197 }
198 
199 void flush(segment* s, vector<line>& lines) {
200   if (lines.empty()) return;
201   if (s == NULL) {
202     raise << "input does not start with a '==' section header\n" << end();
203     return;
204   }
205   trace(3, "parse") << "flushing segment" << end();
206   s->lines.insert(s->lines.end(), lines.begin(), lines.end());
207   lines.clear();
208 }
209 
210 void parse_word(const string& data, word& out) {
211   out.original = data;
212   istringstream win(data);
213   if (getline(win, out.data, '/')) {
214     string m;
215     while (getline(win, m, '/'))
216       out.metadata.push_back(m);
217   }
218 }
219 
220 void sanity_check_program_segment(const program& p, uint32_t addr) {
221   for (int i = 0;  i < SIZE(p.segments);  ++i) {
222     if (p.segments.at(i).start == addr)
223       raise << "can't have multiple segments starting at address 0x" << HEXWORD << addr << '\n' << end();
224   }
225 }
226 
227 // helper for tests
228 void parse(const string& text_bytes) {
229   program p;
230   istringstream in(text_bytes);
231   parse(in, p);
232 }
233 
234 void test_detect_duplicate_segments() {
235   Hide_errors = true;
236   parse(
237       "== segment1 0xee\n"
238       "ab\n"
239       "== segment2 0xee\n"
240       "cd\n"
241   );
242   CHECK_TRACE_CONTENTS(
243       "error: can't have multiple segments starting at address 0x000000ee\n"
244   );
245 }
246 
247 //:: transform
248 
249 :(before "End Types")
250 typedef void (*transform_fn)(program&);
251 :(before "End Globals")
252 vector<transform_fn> Transform;
253 
254 :(code)
255 void transform(program& p) {
256   for (int t = 0;  t < SIZE(Transform);  ++t)
257     (*Transform.at(t))(p);
258 }
259 
260 //:: load
261 
262 void load(const program& p) {
263   if (find(p, "code") == NULL) {
264     raise << "no code to run\n" << end();
265     return;
266   }
267   // Ensure segments are disjoint.
268   set<uint32_t> overlap;
269   for (int i = 0;   i < SIZE(p.segments);  ++i) {
270     const segment& seg = p.segments.at(i);
271     uint32_t addr = seg.start;
272     if (!already_allocated(addr))
273       Mem.push_back(vma(seg.start));
274     trace(99, "load") << "loading segment " << i << " from " << HEXWORD << addr << end();
275     for (int j = 0;  j < SIZE(seg.lines);  ++j) {
276       const line& l = seg.lines.at(j);
277       for (int k = 0;  k < SIZE(l.words);  ++k) {
278         const word& w = l.words.at(k);
279         uint8_t val = hex_byte(w.data);
280         if (trace_contains_errors()) return;
281         assert(overlap.find(addr) == overlap.end());
282         write_mem_u8(addr, val);
283         overlap.insert(addr);
284         trace(99, "load") << "0x" << HEXWORD << addr << " -> " << HEXBYTE << NUM(read_mem_u8(addr)) << end();
285         ++addr;
286       }
287     }
288     if (seg.name == "code") {
289       End_of_program = addr;
290     }
291   }
292 }
293 
294 const segment* find(const program& p, const string& segment_name) {
295   for (int i = 0;  i < SIZE(p.segments);  ++i) {
296     if (p.segments.at(i).name == segment_name)
297       return &p.segments.at(i);
298   }
299   return NULL;
300 }
301 
302 uint8_t hex_byte(const string& s) {
303   if (contains_uppercase(s)) {
304     raise << "uppercase hex not allowed: " << s << '\n' << end();
305     return 0;
306   }
307   istringstream in(s);
308   int result = 0;
309   in >> std::hex >> result;
310   if (!in || !in.eof()) {
311     raise << "token '" << s << "' is not a hex byte\n" << end();
312     return '\0';
313   }
314   if (result > 0xff || result < -0x8f) {
315     raise << "token '" << s << "' is not a hex byte\n" << end();
316     return '\0';
317   }
318   return static_cast<uint8_t>(result);
319 }
320 
321 void test_number_too_large() {
322   Hide_errors = true;
323   parse_and_load(
324       "== code 0x1\n"
325       "01 cab\n"
326   );
327   CHECK_TRACE_CONTENTS(
328       "error: token 'cab' is not a hex byte\n"
329   );
330 }
331 
332 void test_invalid_hex() {
333   Hide_errors = true;
334   parse_and_load(
335       "== code 0x1\n"
336       "01 cx\n"
337   );
338   CHECK_TRACE_CONTENTS(
339       "error: token 'cx' is not a hex byte\n"
340   );
341 }
342 
343 void test_negative_number() {
344   parse_and_load(
345       "== code 0x1\n"
346       "01 -02\n"
347   );
348   CHECK_TRACE_COUNT("error", 0);
349 }
350 
351 void test_negative_number_too_small() {
352   Hide_errors = true;
353   parse_and_load(
354       "== code 0x1\n"
355       "01 -12345\n"
356   );
357   CHECK_TRACE_CONTENTS(
358       "error: token '-12345' is not a hex byte\n"
359   );
360 }
361 
362 void test_hex_prefix() {
363   parse_and_load(
364       "== code 0x1\n"
365       "0x01 -0x02\n"
366   );
367   CHECK_TRACE_COUNT("error", 0);
368 }
369 
370 void test_repeated_segment_merges_data() {
371   parse_and_load(
372       "== code 0x1\n"
373       "11 22\n"
374       "== code\n"  // again
375       "33 44\n"
376   );
377   CHECK_TRACE_CONTENTS(
378       "parse: new segment 'code'\n"
379       "parse: appending to segment 'code'\n"
380       // first segment
381       "load: 0x00000001 -> 11\n"
382       "load: 0x00000002 -> 22\n"
383       // second segment
384       "load: 0x00000003 -> 33\n"
385       "load: 0x00000004 -> 44\n"
386   );
387 }
388 
389 void test_error_on_missing_segment_header() {
390   Hide_errors = true;
391   parse_and_load(
392       "01 02\n"
393   );
394   CHECK_TRACE_CONTENTS(
395       "error: input does not start with a '==' section header\n"
396   );
397 }
398 
399 void test_error_on_uppercase_hex() {
400   Hide_errors = true;
401   parse_and_load(
402       "== code\n"
403       "01 Ab\n"
404   );
405   CHECK_TRACE_CONTENTS(
406       "error: uppercase hex not allowed: Ab\n"
407   );
408 }
409 
410 //: helper for tests
411 void parse_and_load(const string& text_bytes) {
412   program p;
413   istringstream in(text_bytes);
414   parse(in, p);
415   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
416   load(p);
417 }
418 
419 //:: run
420 
421 :(before "End Initialize Op Names")
422 put_new(Name, "b8", "copy imm32 to EAX (mov)");
423 
424 //: our first opcode
425 
426 :(before "End Single-Byte Opcodes")
427 case 0xb8: {  // copy imm32 to EAX
428   const int32_t src = next32();
429   trace(Callstack_depth+1, "run") << "copy imm32 0x" << HEXWORD << src << " to EAX" << end();
430   Reg[EAX].i = src;
431   break;
432 }
433 
434 :(code)
435 void test_copy_imm32_to_EAX_again() {
436   run(
437       "== code 0x1\n"  // code segment
438       // op     ModR/M  SIB   displacement  immediate
439       "  b8                                 0a 0b 0c 0d \n"  // copy 0x0d0c0b0a to EAX
440   );
441   CHECK_TRACE_CONTENTS(
442       "run: copy imm32 0x0d0c0b0a to EAX\n"
443   );
444 }
445 
446 // read a 32-bit int in little-endian order from the instruction stream
447 int32_t next32() {
448   int32_t result = read_mem_i32(EIP);
449   EIP+=4;
450   return result;
451 }
452 
453 //:: helpers
454 
455 string to_string(const word& w) {
456   ostringstream out;
457   out << w.data;
458   for (int i = 0;  i < SIZE(w.metadata);  ++i)
459     out << " /" << w.metadata.at(i);
460   return out.str();
461 }
462 
463 bool contains_uppercase(const string& s) {
464   for (int i = 0;  i < SIZE(s);  ++i)
465     if (isupper(s.at(i))) return true;
466   return false;
467 }