https://github.com/akkartik/mu/blob/master/subx/011run.cc
  1 //: Running SubX programs on the VM.
  2 
  3 //: (Not to be confused with the 'run' subcommand for running ELF binaries on
  4 //: the VM. That comes later.)
  5 
  6 :(before "End Help Texts")
  7 put_new(Help, "syntax",
  8   "SubX programs consist of segments, each segment in turn consisting of lines.\n"
  9   "Line-endings are significant; each line should contain a single\n"
 10   "instruction, macro or directive.\n"
 11   "\n"
 12   "Comments start with the '#' character. It should be at the start of a word\n"
 13   "(start of line, or following a space).\n"
 14   "\n"
 15   "Each segment starts with a header line: a '==' delimiter followed by the name of\n"
 16   "the segment and a (sometimes approximate) starting address in memory.\n"
 17   "The name 'code' is special; instructions to execute should always go here.\n"
 18   "\n"
 19   "The resulting binary starts running from the start of the segment by default.\n"
 20   "To start elsewhere in the code segment, define a special label called 'Entry'.\n"
 21   "\n"
 22   "Segments with the same name get merged together. This rule helps keep functions and\n"
 23   "their data close together in .subx files.\n"
 24   "You don't have to specify the starting address after the first time.\n"
 25   "\n"
 26   "Lines consist of a series of words. Words can contain arbitrary metadata\n"
 27   "after a '/', but they can never contain whitespace. Metadata has no effect\n"
 28   "at runtime, but can be handy when rewriting macros.\n"
 29   "\n"
 30   "Check out the examples in the examples/ directory.\n"
 31   "Programming in machine code can be annoying, but let's see if we can make\n"
 32   "it nice enough to be able to write a compiler in it.\n"
 33 );
 34 :(before "End Help Contents")
 35 cerr << "  syntax\n";
 36 
 37 :(code)
 38 void test_copy_imm32_to_EAX() {
 39   // At the lowest level, SubX programs are a series of hex bytes, each
 40   // (variable-length) instruction on one line.
 41   run(
 42       // Comments start with '#' and are ignored.
 43       "# comment\n"
 44       // Segment headers start with '==', a name and a starting hex address.
 45       // There's usually one code and one data segment. The code segment
 46       // always comes first.
 47       "== code 0x1\n"  // code segment
 48 
 49       // After the header, each segment consists of lines, and each line
 50       // consists of words separated by whitespace.
 51       //
 52       // All words can have metadata after a '/'. No spaces allowed in
 53       // metadata, of course.
 54       // Unrecognized metadata never causes errors, so you can use it for
 55       // documentation.
 56       //
 57       // Within the code segment in particular, x86 instructions consist of
 58       // some number of the following parts and sub-parts (see the Readme and
 59       // cheatsheet.pdf for details):
 60       //   opcodes: 1-3 bytes
 61       //   ModR/M byte
 62       //   SIB byte
 63       //   displacement: 0/1/2/4 bytes
 64       //   immediate: 0/1/2/4 bytes
 65       // opcode        ModR/M                    SIB                   displacement    immediate
 66       // instruction   mod, reg, Reg/Mem bits    scale, index, base
 67       // 1-3 bytes     0/1 byte                  0/1 byte              0/1/2/4 bytes   0/1/2/4 bytes
 68       "  b8            .                         .                     .               0a 0b 0c 0d\n"  // copy 0x0d0c0b0a to EAX
 69       // The periods are just to help the eye track long gaps between columns,
 70       // and are otherwise ignored.
 71   );
 72   // This program, when run, causes the following events in the trace:
 73   CHECK_TRACE_CONTENTS(
 74       "load: 0x00000001 -> b8\n"
 75       "load: 0x00000002 -> 0a\n"
 76       "load: 0x00000003 -> 0b\n"
 77       "load: 0x00000004 -> 0c\n"
 78       "load: 0x00000005 -> 0d\n"
 79       "run: copy imm32 0x0d0c0b0a to EAX\n"
 80   );
 81 }
 82 
 83 // top-level helper for scenarios: parse the input, transform any macros, load
 84 // the final hex bytes into memory, run it
 85 void run(const string& text_bytes) {
 86   program p;
 87   istringstream in(text_bytes);
 88   parse(in, p);
 89   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
 90   transform(p);
 91   if (trace_contains_errors()) return;
 92   load(p);
 93   if (trace_contains_errors()) return;
 94   while (EIP < End_of_program)
 95     run_one_instruction();
 96 }
 97 
 98 //:: core data structures
 99 
100 :(before "End Types")
101 struct program {
102   vector<segment> segments;
103   // random ideas for other things we may eventually need
104   //map<name, address> globals;
105   //vector<recipe> recipes;
106   //map<string, type_info> types;
107 };
108 :(before "struct program")
109 struct segment {
110   string name;
111   uint32_t start;
112   vector<line> lines;
113   // End segment Fields
114   segment() {
115     start = 0;
116     // End segment Constructor
117   }
118 };
119 :(before "struct segment")
120 struct line {
121   vector<word> words;
122   vector<string> metadata;
123   string original;
124 };
125 :(before "struct line")
126 struct word {
127   string original;
128   string data;
129   vector<string> metadata;
130 };
131 
132 //:: parse
133 
134 :(code)
135 void parse(istream& fin, program& out) {
136   segment* curr_segment = NULL;
137   vector<line> l;
138   while (has_data(fin)) {
139     string line_data;
140     line curr;
141     getline(fin, line_data);
142     curr.original = line_data;
143     trace(99, "parse") << "line: " << line_data << end();
144     // End Line Parsing Special-cases(line_data -> l)
145     istringstream lin(line_data);
146     while (has_data(lin)) {
147       string word_data;
148       lin >> word_data;
149       if (word_data.empty()) continue;
150       if (word_data[0] == '#') break;  // comment
151       if (word_data == ".") continue;  // comment token
152       if (word_data == "==") {
153         flush(curr_segment, l);
154         string segment_name;
155         lin >> segment_name;
156         curr_segment = find(out, segment_name);
157         if (curr_segment != NULL) {
158           trace(3, "parse") << "appending to segment '" << segment_name << "'" << end();
159         }
160         else {
161           trace(3, "parse") << "new segment '" << segment_name << "'" << end();
162           uint32_t seg_start = 0;
163           lin >> std::hex >> seg_start;
164           sanity_check_program_segment(out, seg_start);
165           out.segments.push_back(segment());
166           curr_segment = &out.segments.back();
167           curr_segment->name = segment_name;
168           curr_segment->start = seg_start;
169           if (trace_contains_errors()) continue;
170           trace(3, "parse") << "starts at address 0x" << HEXWORD << curr_segment->start << end();
171         }
172         break;  // skip rest of line
173       }
174       if (word_data[0] == ':') {
175         // todo: line metadata
176         break;
177       }
178       curr.words.push_back(word());
179       parse_word(word_data, curr.words.back());
180       trace(99, "parse") << "word: " << to_string(curr.words.back());
181     }
182     if (!curr.words.empty())
183       l.push_back(curr);
184   }
185   flush(curr_segment, l);
186   trace(99, "parse") << "done" << end();
187 }
188 
189 segment* find(program& p, const string& segment_name) {
190   for (int i = 0;  i < SIZE(p.segments);  ++i) {
191     if (p.segments.at(i).name == segment_name)
192       return &p.segments.at(i);
193   }
194   return NULL;
195 }
196 
197 void flush(segment* s, vector<line>& lines) {
198   if (lines.empty()) return;
199   if (s == NULL) {
200     raise << "input does not start with a '==' section header\n" << end();
201     return;
202   }
203   trace(3, "parse") << "flushing segment" << end();
204   s->lines.insert(s->lines.end(), lines.begin(), lines.end());
205   lines.clear();
206 }
207 
208 void parse_word(const string& data, word& out) {
209   out.original = data;
210   istringstream win(data);
211   if (getline(win, out.data, '/')) {
212     string m;
213     while (getline(win, m, '/'))
214       out.metadata.push_back(m);
215   }
216 }
217 
218 void sanity_check_program_segment(const program& p, uint32_t addr) {
219   for (int i = 0;  i < SIZE(p.segments);  ++i) {
220     if (p.segments.at(i).start == addr)
221       raise << "can't have multiple segments starting at address 0x" << HEXWORD << addr << '\n' << end();
222   }
223 }
224 
225 // helper for tests
226 void parse(const string& text_bytes) {
227   program p;
228   istringstream in(text_bytes);
229   parse(in, p);
230 }
231 
232 void test_detect_duplicate_segments() {
233   Hide_errors = true;
234   parse(
235       "== segment1 0xee\n"
236       "ab\n"
237       "== segment2 0xee\n"
238       "cd\n"
239   );
240   CHECK_TRACE_CONTENTS(
241       "error: can't have multiple segments starting at address 0x000000ee\n"
242   );
243 }
244 
245 //:: transform
246 
247 :(before "End Types")
248 typedef void (*transform_fn)(program&);
249 :(before "End Globals")
250 vector<transform_fn> Transform;
251 
252 :(code)
253 void transform(program& p) {
254   for (int t = 0;  t < SIZE(Transform);  ++t)
255     (*Transform.at(t))(p);
256 }
257 
258 //:: load
259 
260 void load(const program& p) {
261   if (find(p, "code") == NULL) {
262     raise << "no code to run\n" << end();
263     return;
264   }
265   // Ensure segments are disjoint.
266   set<uint32_t> overlap;
267   for (int i = 0;   i < SIZE(p.segments);  ++i) {
268     const segment& seg = p.segments.at(i);
269     uint32_t addr = seg.start;
270     if (!already_allocated(addr))
271       Mem.push_back(vma(seg.start));
272     trace(99, "load") << "loading segment " << i << " from " << HEXWORD << addr << end();
273     for (int j = 0;  j < SIZE(seg.lines);  ++j) {
274       const line& l = seg.lines.at(j);
275       for (int k = 0;  k < SIZE(l.words);  ++k) {
276         const word& w = l.words.at(k);
277         uint8_t val = hex_byte(w.data);
278         if (trace_contains_errors()) return;
279         assert(overlap.find(addr) == overlap.end());
280         write_mem_u8(addr, val);
281         overlap.insert(addr);
282         trace(99, "load") << "0x" << HEXWORD << addr << " -> " << HEXBYTE << NUM(read_mem_u8(addr)) << end();
283         ++addr;
284       }
285     }
286     if (seg.name == "code") {
287       End_of_program = addr;
288       EIP = seg.start;
289       // End Initialize EIP
290     }
291   }
292 }
293 
294 const segment* find(const program& p, const string& segment_name) {
295   for (int i = 0;  i < SIZE(p.segments);  ++i) {
296     if (p.segments.at(i).name == segment_name)
297       return &p.segments.at(i);
298   }
299   return NULL;
300 }
301 
302 uint8_t hex_byte(const string& s) {
303   istringstream in(s);
304   int result = 0;
305   in >> std::hex >> result;
306   if (!in || !in.eof()) {
307     raise << "token '" << s << "' is not a hex byte\n" << end();
308     return '\0';
309   }
310   if (result > 0xff || result < -0x8f) {
311     raise << "token '" << s << "' is not a hex byte\n" << end();
312     return '\0';
313   }
314   return static_cast<uint8_t>(result);
315 }
316 
317 void test_number_too_large() {
318   Hide_errors = true;
319   parse_and_load(
320       "== code 0x1\n"
321       "01 cab\n"
322   );
323   CHECK_TRACE_CONTENTS(
324       "error: token 'cab' is not a hex byte\n"
325   );
326 }
327 
328 void test_invalid_hex() {
329   Hide_errors = true;
330   parse_and_load(
331       "== code 0x1\n"
332       "01 cx\n"
333   );
334   CHECK_TRACE_CONTENTS(
335       "error: token 'cx' is not a hex byte\n"
336   );
337 }
338 
339 void test_negative_number() {
340   parse_and_load(
341       "== code 0x1\n"
342       "01 -02\n"
343   );
344   CHECK_TRACE_COUNT("error", 0);
345 }
346 
347 void test_negative_number_too_small() {
348   Hide_errors = true;
349   parse_and_load(
350       "== code 0x1\n"
351       "01 -12345\n"
352   );
353   CHECK_TRACE_CONTENTS(
354       "error: token '-12345' is not a hex byte\n"
355   );
356 }
357 
358 void test_hex_prefix() {
359   parse_and_load(
360       "== code 0x1\n"
361       "0x01 -0x02\n"
362   );
363   CHECK_TRACE_COUNT("error", 0);
364 }
365 
366 void test_repeated_segment_merges_data() {
367   parse_and_load(
368       "== code 0x1\n"
369       "11 22\n"
370       "== code\n"  // again
371       "33 44\n"
372   );
373   CHECK_TRACE_CONTENTS(
374       "parse: new segment 'code'\n"
375       "parse: appending to segment 'code'\n"
376       // first segment
377       "load: 0x00000001 -> 11\n"
378       "load: 0x00000002 -> 22\n"
379       // second segment
380       "load: 0x00000003 -> 33\n"
381       "load: 0x00000004 -> 44\n"
382   );
383 }
384 
385 void test_error_on_missing_segment_header() {
386   Hide_errors = true;
387   parse_and_load(
388       "01 02\n"
389   );
390   CHECK_TRACE_CONTENTS(
391       "error: input does not start with a '==' section header\n"
392   );
393 }
394 
395 //: helper for tests
396 void parse_and_load(const string& text_bytes) {
397   program p;
398   istringstream in(text_bytes);
399   parse(in, p);
400   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
401   load(p);
402 }
403 
404 //:: run
405 
406 :(before "End Initialize Op Names")
407 put_new(Name, "b8", "copy imm32 to EAX (mov)");
408 
409 //: our first opcode
410 
411 :(before "End Single-Byte Opcodes")
412 case 0xb8: {  // copy imm32 to EAX
413   const int32_t src = next32();
414   trace(Callstack_depth+1, "run") << "copy imm32 0x" << HEXWORD << src << " to EAX" << end();
415   Reg[EAX].i = src;
416   break;
417 }
418 
419 :(code)
420 void test_copy_imm32_to_EAX_again() {
421   run(
422       "== code 0x1\n"  // code segment
423       // op     ModR/M  SIB   displacement  immediate
424       "  b8                                 0a 0b 0c 0d \n"  // copy 0x0d0c0b0a to EAX
425   );
426   CHECK_TRACE_CONTENTS(
427       "run: copy imm32 0x0d0c0b0a to EAX\n"
428   );
429 }
430 
431 // read a 32-bit int in little-endian order from the instruction stream
432 int32_t next32() {
433   int32_t result = read_mem_i32(EIP);
434   EIP+=4;
435   return result;
436 }
437 
438 //:: helpers
439 
440 string to_string(const word& w) {
441   ostringstream out;
442   out << w.data;
443   for (int i = 0;  i < SIZE(w.metadata);  ++i)
444     out << " /" << w.metadata.at(i);
445   return out.str();
446 }
447 
448 int32_t parse_int(const string& s) {
449   if (s.empty()) return 0;
450   istringstream in(s);
451   in >> std::hex;
452   if (s.at(0) == '-') {
453     int32_t result = 0;
454     in >> result;
455     if (!in || !in.eof()) {
456       raise << "not a number: " << s << '\n' << end();
457       return 0;
458     }
459     return result;
460   }
461   uint32_t uresult = 0;
462   in >> uresult;
463   if (!in || !in.eof()) {
464     raise << "not a number: " << s << '\n' << end();
465     return 0;
466   }
467   return static_cast<int32_t>(uresult);
468 }
469 :(before "End Unit Tests")
470 void test_parse_int() {
471   CHECK_EQ(0, parse_int("0"));
472   CHECK_EQ(0, parse_int("0x0"));
473   CHECK_EQ(0, parse_int("0x0"));
474   CHECK_EQ(16, parse_int("10"));  // hex always
475   CHECK_EQ(-1, parse_int("-1"));
476   CHECK_EQ(-1, parse_int("0xffffffff"));
477 }