https://github.com/akkartik/mu/blob/main/linux/bootstrap/011run.cc
  1 //: Running SubX programs on the VM.
  2 
  3 //: (Not to be confused with the 'run' subcommand for running ELF binaries on
  4 //: the VM. That comes later.)
  5 
  6 :(before "End Help Texts")
  7 put_new(Help, "syntax",
  8   "SubX programs consist of segments, each segment in turn consisting of lines.\n"
  9   "Line-endings are significant; each line should contain a single\n"
 10   "instruction, macro or directive.\n"
 11   "\n"
 12   "Comments start with the '#' character. It should be at the start of a word\n"
 13   "(start of line, or following a space).\n"
 14   "\n"
 15   "Each segment starts with a header line: a '==' delimiter followed by the name of\n"
 16   "the segment and a (sometimes approximate) starting address in memory.\n"
 17   "The name 'code' is special; instructions to execute should always go here.\n"
 18   "\n"
 19   "The resulting binary starts running code from a label called 'Entry'\n"
 20   "in the code segment.\n"
 21   "\n"
 22   "Segments with the same name get merged together. This rule helps keep functions\n"
 23   "and their data close together in .subx files.\n"
 24   "You don't have to specify the starting address after the first time.\n"
 25   "\n"
 26   "Lines consist of a series of words. Words can contain arbitrary metadata\n"
 27   "after a '/', but they can never contain whitespace. Metadata has no effect\n"
 28   "at runtime, but can be handy when rewriting macros.\n"
 29   "\n"
 30   "Check out the example programs linux/ex*.\n"
 31 );
 32 :(before "End Help Contents")
 33 cerr << "  syntax\n";
 34 
 35 :(code)
 36 void test_copy_imm32_to_EAX() {
 37   // At the lowest level, SubX programs are a series of hex bytes, each
 38   // (variable-length) instruction on one line.
 39   run(
 40       // Comments start with '#' and are ignored.
 41       "# comment\n"
 42       // Segment headers start with '==', a name and a starting hex address.
 43       // There's usually one code and one data segment. The code segment
 44       // always comes first.
 45       "== code 0x1\n"  // code segment
 46 
 47       // After the header, each segment consists of lines, and each line
 48       // consists of words separated by whitespace.
 49       //
 50       // All words can have metadata after a '/'. No spaces allowed in
 51       // metadata, of course.
 52       // Unrecognized metadata never causes errors, so you can use it for
 53       // documentation.
 54       //
 55       // Within the code segment in particular, x86 instructions consist of
 56       // some number of the following parts and sub-parts (see the Readme and
 57       // cheatsheet.pdf for details):
 58       //   opcodes: 1-3 bytes
 59       //   ModR/M byte
 60       //   SIB byte
 61       //   displacement: 0/1/2/4 bytes
 62       //   immediate: 0/1/2/4 bytes
 63       // opcode        ModR/M                    SIB                   displacement    immediate
 64       // instruction   mod, reg, Reg/Mem bits    scale, index, base
 65       // 1-3 bytes     0/1 byte                  0/1 byte              0/1/2/4 bytes   0/1/2/4 bytes
 66       "  b8            .                         .                     .               0a 0b 0c 0d\n"  // copy 0x0d0c0b0a to EAX
 67       // The periods are just to help the eye track long gaps between columns,
 68       // and are otherwise ignored.
 69   );
 70   // This program, when run, causes the following events in the trace:
 71   CHECK_TRACE_CONTENTS(
 72       "load: 0x00000001 -> b8\n"
 73       "load: 0x00000002 -> 0a\n"
 74       "load: 0x00000003 -> 0b\n"
 75       "load: 0x00000004 -> 0c\n"
 76       "load: 0x00000005 -> 0d\n"
 77       "run: copy imm32 0x0d0c0b0a to EAX\n"
 78   );
 79 }
 80 
 81 // top-level helper for tests: parse the input, load the hex bytes into memory, run
 82 void run(const string& text_bytes) {
 83   program p;
 84   istringstream in(text_bytes);
 85   // Loading Test Program
 86   parse(in, p);
 87   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
 88   // Running Test Program
 89   load(p);
 90   if (trace_contains_errors()) return;
 91   // convenience to keep tests concise: 'Entry' label need not be provided
 92   // not allowed in real programs
 93   if (p.entry)
 94     EIP = p.entry;
 95   else
 96     EIP = find(p, "code")->start;
 97   while (EIP < End_of_program)
 98     run_one_instruction();
 99 }
100 
101 //:: core data structures
102 
103 :(before "End Types")
104 struct program {
105   uint32_t entry;
106   vector<segment> segments;
107   program() { entry = 0; }
108 };
109 :(before "struct program")
110 struct segment {
111   string name;
112   uint32_t start;
113   vector<line> lines;
114   // End segment Fields
115   segment() {
116     start = 0;
117     // End segment Constructor
118   }
119 };
120 :(before "struct segment")
121 struct line {
122   vector<word> words;
123   vector<string> metadata;
124   string original;
125 };
126 :(before "struct line")
127 struct word {
128   string original;
129   string data;
130   vector<string> metadata;
131 };
132 
133 //:: parse
134 
135 :(code)
136 void parse(istream& fin, program& out) {
137   segment* curr_segment = NULL;
138   vector<line> l;
139   while (has_data(fin)) {
140     string line_data;
141     line curr;
142     getline(fin, line_data);
143     curr.original = line_data;
144     trace(99, "parse") << "line: " << line_data << end();
145     // End Line Parsing Special-cases(line_data -> l)
146     istringstream lin(line_data);
147     while (has_data(lin)) {
148       string word_data;
149       lin >> word_data;
150       if (word_data.empty()) continue;
151       if (word_data[0] == '#') break;  // comment
152       if (word_data == ".") continue;  // comment token
153       if (word_data == "==") {
154         flush(curr_segment, l);
155         string segment_name;
156         lin >> segment_name;
157         curr_segment = find(out, segment_name);
158         if (curr_segment != NULL) {
159           trace(3, "parse") << "appending to segment '" << segment_name << "'" << end();
160         }
161         else {
162           trace(3, "parse") << "new segment '" << segment_name << "'" << end();
163           uint32_t seg_start = 0;
164           lin >> std::hex >> seg_start;
165           sanity_check_program_segment(out, seg_start);
166           out.segments.push_back(segment());
167           curr_segment = &out.segments.back();
168           curr_segment->name = segment_name;
169           curr_segment->start = seg_start;
170           if (trace_contains_errors()) continue;
171           trace(3, "parse") << "starts at address 0x" << HEXWORD << curr_segment->start << end();
172         }
173         break;  // skip rest of line
174       }
175       if (word_data[0] == ':') {
176         // todo: line metadata
177         break;
178       }
179       curr.words.push_back(word());
180       parse_word(word_data, curr.words.back());
181       trace(99, "parse") << "word: " << to_string(curr.words.back());
182     }
183     if (!curr.words.empty())
184       l.push_back(curr);
185   }
186   flush(curr_segment, l);
187   trace(99, "parse") << "done" << end();
188 }
189 
190 segment* find(program& p, const string& segment_name) {
191   for (int i = 0;  i < SIZE(p.segments);  ++i) {
192     if (p.segments.at(i).name == segment_name)
193       return &p.segments.at(i);
194   }
195   return NULL;
196 }
197 
198 void flush(segment* s, vector<line>& lines) {
199   if (lines.empty()) return;
200   if (s == NULL) {
201     raise << "input does not start with a '==' section header\n" << end();
202     return;
203   }
204   trace(3, "parse") << "flushing segment" << end();
205   s->lines.insert(s->lines.end(), lines.begin(), lines.end());
206   lines.clear();
207 }
208 
209 void parse_word(const string& data, word& out) {
210   out.original = data;
211   istringstream win(data);
212   if (getline(win, out.data, '/')) {
213     string m;
214     while (getline(win, m, '/'))
215       out.metadata.push_back(m);
216   }
217 }
218 
219 void sanity_check_program_segment(const program& p, uint32_t addr) {
220   for (int i = 0;  i < SIZE(p.segments);  ++i) {
221     if (p.segments.at(i).start == addr)
222       raise << "can't have multiple segments starting at address 0x" << HEXWORD << addr << '\n' << end();
223   }
224 }
225 
226 // helper for tests
227 void parse(const string& text_bytes) {
228   program p;
229   istringstream in(text_bytes);
230   parse(in, p);
231 }
232 
233 void test_detect_duplicate_segments() {
234   Hide_errors = true;
235   parse(
236       "== segment1 0xee\n"
237       "ab\n"
238       "== segment2 0xee\n"
239       "cd\n"
240   );
241   CHECK_TRACE_CONTENTS(
242       "error: can't have multiple segments starting at address 0x000000ee\n"
243   );
244 }
245 
246 //:: load
247 
248 void load(const program& p) {
249   if (find(p, "code") == NULL) {
250     raise << "no code to run\n" << end();
251     return;
252   }
253   // Ensure segments are disjoint.
254   set<uint32_t> overlap;
255   for (int i = 0;   i < SIZE(p.segments);  ++i) {
256     const segment& seg = p.segments.at(i);
257     uint32_t addr = seg.start;
258     if (!already_allocated(addr))
259       Mem.push_back(vma(seg.start));
260     trace(99, "load") << "loading segment " << i << " from " << HEXWORD << addr << end();
261     for (int j = 0;  j < SIZE(seg.lines);  ++j) {
262       const line& l = seg.lines.at(j);
263       for (int k = 0;  k < SIZE(l.words);  ++k) {
264         const word& w = l.words.at(k);
265         uint8_t val = hex_byte(w.data);
266         if (trace_contains_errors()) return;
267         assert(overlap.find(addr) == overlap.end());
268         write_mem_u8(addr, val);
269         overlap.insert(addr);
270         trace(99, "load") << "0x" << HEXWORD << addr << " -> " << HEXBYTE << NUM(read_mem_u8(addr)) << end();
271         ++addr;
272       }
273     }
274     if (seg.name == "code") {
275       End_of_program = addr;
276     }
277   }
278 }
279 
280 const segment* find(const program& p, const string& segment_name) {
281   for (int i = 0;  i < SIZE(p.segments);  ++i) {
282     if (p.segments.at(i).name == segment_name)
283       return &p.segments.at(i);
284   }
285   return NULL;
286 }
287 
288 uint8_t hex_byte(const string& s) {
289   if (contains_uppercase(s)) {
290     raise << "uppercase hex not allowed: " << s << '\n' << end();
291     return 0;
292   }
293   istringstream in(s);
294   int result = 0;
295   in >> std::hex >> result;
296   if (!in || !in.eof()) {
297     raise << "token '" << s << "' is not a hex byte\n" << end();
298     return '\0';
299   }
300   if (result > 0xff || result < -0x8f) {
301     raise << "token '" << s << "' is not a hex byte\n" << end();
302     return '\0';
303   }
304   return static_cast<uint8_t>(result);
305 }
306 
307 void test_number_too_large() {
308   Hide_errors = true;
309   parse_and_load(
310       "== code 0x1\n"
311       "01 cab\n"
312   );
313   CHECK_TRACE_CONTENTS(
314       "error: token 'cab' is not a hex byte\n"
315   );
316 }
317 
318 void test_invalid_hex() {
319   Hide_errors = true;
320   parse_and_load(
321       "== code 0x1\n"
322       "01 cx\n"
323   );
324   CHECK_TRACE_CONTENTS(
325       "error: token 'cx' is not a hex byte\n"
326   );
327 }
328 
329 void test_negative_number() {
330   parse_and_load(
331       "== code 0x1\n"
332       "01 -02\n"
333   );
334   CHECK_TRACE_COUNT("error", 0);
335 }
336 
337 void test_negative_number_too_small() {
338   Hide_errors = true;
339   parse_and_load(
340       "== code 0x1\n"
341       "01 -12345\n"
342   );
343   CHECK_TRACE_CONTENTS(
344       "error: token '-12345' is not a hex byte\n"
345   );
346 }
347 
348 void test_hex_prefix() {
349   parse_and_load(
350       "== code 0x1\n"
351       "0x01 -0x02\n"
352   );
353   CHECK_TRACE_COUNT("error", 0);
354 }
355 
356 void test_repeated_segment_merges_data() {
357   parse_and_load(
358       "== code 0x1\n"
359       "11 22\n"
360       "== code\n"  // again
361       "33 44\n"
362   );
363   CHECK_TRACE_CONTENTS(
364       "parse: new segment 'code'\n"
365       "parse: appending to segment 'code'\n"
366       // first segment
367       "load: 0x00000001 -> 11\n"
368       "load: 0x00000002 -> 22\n"
369       // second segment
370       "load: 0x00000003 -> 33\n"
371       "load: 0x00000004 -> 44\n"
372   );
373 }
374 
375 void test_error_on_missing_segment_header() {
376   Hide_errors = true;
377   parse_and_load(
378       "01 02\n"
379   );
380   CHECK_TRACE_CONTENTS(
381       "error: input does not start with a '==' section header\n"
382   );
383 }
384 
385 void test_error_on_uppercase_hex() {
386   Hide_errors = true;
387   parse_and_load(
388       "== code\n"
389       "01 Ab\n"
390   );
391   CHECK_TRACE_CONTENTS(
392       "error: uppercase hex not allowed: Ab\n"
393   );
394 }
395 
396 //: helper for tests
397 void parse_and_load(const string& text_bytes) {
398   program p;
399   istringstream in(text_bytes);
400   parse(in, p);
401   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
402   load(p);
403 }
404 
405 //:: run
406 
407 :(before "End Initialize Op Names")
408 put_new(Name, "b8", "copy imm32 to EAX (mov)");
409 
410 //: our first opcode
411 
412 :(before "End Single-Byte Opcodes")
413 case 0xb8: {  // copy imm32 to EAX
414   const int32_t src = next32();
415   trace(Callstack_depth+1, "run") << "copy imm32 0x" << HEXWORD << src << " to EAX" << end();
416   Reg[EAX].i = src;
417   break;
418 }
419 
420 :(code)
421 void test_copy_imm32_to_EAX_again() {
422   run(
423       "== code 0x1\n"  // code segment
424       // op     ModR/M  SIB   displacement  immediate
425       "  b8                                 0a 0b 0c 0d \n"  // copy 0x0d0c0b0a to EAX
426   );
427   CHECK_TRACE_CONTENTS(
428       "run: copy imm32 0x0d0c0b0a to EAX\n"
429   );
430 }
431 
432 // read a 32-bit int in little-endian order from the instruction stream
433 int32_t next32() {
434   int32_t result = read_mem_i32(EIP);
435   EIP+=4;
436   return result;
437 }
438 
439 //:: helpers
440 
441 string to_string(const word& w) {
442   ostringstream out;
443   out << w.data;
444   for (int i = 0;  i < SIZE(w.metadata);  ++i)
445     out << " /" << w.metadata.at(i);
446   return out.str();
447 }
448 
449 bool contains_uppercase(const string& s) {
450   for (int i = 0;  i < SIZE(s);  ++i)
451     if (isupper(s.at(i))) return true;
452   return false;
453 }