Mu - linux/bootstrap/011run.cc

From 3350c34a74844e21ea69077e01efff3bae64bdcd Mon Sep 17 00:00:00 2001 From: Kartik Agaram Date: Tue, 23 Mar 2021 17:31:08 -0700 Subject: . --- html/linux/bootstrap/011run.cc.html | 515 ++++++++++++++++++++++++++++++++++++ 1 file changed, 515 insertions(+) create mode 100644 html/linux/bootstrap/011run.cc.html (limited to 'html/linux/bootstrap/011run.cc.html') diff --git a/html/linux/bootstrap/011run.cc.html b/html/linux/bootstrap/011run.cc.html new file mode 100644 index 00000000..bc7b84ae --- /dev/null +++ b/html/linux/bootstrap/011run.cc.html @@ -0,0 +1,515 @@ + + + + +Mu - linux/bootstrap/011run.cc + + + + + + + + + + +https://github.com/akkartik/mu/blob/main/linux/bootstrap/011run.cc +
+  1 //: Running SubX programs on the VM.
+  2 
+  3 //: (Not to be confused with the 'run' subcommand for running ELF binaries on
+  4 //: the VM. That comes later.)
+  5 
+  6 :(before "End Help Texts")
+  7 put_new(Help, "syntax",
+  8   "SubX programs consist of segments, each segment in turn consisting of lines.\n"
+  9   "Line-endings are significant; each line should contain a single\n"
+ 10   "instruction, macro or directive.\n"
+ 11   "\n"
+ 12   "Comments start with the '#' character. It should be at the start of a word\n"
+ 13   "(start of line, or following a space).\n"
+ 14   "\n"
+ 15   "Each segment starts with a header line: a '==' delimiter followed by the name of\n"
+ 16   "the segment and a (sometimes approximate) starting address in memory.\n"
+ 17   "The name 'code' is special; instructions to execute should always go here.\n"
+ 18   "\n"
+ 19   "The resulting binary starts running code from a label called 'Entry'\n"
+ 20   "in the code segment.\n"
+ 21   "\n"
+ 22   "Segments with the same name get merged together. This rule helps keep functions\n"
+ 23   "and their data close together in .subx files.\n"
+ 24   "You don't have to specify the starting address after the first time.\n"
+ 25   "\n"
+ 26   "Lines consist of a series of words. Words can contain arbitrary metadata\n"
+ 27   "after a '/', but they can never contain whitespace. Metadata has no effect\n"
+ 28   "at runtime, but can be handy when rewriting macros.\n"
+ 29   "\n"
+ 30   "Check out the example programs in the apps/ directory, particularly apps/ex*.\n"
+ 31 );
+ 32 :(before "End Help Contents")
+ 33 cerr << "  syntax\n";
+ 34 
+ 35 :(code)
+ 36 void test_copy_imm32_to_EAX() {
+ 37   // At the lowest level, SubX programs are a series of hex bytes, each
+ 38   // (variable-length) instruction on one line.
+ 39   run(
+ 40       // Comments start with '#' and are ignored.
+ 41       "# comment\n"
+ 42       // Segment headers start with '==', a name and a starting hex address.
+ 43       // There's usually one code and one data segment. The code segment
+ 44       // always comes first.
+ 45       "== code 0x1\n"  // code segment
+ 46 
+ 47       // After the header, each segment consists of lines, and each line
+ 48       // consists of words separated by whitespace.
+ 49       //
+ 50       // All words can have metadata after a '/'. No spaces allowed in
+ 51       // metadata, of course.
+ 52       // Unrecognized metadata never causes errors, so you can use it for
+ 53       // documentation.
+ 54       //
+ 55       // Within the code segment in particular, x86 instructions consist of
+ 56       // some number of the following parts and sub-parts (see the Readme and
+ 57       // cheatsheet.pdf for details):
+ 58       //   opcodes: 1-3 bytes
+ 59       //   ModR/M byte
+ 60       //   SIB byte
+ 61       //   displacement: 0/1/2/4 bytes
+ 62       //   immediate: 0/1/2/4 bytes
+ 63       // opcode        ModR/M                    SIB                   displacement    immediate
+ 64       // instruction   mod, reg, Reg/Mem bits    scale, index, base
+ 65       // 1-3 bytes     0/1 byte                  0/1 byte              0/1/2/4 bytes   0/1/2/4 bytes
+ 66       "  b8            .                         .                     .               0a 0b 0c 0d\n"  // copy 0x0d0c0b0a to EAX
+ 67       // The periods are just to help the eye track long gaps between columns,
+ 68       // and are otherwise ignored.
+ 69   );
+ 70   // This program, when run, causes the following events in the trace:
+ 71   CHECK_TRACE_CONTENTS(
+ 72       "load: 0x00000001 -> b8\n"
+ 73       "load: 0x00000002 -> 0a\n"
+ 74       "load: 0x00000003 -> 0b\n"
+ 75       "load: 0x00000004 -> 0c\n"
+ 76       "load: 0x00000005 -> 0d\n"
+ 77       "run: copy imm32 0x0d0c0b0a to EAX\n"
+ 78   );
+ 79 }
+ 80 
+ 81 // top-level helper for tests: parse the input, load the hex bytes into memory, run
+ 82 void run(const string& text_bytes) {
+ 83   program p;
+ 84   istringstream in(text_bytes);
+ 85   // Loading Test Program
+ 86   parse(in, p);
+ 87   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
+ 88   // Running Test Program
+ 89   load(p);
+ 90   if (trace_contains_errors()) return;
+ 91   // convenience to keep tests concise: 'Entry' label need not be provided
+ 92   // not allowed in real programs
+ 93   if (p.entry)
+ 94     EIP = p.entry;
+ 95   else
+ 96     EIP = find(p, "code")->start;
+ 97   while (EIP < End_of_program)
+ 98     run_one_instruction();
+ 99 }
+100 
+101 //:: core data structures
+102 
+103 :(before "End Types")
+104 struct program {
+105   uint32_t entry;
+106   vector<segment> segments;
+107   program() { entry = 0; }
+108 };
+109 :(before "struct program")
+110 struct segment {
+111   string name;
+112   uint32_t start;
+113   vector<line> lines;
+114   // End segment Fields
+115   segment() {
+116     start = 0;
+117     // End segment Constructor
+118   }
+119 };
+120 :(before "struct segment")
+121 struct line {
+122   vector<word> words;
+123   vector<string> metadata;
+124   string original;
+125 };
+126 :(before "struct line")
+127 struct word {
+128   string original;
+129   string data;
+130   vector<string> metadata;
+131 };
+132 
+133 //:: parse
+134 
+135 :(code)
+136 void parse(istream& fin, program& out) {
+137   segment* curr_segment = NULL;
+138   vector<line> l;
+139   while (has_data(fin)) {
+140     string line_data;
+141     line curr;
+142     getline(fin, line_data);
+143     curr.original = line_data;
+144     trace(99, "parse") << "line: " << line_data << end();
+145     // End Line Parsing Special-cases(line_data -> l)
+146     istringstream lin(line_data);
+147     while (has_data(lin)) {
+148       string word_data;
+149       lin >> word_data;
+150       if (word_data.empty()) continue;
+151       if (word_data[0] == '#') break;  // comment
+152       if (word_data == ".") continue;  // comment token
+153       if (word_data == "==") {
+154         flush(curr_segment, l);
+155         string segment_name;
+156         lin >> segment_name;
+157         curr_segment = find(out, segment_name);
+158         if (curr_segment != NULL) {
+159           trace(3, "parse") << "appending to segment '" << segment_name << "'" << end();
+160         }
+161         else {
+162           trace(3, "parse") << "new segment '" << segment_name << "'" << end();
+163           uint32_t seg_start = 0;
+164           lin >> std::hex >> seg_start;
+165           sanity_check_program_segment(out, seg_start);
+166           out.segments.push_back(segment());
+167           curr_segment = &out.segments.back();
+168           curr_segment->name = segment_name;
+169           curr_segment->start = seg_start;
+170           if (trace_contains_errors()) continue;
+171           trace(3, "parse") << "starts at address 0x" << HEXWORD << curr_segment->start << end();
+172         }
+173         break;  // skip rest of line
+174       }
+175       if (word_data[0] == ':') {
+176         // todo: line metadata
+177         break;
+178       }
+179       curr.words.push_back(word());
+180       parse_word(word_data, curr.words.back());
+181       trace(99, "parse") << "word: " << to_string(curr.words.back());
+182     }
+183     if (!curr.words.empty())
+184       l.push_back(curr);
+185   }
+186   flush(curr_segment, l);
+187   trace(99, "parse") << "done" << end();
+188 }
+189 
+190 segment* find(program& p, const string& segment_name) {
+191   for (int i = 0;  i < SIZE(p.segments);  ++i) {
+192     if (p.segments.at(i).name == segment_name)
+193       return &p.segments.at(i);
+194   }
+195   return NULL;
+196 }
+197 
+198 void flush(segment* s, vector<line>& lines) {
+199   if (lines.empty()) return;
+200   if (s == NULL) {
+201     raise << "input does not start with a '==' section header\n" << end();
+202     return;
+203   }
+204   trace(3, "parse") << "flushing segment" << end();
+205   s->lines.insert(s->lines.end(), lines.begin(), lines.end());
+206   lines.clear();
+207 }
+208 
+209 void parse_word(const string& data, word& out) {
+210   out.original = data;
+211   istringstream win(data);
+212   if (getline(win, out.data, '/')) {
+213     string m;
+214     while (getline(win, m, '/'))
+215       out.metadata.push_back(m);
+216   }
+217 }
+218 
+219 void sanity_check_program_segment(const program& p, uint32_t addr) {
+220   for (int i = 0;  i < SIZE(p.segments);  ++i) {
+221     if (p.segments.at(i).start == addr)
+222       raise << "can't have multiple segments starting at address 0x" << HEXWORD << addr << '\n' << end();
+223   }
+224 }
+225 
+226 // helper for tests
+227 void parse(const string& text_bytes) {
+228   program p;
+229   istringstream in(text_bytes);
+230   parse(in, p);
+231 }
+232 
+233 void test_detect_duplicate_segments() {
+234   Hide_errors = true;
+235   parse(
+236       "== segment1 0xee\n"
+237       "ab\n"
+238       "== segment2 0xee\n"
+239       "cd\n"
+240   );
+241   CHECK_TRACE_CONTENTS(
+242       "error: can't have multiple segments starting at address 0x000000ee\n"
+243   );
+244 }
+245 
+246 //:: load
+247 
+248 void load(const program& p) {
+249   if (find(p, "code") == NULL) {
+250     raise << "no code to run\n" << end();
+251     return;
+252   }
+253   // Ensure segments are disjoint.
+254   set<uint32_t> overlap;
+255   for (int i = 0;   i < SIZE(p.segments);  ++i) {
+256     const segment& seg = p.segments.at(i);
+257     uint32_t addr = seg.start;
+258     if (!already_allocated(addr))
+259       Mem.push_back(vma(seg.start));
+260     trace(99, "load") << "loading segment " << i << " from " << HEXWORD << addr << end();
+261     for (int j = 0;  j < SIZE(seg.lines);  ++j) {
+262       const line& l = seg.lines.at(j);
+263       for (int k = 0;  k < SIZE(l.words);  ++k) {
+264         const word& w = l.words.at(k);
+265         uint8_t val = hex_byte(w.data);
+266         if (trace_contains_errors()) return;
+267         assert(overlap.find(addr) == overlap.end());
+268         write_mem_u8(addr, val);
+269         overlap.insert(addr);
+270         trace(99, "load") << "0x" << HEXWORD << addr << " -> " << HEXBYTE << NUM(read_mem_u8(addr)) << end();
+271         ++addr;
+272       }
+273     }
+274     if (seg.name == "code") {
+275       End_of_program = addr;
+276     }
+277   }
+278 }
+279 
+280 const segment* find(const program& p, const string& segment_name) {
+281   for (int i = 0;  i < SIZE(p.segments);  ++i) {
+282     if (p.segments.at(i).name == segment_name)
+283       return &p.segments.at(i);
+284   }
+285   return NULL;
+286 }
+287 
+288 uint8_t hex_byte(const string& s) {
+289   if (contains_uppercase(s)) {
+290     raise << "uppercase hex not allowed: " << s << '\n' << end();
+291     return 0;
+292   }
+293   istringstream in(s);
+294   int result = 0;
+295   in >> std::hex >> result;
+296   if (!in || !in.eof()) {
+297     raise << "token '" << s << "' is not a hex byte\n" << end();
+298     return '\0';
+299   }
+300   if (result > 0xff || result < -0x8f) {
+301     raise << "token '" << s << "' is not a hex byte\n" << end();
+302     return '\0';
+303   }
+304   return static_cast<uint8_t>(result);
+305 }
+306 
+307 void test_number_too_large() {
+308   Hide_errors = true;
+309   parse_and_load(
+310       "== code 0x1\n"
+311       "01 cab\n"
+312   );
+313   CHECK_TRACE_CONTENTS(
+314       "error: token 'cab' is not a hex byte\n"
+315   );
+316 }
+317 
+318 void test_invalid_hex() {
+319   Hide_errors = true;
+320   parse_and_load(
+321       "== code 0x1\n"
+322       "01 cx\n"
+323   );
+324   CHECK_TRACE_CONTENTS(
+325       "error: token 'cx' is not a hex byte\n"
+326   );
+327 }
+328 
+329 void test_negative_number() {
+330   parse_and_load(
+331       "== code 0x1\n"
+332       "01 -02\n"
+333   );
+334   CHECK_TRACE_COUNT("error", 0);
+335 }
+336 
+337 void test_negative_number_too_small() {
+338   Hide_errors = true;
+339   parse_and_load(
+340       "== code 0x1\n"
+341       "01 -12345\n"
+342   );
+343   CHECK_TRACE_CONTENTS(
+344       "error: token '-12345' is not a hex byte\n"
+345   );
+346 }
+347 
+348 void test_hex_prefix() {
+349   parse_and_load(
+350       "== code 0x1\n"
+351       "0x01 -0x02\n"
+352   );
+353   CHECK_TRACE_COUNT("error", 0);
+354 }
+355 
+356 void test_repeated_segment_merges_data() {
+357   parse_and_load(
+358       "== code 0x1\n"
+359       "11 22\n"
+360       "== code\n"  // again
+361       "33 44\n"
+362   );
+363   CHECK_TRACE_CONTENTS(
+364       "parse: new segment 'code'\n"
+365       "parse: appending to segment 'code'\n"
+366       // first segment
+367       "load: 0x00000001 -> 11\n"
+368       "load: 0x00000002 -> 22\n"
+369       // second segment
+370       "load: 0x00000003 -> 33\n"
+371       "load: 0x00000004 -> 44\n"
+372   );
+373 }
+374 
+375 void test_error_on_missing_segment_header() {
+376   Hide_errors = true;
+377   parse_and_load(
+378       "01 02\n"
+379   );
+380   CHECK_TRACE_CONTENTS(
+381       "error: input does not start with a '==' section header\n"
+382   );
+383 }
+384 
+385 void test_error_on_uppercase_hex() {
+386   Hide_errors = true;
+387   parse_and_load(
+388       "== code\n"
+389       "01 Ab\n"
+390   );
+391   CHECK_TRACE_CONTENTS(
+392       "error: uppercase hex not allowed: Ab\n"
+393   );
+394 }
+395 
+396 //: helper for tests
+397 void parse_and_load(const string& text_bytes) {
+398   program p;
+399   istringstream in(text_bytes);
+400   parse(in, p);
+401   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
+402   load(p);
+403 }
+404 
+405 //:: run
+406 
+407 :(before "End Initialize Op Names")
+408 put_new(Name, "b8", "copy imm32 to EAX (mov)");
+409 
+410 //: our first opcode
+411 
+412 :(before "End Single-Byte Opcodes")
+413 case 0xb8: {  // copy imm32 to EAX
+414   const int32_t src = next32();
+415   trace(Callstack_depth+1, "run") << "copy imm32 0x" << HEXWORD << src << " to EAX" << end();
+416   Reg[EAX].i = src;
+417   break;
+418 }
+419 
+420 :(code)
+421 void test_copy_imm32_to_EAX_again() {
+422   run(
+423       "== code 0x1\n"  // code segment
+424       // op     ModR/M  SIB   displacement  immediate
+425       "  b8                                 0a 0b 0c 0d \n"  // copy 0x0d0c0b0a to EAX
+426   );
+427   CHECK_TRACE_CONTENTS(
+428       "run: copy imm32 0x0d0c0b0a to EAX\n"
+429   );
+430 }
+431 
+432 // read a 32-bit int in little-endian order from the instruction stream
+433 int32_t next32() {
+434   int32_t result = read_mem_i32(EIP);
+435   EIP+=4;
+436   return result;
+437 }
+438 
+439 //:: helpers
+440 
+441 string to_string(const word& w) {
+442   ostringstream out;
+443   out << w.data;
+444   for (int i = 0;  i < SIZE(w.metadata);  ++i)
+445     out << " /" << w.metadata.at(i);
+446   return out.str();
+447 }
+448 
+449 bool contains_uppercase(const string& s) {
+450   for (int i = 0;  i < SIZE(s);  ++i)
+451     if (isupper(s.at(i))) return true;
+452   return false;
+453 }
+
+ + + -- cgit 1.4.1-2-gfad0