https://github.com/akkartik/mu/blob/master/subx/011run.cc
  1 //: Running SubX programs on the VM.
  2 
  3 //: (Not to be confused with the 'run' subcommand for running ELF binaries on
  4 //: the VM. That comes later.)
  5 
  6 :(before "End Help Texts")
  7 put_new(Help, "syntax",
  8   "SubX programs consist of segments, each segment in turn consisting of lines.\n"
  9   "Line-endings are significant; each line should contain a single\n"
 10   "instruction, macro or directive.\n"
 11   "\n"
 12   "Comments start with the '#' character. It should be at the start of a word\n"
 13   "(start of line, or following a space).\n"
 14   "\n"
 15   "Each segment starts with a header line: a '==' delimiter followed by the name of\n"
 16   "the segment.\n"
 17   "\n"
 18   "The first segment contains code and should be called 'code'.\n"
 19   "The second segment should be called 'data'.\n"
 20   "The resulting binary starts running from the start of the code segment by default.\n"
 21   "To start elsewhere in the code segment, define a special label called 'Entry'.\n"
 22   "\n"
 23   "Segments with the same name get merged together. This rule helps keep functions and\n"
 24   "their data close together in .subx files.\n"
 25   "\n"
 26   "Lines consist of a series of words. Words can contain arbitrary metadata\n"
 27   "after a '/', but they can never contain whitespace. Metadata has no effect\n"
 28   "at runtime, but can be handy when rewriting macros.\n"
 29   "\n"
 30   "Check out the examples in the examples/ directory.\n"
 31   "Programming in machine code can be annoying, but let's see if we can make\n"
 32   "it nice enough to be able to write a compiler in it.\n"
 33 );
 34 :(before "End Help Contents")
 35 cerr << "  syntax\n";
 36 
 37 :(code)
 38 void test_add_imm32_to_eax() {
 39   // At the lowest level, SubX programs are a series of hex bytes, each
 40   // (variable-length) instruction on one line.
 41   run(
 42       // Comments start with '#' and are ignored.
 43       "# comment\n"
 44       // Segment headers start with '==' and a name or starting hex address.
 45       // There's usually one code and one data segment. The code segment
 46       // always comes first.
 47       "== 0x1\n"  // code segment
 48 
 49       // After the header, each segment consists of lines, and each line
 50       // consists of words separated by whitespace.
 51       //
 52       // All words can have metadata after a '/'. No spaces allowed in
 53       // metadata, of course.
 54       // Unrecognized metadata never causes errors, so you can use it for
 55       // documentation.
 56       //
 57       // Within the code segment in particular, x86 instructions consist of
 58       // some number of the following parts and sub-parts (see the Readme and
 59       // cheatsheet.pdf for details):
 60       //   opcodes: 1-3 bytes
 61       //   ModR/M byte
 62       //   SIB byte
 63       //   displacement: 0/1/2/4 bytes
 64       //   immediate: 0/1/2/4 bytes
 65       // opcode        ModR/M                    SIB                   displacement    immediate
 66       // instruction   mod, reg, Reg/Mem bits    scale, index, base
 67       // 1-3 bytes     0/1 byte                  0/1 byte              0/1/2/4 bytes   0/1/2/4 bytes
 68       "  05            .                         .                     .               0a 0b 0c 0d\n"  // add 0x0d0c0b0a to EAX
 69       // The periods are just to help the eye track long gaps between columns,
 70       // and are otherwise ignored.
 71   );
 72   // This program, when run, causes the following events in the trace:
 73   CHECK_TRACE_CONTENTS(
 74       "load: 0x00000001 -> 05\n"
 75       "load: 0x00000002 -> 0a\n"
 76       "load: 0x00000003 -> 0b\n"
 77       "load: 0x00000004 -> 0c\n"
 78       "load: 0x00000005 -> 0d\n"
 79       "run: add imm32 0x0d0c0b0a to reg EAX\n"
 80       "run: storing 0x0d0c0b0a\n"
 81   );
 82 }
 83 
 84 // top-level helper for scenarios: parse the input, transform any macros, load
 85 // the final hex bytes into memory, run it
 86 void run(const string& text_bytes) {
 87   program p;
 88   istringstream in(text_bytes);
 89   parse(in, p);
 90   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
 91   transform(p);
 92   if (trace_contains_errors()) return;
 93   load(p);
 94   if (trace_contains_errors()) return;
 95   while (EIP < End_of_program)
 96     run_one_instruction();
 97 }
 98 
 99 //:: core data structures
100 
101 :(before "End Types")
102 struct program {
103   vector<segment> segments;
104   // random ideas for other things we may eventually need
105   //map<name, address> globals;
106   //vector<recipe> recipes;
107   //map<string, type_info> types;
108 };
109 :(before "struct program")
110 struct segment {
111   uint32_t start;
112   vector<line> lines;
113   // End segment Fields
114   segment() {
115     start = 0;
116     // End segment Constructor
117   }
118 };
119 :(before "struct segment")
120 struct line {
121   vector<word> words;
122   vector<string> metadata;
123   string original;
124 };
125 :(before "struct line")
126 struct word {
127   string original;
128   string data;
129   vector<string> metadata;
130 };
131 
132 //:: parse
133 
134 :(code)
135 void parse(istream& fin, program& out) {
136   vector<line> l;
137   while (has_data(fin)) {
138     string line_data;
139     line curr;
140     getline(fin, line_data);
141     curr.original = line_data;
142     trace(99, "parse") << "line: " << line_data << end();
143     // End Line Parsing Special-cases(line_data -> l)
144     istringstream lin(line_data);
145     while (has_data(lin)) {
146       string word_data;
147       lin >> word_data;
148       if (word_data.empty()) continue;
149       if (word_data[0] == '#') break;  // comment
150       if (word_data == ".") continue;  // comment token
151       if (word_data == "==") {
152         flush(out, l);
153         string segment_title;
154         lin >> segment_title;
155         if (starts_with(segment_title, "0x")) {
156           segment s;
157           s.start = parse_int(segment_title);
158           sanity_check_program_segment(out, s.start);
159           if (trace_contains_errors()) continue;
160           trace(3, "parse") << "new segment from 0x" << HEXWORD << s.start << end();
161           out.segments.push_back(s);
162         }
163         // End Segment Parsing Special-cases(segment_title)
164         // todo: segment segment metadata
165         break;  // skip rest of line
166       }
167       if (word_data[0] == ':') {
168         // todo: line metadata
169         break;
170       }
171       curr.words.push_back(word());
172       parse_word(word_data, curr.words.back());
173       trace(99, "parse") << "word: " << to_string(curr.words.back());
174     }
175     if (!curr.words.empty())
176       l.push_back(curr);
177   }
178   flush(out, l);
179   trace(99, "parse") << "done" << end();
180 }
181 
182 void flush(program& p, vector<line>& lines) {
183   if (lines.empty()) return;
184   if (p.segments.empty()) {
185     raise << "input does not start with a '==' section header\n" << end();
186     return;
187   }
188   // End flush(p, lines) Special-cases
189   trace(99, "parse") << "flushing segment" << end();
190   p.segments.back().lines.swap(lines);
191 }
192 
193 void parse_word(const string& data, word& out) {
194   out.original = data;
195   istringstream win(data);
196   if (getline(win, out.data, '/')) {
197     string m;
198     while (getline(win, m, '/'))
199       out.metadata.push_back(m);
200   }
201 }
202 
203 void sanity_check_program_segment(const program& p, uint32_t addr) {
204   for (int i = 0;  i < SIZE(p.segments);  ++i) {
205     if (p.segments.at(i).start == addr)
206       raise << "can't have multiple segments starting at address 0x" << HEXWORD << addr << '\n' << end();
207   }
208 }
209 
210 // helper for tests
211 void parse(const string& text_bytes) {
212   program p;
213   istringstream in(text_bytes);
214   parse(in, p);
215 }
216 
217 void test_detect_duplicate_segments() {
218   Hide_errors = true;
219   parse(
220       "== 0xee\n"
221       "ab\n"
222       "== 0xee\n"
223       "cd\n"
224   );
225   CHECK_TRACE_CONTENTS(
226       "error: can't have multiple segments starting at address 0x000000ee\n"
227   );
228 }
229 
230 //:: transform
231 
232 :(before "End Types")
233 typedef void (*transform_fn)(program&);
234 :(before "End Globals")
235 vector<transform_fn> Transform;
236 
237 :(code)
238 void transform(program& p) {
239   for (int t = 0;  t < SIZE(Transform);  ++t)
240     (*Transform.at(t))(p);
241 }
242 
243 //:: load
244 
245 void load(const program& p) {
246   if (p.segments.empty()) {
247     raise << "no code to run\n" << end();
248     return;
249   }
250   // Ensure segments are disjoint.
251   set<uint32_t> overlap;
252   for (int i = 0;   i < SIZE(p.segments);  ++i) {
253     const segment& seg = p.segments.at(i);
254     uint32_t addr = seg.start;
255     if (!already_allocated(addr))
256       Mem.push_back(vma(seg.start));
257     trace(99, "load") << "loading segment " << i << " from " << HEXWORD << addr << end();
258     for (int j = 0;  j < SIZE(seg.lines);  ++j) {
259       const line& l = seg.lines.at(j);
260       for (int k = 0;  k < SIZE(l.words);  ++k) {
261         const word& w = l.words.at(k);
262         uint8_t val = hex_byte(w.data);
263         if (trace_contains_errors()) return;
264         assert(overlap.find(addr) == overlap.end());
265         write_mem_u8(addr, val);
266         overlap.insert(addr);
267         trace(99, "load") << "0x" << HEXWORD << addr << " -> " << HEXBYTE << NUM(read_mem_u8(addr)) << end();
268         ++addr;
269       }
270     }
271     if (i == 0) End_of_program = addr;
272   }
273   EIP = p.segments.at(0).start;
274   // End Initialize EIP
275 }
276 
277 uint8_t hex_byte(const string& s) {
278   istringstream in(s);
279   int result = 0;
280   in >> std::hex >> result;
281   if (!in || !in.eof()) {
282     raise << "token '" << s << "' is not a hex byte\n" << end();
283     return '\0';
284   }
285   if (result > 0xff || result < -0x8f) {
286     raise << "token '" << s << "' is not a hex byte\n" << end();
287     return '\0';
288   }
289   return static_cast<uint8_t>(result);
290 }
291 
292 void test_number_too_large() {
293   Hide_errors = true;
294   parse_and_load(
295       "== 0x1\n"
296       "05 cab\n"
297   );
298   CHECK_TRACE_CONTENTS(
299       "error: token 'cab' is not a hex byte\n"
300   );
301 }
302 
303 void test_invalid_hex() {
304   Hide_errors = true;
305   parse_and_load(
306       "== 0x1\n"
307       "05 cx\n"
308   );
309   CHECK_TRACE_CONTENTS(
310       "error: token 'cx' is not a hex byte\n"
311   );
312 }
313 
314 void test_negative_number() {
315   parse_and_load(
316       "== 0x1\n"
317       "05 -12\n"
318   );
319   CHECK_TRACE_COUNT("error", 0);
320 }
321 
322 void test_negative_number_too_small() {
323   Hide_errors = true;
324   parse_and_load(
325       "== 0x1\n"
326       "05 -12345\n"
327   );
328   CHECK_TRACE_CONTENTS(
329       "error: token '-12345' is not a hex byte\n"
330   );
331 }
332 
333 void test_hex_prefix() {
334   parse_and_load(
335       "== 0x1\n"
336       "0x05 -0x12\n"
337   );
338   CHECK_TRACE_COUNT("error", 0);
339 }
340 
341 //: helper for tests
342 void parse_and_load(const string& text_bytes) {
343   program p;
344   istringstream in(text_bytes);
345   parse(in, p);
346   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
347   load(p);
348 }
349 
350 //:: run
351 
352 :(before "End Initialize Op Names")
353 put_new(Name, "05", "add imm32 to EAX (add)");
354 
355 //: our first opcode
356 :(before "End Single-Byte Opcodes")
357 case 0x05: {  // add imm32 to EAX
358   int32_t arg2 = next32();
359   trace(Callstack_depth+1, "run") << "add imm32 0x" << HEXWORD << arg2 << " to reg EAX" << end();
360   BINARY_ARITHMETIC_OP(+, Reg[EAX].i, arg2);
361   break;
362 }
363 
364 :(code)
365 // read a 32-bit int in little-endian order from the instruction stream
366 int32_t next32() {
367   int32_t result = read_mem_i32(EIP);
368   EIP+=4;
369   return result;
370 }
371 
372 //:: helpers
373 
374 string to_string(const word& w) {
375   ostringstream out;
376   out << w.data;
377   for (int i = 0;  i < SIZE(w.metadata);  ++i)
378     out << " /" << w.metadata.at(i);
379   return out.str();
380 }
381 
382 int32_t parse_int(const string& s) {
383   if (s.empty()) return 0;
384   istringstream in(s);
385   in >> std::hex;
386   if (s.at(0) == '-') {
387     int32_t result = 0;
388     in >> result;
389     if (!in || !in.eof()) {
390       raise << "not a number: " << s << '\n' << end();
391       return 0;
392     }
393     return result;
394   }
395   uint32_t uresult = 0;
396   in >> uresult;
397   if (!in || !in.eof()) {
398     raise << "not a number: " << s << '\n' << end();
399     return 0;
400   }
401   return static_cast<int32_t>(uresult);
402 }
403 :(before "End Unit Tests")
404 void test_parse_int() {
405   CHECK_EQ(0, parse_int("0"));
406   CHECK_EQ(0, parse_int("0x0"));
407   CHECK_EQ(0, parse_int("0x0"));
408   CHECK_EQ(16, parse_int("10"));  // hex always
409   CHECK_EQ(-1, parse_int("-1"));
410   CHECK_EQ(-1, parse_int("0xffffffff"));
411 }