https://github.com/akkartik/mu/blob/master/subx/011run.cc
  1 //: Running SubX programs on the VM.
  2 
  3 //: (Not to be confused with the 'run' subcommand for running ELF binaries on
  4 //: the VM. That comes later.)
  5 
  6 :(before "End Help Texts")
  7 put_new(Help, "syntax",
  8   "SubX programs consist of segments, each segment in turn consisting of lines.\n"
  9   "Line-endings are significant; each line should contain a single\n"
 10   "instruction, macro or directive.\n"
 11   "\n"
 12   "Comments start with the '#' character. It should be at the start of a word\n"
 13   "(start of line, or following a space).\n"
 14   "\n"
 15   "Each segment starts with a header line: a '==' delimiter followed by the name of\n"
 16   "the segment.\n"
 17   "\n"
 18   "Currently SubX assumes the first segment encountered contains executable code, and\n"
 19   "the second contains global variables. By convention we call them 'code' and 'data'\n"
 20   "respectively.\n"
 21   "The first instruction executed in the resulting binary is always the first\n"
 22   "instruction of the first segment.\n"
 23   "\n"
 24   "Segments with the same name get merged together. This rule helps keep functions and\n"
 25   "their data close together in .subx files.\n"
 26   "Later segments with the same name get their contents *prepended* to earlier ones.\n"
 27   "This rule helps each .subx file to put forth a different entrypoint for the binary,\n"
 28   "overriding previously loaded files.\n"
 29   "\n"
 30   "Lines consist of a series of words. Words can contain arbitrary metadata\n"
 31   "after a '/', but they can never contain whitespace. Metadata has no effect\n"
 32   "at runtime, but can be handy when rewriting macros.\n"
 33   "\n"
 34   "Check out the examples in the examples/ directory.\n"
 35   "Programming in machine code can be annoying, but let's see if we can make\n"
 36   "it nice enough to be able to write a compiler in it.\n"
 37 );
 38 :(before "End Help Contents")
 39 cerr << "  syntax\n";
 40 
 41 :(scenario add_imm32_to_eax)
 42 # At the lowest level, SubX programs are a series of hex bytes, each
 43 # (variable-length) instruction on one line.
 44 #
 45 # Later we'll make things nicer using macros. But you'll always be able to
 46 # insert hex bytes out of instructions.
 47 #
 48 # As you can see, comments start with '#' and are ignored.
 49 
 50 # Segment headers start with '==', specifying the hex address where they
 51 # begin. There's usually one code segment and one data segment. We assume the
 52 # code segment always comes first. Later when we emit ELF binaries we'll add
 53 # directives for the operating system to ensure that the code segment can't be
 54 # written to, and the data segment can't be executed as code.
 55 == 0x1
 56 
 57 # We don't show it here, but all lines can have metadata after a ':'.
 58 # All words can have metadata after a '/'. No spaces allowed in word metadata, of course.
 59 # Metadata doesn't directly form instructions, but some macros may look at it.
 60 # Unrecognized metadata never causes errors, so you can also use it for
 61 # documentation.
 62 
 63 # Within the code segment, x86 instructions consist of the following parts (see cheatsheet.pdf):
 64 #   opcode        ModR/M                    SIB                   displacement    immediate
 65 #   instruction   mod, reg, Reg/Mem bits    scale, index, base
 66 #   1-3 bytes     0/1 byte                  0/1 byte              0/1/2/4 bytes   0/1/2/4 bytes
 67     05            .                         .                     .               0a 0b 0c 0d  # add 0x0d0c0b0a to EAX
 68 # (The single periods are just to help the eye track long gaps between
 69 # columns, and are otherwise ignored.)
 70 
 71 # This program, when run, causes the following events in the trace:
 72 +load: 0x00000001 -> 05
 73 +load: 0x00000002 -> 0a
 74 +load: 0x00000003 -> 0b
 75 +load: 0x00000004 -> 0c
 76 +load: 0x00000005 -> 0d
 77 +run: add imm32 0x0d0c0b0a to reg EAX
 78 +run: storing 0x0d0c0b0a
 79 
 80 :(code)
 81 // top-level helper for scenarios: parse the input, transform any macros, load
 82 // the final hex bytes into memory, run it
 83 void run(const string& text_bytes) {
 84   program p;
 85   istringstream in(text_bytes);
 86   parse(in, p);
 87   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
 88   transform(p);
 89   if (trace_contains_errors()) return;
 90   load(p);
 91   if (trace_contains_errors()) return;
 92   while (EIP < End_of_program)
 93     run_one_instruction();
 94 }
 95 
 96 //:: core data structures
 97 
 98 :(before "End Types")
 99 struct program {
100   vector<segment> segments;
101   // random ideas for other things we may eventually need
102   //map<name, address> globals;
103   //vector<recipe> recipes;
104   //map<string, type_info> types;
105 };
106 :(before "struct program")
107 struct segment {
108   uint32_t start;
109   vector<line> lines;
110   // End segment Fields
111   segment() {
112     start = 0;
113     // End segment Constructor
114   }
115 };
116 :(before "struct segment")
117 struct line {
118   vector<word> words;
119   vector<string> metadata;
120   string original;
121 };
122 :(before "struct line")
123 struct word {
124   string original;
125   string data;
126   vector<string> metadata;
127 };
128 
129 //:: parse
130 
131 :(code)
132 void parse(istream& fin, program& out) {
133   vector<line> l;
134   trace(99, "parse") << "begin" << end();
135   while (has_data(fin)) {
136     string line_data;
137     line curr;
138     getline(fin, line_data);
139     curr.original = line_data;
140     trace(99, "parse") << "line: " << line_data << end();
141     // End Line Parsing Special-cases(line_data -> l)
142     istringstream lin(line_data);
143     while (has_data(lin)) {
144       string word_data;
145       lin >> word_data;
146       if (word_data.empty()) continue;
147       if (word_data[0] == '#') break;  // comment
148       if (word_data == ".") continue;  // comment token
149       if (word_data == "==") {
150         flush(out, l);
151         string segment_title;
152         lin >> segment_title;
153         if (starts_with(segment_title, "0x")) {
154           segment s;
155           s.start = parse_int(segment_title);
156           sanity_check_program_segment(out, s.start);
157           if (trace_contains_errors()) continue;
158           trace(99, "parse") << "new segment from 0x" << HEXWORD << s.start << end();
159           out.segments.push_back(s);
160         }
161         // End Segment Parsing Special-cases(segment_title)
162         // todo: segment segment metadata
163         break;  // skip rest of line
164       }
165       if (word_data[0] == ':') {
166         // todo: line metadata
167         break;
168       }
169       curr.words.push_back(word());
170       parse_word(word_data, curr.words.back());
171       trace(99, "parse") << "word: " << to_string(curr.words.back());
172     }
173     if (!curr.words.empty())
174       l.push_back(curr);
175   }
176   flush(out, l);
177   trace(99, "parse") << "done" << end();
178 }
179 
180 void flush(program& p, vector<line>& lines) {
181   if (lines.empty()) return;
182   if (p.segments.empty()) {
183     raise << "input does not start with a '==' section header\n" << end();
184     return;
185   }
186   // End flush(p, lines) Special-cases
187   trace(99, "parse") << "flushing to segment" << end();
188   p.segments.back().lines.swap(lines);
189 }
190 
191 void parse_word(const string& data, word& out) {
192   out.original = data;
193   istringstream win(data);
194   if (getline(win, out.data, '/')) {
195     string m;
196     while (getline(win, m, '/'))
197       out.metadata.push_back(m);
198   }
199 }
200 
201 void sanity_check_program_segment(const program& p, uint32_t addr) {
202   for (int i = 0;  i < SIZE(p.segments);  ++i) {
203     if (p.segments.at(i).start == addr)
204       raise << "can't have multiple segments starting at address 0x" << HEXWORD << addr << '\n' << end();
205   }
206 }
207 
208 // helper for tests
209 void parse(const string& text_bytes) {
210   program p;
211   istringstream in(text_bytes);
212   parse(in, p);
213 }
214 
215 :(scenarios parse)
216 :(scenario detect_duplicate_segments)
217 % Hide_errors = true;
218 == 0xee
219 ab
220 == 0xee
221 cd
222 +error: can't have multiple segments starting at address 0x000000ee
223 
224 //:: transform
225 
226 :(before "End Types")
227 typedef void (*transform_fn)(program&);
228 :(before "End Globals")
229 vector<transform_fn> Transform;
230 
231 void transform(program& p) {
232   trace(99, "transform") << "begin" << end();
233   for (int t = 0;  t < SIZE(Transform);  ++t)
234     (*Transform.at(t))(p);
235   trace(99, "transform") << "done" << end();
236 }
237 
238 //:: load
239 
240 void load(const program& p) {
241   trace(99, "load") << "begin" << end();
242   if (p.segments.empty()) {
243     raise << "no code to run\n" << end();
244     return;
245   }
246   // Ensure segments are disjoint.
247   set<uint32_t> overlap;
248   for (int i = 0;   i < SIZE(p.segments);  ++i) {
249     const segment& seg = p.segments.at(i);
250     uint32_t addr = seg.start;
251     if (!already_allocated(addr))
252       Mem.push_back(vma(seg.start));
253     trace(99, "load") << "loading segment " << i << " from " << HEXWORD << addr << end();
254     for (int j = 0;  j < SIZE(seg.lines);  ++j) {
255       const line& l = seg.lines.at(j);
256       for (int k = 0;  k < SIZE(l.words);  ++k) {
257         const word& w = l.words.at(k);
258         uint8_t val = hex_byte(w.data);
259         if (trace_contains_errors()) return;
260         assert(overlap.find(addr) == overlap.end());
261         write_mem_u8(addr, val);
262         overlap.insert(addr);
263         trace(99, "load") << "0x" << HEXWORD << addr << " -> " << HEXBYTE << NUM(read_mem_u8(addr)) << end();
264         ++addr;
265       }
266     }
267     if (i == 0) End_of_program = addr;
268   }
269   EIP = p.segments.at(0).start;
270   trace(99, "load") << "done" << end();
271 }
272 
273 uint8_t hex_byte(const string& s) {
274   istringstream in(s);
275   int result = 0;
276   in >> std::hex >> result;
277   if (!in || !in.eof()) {
278     raise << "token '" << s << "' is not a hex byte\n" << end();
279     return '\0';
280   }
281   if (result > 0xff || result < -0x8f) {
282     raise << "token '" << s << "' is not a hex byte\n" << end();
283     return '\0';
284   }
285   return static_cast<uint8_t>(result);
286 }
287 
288 :(scenarios parse_and_load)
289 :(scenario number_too_large)
290 % Hide_errors = true;
291 == 0x1
292 05 cab
293 +error: token 'cab' is not a hex byte
294 
295 :(scenario invalid_hex)
296 % Hide_errors = true;
297 == 0x1
298 05 cx
299 +error: token 'cx' is not a hex byte
300 
301 :(scenario negative_number)
302 == 0x1
303 05 -12
304 $error: 0
305 
306 :(scenario negative_number_too_small)
307 % Hide_errors = true;
308 == 0x1
309 05 -12345
310 +error: token '-12345' is not a hex byte
311 
312 :(scenario hex_prefix)
313 == 0x1
314 0x05 -0x12
315 $error: 0
316 
317 //: helper for tests
318 :(code)
319 void parse_and_load(const string& text_bytes) {
320   program p;
321   istringstream in(text_bytes);
322   parse(in, p);
323   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
324   load(p);
325 }
326 
327 //:: run
328 
329 :(before "End Initialize Op Names")
330 put_new(Name, "05", "add imm32 to EAX (add)");
331 
332 //: our first opcode
333 :(before "End Single-Byte Opcodes")
334 case 0x05: {  // add imm32 to EAX
335   int32_t arg2 = next32();
336   trace(90, "run") << "add imm32 0x" << HEXWORD << arg2 << " to reg EAX" << end();
337   BINARY_ARITHMETIC_OP(+, Reg[EAX].i, arg2);
338   break;
339 }
340 
341 :(code)
342 // read a 32-bit int in little-endian order from the instruction stream
343 int32_t next32() {
344   int32_t result = next();
345   result |= (next()<<8);
346   result |= (next()<<16);
347   result |= (next()<<24);
348   return result;
349 }
350 
351 //:: helpers
352 
353 :(code)
354 string to_string(const word& w) {
355   ostringstream out;
356   out << w.data;
357   for (int i = 0;  i < SIZE(w.metadata);  ++i)
358     out << " /" << w.metadata.at(i);
359   return out.str();
360 }
361 
362 int32_t parse_int(const string& s) {
363   if (s.empty()) return 0;
364   istringstream in(s);
365   in >> std::hex;
366   if (s.at(0) == '-') {
367     int32_t result = 0;
368     in >> result;
369     if (!in || !in.eof()) {
370       raise << "not a number: " << s << '\n' << end();
371       return 0;
372     }
373     return result;
374   }
375   uint32_t uresult = 0;
376   in >> uresult;
377   if (!in || !in.eof()) {
378     raise << "not a number: " << s << '\n' << end();
379     return 0;
380   }
381   return static_cast<int32_t>(uresult);
382 }
383 :(before "End Unit Tests")
384 void test_parse_int() {
385   CHECK_EQ(0, parse_int("0"));
386   CHECK_EQ(0, parse_int("0x0"));
387   CHECK_EQ(0, parse_int("0x0"));
388   CHECK_EQ(16, parse_int("10"));  // hex always
389   CHECK_EQ(-1, parse_int("-1"));
390   CHECK_EQ(-1, parse_int("0xffffffff"));
391 }