https://github.com/akkartik/mu/blob/master/subx/011run.cc
  1 //: Running SubX programs on the VM.
  2 
  3 //: (Not to be confused with the 'run' subcommand for running ELF binaries on
  4 //: the VM. That comes later.)
  5 
  6 :(before "End Help Texts")
  7 put_new(Help, "syntax",
  8   "SubX programs consist of segments, each segment in turn consisting of lines.\n"
  9   "Line-endings are significant; each line should contain a single\n"
 10   "instruction, macro or directive.\n"
 11   "\n"
 12   "Comments start with the '#' character. It should be at the start of a word\n"
 13   "(start of line, or following a space).\n"
 14   "\n"
 15   "Each segment starts with a header line: a '==' delimiter followed by the name of\n"
 16   "the segment.\n"
 17   "\n"
 18   "The first segment contains code and should be called 'code'.\n"
 19   "The second segment should be called 'data'.\n"
 20   "The resulting binary starts running from the start of the code segment by default.\n"
 21   "To start elsewhere in the code segment, define a special label called 'Entry'.\n"
 22   "\n"
 23   "Segments with the same name get merged together. This rule helps keep functions and\n"
 24   "their data close together in .subx files.\n"
 25   "\n"
 26   "Lines consist of a series of words. Words can contain arbitrary metadata\n"
 27   "after a '/', but they can never contain whitespace. Metadata has no effect\n"
 28   "at runtime, but can be handy when rewriting macros.\n"
 29   "\n"
 30   "Check out the examples in the examples/ directory.\n"
 31   "Programming in machine code can be annoying, but let's see if we can make\n"
 32   "it nice enough to be able to write a compiler in it.\n"
 33 );
 34 :(before "End Help Contents")
 35 cerr << "  syntax\n";
 36 
 37 :(scenario add_imm32_to_eax)
 38 # At the lowest level, SubX programs are a series of hex bytes, each
 39 # (variable-length) instruction on one line.
 40 #
 41 # Later we'll make things nicer using macros. But you'll always be able to
 42 # insert hex bytes out of instructions.
 43 #
 44 # As you can see, comments start with '#' and are ignored.
 45 
 46 # Segment headers start with '==', specifying the hex address where they
 47 # begin. There's usually one code segment and one data segment. We assume the
 48 # code segment always comes first. Later when we emit ELF binaries we'll add
 49 # directives for the operating system to ensure that the code segment can't be
 50 # written to, and the data segment can't be executed as code.
 51 == 0x1
 52 
 53 # We don't show it here, but all lines can have metadata after a ':'.
 54 # All words can have metadata after a '/'. No spaces allowed in word metadata, of course.
 55 # Metadata doesn't directly form instructions, but some macros may look at it.
 56 # Unrecognized metadata never causes errors, so you can also use it for
 57 # documentation.
 58 
 59 # Within the code segment, x86 instructions consist of the following parts (see cheatsheet.pdf):
 60 #   opcode        ModR/M                    SIB                   displacement    immediate
 61 #   instruction   mod, reg, Reg/Mem bits    scale, index, base
 62 #   1-3 bytes     0/1 byte                  0/1 byte              0/1/2/4 bytes   0/1/2/4 bytes
 63     05            .                         .                     .               0a 0b 0c 0d  # add 0x0d0c0b0a to EAX
 64 # (The single periods are just to help the eye track long gaps between
 65 # columns, and are otherwise ignored.)
 66 
 67 # This program, when run, causes the following events in the trace:
 68 +load: 0x00000001 -> 05
 69 +load: 0x00000002 -> 0a
 70 +load: 0x00000003 -> 0b
 71 +load: 0x00000004 -> 0c
 72 +load: 0x00000005 -> 0d
 73 +run: add imm32 0x0d0c0b0a to reg EAX
 74 +run: storing 0x0d0c0b0a
 75 
 76 :(code)
 77 // top-level helper for scenarios: parse the input, transform any macros, load
 78 // the final hex bytes into memory, run it
 79 void run(const string& text_bytes) {
 80   program p;
 81   istringstream in(text_bytes);
 82   parse(in, p);
 83   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
 84   transform(p);
 85   if (trace_contains_errors()) return;
 86   load(p);
 87   if (trace_contains_errors()) return;
 88   while (EIP < End_of_program)
 89     run_one_instruction();
 90 }
 91 
 92 //:: core data structures
 93 
 94 :(before "End Types")
 95 struct program {
 96   vector<segment> segments;
 97   // random ideas for other things we may eventually need
 98   //map<name, address> globals;
 99   //vector<recipe> recipes;
100   //map<string, type_info> types;
101 };
102 :(before "struct program")
103 struct segment {
104   uint32_t start;
105   vector<line> lines;
106   // End segment Fields
107   segment() {
108     start = 0;
109     // End segment Constructor
110   }
111 };
112 :(before "struct segment")
113 struct line {
114   vector<word> words;
115   vector<string> metadata;
116   string original;
117 };
118 :(before "struct line")
119 struct word {
120   string original;
121   string data;
122   vector<string> metadata;
123 };
124 
125 //:: parse
126 
127 :(code)
128 void parse(istream& fin, program& out) {
129   vector<line> l;
130   while (has_data(fin)) {
131     string line_data;
132     line curr;
133     getline(fin, line_data);
134     curr.original = line_data;
135     trace(99, "parse") << "line: " << line_data << end();
136     // End Line Parsing Special-cases(line_data -> l)
137     istringstream lin(line_data);
138     while (has_data(lin)) {
139       string word_data;
140       lin >> word_data;
141       if (word_data.empty()) continue;
142       if (word_data[0] == '#') break;  // comment
143       if (word_data == ".") continue;  // comment token
144       if (word_data == "==") {
145         flush(out, l);
146         string segment_title;
147         lin >> segment_title;
148         if (starts_with(segment_title, "0x")) {
149           segment s;
150           s.start = parse_int(segment_title);
151           sanity_check_program_segment(out, s.start);
152           if (trace_contains_errors()) continue;
153           trace(3, "parse") << "new segment from 0x" << HEXWORD << s.start << end();
154           out.segments.push_back(s);
155         }
156         // End Segment Parsing Special-cases(segment_title)
157         // todo: segment segment metadata
158         break;  // skip rest of line
159       }
160       if (word_data[0] == ':') {
161         // todo: line metadata
162         break;
163       }
164       curr.words.push_back(word());
165       parse_word(word_data, curr.words.back());
166       trace(99, "parse") << "word: " << to_string(curr.words.back());
167     }
168     if (!curr.words.empty())
169       l.push_back(curr);
170   }
171   flush(out, l);
172   trace(99, "parse") << "done" << end();
173 }
174 
175 void flush(program& p, vector<line>& lines) {
176   if (lines.empty()) return;
177   if (p.segments.empty()) {
178     raise << "input does not start with a '==' section header\n" << end();
179     return;
180   }
181   // End flush(p, lines) Special-cases
182   trace(99, "parse") << "flushing segment" << end();
183   p.segments.back().lines.swap(lines);
184 }
185 
186 void parse_word(const string& data, word& out) {
187   out.original = data;
188   istringstream win(data);
189   if (getline(win, out.data, '/')) {
190     string m;
191     while (getline(win, m, '/'))
192       out.metadata.push_back(m);
193   }
194 }
195 
196 void sanity_check_program_segment(const program& p, uint32_t addr) {
197   for (int i = 0;  i < SIZE(p.segments);  ++i) {
198     if (p.segments.at(i).start == addr)
199       raise << "can't have multiple segments starting at address 0x" << HEXWORD << addr << '\n' << end();
200   }
201 }
202 
203 // helper for tests
204 void parse(const string& text_bytes) {
205   program p;
206   istringstream in(text_bytes);
207   parse(in, p);
208 }
209 
210 :(scenarios parse)
211 :(scenario detect_duplicate_segments)
212 % Hide_errors = true;
213 == 0xee
214 ab
215 == 0xee
216 cd
217 +error: can't have multiple segments starting at address 0x000000ee
218 
219 //:: transform
220 
221 :(before "End Types")
222 typedef void (*transform_fn)(program&);
223 :(before "End Globals")
224 vector<transform_fn> Transform;
225 
226 :(code)
227 void transform(program& p) {
228   for (int t = 0;  t < SIZE(Transform);  ++t)
229     (*Transform.at(t))(p);
230 }
231 
232 //:: load
233 
234 void load(const program& p) {
235   if (p.segments.empty()) {
236     raise << "no code to run\n" << end();
237     return;
238   }
239   // Ensure segments are disjoint.
240   set<uint32_t> overlap;
241   for (int i = 0;   i < SIZE(p.segments);  ++i) {
242     const segment& seg = p.segments.at(i);
243     uint32_t addr = seg.start;
244     if (!already_allocated(addr))
245       Mem.push_back(vma(seg.start));
246     trace(99, "load") << "loading segment " << i << " from " << HEXWORD << addr << end();
247     for (int j = 0;  j < SIZE(seg.lines);  ++j) {
248       const line& l = seg.lines.at(j);
249       for (int k = 0;  k < SIZE(l.words);  ++k) {
250         const word& w = l.words.at(k);
251         uint8_t val = hex_byte(w.data);
252         if (trace_contains_errors()) return;
253         assert(overlap.find(addr) == overlap.end());
254         write_mem_u8(addr, val);
255         overlap.insert(addr);
256         trace(99, "load") << "0x" << HEXWORD << addr << " -> " << HEXBYTE << NUM(read_mem_u8(addr)) << end();
257         ++addr;
258       }
259     }
260     if (i == 0) End_of_program = addr;
261   }
262   EIP = p.segments.at(0).start;
263   // End Initialize EIP
264 }
265 
266 uint8_t hex_byte(const string& s) {
267   istringstream in(s);
268   int result = 0;
269   in >> std::hex >> result;
270   if (!in || !in.eof()) {
271     raise << "token '" << s << "' is not a hex byte\n" << end();
272     return '\0';
273   }
274   if (result > 0xff || result < -0x8f) {
275     raise << "token '" << s << "' is not a hex byte\n" << end();
276     return '\0';
277   }
278   return static_cast<uint8_t>(result);
279 }
280 
281 :(scenarios parse_and_load)
282 :(scenario number_too_large)
283 % Hide_errors = true;
284 == 0x1
285 05 cab
286 +error: token 'cab' is not a hex byte
287 
288 :(scenario invalid_hex)
289 % Hide_errors = true;
290 == 0x1
291 05 cx
292 +error: token 'cx' is not a hex byte
293 
294 :(scenario negative_number)
295 == 0x1
296 05 -12
297 $error: 0
298 
299 :(scenario negative_number_too_small)
300 % Hide_errors = true;
301 == 0x1
302 05 -12345
303 +error: token '-12345' is not a hex byte
304 
305 :(scenario hex_prefix)
306 == 0x1
307 0x05 -0x12
308 $error: 0
309 
310 //: helper for tests
311 :(code)
312 void parse_and_load(const string& text_bytes) {
313   program p;
314   istringstream in(text_bytes);
315   parse(in, p);
316   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
317   load(p);
318 }
319 
320 //:: run
321 
322 :(before "End Initialize Op Names")
323 put_new(Name, "05", "add imm32 to EAX (add)");
324 
325 //: our first opcode
326 :(before "End Single-Byte Opcodes")
327 case 0x05: {  // add imm32 to EAX
328   int32_t arg2 = next32();
329   trace(Callstack_depth+1, "run") << "add imm32 0x" << HEXWORD << arg2 << " to reg EAX" << end();
330   BINARY_ARITHMETIC_OP(+, Reg[EAX].i, arg2);
331   break;
332 }
333 
334 :(code)
335 // read a 32-bit int in little-endian order from the instruction stream
336 int32_t next32() {
337   int32_t result = next();
338   result |= (next()<<8);
339   result |= (next()<<16);
340   result |= (next()<<24);
341   return result;
342 }
343 
344 //:: helpers
345 
346 :(code)
347 string to_string(const word& w) {
348   ostringstream out;
349   out << w.data;
350   for (int i = 0;  i < SIZE(w.metadata);  ++i)
351     out << " /" << w.metadata.at(i);
352   return out.str();
353 }
354 
355 int32_t parse_int(const string& s) {
356   if (s.empty()) return 0;
357   istringstream in(s);
358   in >> std::hex;
359   if (s.at(0) == '-') {
360     int32_t result = 0;
361     in >> result;
362     if (!in || !in.eof()) {
363       raise << "not a number: " << s << '\n' << end();
364       return 0;
365     }
366     return result;
367   }
368   uint32_t uresult = 0;
369   in >> uresult;
370   if (!in || !in.eof()) {
371     raise << "not a number: " << s << '\n' << end();
372     return 0;
373   }
374   return static_cast<int32_t>(uresult);
375 }
376 :(before "End Unit Tests")
377 void test_parse_int() {
378   CHECK_EQ(0, parse_int("0"));
379   CHECK_EQ(0, parse_int("0x0"));
380   CHECK_EQ(0, parse_int("0x0"));
381   CHECK_EQ(16, parse_int("10"));  // hex always
382   CHECK_EQ(-1, parse_int("-1"));
383   CHECK_EQ(-1, parse_int("0xffffffff"));
384 }