1 //: Running SubX programs on the VM.
  2 
  3 //: (Not to be confused with the 'run' subcommand for running ELF binaries on
  4 //: the VM. That comes later.)
  5 
  6 :(before "End Help Texts")
  7 put(Help, "syntax",
  8   "SubX programs consist of segments, each segment in turn consisting of lines.\n"
  9   "Line-endings are significant; each line should contain a single\n"
 10   "instruction, macro or directive.\n"
 11   "\n"
 12   "Comments start with the '#' character. It should be at the start of a word\n"
 13   "(start of line, or following a space).\n"
 14   "\n"
 15   "Each segment starts with a header line: a '==' delimiter followed by the\n"
 16   "starting address for the segment.\n"
 17   "\n"
 18   "The starting address for a segment has some finicky requirements. But just\n"
 19   "start with a round number, and `subx` will try to guide you to a valid\n"
 20   "configuration.\n"
 21   "A good rule of thumb is to try to start the first segment at the default\n"
 22   "address of 0x08048000, and to start each subsequent segment at least 0x1000\n"
 23   "(most common page size) bytes after the last.\n"
 24   "If a segment occupies than 0x1000 bytes you'll need to push subsequent\n"
 25   "segments further down.\n"
 26   "Currently only the first segment contains executable code (because it gets\n"
 27   "annoying to have to change addresses in later segments every time an earlier\n"
 28   "one changes length; one of those finicky requirements).\n"
 29   "\n"
 30   "Lines consist of a series of words. Words can contain arbitrary metadata\n"
 31   "after a '/', but they can never contain whitespace. Metadata has no effect\n"
 32   "at runtime, but can be handy when rewriting macros.\n"
 33   "\n"
 34   "Check out some examples in this directory (ex*.subx)\n"
 35   "Programming in machine code can be annoying, but let's see if we can make\n"
 36   "it nice enough to be able to write a compiler in it.\n"
 37 );
 38 :(before "End Help Contents")
 39 cerr << "  syntax\n";
 40 
 41 :(scenario add_imm32_to_eax)
 42 # At the lowest level, SubX programs are a series of hex bytes, each
 43 # (variable-length) instruction on one line.
 44 #
 45 # Later we'll make things nicer using macros. But you'll always be able to
 46 # insert hex bytes out of instructions.
 47 #
 48 # As you can see, comments start with '#' and are ignored.
 49 
 50 # Segment headers start with '==', specifying the hex address where they
 51 # begin. There's usually one code segment and one data segment. We assume the
 52 # code segment always comes first. Later when we emit ELF binaries we'll add
 53 # directives for the operating system to ensure that the code segment can't be
 54 # written to, and the data segment can't be executed as code.
 55 == 0x1
 56 
 57 # We don't show it here, but all lines can have metadata after a ':'.
 58 # All words can have metadata after a '/'. No spaces allowed in word metadata, of course.
 59 # Metadata doesn't directly form instructions, but some macros may look at it.
 60 # Unrecognized metadata never causes errors, so you can also use it for
 61 # documentation.
 62 
 63 # Within the code segment, x86 instructions consist of the following parts (see cheatsheet.pdf):
 64 #   opcode        ModR/M                    SIB                   displacement    immediate
 65 #   instruction   mod, reg, Reg/Mem bits    scale, index, base
 66 #   1-3 bytes     0/1 byte                  0/1 byte              0/1/2/4 bytes   0/1/2/4 bytes
 67     05            .                         .                     .               0a 0b 0c 0d  # add 0x0d0c0b0a to EAX
 68 # (The single periods are just to help the eye track long gaps between
 69 # columns, and are otherwise ignored.)
 70 
 71 # This program, when run, causes the following events in the trace:
 72 +load: 0x00000001 -> 05
 73 +load: 0x00000002 -> 0a
 74 +load: 0x00000003 -> 0b
 75 +load: 0x00000004 -> 0c
 76 +load: 0x00000005 -> 0d
 77 +run: add imm32 0x0d0c0b0a to reg EAX
 78 +run: storing 0x0d0c0b0a
 79 
 80 :(code)
 81 // top-level helper for scenarios: parse the input, transform any macros, load
 82 // the final hex bytes into memory, run it
 83 void run(const string& text_bytes) {
 84   program p;
 85   istringstream in(text_bytes);
 86   parse(in, p);
 87   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
 88   transform(p);
 89   if (trace_contains_errors()) return;
 90   load(p);
 91   if (trace_contains_errors()) return;
 92   while (EIP < End_of_program)
 93     run_one_instruction();
 94 }
 95 
 96 //:: core data structures
 97 
 98 :(before "End Types")
 99 struct program {
100   vector<segment> segments;
101   // random ideas for other things we may eventually need
102   //map<name, address> globals;
103   //vector<recipe> recipes;
104   //map<string, type_info> types;
105 };
106 :(before "struct program")
107 struct segment {
108   uint32_t start;
109   vector<line> lines;
110   // End segment Fields
111   segment() {
112     start = 0;
113     // End segment Constructor
114   }
115 };
116 :(before "struct segment")
117 struct line {
118   vector<word> words;
119   vector<string> metadata;
120   string original;
121 };
122 :(before "struct line")
123 struct word {
124   string original;
125   string data;
126   vector<string> metadata;
127 };
128 
129 //:: parse
130 
131 :(code)
132 void parse(istream& fin, program& out) {
133   vector<line> l;
134   trace(99, "parse") << "begin" << end();
135   while (has_data(fin)) {
136     string line_data;
137     line curr;
138     getline(fin, line_data);
139     curr.original = line_data;
140     trace(99, "parse") << "line: " << line_data << end();
141     // End Line Parsing Special-cases(line_data -> l)
142     istringstream lin(line_data);
143     while (has_data(lin)) {
144       string word_data;
145       lin >> word_data;
146       if (word_data.empty()) continue;
147       if (word_data[0] == '#') break;  // comment
148       if (word_data == ".") continue;  // comment token
149       if (word_data == "==") {
150         if (!l.empty()) {
151           assert(!out.segments.empty());
152           trace(99, "parse") << "flushing to segment" << end();
153           out.segments.back().lines.swap(l);
154         }
155         segment s;
156         string segment_title;
157         lin >> segment_title;
158         if (starts_with(segment_title, "0x"))
159           s.start = parse_int(segment_title);
160         trace(99, "parse") << "new segment from " << HEXWORD << s.start << end();
161         out.segments.push_back(s);
162         // todo?
163         break;  // skip rest of line
164       }
165       if (word_data[0] == ':') {
166         // todo: line metadata
167         break;
168       }
169       curr.words.push_back(word());
170       parse_word(word_data, curr.words.back());
171       trace(99, "parse") << "word: " << to_string(curr.words.back());
172     }
173     if (!curr.words.empty())
174       l.push_back(curr);
175   }
176   if (!l.empty()) {
177     assert(!out.segments.empty());
178     trace(99, "parse") << "flushing to segment" << end();
179     out.segments.back().lines.swap(l);
180   }
181   trace(99, "parse") << "done" << end();
182 }
183 
184 void parse_word(const string& data, word& out) {
185   out.original = data;
186   istringstream win(data);
187   if (getline(win, out.data, '/')) {
188     string m;
189     while (getline(win, m, '/'))
190       out.metadata.push_back(m);
191   }
192 }
193 
194 string to_string(const word& w) {
195   ostringstream out;
196   out << w.data;
197   for (int i = 0;  i < SIZE(w.metadata);  ++i)
198     out << " /" << w.metadata.at(i);
199   return out.str();
200 }
201 
202 //:: transform
203 
204 :(before "End Types")
205 typedef void (*transform_fn)(program&);
206 :(before "End Globals")
207 vector<transform_fn> Transform;
208 
209 void transform(program& p) {
210   trace(99, "transform") << "begin" << end();
211   for (int t = 0;  t < SIZE(Transform);  ++t)
212     (*Transform.at(t))(p);
213   trace(99, "transform") << "done" << end();
214 }
215 
216 //:: load
217 
218 void load(const program& p) {
219   trace(99, "load") << "begin" << end();
220   if (p.segments.empty()) {
221     raise << "no code to run\n" << end();
222     return;
223   }
224   for (int i = 0;   i < SIZE(p.segments);  ++i) {
225     const segment& seg = p.segments.at(i);
226     uint32_t addr = seg.start;
227     trace(99, "load") << "loading segment " << i << " from " << HEXWORD << addr << end();
228     for (int j = 0;  j < SIZE(seg.lines);  ++j) {
229       const line& l = seg.lines.at(j);
230       for (int k = 0;  k < SIZE(l.words);  ++k) {
231         const word& w = l.words.at(k);
232         uint8_t val = hex_byte(w.data);
233         if (trace_contains_errors()) return;
234         write_mem_u8(addr, val);
235         trace(99, "load") << "0x" << HEXWORD << addr << " -> " << HEXBYTE << NUM(read_mem_u8(addr)) << end();
236         ++addr;
237       }
238     }
239     if (i == 0) End_of_program = addr;
240   }
241   EIP = p.segments.at(0).start;
242   trace(99, "load") << "done" << end();
243 }
244 
245 uint8_t hex_byte(const string& s) {
246   istringstream in(s);
247   int result = 0;
248   in >> std::hex >> result;
249   if (!in || !in.eof()) {
250     raise << "token '" << s << "' is not a hex byte\n" << end();
251     return '\0';
252   }
253   if (result > 0xff || result < -0x8f) {
254     raise << "token '" << s << "' is not a hex byte\n" << end();
255     return '\0';
256   }
257   return static_cast<uint8_t>(result);
258 }
259 
260 :(scenarios parse_and_load)
261 :(scenario number_too_large)
262 % Hide_errors = true;
263 == 0x1
264 05 cab
265 +error: token 'cab' is not a hex byte
266 
267 :(scenario invalid_hex)
268 % Hide_errors = true;
269 == 0x1
270 05 cx
271 +error: token 'cx' is not a hex byte
272 
273 :(scenario negative_number)
274 == 0x1
275 05 -12
276 $error: 0
277 
278 :(scenario negative_number_too_small)
279 % Hide_errors = true;
280 == 0x1
281 05 -12345
282 +error: token '-12345' is not a hex byte
283 
284 :(scenario hex_prefix)
285 == 0x1
286 0x05 -0x12
287 $error: 0
288 
289 //: helper for tests
290 :(code)
291 void parse_and_load(const string& text_bytes) {
292   program p;
293   istringstream in(text_bytes);
294   parse(in, p);
295   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
296   load(p);
297 }
298 
299 //:: run
300 
301 :(before "End Initialize Op Names(name)")
302 put(name, "05", "add imm32 to R0 (EAX)");
303 
304 //: our first opcode
305 :(before "End Single-Byte Opcodes")
306 case 0x05: {  // add imm32 to EAX
307   int32_t arg2 = next32();
308   trace(90, "run") << "add imm32 0x" << HEXWORD << arg2 << " to reg EAX" << end();
309   BINARY_ARITHMETIC_OP(+, Reg[EAX].i, arg2);
310   break;
311 }
312 
313 :(code)
314 // read a 32-bit int in little-endian order from the instruction stream
315 int32_t next32() {
316   int32_t result = next();
317   result |= (next()<<8);
318   result |= (next()<<16);
319   result |= (next()<<24);
320   return result;
321 }
322 
323 :(code)
324 int32_t parse_int(const string& s) {
325   if (s.empty()) return 0;
326   istringstream in(s);
327   in >> std::hex;
328   if (s.at(0) == '-') {
329     int32_t result = 0;
330     in >> result;
331     if (!in || !in.eof()) {
332       raise << "not a number: " << s << '\n' << end();
333       return 0;
334     }
335     return result;
336   }
337   uint32_t uresult = 0;
338   in >> uresult;
339   if (!in || !in.eof()) {
340     raise << "not a number: " << s << '\n' << end();
341     return 0;
342   }
343   return static_cast<int32_t>(uresult);
344 }
345 :(before "End Unit Tests")
346 void test_parse_int() {
347   CHECK_EQ(0, parse_int("0"));
348   CHECK_EQ(0, parse_int("0x0"));
349   CHECK_EQ(0, parse_int("0x0"));
350   CHECK_EQ(16, parse_int("10"));  // hex always
351   CHECK_EQ(-1, parse_int("-1"));
352   CHECK_EQ(-1, parse_int("0xffffffff"));
353 }