1 //: Running SubX programs on the VM.
  2 
  3 //: (Not to be confused with the 'run' subcommand for running ELF binaries on
  4 //: the VM. That comes later.)
  5 
  6 :(before "End Help Texts")
  7 put(Help, "syntax",
  8   "SubX programs consist of segments, each segment in turn consisting of lines.\n"
  9   "Line-endings are significant; each line should contain a single\n"
 10   "instruction, macro or directive.\n"
 11   "\n"
 12   "Comments start with the '#' character. It should be at the start of a word\n"
 13   "(start of line, or following a space).\n"
 14   "\n"
 15   "Each segment starts with a header line: a '==' delimiter followed by the\n"
 16   "starting address for the segment.\n"
 17   "\n"
 18   "The starting address for a segment has some finicky requirements. But just\n"
 19   "start with a round number, and `subx` will try to guide you to a valid\n"
 20   "configuration.\n"
 21   "A good rule of thumb is to try to start the first segment at the default\n"
 22   "address of 0x08048000, and to start each subsequent segment at least 0x1000\n"
 23   "(most common page size) bytes after the last.\n"
 24   "If a segment occupies than 0x1000 bytes you'll need to push subsequent\n"
 25   "segments further down.\n"
 26   "Currently only the first segment contains executable code (because it gets\n"
 27   "annoying to have to change addresses in later segments every time an earlier\n"
 28   "one changes length; one of those finicky requirements).\n"
 29   "\n"
 30   "Lines consist of a series of words. Words can contain arbitrary metadata\n"
 31   "after a '/', but they can never contain whitespace. Metadata has no effect\n"
 32   "at runtime, but can be handy when rewriting macros.\n"
 33   "\n"
 34   "Check out some examples in this directory (ex*.subx)\n"
 35   "Programming in machine code can be annoying, but let's see if we can make\n"
 36   "it nice enough to be able to write a compiler in it.\n"
 37 );
 38 :(before "End Help Contents")
 39 cerr << "  syntax\n";
 40 
 41 :(scenario add_imm32_to_eax)
 42 # At the lowest level, SubX programs are a series of hex bytes, each
 43 # (variable-length) instruction on one line.
 44 #
 45 # Later we'll make things nicer using macros. But you'll always be able to
 46 # insert hex bytes out of instructions.
 47 #
 48 # As you can see, comments start with '#' and are ignored.
 49 
 50 # Segment headers start with '==', specifying the hex address where they
 51 # begin. The first segment is always assumed to be code.
 52 == 0x1
 53 
 54 # We don't show it here, but all lines can have metadata after a ':'.
 55 # All words can have metadata after a '/'. No spaces allowed in word metadata, of course.
 56 # Metadata doesn't directly form instructions, but some macros may look at it.
 57 # Unrecognized metadata never causes errors, so you can also use it for
 58 # documentation.
 59 
 60 # Within the code segment, x86 instructions consist of the following parts (see cheatsheet.pdf):
 61 #   opcode        ModR/M                    SIB                   displacement    immediate
 62 #   instruction   mod, reg, Reg/Mem bits    scale, index, base
 63 #   1-3 bytes     0/1 byte                  0/1 byte              0/1/2/4 bytes   0/1/2/4 bytes
 64     05            .                         .                     .               0a 0b 0c 0d  # add 0x0d0c0b0a to EAX
 65 # (The single periods are just to help the eye track long gaps between
 66 # columns, and are otherwise ignored.)
 67 
 68 # This program, when run, causes the following events in the trace:
 69 +load: 0x00000001 -> 05
 70 +load: 0x00000002 -> 0a
 71 +load: 0x00000003 -> 0b
 72 +load: 0x00000004 -> 0c
 73 +load: 0x00000005 -> 0d
 74 +run: add imm32 0x0d0c0b0a to reg EAX
 75 +run: storing 0x0d0c0b0a
 76 
 77 :(code)
 78 // top-level helper for scenarios: parse the input, transform any macros, load
 79 // the final hex bytes into memory, run it
 80 void run(const string& text_bytes) {
 81   program p;
 82   istringstream in(text_bytes);
 83   parse(in, p);
 84   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
 85   transform(p);
 86   if (trace_contains_errors()) return;
 87   load(p);
 88   if (trace_contains_errors()) return;
 89   while (EIP < End_of_program)
 90     run_one_instruction();
 91 }
 92 
 93 //:: core data structures
 94 
 95 :(before "End Types")
 96 struct program {
 97   vector<segment> segments;
 98   // random ideas for other things we may eventually need
 99   //map<name, address> globals;
100   //vector<recipe> recipes;
101   //map<string, type_info> types;
102 };
103 :(before "struct program")
104 struct segment {
105   uint32_t start;
106   vector<line> lines;
107   // End segment Fields
108   segment() {
109     start = 0;
110     // End segment Constructor
111   }
112 };
113 :(before "struct segment")
114 struct line {
115   vector<word> words;
116   vector<string> metadata;
117   string original;
118 };
119 :(before "struct line")
120 struct word {
121   string original;
122   string data;
123   vector<string> metadata;
124 };
125 
126 //:: parse
127 
128 :(code)
129 void parse(istream& fin, program& out) {
130   vector<line> l;
131   trace(99, "parse") << "begin" << end();
132   while (has_data(fin)) {
133     string line_data;
134     line curr;
135     getline(fin, line_data);
136     curr.original = line_data;
137     trace(99, "parse") << "line: " << line_data << end();
138     istringstream lin(line_data);
139     while (has_data(lin)) {
140       string word_data;
141       lin >> word_data;
142       if (word_data.empty()) continue;
143       if (word_data[0] == '#') break;  // comment
144       if (word_data == ".") continue;  // comment token
145       if (word_data == "==") {
146         if (!l.empty()) {
147           assert(!out.segments.empty());
148           trace(99, "parse") << "flushing to segment" << end();
149           out.segments.back().lines.swap(l);
150         }
151         segment s;
152         string segment_title;
153         lin >> segment_title;
154         if (starts_with(segment_title, "0x"))
155           s.start = parse_int(segment_title);
156         trace(99, "parse") << "new segment from " << HEXWORD << s.start << end();
157         out.segments.push_back(s);
158         // todo?
159         break;  // skip rest of line
160       }
161       if (word_data[0] == ':') {
162         // todo: line metadata
163         break;
164       }
165       curr.words.push_back(word());
166       curr.words.back().original = word_data;
167       istringstream win(word_data);
168       if (getline(win, curr.words.back().data, '/')) {
169         string m;
170         while (getline(win, m, '/'))
171           curr.words.back().metadata.push_back(m);
172       }
173       trace(99, "parse") << "new word: " << curr.words.back().data << end();
174     }
175     if (!curr.words.empty())
176       l.push_back(curr);
177   }
178   if (!l.empty()) {
179     assert(!out.segments.empty());
180     trace(99, "parse") << "flushing to segment" << end();
181     out.segments.back().lines.swap(l);
182   }
183   trace(99, "parse") << "done" << end();
184 }
185 
186 //:: transform
187 
188 :(before "End Types")
189 typedef void (*transform_fn)(program&);
190 :(before "End Globals")
191 vector<transform_fn> Transform;
192 
193 void transform(program& p) {
194   trace(99, "transform") << "begin" << end();
195   for (int t = 0;  t < SIZE(Transform);  ++t)
196     (*Transform.at(t))(p);
197   trace(99, "transform") << "done" << end();
198 }
199 
200 //:: load
201 
202 void load(const program& p) {
203   trace(99, "load") << "begin" << end();
204   if (p.segments.empty()) {
205     raise << "no code to run\n" << end();
206     return;
207   }
208   for (int i = 0;   i < SIZE(p.segments);  ++i) {
209     const segment& seg = p.segments.at(i);
210     uint32_t addr = seg.start;
211     trace(99, "load") << "loading segment " << i << " from " << HEXWORD << addr << end();
212     for (int j = 0;  j < SIZE(seg.lines);  ++j) {
213       const line& l = seg.lines.at(j);
214       for (int k = 0;  k < SIZE(l.words);  ++k) {
215         const word& w = l.words.at(k);
216         uint8_t val = hex_byte(w.data);
217         if (trace_contains_errors()) return;
218         write_mem_u8(addr, val);
219         trace(99, "load") << "0x" << HEXWORD << addr << " -> " << HEXBYTE << NUM(read_mem_u8(addr)) << end();
220         ++addr;
221       }
222     }
223     if (i == 0) End_of_program = addr;
224   }
225   EIP = p.segments.at(0).start;
226   trace(99, "load") << "done" << end();
227 }
228 
229 uint8_t hex_byte(const string& s) {
230   istringstream in(s);
231   int result = 0;
232   in >> std::hex >> result;
233   if (!in || !in.eof()) {
234     raise << "token '" << s << "' is not a hex byte\n" << end();
235     return '\0';
236   }
237   if (result > 0xff || result < -0x8f) {
238     raise << "token '" << s << "' is not a hex byte\n" << end();
239     return '\0';
240   }
241   return static_cast<uint8_t>(result);
242 }
243 
244 :(scenarios parse_and_load)
245 :(scenario number_too_large)
246 % Hide_errors = true;
247 == 0x1
248 05 cab
249 +error: token 'cab' is not a hex byte
250 
251 :(scenario invalid_hex)
252 % Hide_errors = true;
253 == 0x1
254 05 cx
255 +error: token 'cx' is not a hex byte
256 
257 :(scenario negative_number)
258 == 0x1
259 05 -12
260 $error: 0
261 
262 :(scenario negative_number_too_small)
263 % Hide_errors = true;
264 == 0x1
265 05 -12345
266 +error: token '-12345' is not a hex byte
267 
268 :(scenario hex_prefix)
269 == 0x1
270 0x05 -0x12
271 $error: 0
272 
273 //: helper for tests
274 :(code)
275 void parse_and_load(const string& text_bytes) {
276   program p;
277   istringstream in(text_bytes);
278   parse(in, p);
279   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
280   load(p);
281 }
282 
283 //:: run
284 
285 :(before "End Initialize Op Names(name)")
286 put(name, "05", "add imm32 to R0 (EAX)");
287 
288 //: our first opcode
289 :(before "End Single-Byte Opcodes")
290 case 0x05: {  // add imm32 to EAX
291   int32_t arg2 = imm32();
292   trace(90, "run") << "add imm32 0x" << HEXWORD << arg2 << " to reg EAX" << end();
293   BINARY_ARITHMETIC_OP(+, Reg[EAX].i, arg2);
294   break;
295 }
296 
297 :(code)
298 // read a 32-bit immediate in little-endian order from the instruction stream
299 int32_t imm32() {
300   int32_t result = next();
301   result |= (next()<<8);
302   result |= (next()<<16);
303   result |= (next()<<24);
304   return result;
305 }
306 
307 :(code)
308 int32_t parse_int(const string& s) {
309   if (s.empty()) return 0;
310   istringstream in(s);
311   in >> std::hex;
312   if (s.at(0) == '-') {
313     int32_t result = 0;
314     in >> result;
315     if (!in || !in.eof()) {
316       raise << "not a number: " << s << '\n' << end();
317       return 0;
318     }
319     return result;
320   }
321   uint32_t uresult = 0;
322   in >> uresult;
323   if (!in || !in.eof()) {
324     raise << "not a number: " << s << '\n' << end();
325     return 0;
326   }
327   return static_cast<int32_t>(uresult);
328 }
329 :(before "End Unit Tests")
330 void test_parse_int() {
331   CHECK_EQ(0, parse_int("0"));
332   CHECK_EQ(0, parse_int("0x0"));
333   CHECK_EQ(0, parse_int("0x0"));
334   CHECK_EQ(16, parse_int("10"));  // hex always
335   CHECK_EQ(-1, parse_int("-1"));
336   CHECK_EQ(-1, parse_int("0xffffffff"));
337 }