1 //: Running SubX programs on the VM.
  2 
  3 //: (Not to be confused with the 'run' subcommand for running ELF binaries on
  4 //: the VM. That comes later.)
  5 
  6 :(before "End Help Texts")
  7 put(Help, "syntax",
  8   "SubX programs consist of segments, each segment in turn consisting of lines.\n"
  9   "Line-endings are significant; each line should contain a single\n"
 10   "instruction, macro or directive.\n"
 11   "\n"
 12   "Comments start with the '#' character. It should be at the start of a word\n"
 13   "(start of line, or following a space).\n"
 14   "\n"
 15   "Each segment starts with a header line: a '==' delimiter followed by the\n"
 16   "starting address for the segment.\n"
 17   "\n"
 18   "The starting address for a segment has some finicky requirements. But just\n"
 19   "start with a round number, and `subx` will try to guide you to a valid\n"
 20   "configuration.\n"
 21   "A good rule of thumb is to try to start the first segment at the default\n"
 22   "address of 0x08048000, and to start each subsequent segment at least 0x1000\n"
 23   "(most common page size) bytes after the last.\n"
 24   "If a segment occupies than 0x1000 bytes you'll need to push subsequent\n"
 25   "segments further down.\n"
 26   "Currently only the first segment contains executable code (because it gets\n"
 27   "annoying to have to change addresses in later segments every time an earlier\n"
 28   "one changes length; one of those finicky requirements).\n"
 29   "\n"
 30   "Lines consist of a series of words. Words can contain arbitrary metadata\n"
 31   "after a '/', but they can never contain whitespace. Metadata has no effect\n"
 32   "at runtime, but can be handy when rewriting macros.\n"
 33   "\n"
 34   "Check out some examples in this directory (ex*.subx)\n"
 35   "Programming in machine code can be annoying, but let's see if we can make\n"
 36   "it nice enough to be able to write a compiler in it.\n"
 37 );
 38 :(before "End Help Contents")
 39 cerr << "  syntax\n";
 40 
 41 :(scenario add_imm32_to_eax)
 42 # At the lowest level, SubX programs are a series of hex bytes, each
 43 # (variable-length) instruction on one line.
 44 #
 45 # Later we'll make things nicer using macros. But you'll always be able to
 46 # insert hex bytes out of instructions.
 47 #
 48 # As you can see, comments start with '#' and are ignored.
 49 
 50 # Segment headers start with '==', specifying the hex address where they
 51 # begin. There's usually one code segment and one data segment. We assume the
 52 # code segment always comes first. Later when we emit ELF binaries we'll add
 53 # directives for the operating system to ensure that the code segment can't be
 54 # written to, and the data segment can't be executed as code.
 55 == 0x1
 56 
 57 # We don't show it here, but all lines can have metadata after a ':'.
 58 # All words can have metadata after a '/'. No spaces allowed in word metadata, of course.
 59 # Metadata doesn't directly form instructions, but some macros may look at it.
 60 # Unrecognized metadata never causes errors, so you can also use it for
 61 # documentation.
 62 
 63 # Within the code segment, x86 instructions consist of the following parts (see cheatsheet.pdf):
 64 #   opcode        ModR/M                    SIB                   displacement    immediate
 65 #   instruction   mod, reg, Reg/Mem bits    scale, index, base
 66 #   1-3 bytes     0/1 byte                  0/1 byte              0/1/2/4 bytes   0/1/2/4 bytes
 67     05            .                         .                     .               0a 0b 0c 0d  # add 0x0d0c0b0a to EAX
 68 # (The single periods are just to help the eye track long gaps between
 69 # columns, and are otherwise ignored.)
 70 
 71 # This program, when run, causes the following events in the trace:
 72 +load: 0x00000001 -> 05
 73 +load: 0x00000002 -> 0a
 74 +load: 0x00000003 -> 0b
 75 +load: 0x00000004 -> 0c
 76 +load: 0x00000005 -> 0d
 77 +run: add imm32 0x0d0c0b0a to reg EAX
 78 +run: storing 0x0d0c0b0a
 79 
 80 :(code)
 81 // top-level helper for scenarios: parse the input, transform any macros, load
 82 // the final hex bytes into memory, run it
 83 void run(const string& text_bytes) {
 84   program p;
 85   istringstream in(text_bytes);
 86   parse(in, p);
 87   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
 88   transform(p);
 89   if (trace_contains_errors()) return;
 90   load(p);
 91   if (trace_contains_errors()) return;
 92   while (EIP < End_of_program)
 93     run_one_instruction();
 94 }
 95 
 96 //:: core data structures
 97 
 98 :(before "End Types")
 99 struct program {
100   vector<segment> segments;
101   // random ideas for other things we may eventually need
102   //map<name, address> globals;
103   //vector<recipe> recipes;
104   //map<string, type_info> types;
105 };
106 :(before "struct program")
107 struct segment {
108   uint32_t start;
109   vector<line> lines;
110   // End segment Fields
111   segment() {
112     start = 0;
113     // End segment Constructor
114   }
115 };
116 :(before "struct segment")
117 struct line {
118   vector<word> words;
119   vector<string> metadata;
120   string original;
121 };
122 :(before "struct line")
123 struct word {
124   string original;
125   string data;
126   vector<string> metadata;
127 };
128 
129 //:: parse
130 
131 :(code)
132 void parse(istream& fin, program& out) {
133   vector<line> l;
134   trace(99, "parse") << "begin" << end();
135   while (has_data(fin)) {
136     string line_data;
137     line curr;
138     getline(fin, line_data);
139     curr.original = line_data;
140     trace(99, "parse") << "line: " << line_data << end();
141     istringstream lin(line_data);
142     while (has_data(lin)) {
143       string word_data;
144       lin >> word_data;
145       if (word_data.empty()) continue;
146       if (word_data[0] == '#') break;  // comment
147       if (word_data == ".") continue;  // comment token
148       if (word_data == "==") {
149         if (!l.empty()) {
150           assert(!out.segments.empty());
151           trace(99, "parse") << "flushing to segment" << end();
152           out.segments.back().lines.swap(l);
153         }
154         segment s;
155         string segment_title;
156         lin >> segment_title;
157         if (starts_with(segment_title, "0x"))
158           s.start = parse_int(segment_title);
159         trace(99, "parse") << "new segment from " << HEXWORD << s.start << end();
160         out.segments.push_back(s);
161         // todo?
162         break;  // skip rest of line
163       }
164       if (word_data[0] == ':') {
165         // todo: line metadata
166         break;
167       }
168       curr.words.push_back(word());
169       curr.words.back().original = word_data;
170       istringstream win(word_data);
171       if (getline(win, curr.words.back().data, '/')) {
172         string m;
173         while (getline(win, m, '/'))
174           curr.words.back().metadata.push_back(m);
175       }
176       trace(99, "parse") << "new word: " << curr.words.back().data << end();
177     }
178     if (!curr.words.empty())
179       l.push_back(curr);
180   }
181   if (!l.empty()) {
182     assert(!out.segments.empty());
183     trace(99, "parse") << "flushing to segment" << end();
184     out.segments.back().lines.swap(l);
185   }
186   trace(99, "parse") << "done" << end();
187 }
188 
189 //:: transform
190 
191 :(before "End Types")
192 typedef void (*transform_fn)(program&);
193 :(before "End Globals")
194 vector<transform_fn> Transform;
195 
196 void transform(program& p) {
197   trace(99, "transform") << "begin" << end();
198   for (int t = 0;  t < SIZE(Transform);  ++t)
199     (*Transform.at(t))(p);
200   trace(99, "transform") << "done" << end();
201 }
202 
203 //:: load
204 
205 void load(const program& p) {
206   trace(99, "load") << "begin" << end();
207   if (p.segments.empty()) {
208     raise << "no code to run\n" << end();
209     return;
210   }
211   for (int i = 0;   i < SIZE(p.segments);  ++i) {
212     const segment& seg = p.segments.at(i);
213     uint32_t addr = seg.start;
214     trace(99, "load") << "loading segment " << i << " from " << HEXWORD << addr << end();
215     for (int j = 0;  j < SIZE(seg.lines);  ++j) {
216       const line& l = seg.lines.at(j);
217       for (int k = 0;  k < SIZE(l.words);  ++k) {
218         const word& w = l.words.at(k);
219         uint8_t val = hex_byte(w.data);
220         if (trace_contains_errors()) return;
221         write_mem_u8(addr, val);
222         trace(99, "load") << "0x" << HEXWORD << addr << " -> " << HEXBYTE << NUM(read_mem_u8(addr)) << end();
223         ++addr;
224       }
225     }
226     if (i == 0) End_of_program = addr;
227   }
228   EIP = p.segments.at(0).start;
229   trace(99, "load") << "done" << end();
230 }
231 
232 uint8_t hex_byte(const string& s) {
233   istringstream in(s);
234   int result = 0;
235   in >> std::hex >> result;
236   if (!in || !in.eof()) {
237     raise << "token '" << s << "' is not a hex byte\n" << end();
238     return '\0';
239   }
240   if (result > 0xff || result < -0x8f) {
241     raise << "token '" << s << "' is not a hex byte\n" << end();
242     return '\0';
243   }
244   return static_cast<uint8_t>(result);
245 }
246 
247 :(scenarios parse_and_load)
248 :(scenario number_too_large)
249 % Hide_errors = true;
250 == 0x1
251 05 cab
252 +error: token 'cab' is not a hex byte
253 
254 :(scenario invalid_hex)
255 % Hide_errors = true;
256 == 0x1
257 05 cx
258 +error: token 'cx' is not a hex byte
259 
260 :(scenario negative_number)
261 == 0x1
262 05 -12
263 $error: 0
264 
265 :(scenario negative_number_too_small)
266 % Hide_errors = true;
267 == 0x1
268 05 -12345
269 +error: token '-12345' is not a hex byte
270 
271 :(scenario hex_prefix)
272 == 0x1
273 0x05 -0x12
274 $error: 0
275 
276 //: helper for tests
277 :(code)
278 void parse_and_load(const string& text_bytes) {
279   program p;
280   istringstream in(text_bytes);
281   parse(in, p);
282   if (trace_contains_errors()) return;  // if any stage raises errors, stop immediately
283   load(p);
284 }
285 
286 //:: run
287 
288 :(before "End Initialize Op Names(name)")
289 put(name, "05", "add imm32 to R0 (EAX)");
290 
291 //: our first opcode
292 :(before "End Single-Byte Opcodes")
293 case 0x05: {  // add imm32 to EAX
294   int32_t arg2 = next32();
295   trace(90, "run") << "add imm32 0x" << HEXWORD << arg2 << " to reg EAX" << end();
296   BINARY_ARITHMETIC_OP(+, Reg[EAX].i, arg2);
297   break;
298 }
299 
300 :(code)
301 // read a 32-bit int in little-endian order from the instruction stream
302 int32_t next32() {
303   int32_t result = next();
304   result |= (next()<<8);
305   result |= (next()<<16);
306   result |= (next()<<24);
307   return result;
308 }
309 
310 :(code)
311 int32_t parse_int(const string& s) {
312   if (s.empty()) return 0;
313   istringstream in(s);
314   in >> std::hex;
315   if (s.at(0) == '-') {
316     int32_t result = 0;
317     in >> result;
318     if (!in || !in.eof()) {
319       raise << "not a number: " << s << '\n' << end();
320       return 0;
321     }
322     return result;
323   }
324   uint32_t uresult = 0;
325   in >> uresult;
326   if (!in || !in.eof()) {
327     raise << "not a number: " << s << '\n' << end();
328     return 0;
329   }
330   return static_cast<int32_t>(uresult);
331 }
332 :(before "End Unit Tests")
333 void test_parse_int() {
334   CHECK_EQ(0, parse_int("0"));
335   CHECK_EQ(0, parse_int("0x0"));
336   CHECK_EQ(0, parse_int("0x0"));
337   CHECK_EQ(16, parse_int("10"));  // hex always
338   CHECK_EQ(-1, parse_int("-1"));
339   CHECK_EQ(-1, parse_int("0xffffffff"));
340 }