diff options
author | Kartik Agaram <vc@akkartik.com> | 2018-09-17 22:57:10 -0700 |
---|---|---|
committer | Kartik Agaram <vc@akkartik.com> | 2018-09-17 22:57:58 -0700 |
commit | f09280141f18fbe8cef0ed576cf932e12e315666 (patch) | |
tree | d00962b07cb013f89d4fdb2fcf19c392afb62b5c | |
parent | 0a7b03727a736f73c16d37b22afef8496c60d657 (diff) | |
download | mu-f09280141f18fbe8cef0ed576cf932e12e315666.tar.gz |
4548: start of a compiler for a new experimental low-level language
-rw-r--r-- | subx/apps/factorial.k2 | 21 | ||||
-rw-r--r-- | subx/examples/ex3.k2 | 20 | ||||
-rw-r--r-- | subx/examples/ex4.k2 | 17 | ||||
-rw-r--r-- | subx/examples/ex5.k2 | 12 | ||||
-rw-r--r-- | subx/examples/ex6.k2 | 13 | ||||
-rw-r--r-- | subx/examples/ex7.k2 | 24 | ||||
-rw-r--r-- | subx/examples/ex8.k2 | 41 | ||||
-rw-r--r-- | transect/000organization.cc | 136 | ||||
-rw-r--r-- | transect/001help.cc | 261 | ||||
-rw-r--r-- | transect/002test.cc | 104 | ||||
-rw-r--r-- | transect/003trace.cc | 409 | ||||
-rw-r--r-- | transect/003trace.test.cc | 124 | ||||
-rw-r--r-- | transect/010vm.cc | 230 | ||||
-rw-r--r-- | transect/011load.cc | 228 | ||||
-rwxr-xr-x | transect/build | 109 | ||||
-rwxr-xr-x | transect/build_and_test_until | 18 | ||||
-rwxr-xr-x | transect/clean | 8 | ||||
-rw-r--r-- | transect/compiler10 | 301 | ||||
-rw-r--r-- | transect/compiler2 | 27 | ||||
-rw-r--r-- | transect/compiler3 | 73 | ||||
-rw-r--r-- | transect/compiler4 | 84 | ||||
-rw-r--r-- | transect/compiler5 | 32 | ||||
-rw-r--r-- | transect/compiler6 | 36 | ||||
-rw-r--r-- | transect/compiler7 | 46 | ||||
-rw-r--r-- | transect/compiler8 | 53 | ||||
-rw-r--r-- | transect/compiler9 | 254 | ||||
-rw-r--r-- | transect/vimrc.vim | 36 |
27 files changed, 2717 insertions, 0 deletions
diff --git a/subx/apps/factorial.k2 b/subx/apps/factorial.k2 new file mode 100644 index 00000000..82c44352 --- /dev/null +++ b/subx/apps/factorial.k2 @@ -0,0 +1,21 @@ +fn factorial n : int -> result/EAX : int [ + result/EAX <- copy 1 + { + compare n, 1 + break-if <= + var tmp/EBX : int + tmp/EBX <- copy n + tmp/EBX <- subtract 1 + var tmp2/EAX : int + tmp2/EAX <- call factorial, tmp/EBX + result/EAX <- multiply tmp2/EAX, n + } + return result/EAX +] + +data structures: + +add entry for "factorial" into the Types table, with value (fn int -> int) +add entry for "factorial" into the address table, with value next available address +add entry for "factorial" into the size table, with value size of "0b 0a bb ..." +increase next available address by size of "factorial" diff --git a/subx/examples/ex3.k2 b/subx/examples/ex3.k2 new file mode 100644 index 00000000..aa17ac7b --- /dev/null +++ b/subx/examples/ex3.k2 @@ -0,0 +1,20 @@ +fn main [ + var result/EBX : int + result/EBX <- copy 0 + var counter/ECX : int + counter/ECX <- copy 1 + { + compare counter/ECX, 10 + break-if > + result/EBX <- add counter/ECX + counter/ECX <- add 1 + loop + } + call exit, 1 +] + +fn exit x : int [ + code/EBX <- copy x + code/EAX <- copy 1/exit + syscall +] diff --git a/subx/examples/ex4.k2 b/subx/examples/ex4.k2 new file mode 100644 index 00000000..ad968eaf --- /dev/null +++ b/subx/examples/ex4.k2 @@ -0,0 +1,17 @@ +# variables are always references +# read their address with their names: x (can't write to their address) +# read/write their contents with a lookup: *x +var x : char + +fn main [ + call read 0/stdin, x, 1/size # watch out; reading a global may not be possible in all instructions + # but the address is more easily obtained + result/EAX <- call write 1/stdout, x, 1/size + call exit, result/EAX +] + +fn exit x : int [ + code/EBX <- copy x + code/EAX <- copy 1/exit + syscall +] diff --git a/subx/examples/ex5.k2 b/subx/examples/ex5.k2 new file mode 100644 index 00000000..79920614 --- /dev/null +++ b/subx/examples/ex5.k2 @@ -0,0 +1,12 @@ +fn main [ + var x : char + call read 0/stdin, x, 1/size + result/EBX <- call write 1/stdout, x, 1/size + call exit-EBX +] + +# like exit, but assumes the code is already in EBX +fn exit-EBX [ + code/EAX <- copy 1/exit + syscall +] diff --git a/subx/examples/ex6.k2 b/subx/examples/ex6.k2 new file mode 100644 index 00000000..d9b54dd4 --- /dev/null +++ b/subx/examples/ex6.k2 @@ -0,0 +1,13 @@ +var size : int = 14 +var x : (array character) = "hello, world!" + +fn main [ + call write 1/stdout, x, size + call exit, 0 +] + +fn exit x : int [ + code/EBX <- copy x + code/EAX <- copy 1/exit + syscall +] diff --git a/subx/examples/ex7.k2 b/subx/examples/ex7.k2 new file mode 100644 index 00000000..515c5c12 --- /dev/null +++ b/subx/examples/ex7.k2 @@ -0,0 +1,24 @@ +var stream : int = 0 +var a : char = 97 +var b : char = 0 +var filename : (array char) = ".foo" + +fn main [ + call create, filename + *stream <- call open, filename, 1/wronly + call write, *stream, a, 1 + call close, *stream + stream <- call open, filename, 0/rdonly + call read, *stream, b, 1 + call close, *stream + call unlink, filename + var result/EBX : char + result/EBX <- copy b # TODO: copy char to int? + call exit-EBX +] + +# like exit, but assumes the code is already in EBX +fn exit-EBX [ + code/EAX <- copy 1/exit + syscall +] diff --git a/subx/examples/ex8.k2 b/subx/examples/ex8.k2 new file mode 100644 index 00000000..f6c359f7 --- /dev/null +++ b/subx/examples/ex8.k2 @@ -0,0 +1,41 @@ +fn main argc : int, argv : (array (ref array char)) -> [ + var tmp : (index char) + tmp <- index 1, %size(ref array char) + var tmp2 : (address (ref array char)) + tmp2 <- advance argv, tmp + var s/EBX : (ref array char) + s/EBX <- copy *tmp2 + var result/EAX : int + result/EAX <- ascii_length s/EBX + call exit, result/EAX +] + +# must +fn ascii_length s : (ref array char) -> result : int [ + var result/EBX : int + result/EBX <- copy 0 + { + var tmp0/EDI : (offset char) + tmp0/EDI <- index result/EBX, %size(char) + var tmp/EDX : (address char) + tmp/EDX <- advance *s, tmp0/EDI + => compare (ESP+4), *(ESP+8) ; '*' from compiler2 + jump-unless-equal panic + EDX <- add ESP, 8 + EDX <- copy *EDX + EDX <- add EDX, 4 + EDX <- 8d/lea EDX + result + var c/ECX : char + c/ECX <- copy *tmp + compare c/ECX, 0 + break-if-equal + loop + } + return result/EBX +] + +fn exit x : int [ + code/EBX <- copy x + code/EAX <- copy 1/exit + syscall +] diff --git a/transect/000organization.cc b/transect/000organization.cc new file mode 100644 index 00000000..9a1938ff --- /dev/null +++ b/transect/000organization.cc @@ -0,0 +1,136 @@ +//: You guessed right: the '000' prefix means you should start reading here. +//: +//: This project is set up to load all files with a numeric prefix. Just +//: create a new file and start hacking. +//: +//: The first few files (00*) are independent of what this program does, an +//: experimental skeleton that will hopefully make it both easier for others to +//: understand and more malleable, easier to rewrite and remould into radically +//: different shapes without breaking in subtle corner cases. The premise is +//: that understandability and rewrite-friendliness are related in a virtuous +//: cycle. Doing one well makes it easier to do the other. +//: +//: Lower down, this file contains a legal, bare-bones C++ program. It doesn't +//: do anything yet; subsequent files will contain :(...) directives to insert +//: lines into it. For example: +//: :(after "more events") +//: This directive means: insert the following lines after a line in the +//: program containing the words "more events". +//: +//: A simple tool is included to 'tangle' all the files together in sequence +//: according to their directives into a single source file containing all the +//: code for the project, and then feed the source file to the compiler. +//: (It'll drop these comments starting with a '//:' prefix that only make +//: sense before tangling.) +//: +//: Directives free up the programmer to order code for others to read rather +//: than as forced by the computer or compiler. Each individual feature can be +//: organized in a self-contained 'layer' that adds code to many different data +//: structures and functions all over the program. The right decomposition into +//: layers will let each layer make sense in isolation. +//: +//: "If I look at any small part of it, I can see what is going on -- I don't +//: need to refer to other parts to understand what something is doing. +//: +//: If I look at any large part in overview, I can see what is going on -- I +//: don't need to know all the details to get it. +//: +//: Every level of detail is as locally coherent and as well thought-out as +//: any other level." +//: +//: -- Richard Gabriel, "The Quality Without A Name" +//: (http://dreamsongs.com/Files/PatternsOfSoftware.pdf, page 42) +//: +//: Directives are powerful; they permit inserting or modifying any point in +//: the program. Using them tastefully requires mapping out specific lines as +//: waypoints for future layers to hook into. Often such waypoints will be in +//: comments, capitalized to hint that other layers rely on their presence. +//: +//: A single waypoint might have many different code fragments hooking into +//: it from all over the codebase. Use 'before' directives to insert +//: code at a location in order, top to bottom, and 'after' directives to +//: insert code in reverse order. By convention waypoints intended for insertion +//: before begin with 'End'. Notice below how the layers line up above the "End +//: Foo" waypoint. +//: +//: File 001 File 002 File 003 +//: ============ =================== =================== +//: // Foo +//: ------------ +//: <---- :(before "End Foo") +//: .... +//: ... +//: ------------ +//: <---------------------------- :(before "End Foo") +//: .... +//: ... +//: // End Foo +//: ============ +//: +//: Here's part of a layer in color: http://i.imgur.com/0eONnyX.png. Directives +//: are shaded dark. +//: +//: Layers do more than just shuffle code around. In a well-organized codebase +//: it should be possible to stop loading after any file/layer, build and run +//: the program, and pass all tests for loaded features. (Relevant is +//: http://youtube.com/watch?v=c8N72t7aScY, a scene from "2001: A Space +//: Odyssey".) Get into the habit of running the included script called +//: 'test_layers' before you commit any changes. +//: +//: This 'subsetting guarantee' ensures that this directory contains a +//: cleaned-up narrative of the evolution of this codebase. Organizing +//: autobiographically allows newcomers to rapidly orient themselves, reading +//: the first few files to understand a simple gestalt of a program's core +//: purpose and features, and later gradually working their way through other +//: features as the need arises. +//: +//: Programmers shouldn't need to understand everything about a program to +//: hack on it. But they shouldn't be prevented from a thorough understanding +//: of each aspect either. The goal of layers is to reward curiosity. + +// Includes +// End Includes + +// Types +// End Types + +// Function prototypes are auto-generated in the 'build*' scripts; define your +// functions in any order. Just be sure to declare each function header all on +// one line, ending with the '{'. Our auto-generation scripts are too minimal +// and simple-minded to handle anything else. +#include "function_list" // by convention, files ending with '_list' are auto-generated + +// Globals +// +// All statements in this section should always define a single variable on a +// single line. The 'build*' scripts will simple-mindedly auto-generate extern +// declarations for them. Remember to define (not just declare) constants with +// extern linkage in this section, since C++ global constants have internal +// linkage by default. +// +// End Globals + +int main(int argc, char* argv[]) { + atexit(reset); + + // End One-time Setup + + // Commandline Parsing + // End Commandline Parsing + + return 0; // End Main +} + +// Unit Tests +// End Unit Tests + +//: our first directive; insert the following header at the start of the program +:(before "End Includes") +#include <stdlib.h> + +//: Without directives or with the :(code) directive, lines get added at the +//: end. +:(code) +void reset() { + // End Reset +} diff --git a/transect/001help.cc b/transect/001help.cc new file mode 100644 index 00000000..3cab06d9 --- /dev/null +++ b/transect/001help.cc @@ -0,0 +1,261 @@ +//: Everything this project/binary supports. +//: This should give you a sense for what to look forward to in later layers. + +:(before "End Commandline Parsing") +if (argc <= 1 || is_equal(argv[1], "--help")) { + //: this is the functionality later layers will provide + // currently no automated tests for commandline arg parsing + if (argc <= 1) { + cerr << "Please provide a Mu program to run.\n" + << "\n"; + } + cerr << "Usage:\n" + << " mu [options] [test] [files]\n" + << "or:\n" + << " mu [options] [test] [files] -- [ingredients for function/recipe 'main']\n" + << "Square brackets surround optional arguments.\n" + << "\n" + << "Examples:\n" + << " To load files and run 'main':\n" + << " mu file1.mu file2.mu ...\n" + << " To run 'main' and dump a trace of all operations at the end:\n" + << " mu --trace file1.mu file2.mu ...\n" + << " To run all tests:\n" + << " mu test\n" + << " To load files and then run all tests:\n" + << " mu test file1.mu file2.mu ...\n" + << " To run a single Mu scenario:\n" + << " mu test file1.mu file2.mu ... scenario\n" + << " To run a single Mu scenario and dump a trace at the end:\n" + << " mu --trace test file1.mu file2.mu ... scenario\n" + << " To load files and run only the tests in explicitly loaded files (for apps):\n" + << " mu --test-only-app test file1.mu file2.mu ...\n" + << " To load all files with a numeric prefix in a directory:\n" + << " mu directory1 directory2 ...\n" + << " You can test directories just like files.\n" + << " mu test directory1 directory2 ...\n" + << " To pass ingredients to a mu program, provide them after '--':\n" + << " mu file_or_dir1 file_or_dir2 ... -- ingredient1 ingredient2 ...\n" + << " To see where a mu program is spending its time:\n" + << " mu --profile file_or_dir1 file_or_dir2 ...\n" + << " this slices and dices time spent in various profile.* output files\n" + << "\n" + << " To browse a trace generated by a previous run:\n" + << " mu browse-trace file\n" + ; + return 0; +} + +//: Support for option parsing. +//: Options always begin with '--' and are always the first arguments. An +//: option will never follow a non-option. +:(before "End Commandline Parsing") +char** arg = &argv[1]; +while (argc > 1 && starts_with(*arg, "--")) { + if (false) + ; // no-op branch just so any further additions can consistently always start with 'else' + // End Commandline Options(*arg) + else + cerr << "skipping unknown option " << *arg << '\n'; + --argc; ++argv; ++arg; +} + +//:: Helper function used by the above fragment of code (and later layers too, +//:: who knows?). +//: The :(code) directive appends function definitions to the end of the +//: project. Regardless of where functions are defined, we can call them +//: anywhere we like as long as we format the function header in a specific +//: way: put it all on a single line without indent, end the line with ') {' +//: and no trailing whitespace. As long as functions uniformly start this +//: way, our 'build*' scripts contain a little command to automatically +//: generate declarations for them. +:(code) +bool is_equal(char* s, const char* lit) { + return strncmp(s, lit, strlen(lit)) == 0; +} + +bool starts_with(const string& s, const string& pat) { + string::const_iterator a=s.begin(), b=pat.begin(); + for (/*nada*/; a!=s.end() && b!=pat.end(); ++a, ++b) + if (*a != *b) return false; + return b == pat.end(); +} + +//: I'll throw some style conventions here for want of a better place for them. +//: As a rule I hate style guides. Do what you want, that's my motto. But since +//: we're dealing with C/C++, the one big thing we want to avoid is undefined +//: behavior. If a compiler ever encounters undefined behavior it can make +//: your program do anything it wants. +//: +//: For reference, my checklist of undefined behaviors to watch out for: +//: out-of-bounds access +//: uninitialized variables +//: use after free +//: dereferencing invalid pointers: null, a new of size 0, others +//: +//: casting a large number to a type too small to hold it +//: +//: integer overflow +//: division by zero and other undefined expressions +//: left-shift by negative count +//: shifting values by more than or equal to the number of bits they contain +//: bitwise operations on signed numbers +//: +//: Converting pointers to types of different alignment requirements +//: T* -> void* -> T*: defined +//: T* -> U* -> T*: defined if non-function pointers and alignment requirements are same +//: function pointers may be cast to other function pointers +//: +//: Casting a numeric value into a value that can't be represented by the target type (either directly or via static_cast) +//: +//: To guard against these, some conventions: +//: +//: 0. Initialize all primitive variables in functions and constructors. +//: +//: 1. Minimize use of pointers and pointer arithmetic. Avoid 'new' and +//: 'delete' as far as possible. Rely on STL to perform memory management to +//: avoid use-after-free issues (and memory leaks). +//: +//: 2. Avoid naked arrays to avoid out-of-bounds access. Never use operator[] +//: except with map. Use at() with STL vectors and so on. +//: +//: 3. Valgrind all the things. +//: +//: 4. Avoid unsigned numbers. Not strictly an undefined-behavior issue, but +//: the extra range doesn't matter, and it's one less confusing category of +//: interaction gotchas to worry about. +//: +//: Corollary: don't use the size() method on containers, since it returns an +//: unsigned and that'll cause warnings about mixing signed and unsigned, +//: yadda-yadda. Instead use this macro below to perform an unsafe cast to +//: signed. We'll just give up immediately if a container's ever too large. +//: Basically, Mu is not concerned about this being a little slower than it +//: could be. (https://gist.github.com/rygorous/e0f055bfb74e3d5f0af20690759de5a7) +//: +//: Addendum to corollary: We're going to uniformly use int everywhere, to +//: indicate that we're oblivious to number size, and since Clang on 32-bit +//: platforms doesn't yet support multiplication over 64-bit integers, and +//: since multiplying two integers seems like a more common situation to end +//: up in than integer overflow. +:(before "End Includes") +#define SIZE(X) (assert((X).size() < (1LL<<(sizeof(int)*8-2))), static_cast<int>((X).size())) + +//: 5. Integer overflow is guarded against at runtime using the -ftrapv flag +//: to the compiler, supported by Clang (GCC version only works sometimes: +//: http://stackoverflow.com/questions/20851061/how-to-make-gcc-ftrapv-work). +:(before "atexit(reset)") +initialize_signal_handlers(); // not always necessary, but doesn't hurt +//? cerr << INT_MAX+1 << '\n'; // test overflow +//? assert(false); // test SIGABRT +:(code) +// based on https://spin.atomicobject.com/2013/01/13/exceptions-stack-traces-c +void initialize_signal_handlers() { + struct sigaction action; + bzero(&action, sizeof(action)); + action.sa_sigaction = dump_and_exit; + sigemptyset(&action.sa_mask); + sigaction(SIGABRT, &action, NULL); // assert() failure or integer overflow on linux (with -ftrapv) + sigaction(SIGILL, &action, NULL); // integer overflow on OS X (with -ftrapv) +} +void dump_and_exit(int sig, siginfo_t* /*unused*/, void* /*unused*/) { + switch (sig) { + case SIGABRT: + #ifndef __APPLE__ + cerr << "SIGABRT: might be an integer overflow if it wasn't an assert() failure or exception\n"; + _Exit(1); + #endif + break; + case SIGILL: + #ifdef __APPLE__ + cerr << "SIGILL: most likely caused by integer overflow\n"; + _Exit(1); + #endif + break; + default: + break; + } +} +:(before "End Includes") +#include <signal.h> + +//: For good measure we'll also enable SIGFPE. +:(before "atexit(reset)") +feenableexcept(FE_OVERFLOW | FE_UNDERFLOW); +//? assert(sizeof(int) == 4 && sizeof(float) == 4); +//? // | exp | mantissa +//? int smallest_subnormal = 0b00000000000000000000000000000001; +//? float smallest_subnormal_f = *reinterpret_cast<float*>(&smallest_subnormal); +//? cerr << "ε: " << smallest_subnormal_f << '\n'; +//? cerr << "ε/2: " << smallest_subnormal_f/2 << " (underflow)\n"; // test SIGFPE +:(before "End Includes") +#include <fenv.h> +:(code) +#ifdef __APPLE__ +// Public domain polyfill for feenableexcept on OS X +// http://www-personal.umich.edu/~williams/archive/computation/fe-handling-example.c +int feenableexcept(unsigned int excepts) { + static fenv_t fenv; + unsigned int new_excepts = excepts & FE_ALL_EXCEPT; + unsigned int old_excepts; + if (fegetenv(&fenv)) return -1; + old_excepts = fenv.__control & FE_ALL_EXCEPT; + fenv.__control &= ~new_excepts; + fenv.__mxcsr &= ~(new_excepts << 7); + return fesetenv(&fenv) ? -1 : old_excepts; +} +#endif + +//: 6. Map's operator[] being non-const is fucking evil. +:(before "Globals") // can't generate prototypes for these +// from http://stackoverflow.com/questions/152643/idiomatic-c-for-reading-from-a-const-map +template<typename T> typename T::mapped_type& get(T& map, typename T::key_type const& key) { + typename T::iterator iter(map.find(key)); + assert(iter != map.end()); + return iter->second; +} +template<typename T> typename T::mapped_type const& get(const T& map, typename T::key_type const& key) { + typename T::const_iterator iter(map.find(key)); + assert(iter != map.end()); + return iter->second; +} +template<typename T> typename T::mapped_type const& put(T& map, typename T::key_type const& key, typename T::mapped_type const& value) { + // map[key] requires mapped_type to have a zero-arg (default) constructor + map.insert(std::make_pair(key, value)).first->second = value; + return value; +} +template<typename T> bool contains_key(T& map, typename T::key_type const& key) { + return map.find(key) != map.end(); +} +template<typename T> typename T::mapped_type& get_or_insert(T& map, typename T::key_type const& key) { + return map[key]; +} +//: The contract: any container that relies on get_or_insert should never call +//: contains_key. + +//: 7. istreams are a royal pain in the arse. You have to be careful about +//: what subclass you try to putback into. You have to watch out for the pesky +//: failbit and badbit. Just avoid eof() and use this helper instead. +:(code) +bool has_data(istream& in) { + return in && !in.eof(); +} + +:(before "End Includes") +#include <assert.h> + +#include <iostream> +using std::istream; +using std::ostream; +using std::iostream; +using std::cin; +using std::cout; +using std::cerr; +#include <iomanip> + +#include <string.h> +#include <string> +using std::string; + +#include <algorithm> +using std::min; +using std::max; diff --git a/transect/002test.cc b/transect/002test.cc new file mode 100644 index 00000000..817b0d47 --- /dev/null +++ b/transect/002test.cc @@ -0,0 +1,104 @@ +//: A simple test harness. To create new tests, define functions starting with +//: 'test_'. To run all tests so defined, run: +//: $ ./mu test +//: +//: Every layer should include tests, and can reach into previous layers. +//: However, it seems like a good idea never to reach into tests from previous +//: layers. Every test should be a contract that always passes as originally +//: written, regardless of any later layers. Avoid writing 'temporary' tests +//: that are only meant to work until some layer. + +:(before "End Types") +typedef void (*test_fn)(void); +:(before "Globals") +// move a global ahead into types that we can't generate an extern declaration for +const test_fn Tests[] = { + #include "test_list" // auto-generated; see 'build*' scripts +}; + +:(before "End Globals") +bool Run_tests = false; +bool Passed = true; // set this to false inside any test to indicate failure + +:(before "End Includes") +#define CHECK(X) \ + if (Passed && !(X)) { \ + cerr << "\nF - " << __FUNCTION__ << "(" << __FILE__ << ":" << __LINE__ << "): " << #X << '\n'; \ + Passed = false; \ + return; /* Currently we stop at the very first failure. */ \ + } + +#define CHECK_EQ(X, Y) \ + if (Passed && (X) != (Y)) { \ + cerr << "\nF - " << __FUNCTION__ << "(" << __FILE__ << ":" << __LINE__ << "): " << #X << " == " << #Y << '\n'; \ + cerr << " got " << (X) << '\n'; /* BEWARE: multiple eval */ \ + Passed = false; \ + return; /* Currently we stop at the very first failure. */ \ + } + +:(before "End Reset") +Passed = true; + +:(before "End Commandline Parsing") +if (argc > 1 && is_equal(argv[1], "test")) { + Run_tests = true; --argc; ++argv; // shift 'test' out of commandline args +} + +:(before "End Main") +if (Run_tests) { + // Test Runs + // we run some tests and then exit; assume no state need be maintained afterward + + long num_failures = 0; + // End Test Run Initialization + time_t t; time(&t); + cerr << "C tests: " << ctime(&t); + for (size_t i=0; i < sizeof(Tests)/sizeof(Tests[0]); ++i) { +//? cerr << "running " << Test_names[i] << '\n'; + run_test(i); + if (Passed) cerr << '.'; + else ++num_failures; + } + cerr << '\n'; + // End Tests + if (num_failures > 0) { + cerr << num_failures << " failure" + << (num_failures > 1 ? "s" : "") + << '\n'; + return 1; + } + return 0; +} + +:(code) +void run_test(size_t i) { + if (i >= sizeof(Tests)/sizeof(Tests[0])) { + cerr << "no test " << i << '\n'; + return; + } + reset(); + // End Test Setup + (*Tests[i])(); + // End Test Teardown +} + +//: Convenience: run a single test +:(before "Globals") +// Names for each element of the 'Tests' global, respectively. +const string Test_names[] = { + #include "test_name_list" // auto-generated; see 'build*' scripts +}; +:(after "Test Runs") +string maybe_single_test_to_run = argv[argc-1]; +if (!starts_with(maybe_single_test_to_run, "test_")) + maybe_single_test_to_run.insert(0, "test_"); +for (size_t i=0; i < sizeof(Tests)/sizeof(Tests[0]); ++i) { + if (Test_names[i] == maybe_single_test_to_run) { + run_test(i); + if (Passed) cerr << ".\n"; + return 0; + } +} + +:(before "End Includes") +#include <stdlib.h> diff --git a/transect/003trace.cc b/transect/003trace.cc new file mode 100644 index 00000000..35db0169 --- /dev/null +++ b/transect/003trace.cc @@ -0,0 +1,409 @@ +//: The goal of layers is to make programs more easy to understand and more +//: malleable, easy to rewrite in radical ways without accidentally breaking +//: some corner case. Tests further both goals. They help understandability by +//: letting one make small changes and get feedback. What if I wrote this line +//: like so? What if I removed this function call, is it really necessary? +//: Just try it, see if the tests pass. Want to explore rewriting this bit in +//: this way? Tests put many refactorings on a firmer footing. +//: +//: But the usual way we write tests seems incomplete. Refactorings tend to +//: work in the small, but don't help with changes to function boundaries. If +//: you want to extract a new function you have to manually test-drive it to +//: create tests for it. If you want to inline a function its tests are no +//: longer valid. In both cases you end up having to reorganize code as well as +//: tests, an error-prone activity. +//: +//: In response, this layer introduces the notion of *domain-driven* testing. +//: We focus on the domain of inputs the whole program needs to handle rather +//: than the correctness of individual functions. All tests invoke the program +//: in a single way: by calling run() with some input. As the program operates +//: on the input, it traces out a list of _facts_ deduced about the domain: +//: trace("label") << "fact 1: " << val; +//: +//: Tests can now check these facts: +//: :(scenario foo) +//: 34 # call run() with this input +//: +label: fact 1: 34 # 'run' should have deduced this fact +//: -label: fact 1: 35 # the trace should not contain such a fact +//: +//: Since we never call anything but the run() function directly, we never have +//: to rewrite the tests when we reorganize the internals of the program. We +//: just have to make sure our rewrite deduces the same facts about the domain, +//: and that's something we're going to have to do anyway. +//: +//: To avoid the combinatorial explosion of integration tests, each layer +//: mainly logs facts to the trace with a common *label*. All tests in a layer +//: tend to check facts with this label. Validating the facts logged with a +//: specific label is like calling functions of that layer directly. +//: +//: To build robust tests, trace facts about your domain rather than details of +//: how you computed them. +//: +//: More details: http://akkartik.name/blog/tracing-tests +//: +//: --- +//: +//: Between layers and domain-driven testing, programming starts to look like a +//: fundamentally different activity. Instead of a) superficial, b) local rules +//: on c) code [like say http://blog.bbv.ch/2013/06/05/clean-code-cheat-sheet], +//: we allow programmers to engage with the a) deep, b) global structure of the +//: c) domain. If you can systematically track discontinuities in the domain, +//: you don't care if the code used gotos as long as it passed the tests. If +//: tests become more robust to run it becomes easier to try out radically +//: different implementations for the same program. If code is super-easy to +//: rewrite, it becomes less important what indentation style it uses, or that +//: the objects are appropriately encapsulated, or that the functions are +//: referentially transparent. +//: +//: Instead of plumbing, programming becomes building and gradually refining a +//: map of the environment the program must operate under. Whether a program is +//: 'correct' at a given point in time is a red herring; what matters is +//: avoiding regression by monotonically nailing down the more 'eventful' parts +//: of the terrain. It helps readers new and old, and rewards curiosity, to +//: organize large programs in self-similar hierarchies of example scenarios +//: colocated with the code that makes them work. +//: +//: "Programming properly should be regarded as an activity by which +//: programmers form a mental model, rather than as production of a program." +//: -- Peter Naur (http://alistair.cockburn.us/ASD+book+extract%3A+%22Naur,+Ehn,+Musashi%22) + +:(before "End Types") +struct trace_line { + int depth; // optional field just to help browse traces later + string label; + string contents; + trace_line(string l, string c) :depth(0), label(l), contents(c) {} + trace_line(int d, string l, string c) :depth(d), label(l), contents(c) {} +}; + +//: Support for tracing an entire run. +//: Traces can have a lot of overhead, so only turn them on when asked. +:(before "End Commandline Options(*arg)") +else if (is_equal(*arg, "--trace")) { + Save_trace = true; +} +:(before "End Commandline Parsing") +if (Save_trace) { + cerr << "initializing trace\n"; + Trace_stream = new trace_stream; +} +:(code) +void cleanup_main() { + if (!Trace_stream) return; + if (Save_trace) + Trace_stream->save(); + delete Trace_stream; + Trace_stream = NULL; +} +:(before "End One-time Setup") +atexit(cleanup_main); + +:(before "End Types") +// Pre-define some global constants that trace_stream needs to know about. +// Since they're in the Types section, they'll be included in any cleaved +// compilation units. So no extern linkage. +const int Max_depth = 9999; +const int Error_depth = 0; // definitely always print errors + +struct trace_stream { + vector<trace_line> past_lines; + // accumulator for current line + ostringstream* curr_stream; + string curr_label; + int curr_depth; + int collect_depth; + ofstream null_stream; // never opens a file, so writes silently fail + trace_stream() :curr_stream(NULL), curr_depth(Max_depth), collect_depth(Max_depth) {} + ~trace_stream() { if (curr_stream) delete curr_stream; } + + ostream& stream(string label) { + return stream(Max_depth, label); + } + + ostream& stream(int depth, string label) { + if (depth > collect_depth) return null_stream; + curr_stream = new ostringstream; + curr_label = label; + curr_depth = depth; + (*curr_stream) << std::hex; + return *curr_stream; + } + + void save() { + cerr << "saving trace to 'last_run'\n"; + ofstream fout("last_run"); + fout << readable_contents(""); + fout.close(); + } + + // be sure to call this before messing with curr_stream or curr_label + void newline(); + // useful for debugging + string readable_contents(string label); // empty label = show everything +}; + +:(code) +void trace_stream::newline() { + if (!curr_stream) return; + string curr_contents = curr_stream->str(); + if (!curr_contents.empty()) { + past_lines.push_back(trace_line(curr_depth, trim(curr_label), curr_contents)); // preserve indent in contents + if ((!Hide_errors && curr_depth == Error_depth) + || Dump_trace + || (!Dump_label.empty() && curr_label == Dump_label)) + cerr << curr_label << ": " << curr_contents << '\n'; + } + delete curr_stream; + curr_stream = NULL; + curr_label.clear(); + curr_depth = Max_depth; +} + +string trace_stream::readable_contents(string label) { + ostringstream output; + label = trim(label); + for (vector<trace_line>::iterator p = past_lines.begin(); p != past_lines.end(); ++p) + if (label.empty() || label == p->label) { + output << std::setw(4) << p->depth << ' ' << p->label << ": " << p->contents << '\n'; + } + return output.str(); +} + +:(before "End Globals") +trace_stream* Trace_stream = NULL; +int Trace_errors = 0; // used only when Trace_stream is NULL + +:(before "End Globals") +bool Hide_errors = false; // if set, don't print even error trace lines to screen +bool Dump_trace = false; // if set, print trace lines to screen +string Dump_label = ""; // if set, print trace lines matching a single label to screen +:(before "End Reset") +Hide_errors = false; +Dump_trace = false; +Dump_label = ""; + +:(before "End Includes") +#define CLEAR_TRACE delete Trace_stream, Trace_stream = new trace_stream; + +// Top-level helper. IMPORTANT: can't nest +#define trace(...) !Trace_stream ? cerr /*print nothing*/ : Trace_stream->stream(__VA_ARGS__) + +// Just for debugging; 'git log' should never show any calls to 'dbg'. +#define dbg trace(0, "a") +#define DUMP(label) if (Trace_stream) cerr << Trace_stream->readable_contents(label); + +// Errors are a special layer. +#define raise (!Trace_stream ? (++Trace_errors,cerr) /*do print*/ : Trace_stream->stream(Error_depth, "error")) +// If we aren't yet sure how to deal with some corner case, use assert_for_now +// to indicate that it isn't an inviolable invariant. +#define assert_for_now assert + +// Inside tests, fail any tests that displayed (unexpected) errors. +// Expected errors in tests should always be hidden and silently checked for. +:(before "End Test Teardown") +if (Passed && !Hide_errors && trace_contains_errors()) { + Passed = false; +} +:(code) +bool trace_contains_errors() { + return Trace_errors > 0 || trace_count("error") > 0; +} + +:(before "End Types") +struct end {}; +:(code) +ostream& operator<<(ostream& os, end /*unused*/) { + if (Trace_stream) Trace_stream->newline(); + return os; +} + +:(before "End Globals") +bool Save_trace = false; // if set, write out trace to disk + +// Trace_stream is a resource, lease_tracer uses RAII to manage it. +:(before "End Types") +struct lease_tracer { + lease_tracer(); + ~lease_tracer(); +}; +:(code) +lease_tracer::lease_tracer() { Trace_stream = new trace_stream; } +lease_tracer::~lease_tracer() { + if (Save_trace) Trace_stream->save(); + delete Trace_stream, Trace_stream = NULL; +} +:(before "End Includes") +#define START_TRACING_UNTIL_END_OF_SCOPE lease_tracer leased_tracer; +:(before "End Test Setup") +START_TRACING_UNTIL_END_OF_SCOPE + +:(before "End Includes") +#define CHECK_TRACE_CONTENTS(...) check_trace_contents(__FUNCTION__, __FILE__, __LINE__, __VA_ARGS__) + +#define CHECK_TRACE_CONTAINS_ERRORS() CHECK(trace_contains_errors()) +#define CHECK_TRACE_DOESNT_CONTAIN_ERRORS() \ + if (Passed && trace_contains_errors()) { \ + cerr << "\nF - " << __FUNCTION__ << "(" << __FILE__ << ":" << __LINE__ << "): unexpected errors\n"; \ + DUMP("error"); \ + Passed = false; \ + return; \ + } + +#define CHECK_TRACE_COUNT(label, count) \ + if (Passed && trace_count(label) != (count)) { \ + cerr << "\nF - " << __FUNCTION__ << "(" << __FILE__ << ":" << __LINE__ << "): trace_count of " << label << " should be " << count << '\n'; \ + cerr << " got " << trace_count(label) << '\n'; /* multiple eval */ \ + DUMP(label); \ + Passed = false; \ + return; /* Currently we stop at the very first failure. */ \ + } + +#define CHECK_TRACE_DOESNT_CONTAIN(...) CHECK(trace_doesnt_contain(__VA_ARGS__)) + +:(code) +bool check_trace_contents(string FUNCTION, string FILE, int LINE, string expected) { + if (!Passed) return false; + if (!Trace_stream) return false; + vector<string> expected_lines = split(expected, ""); + int curr_expected_line = 0; + while (curr_expected_line < SIZE(expected_lines) && expected_lines.at(curr_expected_line).empty()) + ++curr_expected_line; + if (curr_expected_line == SIZE(expected_lines)) return true; + string label, contents; + split_label_contents(expected_lines.at(curr_expected_line), &label, &contents); + for (vector<trace_line>::iterator p = Trace_stream->past_lines.begin(); p != Trace_stream->past_lines.end(); ++p) { + if (label != p->label) continue; + if (contents != trim(p->contents)) continue; + ++curr_expected_line; + while (curr_expected_line < SIZE(expected_lines) && expected_lines.at(curr_expected_line).empty()) + ++curr_expected_line; + if (curr_expected_line == SIZE(expected_lines)) return true; + split_label_contents(expected_lines.at(curr_expected_line), &label, &contents); + } + + if (line_exists_anywhere(label, contents)) { + cerr << "\nF - " << FUNCTION << "(" << FILE << ":" << LINE << "): line [" << label << ": " << contents << "] out of order in trace:\n"; + DUMP(""); + } + else { + cerr << "\nF - " << FUNCTION << "(" << FILE << ":" << LINE << "): missing [" << contents << "] in trace:\n"; + DUMP(label); + } + Passed = false; + return false; +} + +void split_label_contents(const string& s, string* label, string* contents) { + static const string delim(": "); + size_t pos = s.find(delim); + if (pos == string::npos) { + *label = ""; + *contents = trim(s); + } + else { + *label = trim(s.substr(0, pos)); + *contents = trim(s.substr(pos+SIZE(delim))); + } +} + +bool line_exists_anywhere(const string& label, const string& contents) { + for (vector<trace_line>::iterator p = Trace_stream->past_lines.begin(); p != Trace_stream->past_lines.end(); ++p) { + if (label != p->label) continue; + if (contents == trim(p->contents)) return true; + } + return false; +} + +int trace_count(string label) { + return trace_count(label, ""); +} + +int trace_count(string label, string line) { + if (!Trace_stream) return 0; + long result = 0; + for (vector<trace_line>::iterator p = Trace_stream->past_lines.begin(); p != Trace_stream->past_lines.end(); ++p) { + if (label == p->label) { + if (line == "" || trim(line) == trim(p->contents)) + ++result; + } + } + return result; +} + +int trace_count_prefix(string label, string prefix) { + if (!Trace_stream) return 0; + long result = 0; + for (vector<trace_line>::iterator p = Trace_stream->past_lines.begin(); p != Trace_stream->past_lines.end(); ++p) { + if (label == p->label) { + if (starts_with(trim(p->contents), trim(prefix))) + ++result; + } + } + return result; +} + +bool trace_doesnt_contain(string label, string line) { + return trace_count(label, line) == 0; +} + +bool trace_doesnt_contain(string expected) { + vector<string> tmp = split_first(expected, ": "); + if (SIZE(tmp) == 1) { + raise << expected << ": missing label or contents in trace line\n" << end(); + assert(false); + } + return trace_doesnt_contain(tmp.at(0), tmp.at(1)); +} + +vector<string> split(string s, string delim) { + vector<string> result; + size_t begin=0, end=s.find(delim); + while (true) { + if (end == string::npos) { + result.push_back(string(s, begin, string::npos)); + break; + } + result.push_back(string(s, begin, end-begin)); + begin = end+SIZE(delim); + end = s.find(delim, begin); + } + return result; +} + +vector<string> split_first(string s, string delim) { + vector<string> result; + size_t end=s.find(delim); + result.push_back(string(s, 0, end)); + if (end != string::npos) + result.push_back(string(s, end+SIZE(delim), string::npos)); + return result; +} + +string trim(const string& s) { + string::const_iterator first = s.begin(); + while (first != s.end() && isspace(*first)) + ++first; + if (first == s.end()) return ""; + + string::const_iterator last = --s.end(); + while (last != s.begin() && isspace(*last)) + --last; + ++last; + return string(first, last); +} + +:(before "End Includes") +#include <vector> +using std::vector; +#include <list> +using std::list; +#include <set> +using std::set; + +#include <sstream> +using std::istringstream; +using std::ostringstream; + +#include <fstream> +using std::ifstream; +using std::ofstream; diff --git a/transect/003trace.test.cc b/transect/003trace.test.cc new file mode 100644 index 00000000..67b4c345 --- /dev/null +++ b/transect/003trace.test.cc @@ -0,0 +1,124 @@ +void test_trace_check_compares() { + trace("test layer") << "foo" << end(); + CHECK_TRACE_CONTENTS("test layer: foo"); +} + +void test_trace_check_ignores_other_layers() { + trace("test layer 1") << "foo" << end(); + trace("test layer 2") << "bar" << end(); + CHECK_TRACE_CONTENTS("test layer 1: foo"); + CHECK_TRACE_DOESNT_CONTAIN("test layer 2: foo"); +} + +void test_trace_check_ignores_leading_whitespace() { + trace("test layer 1") << " foo" << end(); + CHECK_EQ(trace_count("test layer 1", /*too little whitespace*/"foo"), 1); + CHECK_EQ(trace_count("test layer 1", /*too much whitespace*/" foo"), 1); +} + +void test_trace_check_ignores_other_lines() { + trace("test layer 1") << "foo" << end(); + trace("test layer 1") << "bar" << end(); + CHECK_TRACE_CONTENTS("test layer 1: foo"); +} + +void test_trace_check_ignores_other_lines2() { + trace("test layer 1") << "foo" << end(); + trace("test layer 1") << "bar" << end(); + CHECK_TRACE_CONTENTS("test layer 1: bar"); +} + +void test_trace_ignores_trailing_whitespace() { + trace("test layer 1") << "foo\n" << end(); + CHECK_TRACE_CONTENTS("test layer 1: foo"); +} + +void test_trace_ignores_trailing_whitespace2() { + trace("test layer 1") << "foo " << end(); + CHECK_TRACE_CONTENTS("test layer 1: foo"); +} + +void test_trace_orders_across_layers() { + trace("test layer 1") << "foo" << end(); + trace("test layer 2") << "bar" << end(); + trace("test layer 1") << "qux" << end(); + CHECK_TRACE_CONTENTS("test layer 1: footest layer 2: bartest layer 1: qux"); +} + +void test_trace_supports_count() { + trace("test layer 1") << "foo" << end(); + trace("test layer 1") << "foo" << end(); + CHECK_EQ(trace_count("test layer 1", "foo"), 2); +} + +void test_trace_supports_count2() { + trace("test layer 1") << "foo" << end(); + trace("test layer 1") << "bar" << end(); + CHECK_EQ(trace_count("test layer 1"), 2); +} + +void test_trace_count_ignores_trailing_whitespace() { + trace("test layer 1") << "foo\n" << end(); + CHECK_EQ(trace_count("test layer 1", "foo"), 1); +} + +// pending: DUMP tests +// pending: readable_contents() adds newline if necessary. +// pending: raise also prints to stderr. +// pending: raise doesn't print to stderr if Hide_errors is set. +// pending: raise doesn't have to be saved if Hide_errors is set, just printed. +// pending: raise prints to stderr if Trace_stream is NULL. +// pending: raise prints to stderr if Trace_stream is NULL even if Hide_errors is set. + +// can't check trace because trace methods call 'split' + +void test_split_returns_at_least_one_elem() { + vector<string> result = split("", ","); + CHECK_EQ(result.size(), 1); + CHECK_EQ(result.at(0), ""); +} + +void test_split_returns_entire_input_when_no_delim() { + vector<string> result = split("abc", ","); + CHECK_EQ(result.size(), 1); + CHECK_EQ(result.at(0), "abc"); +} + +void test_split_works() { + vector<string> result = split("abc,def", ","); + CHECK_EQ(result.size(), 2); + CHECK_EQ(result.at(0), "abc"); + CHECK_EQ(result.at(1), "def"); +} + +void test_split_works2() { + vector<string> result = split("abc,def,ghi", ","); + CHECK_EQ(result.size(), 3); + CHECK_EQ(result.at(0), "abc"); + CHECK_EQ(result.at(1), "def"); + CHECK_EQ(result.at(2), "ghi"); +} + +void test_split_handles_multichar_delim() { + vector<string> result = split("abc,,def,,ghi", ",,"); + CHECK_EQ(result.size(), 3); + CHECK_EQ(result.at(0), "abc"); + CHECK_EQ(result.at(1), "def"); + CHECK_EQ(result.at(2), "ghi"); +} + +void test_trim() { + CHECK_EQ(trim(""), ""); + CHECK_EQ(trim(" "), ""); + CHECK_EQ(trim(" "), ""); + CHECK_EQ(trim("a"), "a"); + CHECK_EQ(trim(" a"), "a"); + CHECK_EQ(trim(" a"), "a"); + CHECK_EQ(trim(" ab"), "ab"); + CHECK_EQ(trim("a "), "a"); + CHECK_EQ(trim("a "), "a"); + CHECK_EQ(trim("ab "), "ab"); + CHECK_EQ(trim(" a "), "a"); + CHECK_EQ(trim(" a "), "a"); + CHECK_EQ(trim(" ab "), "ab"); +} diff --git a/transect/010vm.cc b/transect/010vm.cc new file mode 100644 index 00000000..fad1ee77 --- /dev/null +++ b/transect/010vm.cc @@ -0,0 +1,230 @@ +//: type definitions start with either 'record' or 'choice' + +:(before "End Types") +typedef int type_id; +:(before "End Globals") +map</*name*/string, type_id> Type_id; +map<type_id, type_info> Type_info; +type_id Next_type_id = 1; +// primitive types +type_id Literal_type_id = 0, Int_type_id = 0, Byte_type_id = 0, Address_type_id = 0, Array_type_id = 0, Ref_type_id = 0; +:(before "End Types") +struct type_info { + type_id id; + string name; + kind_of_type kind; + int size; // in bytes + vector<type_declaration> elements; + type_info() :kind(PRIMITIVE), size(0) {} +}; +:(before "struct type_info") +enum kind_of_type { + PRIMITIVE, + RECORD, + CHOICE, +}; +struct type_declaration { + string name; + vector<type_id> type; +}; + +//: global definitions start with 'var' + +:(before "End Types") +typedef int global_id; +:(before "End Globals") +map</*name*/string, global_id> Global_id; +map<global_id, global_info> Global_info; +global_id Next_global_id = 1; +:(before "End Types") +struct global_info { + global_id id; + vector<type_id> type; + int address; + global_info() :address(0) {} +}; + +//: function definitions start with 'fn' + +:(before "End Types") +typedef int function_id; +:(before "End Globals") +map</*name*/string, function_id> Function_id; +map<function_id, function_info> Function_info; +function_id Next_function_id = 1; +:(before "End Types") +struct function_info { + function_id id; + string name; + vector<operand> in; + vector<operand> in_out; + vector<instruction> instructions; + map</*local variable name*/string, int> stack_offset; + function_info() :id(0) {} +}; +:(before "struct function_info") +// operands have form name/property1/property2/... : (type1 type2 ...) +struct operand { + string name; + vector<type_id> type; + vector<string> properties; + operand(string); + void set_type(istream&); +}; + +struct instruction { + function_id id; + string name; + vector<operand> in; + vector<operand> in_out; +}; +:(code) +operand::operand(string s) { + istringstream in(s); + name = slurp_until(in, '/'); + while (has_data(in)) + properties.push_back(slurp_until(in, '/')); +} + +// extremely hacky; assumes a single-level list of words in parens, with no nesting +void operand::set_type(istream& in) { + assert(has_data(in)); + string curr; + in >> curr; +//? cerr << "2: " << curr << '\n'; + if (curr.at(0) != '(') { + type.push_back(get(Type_id, curr)); + return; + } + curr = curr.substr(/*skip '('*/1); + while (!ends_with(curr, ")")) { + if (curr.empty()) continue; + assert(curr.at(0) != '('); + type.push_back(get(Type_id, curr)); + // update + assert(has_data(in)); + in >> curr; + } + assert(ends_with(curr, ")")); + curr = curr.substr(0, SIZE(curr)-1); + if (!curr.empty()) { + /*'(' or ')' isn't a token by itself*/ + type.push_back(get(Type_id, curr)); + } +} + +string to_string(const operand& o) { + ostringstream out; + out << o.name; + if (o.type.empty()) return out.str(); + out << " : "; + if (SIZE(o.type) == 1) { + out << get(Type_info, o.type.at(0)).name; + return out.str(); + } + out << "("; + for (int i = 0; i < SIZE(o.type); ++i) { + if (i > 0) out << ", "; + out << get(Type_info, o.type.at(i)).name; + } + out << ")"; + return out.str(); +} + +string slurp_until(istream& in, char delim) { + ostringstream out; + char c; + while (in >> c) { + if (c == delim) { + // drop the delim + break; + } + out << c; + } + return out.str(); +} + +bool ends_with(const string& s, const string& pat) { + for (string::const_reverse_iterator p = s.rbegin(), q = pat.rbegin(); q != pat.rend(); ++p, ++q) { + if (p == s.rend()) return false; // pat too long + if (*p != *q) return false; + } + return true; +} + +:(before "End One-time Setup") +init_primitive_types(); +:(code) +void init_primitive_types() { + Literal_type_id = new_type("literal", PRIMITIVE, 0); + Int_type_id = new_type("int", PRIMITIVE, 4); + Byte_type_id = new_type("byte", PRIMITIVE, 1); + Address_type_id = new_type("address", PRIMITIVE, 4); + Array_type_id = new_type("array", PRIMITIVE, 0); // size will depend on length + Ref_type_id = new_type("ref", PRIMITIVE, 8); // address + alloc id +} + +type_id new_type(string name, kind_of_type kind, int size) { + assert(!contains_key(Type_id, name)); + int result = Next_type_id++; + put(Type_id, name, result); + assert(!contains_key(Type_info, result)); + type_info& curr = Type_info[result]; // insert + curr.id = result; + curr.name = name; + curr.kind = kind; + curr.size = size; + return result; +} + +//: Start each test by undoing the previous test's types, globals and functions + +:(before "End One-time Setup") +save_snapshots(); +:(before "End Reset") +restore_snapshots(); + +:(before "End Globals") +map<string, type_id> Type_id_snapshot; +map<type_id, type_info> Type_info_snapshot; +type_id Next_type_id_snapshot = 0; + +map<string, global_id> Global_id_snapshot; +map<global_id, global_info> Global_info_snapshot; +global_id Next_global_id_snapshot = 0; + +map<string, function_id> Function_id_snapshot; +map<function_id, function_info> Function_info_snapshot; +function_id Next_function_id_snapshot = 0; +:(code) +void save_snapshots() { + Type_id_snapshot = Type_id; + Type_info_snapshot = Type_info; + Next_type_id_snapshot = Next_type_id; + + Global_id_snapshot = Global_id; + Global_info_snapshot = Global_info; + Next_global_id_snapshot = Next_global_id; + + Function_id_snapshot = Function_id; + Function_info_snapshot = Function_info; + Next_function_id_snapshot = Next_function_id; +} + +void restore_snapshots() { + Type_id = Type_id_snapshot; + Type_info = Type_info_snapshot; + Next_type_id = Next_type_id_snapshot; + + Global_id = Global_id_snapshot; + Global_info = Global_info_snapshot; + Next_global_id = Next_global_id_snapshot; + + Function_id = Function_id_snapshot; + Function_info = Function_info_snapshot; + Next_function_id = Next_function_id_snapshot; +} + +:(before "End Includes") +#include <map> +using std::map; diff --git a/transect/011load.cc b/transect/011load.cc new file mode 100644 index 00000000..f8cf96e8 --- /dev/null +++ b/transect/011load.cc @@ -0,0 +1,228 @@ +//: Phase 1 of translating Mu code: load it from a textual representation. +//: +//: The process of translating Mu code: +//: load -> check types -> convert + +:(scenarios load) // use 'load' instead of 'run' in all scenarios in this layer +:(scenario single_function) +fn foo [ + 1 : int <- copy 23 +] ++parse: function: foo ++parse: 0 in operands ++parse: 0 in_out operands ++parse: instruction: copy ++parse: in => 23 : literal ++parse: in_out => 1 : int + +:(code) +void load(string form) { + istringstream in(form); + load(in); +} + +void load(istream& in) { + while (has_data(in)) { + string line_data; + getline(in, line_data); + if (line_data.empty()) continue; // maybe eof + char c = first_non_whitespace(line_data); + if (c == '\0') continue; // only whitespace + if (c == '#') continue; // only comment + trace(99, "parse") << "line: " << line_data << end(); + istringstream lin(line_data); + while (has_data(lin)) { + string word_data; + lin >> word_data; + if (word_data.empty()) continue; // maybe eof + if (word_data[0] == '#') break; // comment; ignore rest of line + if (word_data == "record") + load_record(lin, in); + else if (word_data == "choice") + load_choice(lin, in); + else if (word_data == "var") + load_global(lin, in); + else if (word_data == "fn") + load_function(lin, in); + else + raise << "unrecognized top-level keyword '" << word_data << "'; should be one of 'record', 'choice', 'var' or 'fn'\n" << end(); + break; + } + // nothing here, because we'll be at the next top-level declaration + } +} + +void load_record(istream& first_line, istream& in) { +} + +void load_choice(istream& first_line, istream& in) { +} + +void load_global(istream& first_line, istream& in) { +} + +void load_function(istream& first_line, istream& in) { + string name; + assert(has_data(first_line)); + first_line >> name; + trace(99, "parse") << "function: " << name << end(); + function_info& curr = new_function(name); + string tmp; + // read in parameters + while (has_data(first_line)) { + // read operand name + first_line >> tmp; +//? cerr << "0: " << tmp << '\n'; + if (tmp == "[") break; + if (tmp == "->") break; + assert(tmp != ":"); + curr.in.push_back(operand(tmp)); + + // skip ':' + assert(has_data(first_line)); + first_line >> tmp; +//? cerr << "1: " << tmp << '\n'; + assert(tmp == ":"); // types are required in function headers + + // read operand type + assert(has_data(first_line)); + curr.in.back().set_type(first_line); + } + // read in-out parameters + while (tmp != "[" && has_data(first_line)) { + // read operand name + first_line >> tmp; +//? cerr << "inout 0: " << tmp << '\n'; + if (tmp == "[") break; + assert(tmp != "->"); + assert(tmp != ":"); // types are required in function headers + curr.in_out.push_back(operand(tmp)); + + // skip ':' + assert(has_data(first_line)); + first_line >> tmp; +//? cerr << "inout 1: " << tmp << '\n'; + assert(tmp == ":"); + + // read operand type + assert(has_data(first_line)); + curr.in.back().set_type(first_line); + } + trace(99, "parse") << " " << SIZE(curr.in) << " in operands" << end(); + trace(99, "parse") << " " << SIZE(curr.in_out) << " in_out operands" << end(); + // not bothering checking for tokens past '[' in first_line + + // read instructions + while (has_data(in)) { + string line_data; + getline(in, line_data); + if (first_non_whitespace(line_data) == ']') break; +//? bool has_in_out = (line_data.find("<-") != string::npos); + istringstream line(line_data); + vector<string> words; + bool has_in_out = false; + while (has_data(line)) { + string w; + line >> w; + words.push_back(w); + if (w == "<-") + has_in_out = true; + } + instruction inst; + int i = 0; + assert(i < SIZE(words)); + if (has_in_out) { + while (i < SIZE(words)) { +//? cerr << "in-out operand: " << i << ' ' << words.at(i) << '\n'; + inst.in_out.push_back(operand(words.at(i))); + ++i; + assert(i < SIZE(words)); + if (words.at(i) == ":") { + ++i; // skip ':' + assert(i < SIZE(words)); + assert(words.at(i) != "<-"); + assert(words.at(i) != ":"); + istringstream tmp(words.at(i)); +//? cerr << "setting type to " << i << ' ' << words.at(i) << '\n'; + inst.in_out.back().set_type(tmp); +//? cerr << "done\n"; + ++i; + assert(i < SIZE(words)); + } + if (words.at(i) == "<-") break; + } + assert(i < SIZE(words)); + assert(words.at(i) == "<-"); + ++i; + } + assert(i < SIZE(words)); + assert(words.at(i) != "<-"); + assert(words.at(i) != ":"); + inst.name = words.at(i); + ++i; + while (i < SIZE(words)) { + inst.in.push_back(operand(words.at(i))); + ++i; + if (i < SIZE(words) && words.at(i) == ":") { + ++i; // skip ':' + assert(i < SIZE(words)); + assert(words.at(i) != "<-"); + assert(words.at(i) != ":"); + istringstream tmp(words.at(i)); + inst.in.back().set_type(tmp); + ++i; + } + else if (is_integer(inst.in.back().name)) { + inst.in.back().type.push_back(Literal_type_id); + } + } + trace(99, "parse") << "instruction: " << inst.name << end(); + for (int i = 0; i < SIZE(inst.in); ++i) + trace(99, "parse") << " in => " << to_string(inst.in.at(i)) << end(); + for (int i = 0; i < SIZE(inst.in_out); ++i) + trace(99, "parse") << " in_out => " << to_string(inst.in_out.at(i)) << end(); + curr.instructions.push_back(inst); + } +} + +function_info& new_function(string name) { + assert(!contains_key(Function_id, name)); + int id = Next_function_id++; + put(Function_id, name, id); + assert(!contains_key(Function_info, id)); + function_info& result = Function_info[id]; // insert + result.id = id; + result.name = name; + return result; +} + +char first_non_whitespace(string in) { + for (int i = 0; i < SIZE(in); ++i) + if (!isspace(in.at(i))) return in.at(i); + return '\0'; +} + +bool is_integer(const string& s) { + return s.find_first_not_of("0123456789-") == string::npos // no other characters + && s.find_first_of("0123456789") != string::npos // at least one digit + && s.find('-', 1) == string::npos; // '-' only at first position +} + +int to_integer(string n) { + char* end = NULL; + // safe because string.c_str() is guaranteed to be null-terminated + int result = strtoll(n.c_str(), &end, /*any base*/0); + if (*end != '\0') cerr << "tried to convert " << n << " to number\n"; + assert(*end == '\0'); + return result; +} + +void test_is_integer() { + CHECK(is_integer("1234")); + CHECK(is_integer("-1")); + CHECK(!is_integer("234.0")); + CHECK(is_integer("-567")); + CHECK(!is_integer("89-0")); + CHECK(!is_integer("-")); + CHECK(!is_integer("1e3")); // not supported +} diff --git a/transect/build b/transect/build new file mode 100755 index 00000000..150c6b4d --- /dev/null +++ b/transect/build @@ -0,0 +1,109 @@ +#!/bin/sh +# returns 0 on successful build or nothing to build +# non-zero exit status only on error during building +set -e # stop immediately on error + +# [0-9]*.cc -> transect.cc -> transect_bin +# (layers) | | +# tangle $CXX + +# can also be called with a layer to only build until +# $ ./build --until 050 +UNTIL_LAYER=${2:-zzz} + +# there's two mechanisms for fast builds here: +# - if a command is quick to run, always run it but update the result only on any change +# - otherwise run it only if the output is 'older_than' the inputs +# +# avoid combining both mechanisms for a single file +# otherwise you'll see spurious messages about files being updated +# risk: a file may unnecessarily update without changes, causing unnecessary work downstream + +test "$CXX" || export CXX=c++ +test "$CC" || export CC=cc +test "$CFLAGS" || export CFLAGS="-g -O3" +export CFLAGS="$CFLAGS -Wall -Wextra -ftrapv -fno-strict-aliasing" + +# return 1 if $1 is older than _any_ of the remaining args +older_than() { + local target=$1 + shift + if [ ! -e $target ] + then +#? echo "$target doesn't exist" + echo "updating $target" >&2 + return 0 # success + fi + local f + for f in $* + do + if [ $f -nt $target ] + then + echo "updating $target" >&2 + return 0 # success + fi + done + return 1 # failure +} + +# redirect to $1, unless it's already identical +update() { + if [ ! -e $1 ] + then + cat > $1 + else + cat > $1.tmp + diff -q $1 $1.tmp >/dev/null && rm $1.tmp || mv $1.tmp $1 + fi +} + +update_cp() { + if [ ! -e $2/$1 ] + then + cp $1 $2 + elif [ $1 -nt $2/$1 ] + then + cp $1 $2 + fi +} + +noisy_cd() { + cd $1 + echo "-- `pwd`" >&2 +} + +older_than ../enumerate/enumerate ../enumerate/enumerate.cc && { + $CXX $CFLAGS ../enumerate/enumerate.cc -o ../enumerate/enumerate +} + +older_than ../tangle/tangle ../tangle/*.cc && { + noisy_cd ../tangle + { + grep -h "^struct .* {" [0-9]*.cc |sed 's/\(struct *[^ ]*\).*/\1;/' + grep -h "^typedef " [0-9]*.cc + } |update type_list + grep -h "^[^ #].*) {" [0-9]*.cc |sed 's/ {.*/;/' |update function_list + ls [0-9]*.cc |grep -v "\.test\.cc$" |sed 's/.*/#include "&"/' |update file_list + ls [0-9]*.test.cc |sed 's/.*/#include "&"/' |update test_file_list + grep -h "^[[:space:]]*void test_" [0-9]*.cc |sed 's/^\s*void \(.*\)() {$/\1,/' |update test_list + grep -h "^\s*void test_" [0-9]*.cc |sed 's/^\s*void \(.*\)() {.*/"\1",/' |update test_name_list + $CXX $CFLAGS boot.cc -o tangle + ./tangle test + noisy_cd ../transect # no effect; just to show us returning to the parent directory +} + +LAYERS=$(../enumerate/enumerate --until $UNTIL_LAYER |grep '.cc$') +older_than transect.cc $LAYERS ../enumerate/enumerate ../tangle/tangle && { + # no update here; rely on 'update' calls downstream + ../tangle/tangle $LAYERS > transect.cc +} + +grep -h "^[^[:space:]#].*) {$" transect.cc |grep -v ":.*(" |sed 's/ {.*/;/' |update function_list +grep -h "^\s*void test_" transect.cc |sed 's/^\s*void \(.*\)() {.*/\1,/' |update test_list +grep -h "^\s*void test_" transect.cc |sed 's/^\s*void \(.*\)() {.*/"\1",/' |update test_name_list + +older_than transect_bin transect.cc *_list && { + $CXX $CFLAGS transect.cc -o transect_bin +} + +exit 0 diff --git a/transect/build_and_test_until b/transect/build_and_test_until new file mode 100755 index 00000000..385558be --- /dev/null +++ b/transect/build_and_test_until @@ -0,0 +1,18 @@ +#!/bin/sh +# Run tests for just a subset of layers. +# +# Usage: +# build_and_test_until [file prefix] [test name] +# Provide the second arg to run just a single test. +set -e + +# clean previous builds if they were building until a different layer +touch .until +PREV_UNTIL=`cat .until` +if [ "$PREV_UNTIL" != $1 ] +then + ./clean top-level + echo $1 > .until +fi + +./build --until $1 && ./transect_bin test $2 diff --git a/transect/clean b/transect/clean new file mode 100755 index 00000000..2c49d7d7 --- /dev/null +++ b/transect/clean @@ -0,0 +1,8 @@ +#!/bin/sh +set -e + +set -v +rm -rf transect.cc transect_bin* *_list +test $# -gt 0 && exit 0 # convenience: 'clean top-level' to leave subsidiary tools alone +rm -rf ../enumerate/enumerate ../tangle/tangle ../tangle/*_list ../*/*.dSYM +rm -rf .until diff --git a/transect/compiler10 b/transect/compiler10 new file mode 100644 index 00000000..37cf67ab --- /dev/null +++ b/transect/compiler10 @@ -0,0 +1,301 @@ +=== Goal + +A memory-safe language with a simple translator to x86 that can be feasibly written without itself needing a translator/compiler. + +Memory-safe: it should be impossible to: + a) create a pointer out of arbitrary data, or + b) to access heap memory after it's been freed. + +Simple: do all the work in a 2-pass translator: + Pass 1: check each statement's types in isolation. + Pass 2: emit code for each statement in isolation. + +=== Language summary + +Program organization is going to be fairly conventional and in the spirit of C: programs will consist of a series of type, global and function declarations. More details below. Functions will consist of a list of statements, each containing a single operation. Since we try to map directly to x86 instructions, combinations of operations and operands will not be orthogonal. You won't be able to operate at once on two memory locations, for example, since no single x86 instruction can do that. + +Statement operands will be tagged with where they lie. This mostly follows C: local variables are on the stack, and variables not on the stack are in the global segment. The one addition is that you can lay out (only word-size) variables on registers. This is kinda like C's `register` keyword, but not quite: if you don't place a variable on a register, you are *guaranteed* it won't be allocated a register. Programmers do register allocation in this language. + +The other memorable feature of the language is two kinds of pointers: a 'ref' is a fat pointer manually allocated on the heap, and an 'address' is a far more ephemeral thing described below. + +--- Ref + +Refs are used to manage heap allocations. They are fat pointers that augment the address of a payload with an allocation id. On x86 a ref requires 8 bytes: 4 for the address, and 4 for the alloc id. Refs can only ever point to the start of a heap allocation. Never within a heap allocation, and *certainly* never to the stack or global segment. + +How alloc ids work: Every heap allocation allocates an additional word of space for an alloc id in the payload, and stores a unique alloc id in the payload as well as the pointer returned to the caller. Reclaiming an allocation resets the payload's alloc id. As long as alloc ids are always unique, and as long as refs can never point to within a heap allocation, we can be guaranteed that a stale pointer whose payload has been reclaimed will end up with a mismatch between pointer alloc id and payload alloc id. + + x <- alloc # x's alloc id and *x's alloc id will be the same, say A + y <- copy x # y also has alloc id A + free x # x's alloc id is now 0, as is *x's alloc id + ..*y.. # y's alloc id is A, but *y's alloc id is 0, so we can signal an error + z <- alloc # say z reuses the same address, but now with a new alloc id A' + ..*y.. # y's alloc id is A, but *y's alloc id is A', so we can signal an error + +--- Address + +Since our statements are really simple, many operations may take multiple statements. To stitch a more complex computation like `A[i].f = 34` across multiple statements, we need addresses. + +Addresses can be used to manage any memory address. They can point inside objects, on the stack, heap or global segment. Since they are so powerful we greatly restrict their use. Addresses can only be stored in a register, never in memory on the stack or global segment. Since user-defined types will usually not fit on a register, we forbid addresses in any user-defined types. Since an address may point inside a heap allocation that can be freed, and since `free` will be a function call, addresses will not persist across function calls. Analyzing control flow to find intervening function calls can be complex, so addresses will not persist across basic block boundaries. + +The key open question with this language: can we find *clear* rules of address use that *don't complicate* programs, and that keep the type system *sound*? + +=== Language syntax + +The type system basically follows Hindley-Milner with product and (tagged) sum types. In addition we have address and ref types. Type declarations have the following syntax: + + # product type + type foo [ + x : int + y : (ref int) + z : bar + ] + + # sum type + choice bar [ + x : int + y : point + ] + +Functions have a header and a series of statements in the body: + + fn f a : int b : int -> b : int [ + ... + ] + +Statements have the following format: + + io1, io2, ... <- operation i1, i2, ... + +i1, i2 operands on the right hand side are immutable. io1, io2 are in-out operands. They're written to, and may also be read. + +Two example programs: + + i) Factorial: + + fn factorial n : int -> result/EAX : int [ + result/EAX <- copy 1 + { + compare n, 1 + break-if <= + var tmp/EBX : int + tmp/EBX <- copy n + tmp/EBX <- subtract 1 + var tmp2/EAX : int + tmp2/EAX <- call factorial, tmp/EBX + result/EAX <- multiply tmp2/EAX, n + } + return result/EAX + ] + + ii) Writing to a global variable: + + var x : char + + fn main [ + call read, 0/stdin, x, 1/size + result/EAX <- call write, 1/stdout, x, 1/size + call exit, result/EAX + ] + +One thing to note: variables refer to addresses (not to be confused with the `address` type) just like in Assembly. We'll uniformly use '*' to indicate getting at the value in an address. This will also provide a consistent hint of the addressing mode. + +=== Compilation strategy + +--- User-defined statements + +User-defined functions will be called with the same syntax as primitives. They'll translate to a sequence of push instructions (one per operand, both in and in-out), a call instruction, and a sequence of pop instructions, either to a black hole (in operands) or a location (in-out operands). This follows the standard Unix calling convention: + + push EBP + copy ESP to EBP + push arg 1 + push arg 2 + ... + call + pop arg n + ... + pop arg 1 + copy EBP to ESP + pop ESP + +Implication: each function argument needs to be something push/pop can accept. It can't be an address, so arrays and structs will either have to be passed by value, necessitating copies, or allocated on the heap. We may end up allocating members of structs in separate heap allocations just so we can pass them piecemeal to helper functions. (Mu has explored this trade-off in the past.) + +--- Primitive statements + +Operands may be: + in code (literals) + in registers + on the stack + on the global segment + +Operands are always scalar. Variables on the stack or global segment are immutable references. + + - Variables on the stack are stored at addresses like *(EBP+n) + - Global variables are stored at addresses like *disp32, where disp32 is a statically known constant + +Since the language will not be orthogonal, compilation proceeds by pattern matching over a statement along with knowledge about the types of its operands, as well as where they're stored (register/stack/global). We now enumerate mappings for various categories of statements, based on the type and location of their operands. + +Many statements will end up encoding to the exact same x86 instructions. But the types differ, and they get type-checked differently along the way. + +A. x : int <- add y + + Requires y to be scalar (32 bits). Result will always be an int. No pointer arithmetic. + + reg <- add literal => 81 0/subop 3/mod ...(0) + reg <- add reg => 01 3/mod ...(1) + reg <- add stack => 03 1/mod 4/rm32/SIB 5/base/EBP 4/index/none 0/scale n/disp8 reg/r32 ...(2) + reg <- add global => 03 0/mod 5/rm32/include-disp32 global/disp32 reg/r32 ...(3) + stack <- add literal => 81 0/subop 1/mod 4/rm32/SIB 5/base/EBP 4/index/none 0/scale n/disp8 literal/imm32 ...(4) + stack <- add reg => 01 1/mod 4/rm32/SIB 5/base/EBP 4/index/none 0/scale n/disp8 reg/r32 ...(5) + stack <- add stack => disallowed + stack <- add global => disallowed + global <- add literal => 81 0/subop 0/mod 5/rm32/include-disp32 global/disp32 literal/imm32 ...(6) + global <- add reg => 01 0/mod 5/rm32/include-disp32 global/disp32 reg/r32 ...(7) + global <- add stack => disallowed + global <- add global => disallowed + +Similarly for sub, and, or, xor and even copy. Replace the opcodes above with corresponding ones from this table: + + add sub and or xor copy/mov + reg <- op literal 81 0/subop 81 5/subop 81 4/subop 81 1/subop 81 6/subop c7 + reg <- op reg 01 or 03 29 or 2b 21 or 23 09 or 0b 31 or 33 89 or 8b + reg <- op stack 03 2b 23 0b 33 8b + reg <- op global 03 2b 23 0b 33 8b + stack <- op literal 81 0/subop 81 5/subop 81 4/subop 81 1/subop 81 6/subop c7 + stack <- op reg 01 29 21 09 31 89 + global <- op literal 81 0/subop 81 5/subop 81 4/subop 81 1/subop 81 6/subop c7 + global <- op reg 01 29 21 09 31 89 + +B. x/reg : int <- mul y + + Requires y to be scalar. + x must be in a register. Multiplies can't write to memory. + + reg <- mul literal => 69 ...(8) + reg <- mul reg => 0f af 3/mod ...(9) + reg <- mul stack => 0f af 1/mod 4/rm32/SIB 5/base/EBP 4/index/none 0/scale n/disp8 reg/r32 ...(10) + reg <- mul global => 0f af 0/mod 5/rm32/include-disp32 global/disp32 reg/r32 ...(11) + +C. x/EAX/quotient : int, y/EDX/remainder : int <- idiv z # divide EAX by z; store results in EAX and EDX + + Requires source x and z to both be scalar. + x must be in EAX and y must be in EDX. Divides can't write anywhere else. + + First clear EDX (we don't support ints larger than 32 bits): + 31/xor 3/mod 2/rm32/EDX 2/r32/EDX + + then: + EAX, EDX <- idiv literal => disallowed + EAX, EDX <- idiv reg => f7 7/subop 3/mod ...(12) + EAX, EDX <- idiv stack => f7 7/subop 1/mod 4/rm32/SIB 5/base/EBP 4/index/none 0/scale n/disp8 ...(13) + EAX, EDX <- idiv global => f7 7/subop 0/mod 5/rm32/include-disp32 global/disp32 reg/r32 ...(14) + +D. x : int <- not (weird syntax, but we'll ignore that) + + Requires x to be an int. + + reg <- not => f7 3/mod ...(15) + stack <- not => f7 1/mod 4/rm32/SIB 5/base/EBP 4/index/none 0/scale n/disp8 ...(16) + global <- not => f7 0/mod 5/rm32/include-disp32 global/disp32 reg/r32 ...(17) + +E. x : (address t) <- get o : T, %f + + (Assumes T.f has type t.) + + o can't be on a register since it's a non-primitive (likely larger than a word) + f is a literal + x must be in a register (by definition for an address) + + reg1 <- get reg2, literal => 8d/lea 1/mod reg2/rm32 literal/disp8 reg1/r32 ...(18) + reg <- get stack, literal => 8d/lea 1/mod 4/rm32/SIB 5/base/EBP 4/index/none 0/scale n+literal/disp8 reg/r32 ...(19) + (simplifying assumption: stack frames can't be larger than 256 bytes) + reg <- get global, literal => 8d/lea 0/mod 5/rm32/include-disp32 global+literal/disp32, reg/r32 ...(20) + +F. x : (offset T) <- index i : int, %size(T) + + This statement is used to translate an array index (denominated in the type of array elements) into an offset (denominated in bytes). It's just a multiply but with a new type for the result so that we can keep the type system sound. + + Since index statements translate to multiplies, 'x' must be a register. + The %size(T) argument is statically known, so will always be a literal. + + reg1 <- index reg2, literal => 69/mul 3/mod reg2/rm32 literal/imm32 -> reg1/r32 + or 68/mul 3/mod reg2/rm32 literal/imm8 -> reg1/r32 ...(21) + reg1 <- index stack, literal => 69/mul 1/mod 4/rm32/SIB 5/base/EBP 4/index/none 0/scale n/disp8 literal/imm32 -> reg1/r32 ...(22) + reg1 <- index global, literal => 69/mul 0/mod 5/rm32/include-disp32 global/disp32 literal/imm32 -> reg1/r32 ...(23) + +G. x : (address T) <- advance a : (array T), idx : (offset T) + + reg <- advance a/reg, idx/reg => 8d/lea 0/mod 4/rm32/SIB a/base idx/index 0/scale reg/r32 ...(24) + reg <- advance stack, literal => 8d/lea 1/mod 4/rm32/SIB 5/base/EBP 4/index/none 0/scale n+literal/disp8 reg/r32 ...(25) + reg <- advance stack, reg2 => 8d/lea 1/mod 4/rm32/SIB 5/base/EBP reg2/index 0/scale n/disp8 reg/r32 ...(26) + reg <- advance global, literal => 8d/lea 0/mod 5/rm32/include-disp32 global+literal/disp32, reg/r32 ...(27) + +=== Example + +Putting it all together: code generation for `a[i].y = 4` where a is an array of 2-d points with x, y coordinates. + +If a is allocated on the stack, say of type (array point 6): + + offset/EAX : (offset point) <- index i, 8 # (22) + tmp/EBX : (address point) <- advance a : (array point 6), offset/EAX # (26) + tmp2/ECX : (address number) <- get tmp/EBX : (address point), 4/y # (18) + *tmp2/ECX <- copy 4 # (5 for copy/mov with 0 disp8) + +=== More complex statements + +A couple of statement types expand to multiple instructions: + Function calls. We've already seen these above. + Bounds checking against array length in 'advance' + Dereferencing 'ref' types (see type list up top). Requires an alloc id check. + +G'. Bounds checking the 'advance' statement begins with a few extra instructions. For example: + + x/EAX : (address T) <- advance a : (array T), literal + +Suppose array 'a' lies on the stack starting at EBP+4. Its length will be at EBP+4, and the actual contents of the array will start from EBP+8. + + compare *(EBP+4), literal + jump-if-greater panic # rudimentary error handling + +Now we're ready to perform the actual 'lea': + + lea EBP+8 + literal, reg # line 25 above + +H. Dereferencing a 'ref' needs to be its own statement, yielding an address. This statement has two valid forms: + + reg : (address T) <- deref stack : (ref T) + reg : (address T) <- deref global : (ref T) + +Since refs need 8 bytes they can't be in a register. And of course the output is an address so it must be in a register. + +Compiling 'deref' will take a few instructions. Consider the following example where 's' is on the stack, say starting at EBP+4: + + EDX : (address T) <- deref s : (ref T) + +The alloc id of 's' is at *(EBP+4) and the actual address is at *(EBP+8). The above statement will compile down to the following: + + EDX/s <- copy *(EBP+8) # the address stored in s + EDX/alloc-id <- copy *EDX # alloc id of payload *s + compare EDX, *(EBP+4) # compare with alloc id of pointer + jump-unless-equal panic # rudimentary error handling + # compute *(EBP+8) + 4 + EDX <- copy *(EBP+8) # recompute the address in s because we can't save the value anywhere) + EDX <- add EDX, 4 # skip alloc id this time + +Subtleties: + a) if the alloc id of the payload is 0, then the payload is reclaimed + b) looking up the payload's alloc id *could* cause a segfault. What to do? + +=== More speculative ideas + +Initialize data segment with special extensible syntax for literals. All literals except numbers and strings start with %. Global variable declarations would now look like: + + var s : (array character) = "abc" # exception to the '%' convention + var p : point = %point(3, 4) + +=== Credits + +Forth +C +Rust +Lisp +qhasm diff --git a/transect/compiler2 b/transect/compiler2 new file mode 100644 index 00000000..5c06cc4f --- /dev/null +++ b/transect/compiler2 @@ -0,0 +1,27 @@ +to dereference a heap allocation + copy handle to stack + perform lookup to stack + +lookup x in *(ESP+4) of type (handle T) + + reg <- copy *(ESP+5) : (address T stack) + payload alloc id <- copy *reg + address alloc id <- copy *(ESP+4) + compare payload alloc id, address alloc id + jump if not equal to print stack trace and panic + address <- add reg, 1 + +types: + + address T reg + address T stack + address T heap + address T global + +copy down this spectrum is not permitted, but up is. + +addresses aren't allowed in types, globals and on the heap. Only handles. +addresses are only for temporary manipulations. + + +*(address T) <- copy T diff --git a/transect/compiler3 b/transect/compiler3 new file mode 100644 index 00000000..6bc6bf85 --- /dev/null +++ b/transect/compiler3 @@ -0,0 +1,73 @@ +== Goal + +A memory-safe language with a simple translator to x86 that can be feasibly written in x86. + +== Definitions of terms + +Memory-safe: it should be impossible to: + a) create a pointer out of arbitrary data, or + b) to access heap memory after it's been freed. + +Simple: do all the work in a 2-pass translator: + Pass 1: check each instruction's types in isolation. + Pass 2: emit code for each instruction in isolation. + +== Implications + +=> Each instruction matches a pattern and yields a template to emit. +=> There's a 1-to-1 mapping between instructions in the source language and x86 machine code. + Zero runtime. +=> Programmers have to decide how to use registers. +=> Translator can't insert any instructions that write to registers. (We don't know if a register is in use.) + +== Lessons from Mu + +1. For easy bounds checking, never advance pointers to arrays or heap allocations. No pointer arithmetic. +2. Store the array length with the array. +3. Store an allocation id with heap allocations. Allocation id goes monotonically up, never gets reused. When it wraps around to zero the program panics. +4. Heap pointers also carry around allocation id. +5. When dereferencing a heap pointer, first ensure its alloc id matches the alloc id of the payload. This ensures some other copy of the pointer didn't get freed (and potentially reused) + +== Problem 1 + +How to index into an array? + + The array has a length that needs to be checked. + Its elements have a type T. + The base will be in memory, either on the stack or the heap. + The index may be in the register, stack or heap. + +That's too much work to do in a single instruction. + +So arrays have to take multiple steps. And we have to guard against the steps +being misused in unsafe ways. + +To index into an array with elements of type T, starting with the size of the +array in bytes: + + step 1: get the offset the index is at + <reg offset> : (index T) <- index <reg/mem idx> : int, <literal> : (size T) + step 2: convert the array to address-of-element + <reg x> : (address T) <- advance <reg/mem A> : (array T), <reg offset> : (index T) + implicitly compares the offset with the size, panic if greater + => + compare <reg offset> : (index T), <reg/mem> : (array T) + jge panic + step 3: use the address to the element + ... + +(index T) is a special type. You can do only two things with it: + - pass it to the advance instruction + - convert it to a number (but no converting back) + +(address T) is a short-term pointer. You can't store addresses in structs, you +can't define global variables of that type, and you can't pass the type to the +memory allocator to save to the heap. Only place you can store an (address T) +is on the stack or a register. + +[But you can still be holding an address in a long-lived stack frame after +it's been freed?!] + +== Problem 2 + +How to dereference a heap allocation? diff --git a/transect/compiler4 b/transect/compiler4 new file mode 100644 index 00000000..8dfb8ccd --- /dev/null +++ b/transect/compiler4 @@ -0,0 +1,84 @@ +== Goal + +A memory-safe language with a simple translator to x86 that can be feasibly written in x86. + +== Definitions of terms + +Memory-safe: it should be impossible to: + a) create a pointer out of arbitrary data, or + b) to access heap memory after it's been freed. + +Simple: do all the work in a 2-pass translator: + Pass 1: check each instruction's types in isolation. + Pass 2: emit code for each instruction in isolation. + +== Implications + +=> Each instruction matches a pattern and yields a template to emit. +=> There's a 1-to-1 mapping between instructions in the source language and x86 machine code. + Zero runtime. +=> Programmers have to decide how to use registers. +=> Translator can't insert any instructions that write to registers. (We don't know if a register is in use.) + +== Lessons from Mu + +1. For easy bounds checking, never advance pointers to arrays or heap allocations. No pointer arithmetic. +2. Store the array length with the array. +3. Store an allocation id with heap allocations. Allocation id goes monotonically up, never gets reused. When it wraps around to zero the program panics. +4. Heap pointers also carry around allocation id. +5. When dereferencing a heap pointer, first ensure its alloc id matches the alloc id of the payload. This ensures some other copy of the pointer didn't get freed (and potentially reused) + +== Problem 1 + +How to index into an array? + + The array has a length that needs to be checked. + Its elements have a type T. + The base will be in memory, either on the stack or the heap. + The index may be in the register, stack or heap. + +That's too much work to do in a single instruction. + +So arrays have to take multiple steps. And we have to guard against the steps +being misused in unsafe ways. + +To index into an array with elements of type T, starting with the size of the +array in bytes: + + step 1: get the offset the index is at + <reg offset> : (index T) <- index <reg/mem idx> : int, <literal> : (size T) + step 2: convert the array to address-of-element + <reg x> : (address T) <- advance <reg/mem A> : (array T), <reg offset> : (index T) + implicitly compares the offset with the size, panic if greater + => + compare <reg offset> : (index T), <reg/mem> : (array T) + jge panic + step 3: use the address to the element + ... + +(index T) is a special type. You can do only two things with it: + - pass it to the advance instruction + - convert it to a number (but no converting back) + +(address T) is a short-term pointer. You can't store addresses in structs, you +can't define global variables of that type, and you can't pass the type to the +memory allocator to save to the heap. You also can't store addresses in the +stack, because you may encounter a free before you end the function. + +Maybe we'll also forbid any sort of copy of address types. Only place you can +store an (address T) is the register you saved to. To copy you need a handle +to a heap allocation. + +Still not entirely protected against temporal issues. But pretty close. + +== Problem 2 + +How to dereference a heap allocation? + +== List of types + +int +char +(address _) X +(array _) +(handle _) diff --git a/transect/compiler5 b/transect/compiler5 new file mode 100644 index 00000000..aeb857f4 --- /dev/null +++ b/transect/compiler5 @@ -0,0 +1,32 @@ +== Goal + +A memory-safe language with a simple translator to x86 that can be feasibly written in x86. + +== Definitions of terms + +Memory-safe: it should be impossible to: + a) create a pointer out of arbitrary data, or + b) to access heap memory after it's been freed. + +Simple: do all the work in a 2-pass translator: + Pass 1: check each instruction's types in isolation. + Pass 2: emit code for each instruction in isolation. + +== types + +int +char +(address _ t), t ∋ {stack, heap, global} +(array _ t), t ∋ {stack, heap, global} + +stack addresses can't be copied to heap or global +heap addresses can't be copied [1] +global addresses you're free to use anywhere + +[1] (address _ heap) can't be copied or stored, can't be part of a type or +choice. Only thing you can do with it is access it from the register you wrote +it to. And even that not past a call instruction. Important detail: `free()` +is a call. So an address to something on the heap can never be invalid if the +program type-checks. + +<reg x> : (address T m) <- advance <reg/mem> : (array T m), <reg offset> : (index T) diff --git a/transect/compiler6 b/transect/compiler6 new file mode 100644 index 00000000..48a7030f --- /dev/null +++ b/transect/compiler6 @@ -0,0 +1,36 @@ +== Goal + +A memory-safe language with a simple translator to x86 that can be feasibly written in x86. + +== Definitions of terms + +Memory-safe: it should be impossible to: + a) create a pointer out of arbitrary data, or + b) to access heap memory after it's been freed. + +Simple: do all the work in a 2-pass translator: + Pass 1: check each instruction's types in isolation. + Pass 2: emit code for each instruction in isolation. + +== types + +int +char +(address _) +(array _ n) +(ref _) + +addresses can't be saved to stack or global, + or included in compound types + or used across a call (to eliminate possibility of free) + +<reg x> : (address T) <- advance <reg/mem> : (array T), <reg offset> : (index T) + +arrays require a size +(ref array _) may not include a size + +== open questions +Is argv an address? +Global variables are easiest to map to addresses. +Ideally we'd represent 'indirect' as a '*' and we could just count to make +sure that an instruction never has more than one '*'. diff --git a/transect/compiler7 b/transect/compiler7 new file mode 100644 index 00000000..cf7d454f --- /dev/null +++ b/transect/compiler7 @@ -0,0 +1,46 @@ +== Goal + +A memory-safe language with a simple translator to x86 that can be feasibly written in x86. + +== Definitions of terms + +Memory-safe: it should be impossible to: + a) create a pointer out of arbitrary data, or + b) to access heap memory after it's been freed. + +Simple: do all the work in a 2-pass translator: + Pass 1: check each instruction's types in isolation. + Pass 2: emit code for each instruction in isolation. + +== types + +int +char +(address _ stack|heap|global) +(array _ n) # on stack or global +(ref _) +(ref array _) # by definition always on the heap + +addresses to global can be saved and manipulated as usual +addresses on stack can't be saved to heap +addresses on heap can't be saved to global (use ref) + +addresses to stack or heap can't be included in compound types + or used across a call + or used across a label + +<reg x> : (address T stack|global) <- advance <reg/mem> : (array T), <reg offset> : (index T) +<reg x> : (address T heap) <- advance *<mem> : (ref array T), <reg offset> : (index T) + +arrays require a size +(ref array _) may not include a size + +Arguments of type 'address' are required to be on the stack or global. Can't +be on the heap. + +So we need duplication for address and ref arguments? + +Argv has type (array (address (array char) global)) + +variables on stack, heap and global are references. The name points at the +address. Use '*' to get at the value. diff --git a/transect/compiler8 b/transect/compiler8 new file mode 100644 index 00000000..b3b35271 --- /dev/null +++ b/transect/compiler8 @@ -0,0 +1,53 @@ +== Goal + +A memory-safe language with a simple translator to x86 that can be feasibly written in x86. + +== Definitions of terms + +Memory-safe: it should be impossible to: + a) create a pointer out of arbitrary data, or + b) to access heap memory after it's been freed. + +Simple: do all the work in a 2-pass translator: + Pass 1: check each instruction's types in isolation. + Pass 2: emit code for each instruction in isolation. + +== types + +int +char +(address _) +(array _ n) +(ref _) + +== implications + +addresses can't be saved to stack or global, + or included in compound types + or used across a call (to eliminate possibility of free) + +<reg x> : (address T) <- advance <reg/mem> : (array T), <reg offset> : (index T) + +arrays require a size +(ref array _) may not include a size + +argv has type (array (ref array char)) + +variables on stack, heap and global are references. The name points at the +address. Use '*' to get at the value. + +instructions performing lookups write to register, so that we can reuse the register for temporaries +instructions performing lookups can't read from the register they write to. + But most instructions read from the register they write to?! (in-out params) + +== open questions + +If bounds checks can take multiple instructions, why not perform array +indexing in a single statement in the language? + +But we want addresses as intermediate points to combine instructions with. + +Maybe disallow addresses to function calls, but allow addresses to non-heap +structures to be used spanning function calls and labels. + +That's just for ergonomics. Doesn't add new capability. diff --git a/transect/compiler9 b/transect/compiler9 new file mode 100644 index 00000000..26becf48 --- /dev/null +++ b/transect/compiler9 @@ -0,0 +1,254 @@ +=== Goal + +A memory-safe language with a simple translator to x86 that can be feasibly +written without itself needing a translator. + +Memory-safe: it should be impossible to: + a) create a pointer out of arbitrary data, or + b) to access heap memory after it's been freed. + +Simple: do all the work in a 2-pass translator: + Pass 1: check each instruction's types in isolation. + Pass 2: emit code for each instruction in isolation. + +=== Overview of the language + +A program consists of a series of type, function and global variable declarations. +(Also constants and tests, but let's focus on these.) + +Type declarations basically follow Hindley-Milner with product and (tagged) sum +types. Types are written in s-expression form. There's a `ref` type that's a +type-safe fat pointer, with an alloc id that gets incremented after each +allocation. Memory allocation and reclamation is manual. Dereferencing a ref +after its underlying memory is reclaimed (pointer alloc id no longer matches +payload alloc id) is guaranteed to immediately kill the program (like a +segfault). + + # product type + type foo [ + x : int + y : (ref int) + z : bar + ] + + # sum type + choice bar [ + x : int + y : point + ] + +Functions have a header and a series of instructions in the body: + + fn f a : int -> b : int [ + ... + ] + +Instructions have the following format: + + io1, io2, ... <- operation i1, i2, ... + +i1, i2 operands on the right hand side are immutable. io1, io2 are in-out +operands. They're written to, and may also be read. + +User-defined functions will be called with the same syntax. They'll translate +to a sequence of push instructions (one per operand, both in and in-out), a +call instruction, and a sequence of pop instructions, either to a black hole +(in operands) or a location (in-out operands). This follows the standard Unix +calling convention. Each operand needs to be something push/pop can accept. + +Primitive operations depend on the underlying processor. We'd like each primitive +operation supported by the language to map to a single instruction in the ISA. +Sometimes we have to violate that (see below), but we definitely won't be +writing to any temporary locations behind the scenes. The language affords +control over registers, and tracking unused registers gets complex, and +besides we may have no unused registers at a specific point. Instructions only +modify their operands. + +In most ISAs, instructions operate on at most a word of data at a time. They +also tend to not have more than 2-3 operands, and not modify more than 2 +locations in memory. + +Since the number of reads from memory is limited, we break up complex high-level +operations using a special type called `address`. Addresses are strictly +short-term entities. They can't be stored in a compound type, and they can't +be passed into or returned from a user-defined function. They also can't be +used after a function call (because it could free the underlying memory) or +label (because it gets complex to check control flow, and we want to translate +each instruction simply and in isolation). + +=== Compilation to 32-bit x86 + +Values can be stored: + in code (literals) + in registers + on the stack + on the global segment + +Variables on the stack are stored at *(ESP+n) +Global variables are stored at *disp32, where disp32 is statically known + +Address variables have to be in a register. + - You need them in a register to do a lookup, and + - Saving them to even the stack increases the complexity of checks needed on + function calls or labels. + +Compilation proceeds by pattern matching over an instruction along with +knowledge about the types of its operands, as well as where they're stored +(register/stack/global). We now enumerate mappings for various categories of +instructions, based on the type and location of their operands. + +Where types of operands aren't mentioned below, all operands of an instruction +should have the same (word-length) type. + +Lots of special cases because of limitations of the x86 ISA. Beware. + +A. x : int <- add y + + Requires y to be scalar. Result will always be an int. No pointer arithmetic. + + reg <- add literal => 81 0/subop 3/mod ...(0) + reg <- add reg => 01 3/mod ...(1) + reg <- add stack => 03 1/mod 4/rm32/SIB 4/base/ESP 4/index/none 0/scale n/disp8 reg/r32 ...(2) + reg <- add global => 03 0/mod 5/rm32/include-disp32 global/disp32 reg/r32 ...(3) + stack <- add literal => 81 0/subop 1/mod 4/rm32/SIB 4/base/ESP 4/index/none 0/scale n/disp8 literal/imm32 ...(4) + stack <- add reg => 01 1/mod 4/rm32/SIB 4/base/ESP 4/index/none 0/scale n/disp8 reg/r32 ...(5) + stack <- add stack => disallowed + stack <- add global => disallowed + global <- add literal => 81 0/subop 0/mod 5/rm32/include-disp32 global/disp32 literal/imm32 ...(6) + global <- add reg => 01 0/mod 5/rm32/include-disp32 global/disp32 reg/r32 ...(7) + global <- add stack => disallowed + global <- add global => disallowed + +Similarly for sub, and, or, xor and even copy. Replace the opcodes above with corresponding ones from this table: + + add sub and or xor copy/mov + reg <- op literal 81 0/subop 81 5/subop 81 4/subop 81 1/subop 81 6/subop c7 + reg <- op reg 01 or 03 29 or 2b 21 or 23 09 or 0b 31 or 33 89 or 8b + reg <- op stack 03 2b 23 0b 33 8b + reg <- op global 03 2b 23 0b 33 8b + stack <- op literal 81 0/subop 81 5/subop 81 4/subop 81 1/subop 81 6/subop c7 + stack <- op reg 01 29 21 09 31 89 + global <- op literal 81 0/subop 81 5/subop 81 4/subop 81 1/subop 81 6/subop c7 + global <- op reg 01 29 21 09 31 89 + +B. x/reg : int <- mul y + + Requires both y to be scalar. + x must be in a register. Multiplies can't write to memory. + + reg <- mul literal => 69 ...(8) + reg <- mul reg => 0f af 3/mod ...(9) + reg <- mul stack => 0f af 1/mod 4/rm32/SIB 4/base/ESP 4/index/none 0/scale n/disp8 reg/r32 ...(10) + reg <- mul global => 0f af 0/mod 5/rm32/include-disp32 global/disp32 reg/r32 ...(11) + +C. x/EAX/quotient : int, y/EDX/remainder : int <- idiv z # divide EAX by z; store the result in EAX and EDX + + Requires source x and z to both be scalar. + x must be in EAX and y must be in EDX. Divides can't write anywhere else. + + First clear EDX (we don't support ints larger than 32 bits): + 31/xor 3/mod 2/rm32/EDX 2/r32/EDX + + then: + EAX, EDX <- idiv literal => disallowed + EAX, EDX <- idiv reg => f7 7/subop 3/mod ...(12) + EAX, EDX <- idiv stack => f7 7/subop 1/mod 4/rm32/SIB 4/base/ESP 4/index/none 0/scale n/disp8 ...(13) + EAX, EDX <- idiv global => f7 7/subop 0/mod 5/rm32/include-disp32 global/disp32 reg/r32 ...(14) + +D. x : int <- not + + Requires x to be an int. + + reg <- not => f7 3/mod ...(15) + stack <- not => f7 1/mod 4/rm32/SIB 4/base/ESP 4/index/none 0/scale n/disp8 ...(16) + global <- not => f7 0/mod 5/rm32/include-disp32 global/disp32 reg/r32 ...(17) + +E. x : (address t) <- get o : T, %f + + (Assumes T.f has type t.) + + o can't be on a register since it's a non-primitive (likely larger than a word) + f is a literal + x must be in a register (by definition for an address) + + below '*' works on either address or ref types + + For raw stack values we want to read *(ESP+n) + For raw global values we want to read *disp32 + For address stack values we want to read *(ESP+n)+ + *(ESP+n) contains an address + so we want to compute *(ESP+n) + literal + + reg1 <- get reg2, literal => 8d/lea 1/mod reg2/rm32 literal/disp8 reg1/r32 ...(18) + reg <- get stack, literal => 8d/lea 1/mod 4/rm32/SIB 4/base/ESP 4/index/none 0/scale n+literal/disp8 reg/r32 ...(19) + (simplifying assumption: stack frames can't be larger than 256 bytes) + reg <- get global, literal => 8d/lea 0/mod 5/rm32/include-disp32 global+literal/disp32, reg/r32 ...(20) + +F. x : (offset T) <- index i : int, %size(T) + + reg1 <- index reg2, literal => 69/mul 3/mod reg2/rm32 literal/imm32 -> reg1/r32 + or 68/mul 3/mod reg2/rm32 literal/imm8 -> reg1/r32 ...(21) + reg1 <- index stack, literal => 69/mul 1/mod 4/rm32/SIB 4/base/ESP 4/index/none 0/scale n/disp8 literal/imm32 -> reg1/r32 ...(22) + reg1 <- index global, literal => 69/mul 0/mod 5/rm32/include-disp32 global/disp32 literal/imm32 -> reg1/r32 ...(23) + + optimization: avoid multiply if literal is a power of 2 + use SIB byte if literal is 2, 4 or 8 + or left shift + +G. x : (address T) <- advance o : (array T), idx : (offset T) + + reg <- advance a/reg, idx/reg => 8d/lea 0/mod 4/rm32/SIB a/base idx/index 0/scale reg/r32 ...(24) + reg <- advance stack, literal => 8d/lea 1/mod 4/rm32/SIB 4/base/ESP 4/index/none 0/scale n+literal/disp8 reg/r32 ...(25) + reg <- advance stack, reg2 => 8d/lea 1/mod 4/rm32/SIB 4/base/ESP reg2/index 0/scale n/disp8 reg/r32 ...(26) + reg <- advance global, literal => 8d/lea 0/mod 5/rm32/include-disp32 global+literal/disp32, reg/r32 ...(27) + + also instructions for runtime bounds checking + +=== Example + +Putting it all together: code generation for `a[i].y = 4` where a is an array +of 2-d points with x, y coordinates. + +If a is allocated on the stack, say of type (array point 6) at (ESP+4): + + offset/EAX : (offset point) <- index i, 8 # (22) + tmp/EBX : (address point) <- advance a : (array point 6), offset/EAX # (26) + tmp2/ECX : (address number) <- get tmp/EBX : (address point), 4/y # (18) + *tmp2/ECX <- copy 4 # (5 for copy/mov with 0 disp8) + +Many instructions, particularly variants of 'get' and 'advance' -- end up encoding the exact same instructions. +But the types differ, and the type-checker checks them differently. + +=== Advanced checks + +Couple of items require inserting mapping to multiple instructions: + bounds checking against array length in 'advance' + dereferencing 'ref' types (see type list up top) + +A. Dereferencing a ref + + tmp/EDX <- advance *s, tmp0/EDI + => compare (ESP+4), *(ESP+8) ; '*' from compiler2 + jump-unless-equal panic + EDX <- add ESP, 8 + EDX <- copy *EDX + EDX <- add EDX, 4 + EDX <- 8d/lea EDX + result + +=== More speculative ideas + +Initialize data segment with special extensible syntax for literals. All +literals except numbers and strings start with %. + + %size(type) => compiler replaces with size of type + %point(3, 4) => two words + +and so on. + +=== Credits + +Forth +C +Rust +Lisp +qhasm diff --git a/transect/vimrc.vim b/transect/vimrc.vim new file mode 100644 index 00000000..d8b70fbc --- /dev/null +++ b/transect/vimrc.vim @@ -0,0 +1,36 @@ +" Highlighting literate directives in C++ sources. +function! HighlightTangledFile() + " Tangled comments only make sense in the sources and are stripped out of + " the generated .cc file. They're highlighted same as regular comments. + syntax match tangledComment /\/\/:.*/ | highlight link tangledComment Comment + syntax match tangledSalientComment /\/\/::.*/ | highlight link tangledSalientComment SalientComment + set comments-=:// + set comments-=n:// + set comments+=n://:,n:// + + " Inside tangle scenarios. + syntax region tangleDirective start=+:(+ skip=+".*"+ end=+)+ + highlight link tangleDirective Delimiter + syntax match traceContains /^+.*/ + highlight traceContains ctermfg=darkgreen + syntax match traceAbsent /^-.*/ + highlight traceAbsent ctermfg=darkred + syntax match tangleScenarioSetup /^\s*% .*/ | highlight link tangleScenarioSetup SpecialChar + + " Our C++ files can have Mu code in scenarios, so highlight Mu comments like + " regular comments. + syntax match muComment /#.*$/ + highlight link muComment Comment + syntax match muSalientComment /##.*$/ | highlight link muSalientComment SalientComment + syntax match muCommentedCode /#? .*$/ | highlight link muCommentedCode CommentedCode + set comments+=n:# +endfunction +augroup LocalVimrc + autocmd BufRead,BufNewFile *.cc call HighlightTangledFile() +augroup END + +" Scenarios considered: +" opening or starting vim with a new or existing file without an extension (should interpret as C++) +" starting vim or opening a buffer without a file name (ok to do nothing) +" opening a second file in a new or existing window (shouldn't mess up existing highlighting) +" reloading an existing file (shouldn't mess up existing highlighting) |