about summary refs log tree commit diff stats
path: root/archive/2.transect/compiler4
blob: 8dfb8ccdb991c31bdec5ba9f8f5711ba407cb8bd (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
== Goal

A memory-safe language with a simple translator to x86 that can be feasibly written in x86.

== Definitions of terms

Memory-safe: it should be impossible to:
  a) create a pointer out of arbitrary data, or
  b) to access heap memory after it's been freed.

Simple: do all the work in a 2-pass translator:
  Pass 1: check each instruction's types in isolation.
  Pass 2: emit code for each instruction in isolation.

== Implications

=> Each instruction matches a pattern and yields a template to emit.
=> There's a 1-to-1 mapping between instructions in the source language and x86 machine code.
  Zero runtime.
=> Programmers have to decide how to use registers.
=> Translator can't insert any instructions that write to registers. (We don't know if a register is in use.)

== Lessons from Mu

1. For easy bounds checking, never advance pointers to arrays or heap allocations. No pointer arithmetic.
2. Store the array length with the array.
3. Store an allocation id with heap allocations. Allocation id goes monotonically up, never gets reused. When it wraps around to zero the program panics.
4. Heap pointers also carry around allocation id.
5. When dereferencing a heap pointer, first ensure its alloc id matches the alloc id of the payload. This ensures some other copy of the pointer didn't get freed (and potentially reused)

== Problem 1

How to index into an array?

  The array has a length that needs to be checked.
  Its elements have a type T.
  The base will be in memory, either on the stack or the heap.
  The index may be in the register, stack or heap.

That's too much work to do in a single instruction.

So arrays have to take multiple steps. And we have to guard against the steps
being misused in unsafe ways.

To index into an array with elements of type T, starting with the size of the
array in bytes:

  step 1: get the offset the index is at
    <reg offset> : (index T) <- index <reg/mem idx> : int, <literal> : (size T)
  step 2: convert the array to address-of-element
    <reg x> : (address T) <- advance <reg/mem A> : (array T), <reg offset> : (index T)
    implicitly compares the offset with the size, panic if greater
    =>
      compare <reg offset> : (index T), <reg/mem> : (array T)
      jge panic
  step 3: use the address to the element
    ...

(index T) is a special type. You can do only two things with it:
  - pass it to the advance instruction
  - convert it to a number (but no converting back)

(address T) is a short-term pointer. You can't store addresses in structs, you
can't define global variables of that type, and you can't pass the type to the
memory allocator to save to the heap. You also can't store addresses in the
stack, because you may encounter a free before you end the function.

Maybe we'll also forbid any sort of copy of address types. Only place you can
store an (address T) is the register you saved to. To copy you need a handle
to a heap allocation.

Still not entirely protected against temporal issues. But pretty close.

== Problem 2

How to dereference a heap allocation?

== List of types

int 
char
(address _)   X  
(array _)
(handle _)
*/ .highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */ .highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */ .highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */ .highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */ .highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */ .highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */ .highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */ .highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */ .highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */ .highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */ .highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */ .highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */ .highlight .vc { color: #336699 } /* Name.Variable.Class */ .highlight .vg { color: #dd7700 } /* Name.Variable.Global */ .highlight .vi { color: #3333bb } /* Name.Variable.Instance */ .highlight .vm { color: #336699 } /* Name.Variable.Magic */ .highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */
# read up to 'len' graphemes after skipping the first 'start' ones
fn substring in: (addr array byte), start: int, len: int, out-ah: (addr handle array byte) {
  var in-stream: (stream byte 0x100)
  var in-stream-addr/esi: (addr stream byte) <- address in-stream
  write in-stream-addr, in
  var out-stream: (stream byte 0x100)
  var out-stream-addr/edi: (addr stream byte) <- address out-stream
  $substring:core: {
    # skip 'start' graphemes
    var i/eax: int <- copy 0
    {
      compare i, start
      break-if->=
      {
        var dummy/eax: grapheme <- read-grapheme in-stream-addr
        compare dummy, 0xffffffff/end-of-file
        break-if-= $substring:core
      }
      i <- increment
      loop
    }
    # copy 'len' graphemes
    i <- copy 0
    {
      compare i, len
      break-if->=
      {
        var g/eax: grapheme <- read-grapheme in-stream-addr
        compare g, 0xffffffff/end-of-file
        break-if-= $substring:core
        write-grapheme out-stream-addr, g
      }
      i <- increment
      loop
    }
  }
  stream-to-array out-stream-addr, out-ah
}

fn test-substring {
  var out-h: (handle array byte)
  var out-ah/edi: (addr handle array byte) <- address out-h
  # prefix substrings
  substring 0, 0, 3, out-ah
  var out/eax: (addr array byte) <- lookup *out-ah
  check-strings-equal out, "", "F - test-substring/null"
  substring "", 0, 3, out-ah
  var out/eax: (addr array byte) <- lookup *out-ah
#?   print-string-to-real-screen out
#?   print-string-to-real-screen "\n"
  check-strings-equal out, "", "F - test-substring/empty"
  #
  substring "abcde", 0, 3, out-ah
  var out/eax: (addr array byte) <- lookup *out-ah
#?   print-string-to-real-screen out
#?   print-string-to-real-screen "\n"
  check-strings-equal out, "abc", "F - test-substring/truncate"
  #
  substring "abcde", 0, 5, out-ah
  var out/eax: (addr array byte) <- lookup *out-ah
  check-strings-equal out, "abcde", "F - test-substring/all"
  #
  substring "abcde", 0, 7, out-ah
  var out/eax: (addr array byte) <- lookup *out-ah
  check-strings-equal out, "abcde", "F - test-substring/too-small"
  # substrings outside string
  substring "abcde", 6, 1, out-ah
  var out/eax: (addr array byte) <- lookup *out-ah
  check-strings-equal out, "", "F - test-substring/start-too-large"
  # trim prefix
  substring "", 2, 3, out-ah
  var out/eax: (addr array byte) <- lookup *out-ah
  check-strings-equal out, "", "F - test-substring/middle-empty"
  #
  substring "abcde", 1, 2, out-ah
  var out/eax: (addr array byte) <- lookup *out-ah
  check-strings-equal out, "bc", "F - test-substring/middle-truncate"
  #
  substring "abcde", 1, 4, out-ah
  var out/eax: (addr array byte) <- lookup *out-ah
  check-strings-equal out, "bcde", "F - test-substring/middle-all"
  #
  substring "abcde", 1, 5, out-ah
  var out/eax: (addr array byte) <- lookup *out-ah
  check-strings-equal out, "bcde", "F - test-substring/middle-too-small"
}

fn split-string in: (addr array byte), delim: grapheme, out: (addr handle array (handle array byte)) {
  var in-stream: (stream byte 0x100)
  var in-stream-addr/esi: (addr stream byte) <- address in-stream
  write in-stream-addr, in
  var tokens-stream: (stream (handle array byte) 0x100)
  var tokens-stream-addr/edi: (addr stream (handle array byte)) <- address tokens-stream
  var curr-stream: (stream byte 0x100)
  var curr-stream-addr/ecx: (addr stream byte) <- address curr-stream
  $split-string:core: {
    var g/eax: grapheme <- read-grapheme in-stream-addr
    compare g, 0xffffffff
    break-if-=
#?     print-grapheme-to-real-screen g
#?     print-string-to-real-screen "\n"
    compare g, delim
    {
      break-if-!=
      # token complete; flush
      var token: (handle array byte)
      var token-ah/eax: (addr handle array byte) <- address token
      stream-to-array curr-stream-addr, token-ah
      write-to-stream tokens-stream-addr, token-ah
      clear-stream curr-stream-addr
      loop $split-string:core
    }
    write-grapheme curr-stream-addr, g
    loop
  }
  stream-to-array tokens-stream-addr, out
}

fn test-split-string {
  var out-h: (handle array (handle array byte))
  var out-ah/edi: (addr handle array (handle array byte)) <- address out-h
  # prefix substrings
  split-string "bab", 0x61, out-ah
  # no crash
}