4614 - redo simulated RAM

Now simulated 'Memory' isn't just a single flat array. Instead it knows about segments and VMAs. The code segment will always be first, and the data/heap segment will always be second. The brk() syscall knows about the data segment. One nice side-effect is that I no longer need to mess with Memory initialization regardless of where I place my segments.
author: Kartik Agaram <vc@akkartik.com> 2018-09-28 23:08:27 -0700
committer: Kartik Agaram <vc@akkartik.com> 2018-09-29 10:20:13 -0700
commit: 630433cd9cb97cf71d24bfc8fab6fb54ce40382a (patch)
tree: cf4cffae8599489e5efcbc18b965f804d5e3a8e8 /subx
parent: fd0cf1cd07ce01c3d6fe709d55b60ff9d1d5d44f (diff)
download: mu-630433cd9cb97cf71d24bfc8fab6fb54ce40382a.tar.gz
9 files changed, 101 insertions, 54 deletions
diff --git a/subx/010---vm.cc b/subx/010---vm.cc
index 11ea60eb..c862ed99 100644
--- a/subx/010---vm.cc
+++ b/subx/010---vm.cc
@@ -89,26 +89,74 @@ SF = ZF = OF = false;
 
 //:: simulated RAM
 
+:(before "End Types")
+const uint32_t INITIAL_SEGMENT_SIZE = 0x1000 - 1;
+// Subtract one just so we can start the first segment at address 1 without
+// overflowing the first segment. Other segments will learn to adjust.
+
+// Like in real-world Linux, we'll allocate RAM for our programs in slabs
+// called VMAs or Virtual Memory Areas.
+struct vma {
+  uint32_t start;  // inclusive
+  uint32_t end;  // exclusive
+  vector<uint8_t> _data;
+  vma(uint32_t s, uint32_t e) :start(s), end(e) {
+    _data.resize(end-start);
+  }
+  vma(uint32_t s) :start(s), end(s+INITIAL_SEGMENT_SIZE) {
+    _data.resize(end-start);
+  }
+  bool match(uint32_t a) {
+    return a >= start && a < end;
+  }
+  bool match32(uint32_t a) {
+    return a >= start && a+4 <= end;
+  }
+  uint8_t& data(uint32_t a) {
+    assert(match(a));
+    return _data.at(a-start);
+  }
+  void grow_until(uint32_t new_end_address) {
+    if (new_end_address < end) return;
+    end = new_end_address;
+    _data.resize(new_end_address - start);
+  }
+  // End vma Methods
+};
+
+:(before "End Globals")
+// RAM is made of VMAs.
+vector<vma> Mem;
+:(code)
+// The first 3 VMAs are special. When loading ELF binaries in later layers,
+// we'll assume that the first VMA is for code, the second is for data
+// (including the heap), and the third for the stack.
+void grow_code_segment(uint32_t new_end_address) {
+  assert(!Mem.empty());
+  Mem.at(0).grow_until(new_end_address);
+}
+void grow_data_segment(uint32_t new_end_address) {
+  assert(SIZE(Mem) > 1);
+  Mem.at(1).grow_until(new_end_address);
+}
 :(before "End Globals")
-vector<uint8_t> Mem;
-uint32_t Mem_offset = 0;
-uint32_t End_of_program = 0;
+uint32_t End_of_program = 0;  // when the program executes past this address in tests we'll stop the test
+// The stack grows downward. Can't increase its size for now.
 :(before "End Reset")
 Mem.clear();
-Mem.resize(1024);
-Mem_offset = 0;
 End_of_program = 0;
 :(code)
 // These helpers depend on Mem being laid out contiguously (so you can't use a
 // map, etc.) and on the host also being little-endian.
 inline uint8_t read_mem_u8(uint32_t addr) {
-  return Mem.at(addr-Mem_offset);
+  uint8_t* handle = mem_addr_u8(addr);  // error messages get printed here
+  return handle ? *handle : 0;
 }
 inline int8_t read_mem_i8(uint32_t addr) {
   return static_cast<int8_t>(read_mem_u8(addr));
 }
 inline uint32_t read_mem_u32(uint32_t addr) {
-  uint32_t* handle = mem_addr_u32(addr);
+  uint32_t* handle = mem_addr_u32(addr);  // error messages get printed here
   return handle ? *handle : 0;
 }
 inline int32_t read_mem_i32(uint32_t addr) {
@@ -116,16 +164,25 @@ inline int32_t read_mem_i32(uint32_t addr) {
 }
 
 inline uint8_t* mem_addr_u8(uint32_t addr) {
-  return &Mem.at(addr-Mem_offset);
+  for (int i = 0;  i < SIZE(Mem);  ++i)
+    if (Mem.at(i).match(addr))
+      return &Mem.at(i).data(addr);
+  raise << "Tried to access uninitialized memory at address 0x" << HEXWORD << addr << '\n' << end();
+  return NULL;
 }
 inline int8_t* mem_addr_i8(uint32_t addr) {
   return reinterpret_cast<int8_t*>(mem_addr_u8(addr));
 }
 inline uint32_t* mem_addr_u32(uint32_t addr) {
-  return reinterpret_cast<uint32_t*>(mem_addr_u8(addr));
+  for (int i = 0;  i < SIZE(Mem);  ++i)
+    if (Mem.at(i).match32(addr))
+      return reinterpret_cast<uint32_t*>(&Mem.at(i).data(addr));
+  raise << "Tried to access uninitialized memory at address 0x" << HEXWORD << addr << '\n' << end();
+  raise << "The entire 4-byte word should be initialized and lie in a single segment.\n" << end();
+  return NULL;
 }
 inline int32_t* mem_addr_i32(uint32_t addr) {
-  return reinterpret_cast<int32_t*>(mem_addr_u8(addr));
+  return reinterpret_cast<int32_t*>(mem_addr_u32(addr));
 }
 // helper for some syscalls. But read-only.
 inline const char* mem_addr_string(uint32_t addr) {
@@ -149,6 +206,13 @@ inline void write_mem_i32(uint32_t addr, int32_t val) {
   if (handle != NULL) *handle = val;
 }
 
+inline bool already_allocated(uint32_t addr) {
+  for (int i = 0;  i < SIZE(Mem);  ++i)
+    if (Mem.at(i).match(addr))
+      return true;
+  return false;
+}
+
 //:: core interpreter loop
 
 :(code)
diff --git a/subx/011run.cc b/subx/011run.cc
index d3963e3e..22eaad9d 100644
--- a/subx/011run.cc
+++ b/subx/011run.cc
@@ -224,6 +224,8 @@ void load(const program& p) {
   for (int i = 0;   i < SIZE(p.segments);  ++i) {
     const segment& seg = p.segments.at(i);
     uint32_t addr = seg.start;
+    if (!already_allocated(addr))
+      Mem.push_back(vma(seg.start));
     trace(99, "load") << "loading segment " << i << " from " << HEXWORD << addr << end();
     for (int j = 0;  j < SIZE(seg.lines);  ++j) {
       const line& l = seg.lines.at(j);
diff --git a/subx/012elf.cc b/subx/012elf.cc
index 787d914c..7bdbc548 100644
--- a/subx/012elf.cc
+++ b/subx/012elf.cc
@@ -9,8 +9,6 @@ if (is_equal(argv[1], "run")) {
   assert(argc > 2);
   reset();
   cerr << std::hex;
-  initialize_mem();
-  Mem_offset = CODE_START;
   load_elf(argv[2], argc, argv);
   while (EIP < End_of_program)  // weak final-gasp termination check
     run_one_instruction();
@@ -60,9 +58,10 @@ void load_elf_contents(uint8_t* elf_contents, size_t size, int argc, char* argv[
   // unused: e_shstrndx
 
   for (size_t i = 0;  i < e_phnum;  ++i)
-    load_segment_from_program_header(elf_contents, size, e_phoff + i*e_phentsize, e_ehsize);
+    load_segment_from_program_header(elf_contents, i, size, e_phoff + i*e_phentsize, e_ehsize);
 
   // initialize code and stack
+  Mem.push_back(vma(STACK_SEGMENT));
   Reg[ESP].u = AFTER_STACK;
   Reg[EBP].u = 0;
   EIP = e_entry;
@@ -70,6 +69,7 @@ void load_elf_contents(uint8_t* elf_contents, size_t size, int argc, char* argv[
   // initialize args on stack
   // no envp for now
   // we wastefully use a separate page of memory for argv
+  Mem.push_back(vma(ARGV_DATA_SEGMENT));
   uint32_t argv_data = ARGV_DATA_SEGMENT;
   for (int i = argc-1;  i >= /*skip 'subx_bin' and 'run'*/2;  --i) {
     push(argv_data);
@@ -89,7 +89,7 @@ void push(uint32_t val) {
   write_mem_u32(Reg[ESP].u, val);
 }
 
-void load_segment_from_program_header(uint8_t* elf_contents, size_t size, uint32_t offset, uint32_t e_ehsize) {
+void load_segment_from_program_header(uint8_t* elf_contents, int segment_index, size_t size, uint32_t offset, uint32_t e_ehsize) {
   uint32_t p_type = u32_in(&elf_contents[offset]);
   trace(90, "load") << "program header at offset " << offset << ": type " << p_type << end();
   if (p_type != 1) {
@@ -103,35 +103,36 @@ void load_segment_from_program_header(uint8_t* elf_contents, size_t size, uint32
   uint32_t p_filesz = u32_in(&elf_contents[offset + 16]);
   uint32_t p_memsz = u32_in(&elf_contents[offset + 20]);
   if (p_filesz != p_memsz)
-    raise << "Can't handle segments where p_filesz != p_memsz (see http://refspecs.linuxbase.org/elf/elf.pdf)\n" << die();
+    raise << "Can't yet handle segments where p_filesz != p_memsz (see http://refspecs.linuxbase.org/elf/elf.pdf)\n" << die();
 
   if (p_offset + p_filesz > size)
     raise << "Invalid binary; segment at offset " << offset << " is too large: wants to end at " << p_offset+p_filesz << " but the file ends at " << size << '\n' << die();
-  if (Mem.size() < p_vaddr + p_memsz)
-    Mem.resize(p_vaddr + p_memsz);
-  if (size > p_memsz) size = p_memsz;
+  if (p_memsz > INITIAL_SEGMENT_SIZE) {
+    raise << "Code segment too small for SubX; for now please manually increase INITIAL_SEGMENT_SIZE.\n" << end();
+    return;
+  }
   trace(90, "load") << "blitting file offsets (" << p_offset << ", " << (p_offset+p_filesz) << ") to addresses (" << p_vaddr << ", " << (p_vaddr+p_memsz) << ')' << end();
+  if (size > p_memsz) size = p_memsz;
+  Mem.push_back(vma(p_vaddr));
   for (size_t i = 0;  i < p_filesz;  ++i)
     write_mem_u8(p_vaddr+i, elf_contents[p_offset+i]);
-  if (End_of_program < p_vaddr+p_memsz)
+  if (segment_index == 0 && End_of_program < p_vaddr+p_memsz)
     End_of_program = p_vaddr+p_memsz;
 }
 
 :(before "End Includes")
 // Very primitive/fixed/insecure ELF segments for now.
 //   code: 0x08048000 -> 0x08048fff
-//   data: 0x08049000 -> 0x08049fff
-//   heap: 0x0804a000 -> 0x0804afff
-//   stack: 0x0804bfff -> 0x0804b000 (downward)
-const int CODE_START = 0x08048000;
+//   data/heap: 0x08050000 -> 0x08050fff
+//   stack: 0x08060fff -> 0x08060000 (downward)
 const int SEGMENT_SIZE = 0x1000;
-const int AFTER_STACK = 0x0804c000;
-const int ARGV_DATA_SEGMENT = 0x0804e000;
+const int CODE_START = 0x08048000;
+const int DATA_SEGMENT = 0x08050000;
+const int HEAP_SEGMENT = DATA_SEGMENT;
+const int STACK_SEGMENT = 0x08060000;
+const int AFTER_STACK = 0x08060ffc;  // forget final word because of the off-by-one with INITIAL_SEGMENT_SIZE;
+const int ARGV_DATA_SEGMENT = 0x08070000;
 :(code)
-void initialize_mem() {
-  Mem.resize(AFTER_STACK - CODE_START);
-}
-
 inline uint32_t u32_in(uint8_t* p) {
   return p[0] | p[1] << 8 | p[2] << 16 | p[3] << 24;
 }
diff --git a/subx/013direct_addressing.cc b/subx/013direct_addressing.cc
index 7b265a44..45e034ed 100644
--- a/subx/013direct_addressing.cc
+++ b/subx/013direct_addressing.cc
@@ -555,7 +555,8 @@ put(name, "5f", "pop top of stack to R7 (EDI)");
 
 :(scenario pop_r32)
 % Reg[ESP].u = 0x60;
-% write_mem_i32(0x60, 0x0000000a);
+% Mem.push_back(vma(0x1));  // manually allocate memory
+% write_mem_i32(0x60, 0x0000000a);  // ..before this write
 == 0x1  # code segment
 # op  ModR/M  SIB   displacement  immediate
   5b                                          # pop stack to EBX
diff --git a/subx/020syscalls.cc b/subx/020syscalls.cc
index c7e3fa47..2940b06c 100644
--- a/subx/020syscalls.cc
+++ b/subx/020syscalls.cc
@@ -75,7 +75,7 @@ void process_int80() {
     break;
   case 45:  // brk: modify size of data segment
     trace(91, "run") << "grow data segment to " << Reg[EBX].u << end();
-    resize_mem(/*new end address*/Reg[EBX].u);
+    grow_data_segment(/*new end address*/Reg[EBX].u);
     break;
   default:
     raise << HEXWORD << EIP << ": unimplemented syscall " << Reg[EAX].u << '\n' << end();
@@ -102,16 +102,3 @@ void check_mode(int reg) {
     exit(1);
   }
 }
-
-void resize_mem(uint32_t new_end_address) {
-  if (new_end_address < Mem_offset) {
-    raise << HEXWORD << EIP << ": can't shrink data segment to before code segment\n" << end();
-    return;
-  }
-  int32_t new_size = new_end_address - Mem_offset;
-  if (new_size < SIZE(Mem)) {
-    raise << HEXWORD << EIP << ": shrinking data segment is not supported.\n" << end();
-    return;
-  }
-  Mem.resize(new_size);  // will throw exception on failure
-}
diff --git a/subx/034compute_segment_address.cc b/subx/034compute_segment_address.cc
index f5f383b6..71a18452 100644
--- a/subx/034compute_segment_address.cc
+++ b/subx/034compute_segment_address.cc
@@ -2,9 +2,7 @@
 //: segment.
 //: This gives up a measure of control in placing code and data.
 
-//: segment address computation requires setting Mem_offset in test mode to what it'll be in run mode
 :(scenario segment_name)
-% Mem_offset = CODE_START;
 == code
 05/add 0x0d0c0b0a/imm32  # add 0x0d0c0b0a to EAX
 # code starts at 0x08048000 + p_offset, which is 0x54 for a single-segment binary
diff --git a/subx/035labels.cc b/subx/035labels.cc
index 207b09b1..96668075 100644
--- a/subx/035labels.cc
+++ b/subx/035labels.cc
@@ -231,7 +231,6 @@ xz:
 //: ignore them.
 
 :(scenario segment_size_ignores_labels)
-% Mem_offset = CODE_START;
 == code  # 0x08048074
 05/add 0x0d0c0b0a/imm32  # 5 bytes
 foo:                     # 0 bytes
diff --git a/subx/036global_variables.cc b/subx/036global_variables.cc
index 42790c0c..c565014f 100644
--- a/subx/036global_variables.cc
+++ b/subx/036global_variables.cc
@@ -7,8 +7,6 @@
 //: This layer much the same structure as rewriting labels.
 
 :(scenario global_variable)
-% Mem_offset = CODE_START;
-% Mem.resize(0x2000);
 == code
 b9/copy x/imm32  # copy to ECX
 == data
@@ -147,11 +145,9 @@ x:
 #? +error: can't call to the data segment ('x')
 
 :(scenario disp32_data_with_modrm)
-% Mem_offset = CODE_START;
-% Mem.resize(0x2000);
 == code
 8b/copy 0/mod/indirect 5/rm32/.disp32 2/r32/EDX x/disp32
-==
+== data
 x:
 00 00 00 00
 $error: 0
diff --git a/subx/038---literal_strings.cc b/subx/038---literal_strings.cc
index 97542f43..b17947c0 100644
--- a/subx/038---literal_strings.cc
+++ b/subx/038---literal_strings.cc
@@ -4,10 +4,9 @@
 //: always be the second segment).
 
 :(scenario transform_literal_string)
-% Mem_offset = CODE_START;
-% Mem.resize(AFTER_STACK - CODE_START);
 == code
   b8/copy "test"/imm32  # copy to EAX
+== data  # need to manually create this for now
 +transform: -- move literal strings to data segment
 +transform: adding global variable '__subx_global_1' containing "test"
 +transform: instruction after transform: 'b8 __subx_global_1'
author	Kartik Agaram <vc@akkartik.com>	2018-09-28 23:08:27 -0700
committer	Kartik Agaram <vc@akkartik.com>	2018-09-29 10:20:13 -0700
commit	630433cd9cb97cf71d24bfc8fab6fb54ce40382a (patch)
tree	cf4cffae8599489e5efcbc18b965f804d5e3a8e8 /subx
parent	fd0cf1cd07ce01c3d6fe709d55b60ff9d1d5d44f (diff)
download	mu-630433cd9cb97cf71d24bfc8fab6fb54ce40382a.tar.gz