5 files changed, 198 insertions, 1 deletions
diff --git a/build b/build
index 9e95bb34..9028bfa3 100755
--- a/build
+++ b/build
@@ -106,4 +106,9 @@ older_than subx_bin subx.cc *_list && {
   $CXX $CFLAGS subx.cc -o subx_bin
 }
 
+older_than treeshake treeshake.cc && {
+  echo $CXX $CFLAGS treeshake.cc -o treeshake
+  $CXX $CFLAGS treeshake.cc -o treeshake
+}
+
 exit 0
diff --git a/clean b/clean
index ef0eb481..aeb8f520 100755
--- a/clean
+++ b/clean
@@ -5,7 +5,7 @@ set -v
 rm -rf subx.cc subx_bin* *_list
 rm -rf .until
 test $# -gt 0 && exit 0  # convenience: 'clean top-level' to leave subsidiary tools alone
-rm -rf enumerate/enumerate tangle/tangle tangle/*_list */*.dSYM
+rm -rf enumerate/enumerate tangle/tangle tangle/*_list */*.dSYM treeshake
 rm -rf browse_trace/browse_trace_bin browse_trace/*_list
 rm -rf tmp mu-linux.iso outfs initrd.fat mu-soso.iso
 ( cd kernel.soso  &&  make clean; )
diff --git a/treeshake.cc b/treeshake.cc
new file mode 100644
index 00000000..db8b5135
--- /dev/null
+++ b/treeshake.cc
@@ -0,0 +1,104 @@
+// Read a set of lines on stdin of the following form:
+//  definition:
+//    ...
+//    ...
+//
+// Delete all 'dead' definitions with following indented lines that aren't
+// used outside their bodies.
+//
+// This can be transitive; deleting one definition may cause other definitions
+// to become dead.
+//
+// Also assorts segments as a side-effect.
+//
+// Like linkify, treeshake is a hack.
+
+#include<assert.h>
+
+#include<map>
+using std::map;
+#include<vector>
+using std::vector;
+#define SIZE(X) (assert((X).size() < (1LL<<(sizeof(int)*8-2))), static_cast<int>((X).size()))
+
+#include<string>
+using std::string;
+
+#include<iostream>
+using std::cin;
+using std::cout;
+
+#include<sstream>
+using std::istringstream;
+
+bool starts_with(const string& s, const string& pat) {
+  string::const_iterator a=s.begin(), b=pat.begin();
+  for (/*nada*/;  a!=s.end() && b!=pat.end();  ++a, ++b)
+    if (*a != *b) return false;
+  return b == pat.end();
+}
+
+void read_body(string name, string definition_line, map<string, vector<string> >& segment) {
+  // last definition wins; this only matters for the 'Entry' label in the code segment
+  segment[name] = vector<string>();
+  segment[name].push_back(definition_line);
+  while (!cin.eof()) {
+    if (cin.peek() != ' ' && cin.peek() != '$') break;  // assumes: no whitespace but spaces; internal labels start with '$'
+    string line;
+    getline(cin, line);
+    segment[name].push_back(line);
+  }
+}
+
+void read_lines(string segment_header, map<string, vector<string> >& segment) {
+  // first segment header wins
+  if (segment.empty())
+    segment["=="].push_back(segment_header);  // '==' is a special key containing the segment header
+  while (!cin.eof()) {
+    if (cin.peek() == '=') break;  // assumes: no line can start with '=' except a segment header
+    assert(cin.peek() != ' ');  // assumes: no whitespace but spaces
+    string line;
+    getline(cin, line);
+    istringstream lstream(line);
+    string name;
+    getline(lstream, name, ' ');
+    assert(name[SIZE(name)-1] == ':');
+    name.erase(--name.end());
+    read_body(name, line, segment);
+  }
+}
+
+void read_lines(map<string, vector<string> >& code, map<string, vector<string> >& data) {
+  while (!cin.eof()) {
+    string line;
+    getline(cin, line);
+    assert(starts_with(line, "== "));
+    map<string, vector<string> >& curr = (line.substr(3, 4) == "code") ? code : data;  // HACK: doesn't support segments except 'code' and 'data'
+    read_lines(line, curr);
+  }
+}
+
+void treeshake(const map<string, vector<string> >& code, map<string, vector<string> >& data) {
+}
+
+void dump(const map<string, vector<string> > definitions) {
+  // nothing special needed for segment headers, since '=' precedes all alphabet in ASCII
+  for (map<string, vector<string> >::const_iterator p = definitions.begin();  p != definitions.end();  ++p) {
+    const vector<string>& lines = p->second;
+    for (int i = 0;  i < SIZE(lines);  ++i)
+      cout << lines[i] << '\n';
+  }
+}
+
+int main(int argc, const char* argv[]) {
+  map<string, vector<string> > code, data;
+  read_lines(code, data);
+  while (true) {
+    int old_csize = SIZE(code), old_dsize = SIZE(data);
+    treeshake(code, data);
+    if (SIZE(code) == old_csize && SIZE(data) == old_dsize) break;
+  }
+  dump(code);
+  dump(data);
+  return 0;
+}
diff --git a/treeshake_all b/treeshake_all
new file mode 100755
index 00000000..2a13f50a
--- /dev/null
+++ b/treeshake_all
@@ -0,0 +1,46 @@
+#!/bin/sh
+# Build minimal-size versions of all apps.
+# Hacky; only intended for some stats at the moment.
+
+set -e
+cd `dirname $0`
+
+./build
+
+export OS=${OS:-linux}
+
+for app in factorial crenshaw2-1 crenshaw2-1b
+do
+  echo $app
+  ./treeshake_translate init.$OS 0*.subx apps/$app.subx
+  mv a.treeshake apps/$app.treeshake
+  mv a.elf apps/$app.treeshake.bin
+  apps/$app test
+  echo
+  apps/$app.treeshake.bin test
+  echo
+done
+
+# Phases of the self-hosted SubX translator.
+
+for app in hex survey pack assort dquotes tests sigils calls braces
+do
+  echo $app
+  ./treeshake_translate init.$OS 0*.subx apps/subx-params.subx apps/$app.subx
+  mv a.treeshake apps/$app.treeshake
+  mv a.elf apps/$app.treeshake.bin
+  apps/$app test
+  echo
+  apps/$app.treeshake.bin test
+  echo
+done
+
+# Mu translator
+echo mu.subx
+./treeshake_translate init.$OS 0*.subx apps/mu.subx
+mv a.treeshake apps/mu.treeshake
+mv a.elf apps/mu.treeshake.bin
+apps/mu test
+echo
+apps/mu.treeshake.bin test
+echo
diff --git a/treeshake_translate b/treeshake_translate
new file mode 100755
index 00000000..beb19aa5
--- /dev/null
+++ b/treeshake_translate
@@ -0,0 +1,42 @@
+#!/bin/sh
+# Translate SubX into minimal ELF binaries for Linux.
+# This script is a hack; see the other *translate scripts instead.
+
+set -e
+
+./build
+
+grep -vh '^\s*#\|^\s*$' $* |./treeshake > a.treeshake
+
+cat a.treeshake |apps/braces   > a.braces
+
+cat a.braces    |apps/calls    > a.calls
+
+cat a.calls     |apps/sigils   > a.sigils
+
+cat a.sigils    |apps/tests    > a.tests
+
+cat a.tests     |apps/assort   > a.assort
+
+cat a.assort    |apps/dquotes  > a.dquotes
+
+# A little hack. We want ntranslate to always emit identical binaries to the
+# C++ translator. The C++ translator assorts segments before it processes
+# string literals, so we follow the same order above.
+#
+# However, dquotes currently emits a separate data segment for string literals.
+# So we need to run assort a second time to clean up after it.
+#
+# Potential solutions:
+#   a) modify C++ translator to process string literals before assorting.
+#   b) clean up dquotes to assume assorted segments, and append to the
+#   existing data segment.
+cat a.dquotes   |apps/assort   > a.assort2
+
+cat a.assort2   |apps/pack     > a.pack
+
+cat a.pack      |apps/survey   > a.survey
+
+cat a.survey    |apps/hex      > a.elf
+
+chmod +x a.elf