about summary refs log tree commit diff stats
path: root/cleave
diff options
context:
space:
mode:
authorKartik K. Agaram <vc@akkartik.com>2016-08-29 21:42:41 -0700
committerKartik K. Agaram <vc@akkartik.com>2016-08-29 21:47:13 -0700
commita800400c360c302a06c4127a34023b92244bcbf6 (patch)
tree8bbd071e3f365caec6dcd2e641d2e7667a77ee62 /cleave
parent45912cf8c7ffa429eab1dfb66b8851752ae7ff27 (diff)
downloadmu-a800400c360c302a06c4127a34023b92244bcbf6.tar.gz
3281 - faster incremental builds for layers
Before:

  layers -> tangle -> g++

  All changes to (C++) layers triggered recompilation of everything,
  taking 35s on my laptop, and over 4 minutes on a puny server with just
  512MB of RAM.

After:

  layers -> tangle -> cleave -> g++

  Now a tiny edit takes just 5s to recompile on my laptop.

My initial approach was to turn each function into a separate
compilation unit under the .build/ directory. That blew up the time for
a full/initial compilation to almost 6 minutes on my laptop. Trial and
error showed 4 compilation units to be close to the sweet spot. Full
compilation is still slightly slower (43s) but not by much.

I could speed things up further by building multiple of the compilation
units in parallel (the recursive invocation in 'makefile'). But that
would put more pressure on a puny server, so I'm going to avoid getting
too aggressive.

--- Other considerations

I spent some time manually testing the dependency structure to the
makefile, making sure that files aren't unnecessarily written to disk,
modifying their timestamp and triggering dependent work; that changes to
layers don't unnecessarily modify the common headers or list of globals;
that changes to the cleave/ tool itself rebuild the entire project; that
the old auto-generated '_list' files plug in at the right stage in the
pipeline; that changes to common headers trigger recompilation of
everything; etc. Too bad it's not easy to write some tests for all this.

I spent some time trying to make sure the makefile was not too opaque to
a newcomer. The targets mostly flow from top to bottom. There's a little
diagram at the top that is hopefully illuminating. When I had 700
compilation units for 700 functions I stopped printing each one of those
compilation commands, but when I backed off to just 4 compilation units
I decided to err on the side of making the build steps easy to see.
Diffstat (limited to 'cleave')
-rw-r--r--cleave/cleave.cc244
-rw-r--r--cleave/makefile7
2 files changed, 251 insertions, 0 deletions
diff --git a/cleave/cleave.cc b/cleave/cleave.cc
new file mode 100644
index 00000000..ee83bbf2
--- /dev/null
+++ b/cleave/cleave.cc
@@ -0,0 +1,244 @@
+// Read a single-file C++ program having a very specific structure, and split
+// it up into multiple separate compilation units to reduce the work needed to
+// rebuild after a small change. Write each compilation unit only if it has
+// changed from what was on disk before.
+//
+// This tool is tightly coupled with the build system for this project. The
+// makefile already auto-generates various things; we only do here what
+// standard unix tools can't easily do.
+//
+// Usage:
+//  cleave [input C++ file] [existing output directory]
+//
+// The input C++ file has the following structure:
+//   [#includes]
+//   [type definitions]
+//   // Globals
+//   [global variable definitions]
+//   // End Globals
+//   [function definitions]
+//
+// Afterwards, the output directory contains:
+//   header -- everything before the '// Globals' delimiter
+//   global_definitions_list -- everything between '// Globals' and '// End Globals' delimiters
+//   [.cc files partitioning function definitions]
+//
+// Each output function definition file contains:
+//   #include "header"
+//   #include "global_declarations_list"
+//   [function definitions]
+//
+// We'll chunk the files at boundaries where we encounter a '#line ' directive
+// between functions.
+//
+// One exception: the first file emitted #includes "global_definitions_list" instead
+// of "global_declarations_list"
+
+// Tune this parameter to balance time for initial vs incremental build.
+//
+//   Larger numbers -> larger/fewer compilation units -> faster initial build
+//   Smaller numbers -> smaller compilation units -> faster incremental build
+int Compilation_unit_size = 200;
+
+#include<assert.h>
+#include<cstdlib>
+#include<cstring>
+
+#include<vector>
+using std::vector;
+#include<list>
+using std::list;
+#include<utility>
+using std::pair;
+
+#include<string>
+using std::string;
+
+#include<iostream>
+using std::istream;
+using std::ostream;
+using std::cin;
+using std::cout;
+using std::cerr;
+
+#include<sstream>
+using std::istringstream;
+using std::ostringstream;
+
+#include<fstream>
+using std::ifstream;
+using std::ofstream;
+
+#include <locale>
+using std::isspace;  // unicode-aware
+
+string trim(const string& s) {
+  string::const_iterator first = s.begin();
+  while (first != s.end() && isspace(*first))
+    ++first;
+  if (first == s.end()) return "";
+
+  string::const_iterator last = --s.end();
+  while (last != s.begin() && isspace(*last))
+    --last;
+  ++last;
+  return string(first, last);
+}
+
+bool starts_with(const string& s, const string& pat) {
+  return s.substr(0, pat.size()) == pat;
+}
+
+bool has_data(istream& in) {
+  return in && !in.eof();
+}
+
+void slurp(const string/*copy*/ filename, vector<string>& lines) {
+  lines.clear();
+  ifstream in(filename.c_str());
+  while (has_data(in)) {
+    string curr_line;
+    getline(in, curr_line);
+    lines.push_back(curr_line);
+  }
+}
+
+size_t slurp_some_functions(const vector<string>& in, size_t start, vector<string>& out, bool first) {
+  out.clear();
+  if (start >= in.size()) return start;
+  out.push_back("#include \"header\"");
+  if (first)
+    out.push_back("#include \"global_definitions_list\"");
+  else
+    out.push_back("#include \"global_declarations_list\"");
+  out.push_back("");
+  size_t curr = start;
+  for (int i = 0; i < Compilation_unit_size; ++i) {
+    while (curr < in.size()) {
+      // read functions -- lines until unindented '}'
+      while (curr < in.size()) {
+        const string& line = in.at(curr);
+//?         cerr << curr << ": adding to function: " << line << '\n';
+        out.push_back(line);  ++curr;
+        if (!line.empty() && line.at(0) == '}') break;
+      }
+      // now look for a '#line' directive before the next non-comment non-empty
+      // line
+      while (curr < in.size()) {
+        const string& line = in.at(curr);
+        if (starts_with(line, "#line ")) goto try_return;
+        out.push_back(line);   ++curr;
+        if (trim(line).empty()) continue;
+        if (starts_with(trim(line), "//")) continue;
+        break;
+      }
+    }
+    try_return:;
+  }
+
+  // Idea: Increase the number of functions to include in the next call to
+  // slurp_some_functions.
+  // Early functions are more likely to be larger because later layers added
+  // to them.
+//?   Compilation_unit_size *= 1.5;
+  return curr;
+}
+
+// compare contents of a file with a list of lines, ignoring #line directives
+// on both sides
+bool no_change(const vector<string>& lines, const string& output_filename) {
+  vector<string> old_lines;
+  slurp(output_filename, old_lines);
+  size_t l=0, o=0;
+  while (true) {
+    while (l < lines.size() &&
+            (lines.at(l).empty() || starts_with(lines.at(l), "#line "))) {
+      ++l;
+    }
+    while (o < old_lines.size() &&
+            (old_lines.at(o).empty() || starts_with(old_lines.at(o), "#line "))) {
+      ++o;
+    }
+    if (l >= lines.size() && o >= old_lines.size()) return true;  // no change
+    if (l >= lines.size() || o >= old_lines.size()) return false;  // contents changed
+//?     cerr << "comparing\n";
+//?     cerr << o << ": " << old_lines.at(o) << '\n';
+//?     cerr << l << ": " << lines.at(l) << '\n';
+    if (lines.at(l) != old_lines.at(o)) return false;  // contents changed
+    ++l;  ++o;
+  }
+  assert(false);
+}
+
+string next_output_filename(const string& output_directory) {
+  static int file_count = 0;
+  ostringstream out;
+  out << output_directory << "/mu_" << file_count << ".cc";
+  file_count++;
+  return out.str();
+}
+
+void emit_file(const vector<string>& lines, const string& output_filename) {
+  if (no_change(lines, output_filename)) return;
+  cerr << "  updating " << output_filename << '\n';
+  ofstream out(output_filename.c_str());
+  for (size_t i = 0; i < lines.size(); ++i)
+    out << lines.at(i) << '\n';
+}
+
+void emit_compilation_unit(const vector<string>& lines, const string& output_directory) {
+  string output_filename = next_output_filename(output_directory);
+  emit_file(lines, output_filename);
+}
+
+int main(int argc, const char* argv[]) {
+  if (argc != 3) {
+    cerr << "usage: cleave [input .cc file] [output directory]\n";
+    exit(0);
+  }
+
+  vector<string> lines;
+
+  // read input
+  slurp(argv[1], lines);
+
+  string output_directory = argv[2];
+
+  // write header until but excluding '// Global' delimiter
+  size_t line_num = 0;
+  {
+    vector<string> out;
+    while (line_num < lines.size()) {
+      const string& line = lines.at(line_num);
+      if (trim(line) == "// Globals") break;  // todo: #line directive for delimiters
+      out.push_back(line);
+      ++line_num;
+    }
+    emit_file(out, output_directory+"/header");
+  }
+
+  // write global_definitions_list (including delimiters)
+  {
+    vector<string> out;
+    while (line_num < lines.size()) {
+      const string& line = lines.at(line_num);
+      out.push_back(line);
+      ++line_num;
+      if (trim(line) == "// End Globals") break;
+    }
+    emit_file(out, output_directory+"/global_definitions_list");
+  }
+
+  // segment functions
+  // first one is special
+  if (line_num < lines.size()) {
+    vector<string> function;
+    line_num = slurp_some_functions(lines, line_num, function, true);
+    emit_compilation_unit(function, output_directory);
+  }
+  while (line_num < lines.size()) {
+    vector<string> function;
+    line_num = slurp_some_functions(lines, line_num, function, false);
+    emit_compilation_unit(function, output_directory);
+  }
+}
diff --git a/cleave/makefile b/cleave/makefile
new file mode 100644
index 00000000..33665b5c
--- /dev/null
+++ b/cleave/makefile
@@ -0,0 +1,7 @@
+cleave: cleave.cc makefile
+	c++ -O3 -Wall -Wextra -fno-strict-aliasing cleave.cc -o cleave
+
+.PHONY: clean
+
+clean:
+	-rm cleave