3 files changed, 344 insertions, 1 deletions
diff --git a/tools/Readme.md b/tools/Readme.md
index 2649ef72..29a1a6ef 100644
--- a/tools/Readme.md
+++ b/tools/Readme.md
@@ -12,8 +12,11 @@ These are built automatically.
 
 These are built lazily.
 
+* `linkify`: inserts hyperlinks from variables to definitions in Mu's html
+  sources. Hacky; just see the number of tests. Invoked by `update_html`.
+
 * `treeshake_all`: rebuild SubX binaries without tests and unused functions.
-  Pretty hacky; just helps estimate the code needed to perform various tasks.
+  Hacky; just helps estimate the code needed to perform various tasks.
   ```
   tools/treeshake_all
   ```
diff --git a/tools/linkify.cc b/tools/linkify.cc
new file mode 100644
index 00000000..ece50748
--- /dev/null
+++ b/tools/linkify.cc
@@ -0,0 +1,267 @@
+// Read a tabular cross-reference file generated by ctags, then read a list of
+// html files generated by Vim's TOhtml command on C++ code. Link words
+// in the html files to cross-references from ctags.
+
+// Usage:
+//    linkify [tags file] [html files]...
+
+// Still plenty of holes:
+// - unnecessarily linking definition location to itself
+//   - except SubX definitions, which start at start of line
+// - can't detect strings in spite of attempt to support them below, because
+//   Vim's generated html turns quotes into html entities
+// - distinguishing function and variable names
+// - distinguishing Mu code in C++ files
+// - distinguishing between function overloads
+//   - if there's duplicate tags we aren't smart enough to distinguish between
+//     them yet, so we simply don't add any link at all
+//   - but even that's not perfect, because sometimes the tags file has a
+//     single definition but there's still multiple overloads (say I defined
+//     'clear()' on some type, and it's already defined on STL classes)
+// - ctags misses some symbols in layered code
+
+#include<assert.h>
+
+#include<map>
+using std::map;
+
+#include<string>
+using std::string;
+
+#include<iostream>
+using std::istream;
+using std::cout;
+using std::cerr;
+
+#include<sstream>
+using std::istringstream;
+using std::ostringstream;
+
+#include<fstream>
+using std::ifstream;
+using std::ofstream;
+
+#include <locale>
+using std::isspace;  // unicode-aware
+
+struct syminfo {
+  string filename;
+  int line_num;
+  syminfo() :line_num(0) {}
+};
+
+bool has_data(istream& in) {
+  in.peek();
+  if (in.eof()) return false;
+  assert(in);
+  return true;
+}
+
+bool starts_with(const string& s, const string& pat) {
+  string::const_iterator a=s.begin(), b=pat.begin();
+  for (/*nada*/;  a!=s.end() && b!=pat.end();  ++a, ++b)
+    if (*a != *b) return false;
+  return b == pat.end();
+}
+
+bool ends_with(const string& s, const string& pat) {
+  string::const_reverse_iterator a=s.rbegin(), b=pat.rbegin();
+  for (/*nada*/;  a!=s.rend() && b!=pat.rend();  ++a, ++b)
+    if (*a != *b) return false;
+  return b == pat.rend();
+}
+
+void encode_some_html_entities(string& s) {
+  std::string::size_type pos = 0;
+  while (true) {
+    pos = s.find_first_of("<>", pos);
+    if (pos == std::string::npos) break;
+    std::string replacement;
+    switch (s.at(pos)) {
+      case '<': replacement = "&lt;"; break;
+      case '>': replacement = "&gt;"; break;
+    }
+    s.replace(pos, 1, replacement);
+    pos += replacement.size();
+  };
+}
+
+void read_tags(const string& filename, map<string, syminfo>& info) {
+  ifstream in(filename.c_str());
+//?   cerr << "reading " << filename << '\n';
+  string dummy;
+  while (has_data(in)) {
+    string symbol;  in >> symbol;
+    if (symbol == "operator") {
+      // unsupported
+      getline(in, dummy);  // skip
+      continue;
+    }
+    encode_some_html_entities(symbol);
+//?     cerr << symbol << '\n';
+    if (info.find(symbol) != info.end()) {
+      info[symbol].line_num = -1;
+      info[symbol].filename.clear();
+    }
+    else {
+      in >> dummy;
+      in >> info[symbol].line_num;
+      in >> info[symbol].filename;
+    }
+    getline(in, dummy);  // skip rest of line
+//?     cerr << symbol << ": " << info[symbol].filename << ':' << info[symbol].line_num << '\n';
+  }
+  in.close();
+}
+
+void replace_tags_in_file(const string& filename, const map<string, syminfo>& info) {
+//?   cerr << info.size() << " symbols\n";
+  ifstream in(filename.c_str());
+  ofstream out((filename+".out").c_str());
+  while (has_data(in)) {
+    // send lines that don't start with '<span' straight through
+    string line;
+    getline(in, line);
+    if (!starts_with(line, "<span ")) {
+      out << line << '\n';
+    }
+    else {
+      static int span_size = string("</span>").size();
+      int skip_first_span = line.find("</span>") + span_size;
+      out << line.substr(0, skip_first_span);
+      istringstream in2(line.substr(skip_first_span));
+      in2 >> std::noskipws;
+      // only in .subx files, refuse to linkify the first word on a line
+      bool at_start_of_line = ends_with(filename, ".subx.html");
+//?       cerr << filename << ": " << at_start_of_line << '\n';
+      while (has_data(in2)) {
+        if (isspace(in2.peek())) {
+//?           cerr << "space\n";
+          char c;  in2 >> c;
+          out << c;
+          at_start_of_line = false;
+        }
+        // within a line, send straight through all characters inside '<..>'
+        else if (in2.peek() == '<') {
+//?           cerr << "tag\n";
+          char c = '\0';
+          while (in2 >> c) {
+//?             cerr << "span: " << c << '\n';
+            out << c;
+            if (c == '>') break;
+          }
+          // don't include initial tag when computing 'at_start_of_line'
+//?           cerr << "end tag\n";
+        }
+        else {
+          // send straight through all characters inside strings (handling escapes)
+          char c = in2.get();
+          if (c == '"') {
+//?             cerr << "string\n";
+            out << c;
+            while (in2 >> c) {
+              out << c;
+              if (c == '\\') {
+                in2 >> c;  out << c;
+              }
+              else if (c == '"') {
+                break;
+              }
+            }
+            at_start_of_line = false;
+          }
+          else if (c == '\'') {
+//?             cerr << "character\n";
+            out << c;
+            while (in2 >> c) {
+              out << c;
+              if (c == '\\') {
+                in2 >> c;  out << c;
+              }
+              else if (c == '\'') {
+                break;
+              }
+            }
+            at_start_of_line = false;
+          }
+          // send straight through any characters after '#' (comments)
+          else if (c == '#') {
+//?             cerr << "comment\n";
+            out << c;
+            while (in2 >> c) out << c;
+            at_start_of_line = false;
+          }
+          // send straight through any characters after '//' (comments)
+          else if (c == '/' && in2.peek() == '/') {
+//?             cerr << "comment\n";
+            out << c;
+            while (in2 >> c) out << c;
+            at_start_of_line = false;
+          }
+          // send through open parens at start of line
+          else if (c == '(') {
+            out << c;
+            at_start_of_line = false;
+          }
+          else if (c == ')') {
+            out << c;
+            at_start_of_line = false;
+          }
+          else {
+//?             cerr << "rest\n";
+            if (c == ',' || c == ':') {
+              out << c;
+              at_start_of_line = false;
+              continue;
+            }
+            ostringstream out2;
+            out2 << c;
+            while (in2 >> c) {
+              if (isspace(c) || c == '<' || c == '"' || c == '\'' || c == '/' || c == ',' || c == ':' || c == '(' || c == ')') {  // keep sync'd with other clauses above
+                in2.putback(c);
+                break;
+              }
+              out2 << c;
+            }
+            string symbol = out2.str();
+            if (symbol == "equal" || symbol == "index" || symbol == "put-index" || symbol == "length") {
+//?               cerr << "  blacklisted\n";
+              out << symbol;
+            }
+            else if (info.find(symbol) == info.end()) {
+//?               cerr << "  no info\n";
+              out << symbol;
+            }
+            else {
+              const syminfo& s = info.find(symbol)->second;
+              if (s.filename.empty()) {
+//?                 cerr << "  empty info\n";
+                out << symbol;
+              }
+              else {
+                if (at_start_of_line) {
+//?                   cerr << "  at start of line; refusing to linkify " << symbol << "\n";
+                  out << symbol;
+                }
+                else {
+//?                   cerr << "  link\n";
+                  out << "<a href='" << s.filename << ".html#L" << s.line_num << "'>" << symbol << "</a>";
+                }
+              }
+            }
+          }  // end rest
+        }
+      }  // done parsing line
+      out << '\n';
+    }
+  }
+  in.close();  out.close();
+}
+
+int main(int argc, const char* argv[]) {
+  map<string, syminfo> info;
+  read_tags(argv[1], info);
+  for (int i = 2;  i < argc;  ++i)
+    replace_tags_in_file(argv[i], info);
+  return 0;
+}
diff --git a/tools/update_html b/tools/update_html
new file mode 100755
index 00000000..478d5d6c
--- /dev/null
+++ b/tools/update_html
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Regenerate html files.
+# If given a single argument, regenerate just that file.
+
+set -e
+
+( cd tools; c++ -g linkify.cc -o linkify; )
+
+# generate html/$1.html using /tmp/tags
+process() {
+  rm -f html/$1.html
+  convert_html $1
+  tools/linkify /tmp/tags html/$1.html
+  mv html/$1.html.out html/$1.html
+}
+
+URL_BASE='https://github.com/akkartik/mu/blob/master'
+
+convert_html() {
+  vim -c "set number" -c TOhtml -c write -c qall $1
+
+  sed -i 's,<title>.*/mu/,<title>Mu - ,' $1.html
+  sed -i 's,\.html</title>,</title>,' $1.html
+
+  sed -i "/^<body/a <a href='$URL_BASE/$1'>$URL_BASE/$1</a>" $1.html
+
+  sed -i 's/^\* { \(.*\) }/* { font-size:12pt; \1 }/g' $1.html
+  sed -i 's/^body { \(.*\) }/body { font-size:12pt; \1 }/g' $1.html
+
+  sed -i '/^body {/a a { color:inherit; }' $1.html
+
+  # switch unicode characters around in the rendered html
+  #   the ones we have in the source files render double-wide in html
+  #   the ones we want in the html cause iTerm2 to slow down in alt-tabbing for some reason
+  # the following commands give us the best of both worlds
+  sed -i -e 's/┈/╌/g' -e 's/┊/╎/g' $1.html
+
+  mv -i $1.html html/`dirname $1`
+}
+
+ctags -x *.cc  |grep -v '^. '  > /tmp/tags  # don't hyperlink every 'i' to the integer register variant
+for f in *.cc
+do
+  test $# -gt 0  &&  test $1 != $f  &&  continue
+  process $f
+done
+
+for f in examples/*.subx
+do
+  test $# -gt 0  &&  test $1 != $f  &&  continue
+  ( cd examples
+    ctags -x `basename $f` > /tmp/tags
+  )
+  process $f
+done
+
+ctags -x *.subx  > /tmp/tags
+for f in *.subx
+do
+  test $# -gt 0  &&  test $1 != $f  &&  continue
+  process $f
+done
+
+for f in apps/*.subx
+do
+  test $# -gt 0  &&  test $1 != $f  &&  continue
+  ( cd apps
+    ctags -x ../*.subx `basename $f` > /tmp/tags
+  )
+  process $f
+done
+
+rm /tmp/tags