tools/linkify.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105pre { line-height: 125%; }
td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
.highlight .hll { background-color: #ffffcc }
.highlight .c { color: #888888 } /* Comment */
.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
.highlight .k { color: #008800; font-weight: bold } /* Keyword */
.highlight .ch { color: #888888 } /* Comment.Hashbang */
.highlight .cm { color: #888888 } /* Comment.Multiline */
.highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
.highlight .cpf { color: #888888 } /* Comment.PreprocFile */
.highlight .c1 { color: #888888 } /* Comment.Single */
.highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */
.highlight .gr { color: #aa0000 } /* Generic.Error */
.highlight .gh { color: #333333 } /* Generic.Heading */
.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
.highlight .go { color: #888888 } /* Generic.Output */
.highlight .gp { color: #555555 } /* Generic.Prompt */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #666666 } /* Generic.Subheading */
.highlight .gt { color: #aa0000 } /* Generic.Traceback */
.highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #008800 } /* Keyword.Pseudo */
.highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */
.highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */
.highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */
.highlight .na { color: #336699 } /* Name.Attribute */
.highlight .nb { color: #003388 } /* Name.Builtin */
.highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */
.highlight .no { color: #003366; font-weight: bold } /* Name.Constant */
.highlight .nd { color: #555555 } /* Name.Decorator */
.highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */
.highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */
.highlight .nl { color: #336699; font-style: italic } /* Name.Label */
.highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */
.highlight .py { color: #336699; font-weight: bold } /* Name.Property */
.highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */
.highlight .nv { color: #336699 } /* Name.Variable */
.highlight .ow { color: #008800 } /* Operator.Word */
.highlight .w { color: #bbbbbb } /* Text.Whitespace */
.highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */
.highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */
.highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */
.highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */
.highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */
.highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */
.highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */
.highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */
.highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */
.highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */
.highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */
.highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */
.highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */
.highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */
.highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */
.highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */
.highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */
.highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */
.highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */
.highlight .vc { color: #336699 } /* Name.Variable.Class */
.highlight .vg { color: #dd7700 } /* Name.Variable.Global */
.highlight .vi { color: #3333bb } /* Name.Variable.Instance */
.highlight .vm { color: #336699 } /* Name.Variable.Magic */
.highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */# Copyright (C) 2009, 2010  Roman Zimbelmann <romanz@lavabit.com>
#
# This program is free software: you can redistr// Read a tabular cross-reference file generated by ctags, then read a list of
// html files generated by Vim's TOhtml command on C++ code. Link words
// in the html files to cross-references from ctags.

// Usage:
//    linkify [tags file] [html files]...

// Still plenty of holes:
// - unnecessarily linking definition location to itself
//   - except SubX definitions, which start at start of line
// - can't detect strings in spite of attempt to support them below, because
//   Vim's generated html turns quotes into html entities
// - distinguishing function and variable names
// - distinguishing Mu code in C++ files
// - distinguishing between function overloads
//   - if there's duplicate tags we aren't smart enough to distinguish between
//     them yet, so we simply don't add any link at all
//   - but even that's not perfect, because sometimes the tags file has a
//     single definition but there's still multiple overloads (say I defined
//     'clear()' on some type, and it's already defined on STL classes)
// - ctags misses some symbols in layered code

#include<assert.h>

#include<map>
using std::map;

#include<string>
using std::string;

#include<iostream>
using std::istream;
using std::cout;
using std::cerr;

#include<sstream>
using std::istringstream;
using std::ostringstream;

#include<fstream>
using std::ifstream;
using std::ofstream;

#include <locale>
using std::isspace;  // unicode-aware

struct syminfo {
  string filename;
  int line_num;
  syminfo() :line_num(0) {}
};

bool has_data(istream& in) {
  in.peek();
  if (in.eof()) return false;
  assert(in);
  return true;
}

bool starts_with(const string& s, const string& pat) {
  string::const_iterator a=s.begin(), b=pat.begin();
  for (/*nada*/;  a!=s.end() && b!=pat.end();  ++a, ++b)
    if (*a != *b) return false;
  return b == pat.end();
}

bool ends_with(const string& s, const string& pat) {
  string::const_reverse_iterator a=s.rbegin(), b=pat.rbegin();
  for (/*nada*/;  a!=s.rend() && b!=pat.rend();  ++a, ++b)
    if (*a != *b) return false;
  return b == pat.rend();
}

void encode_some_html_entities(string& s) {
  std::string::size_type pos = 0;
  while (true) {
    pos = s.find_first_of("<>", pos);
    if (pos == std::string::npos) break;
    std::string replacement;
    switch (s.at(pos)) {
      case '<': replacement = "&lt;"; break;
      case '>': replacement = "&gt;"; break;
    }
    s.replace(pos, 1, replacement);
    pos += replacement.size();
  };
}

void read_tags(const string& filename, map<string, syminfo>& info) {
  ifstream in(filename.c_str());
//?   cerr << "reading " << filename << '\n';
  string dummy;
  while (has_data(in)) {
    string symbol;  in >> symbol;
    if (symbol == "operator") {
      // unsupported
      getline(in, dummy);  // skip
      continue;
    }
    encode_some_html_entities(symbol);
//?     cerr << symbol << '\n';
    if (info.find(symbol) != info.end()) {
      info[symbol].line_num = -1;
      info[symbol].filename.clear();
    }
    else {
      in >> dummy;
      in >> info[symbol].line_num;
      in >> info[symbol].filename;
    }
    getline(in, dummy);  // skip rest of line
//?     cerr << symbol << ": " << info[symbol].filename << ':' << info[symbol].line_num << '\n';
  }
  in.close();
}

void replace_tags_in_file(const string& filename, const map<string, syminfo>& info) {
//?   cerr << info.size() << " symbols\n";
  ifstream in(filename.c_str());
  ofstream out((filename+".out").c_str());
  while (has_data(in)) {
    // send lines that don't start with '<span' straight through
    string line;
    getline(in, line);
    if (!starts_with(line, "<span ")) {
      out << line << '\n';
    }
    else {
      static int span_size = string("</span>").size();
      int skip_first_span = line.find("</span>") + span_size;
      out << line.substr(0, skip_first_span);
      istringstream in2(line.substr(skip_first_span));
      in2 >> std::noskipws;
      // only in .subx files, refuse to linkify the first word on a line
      bool at_start_of_line = ends_with(filename, ".subx.html");
//?       cerr << filename << ": " << at_start_of_line << '\n';
      while (has_data(in2)) {
        if (isspace(in2.peek())) {
//?           cerr << "space\n";
          char c;  in2 >> c;
          out << c;
          at_start_of_line = false;
        }
        // within a line, send straight through all characters inside '<..>'
        else if (in2.peek() == '<') {
//?           cerr << "tag\n";
          char c = '\0';
          while (in2 >> c) {
//?             cerr << "span: " << c << '\n';
            out << c;
            if (c == '>') break;
          }
          // don't include initial tag when computing 'at_start_of_line'
//?           cerr << "end tag\n";
        }
        else {
          // send straight through all characters inside strings (handling escapes)
          char c = in2.get();
          if (c == '"') {
//?             cerr << "string\n";
            out << c;
            while (in2 >> c) {
              out << c;
              if (c == '\\') {
                in2 >> c;  out << c;
              }
              else if (c == '"') {
                break;
              }
            }
            at_start_of_line = false;
          }
          else if (c == '\'') {
//?             cerr << "character\n";
            out << c;
            while (in2 >> c) {
              out << c;
              if (c == '\\') {
                in2 >> c;  out << c;
              }
              else if (c == '\'') {
                break;
              }
            }
            at_start_of_line = false;
          }
          // send straight through any characters after '#' (comments)
          else if (c == '#') {
//?             cerr << "comment\n";
            out << c;
            while (in2 >> c) out << c;
            at_start_of_line = false;
          }
          // send straight through any characters after '//' (comments)
          else if (c == '/' && in2.peek() == '/') {
//?             cerr << "comment\n";
            out << c;
            while (in2 >> c) out << c;
            at_start_of_line = false;
          }
          // send through open parens at start of line
          else if (c == '(') {
            out << c;
            at_start_of_line = false;
          }
          else if (c == ')') {
            out << c;
            at_start_of_line = false;
          }
          else {
//?             cerr << "rest\n";
            if (c == ',' || c == ':') {
              out << c;
              at_start_of_line = false;
              continue;
            }
            ostringstream out2;
            out2 << c;
            while (in2 >> c) {
              if (isspace(c) || c == '<' || c == '"' || c == '\'' || c == '/' || c == ',' || c == ':' || c == '(' || c == ')') {  // keep sync'd with other clauses above
                in2.putback(c);
                break;
              }
              out2 << c;
            }
            string symbol = out2.str();
            if (symbol == "equal" || symbol == "index" || symbol == "put-index" || symbol == "length") {
//?               cerr << "  blacklisted\n";
              out << symbol;
            }
            else if (info.find(symbol) == info.end()) {
//?               cerr << "  no info\n";
              out << symbol;
            }
            else {
              const syminfo& s = info.find(symbol)->second;
              if (s.filename.empty()) {
//?                 cerr << "  empty info\n";
                out << symbol;
              }
              else {
                if (at_start_of_line) {
//?                   cerr << "  at start of line; refusing to linkify " << symbol << "\n";
                  out << symbol;
                }
                else {
//?                   cerr << "  link\n";
                  out << "<a href='" << s.filename << ".html#L" << s.line_num << "'>" << symbol << "</a>";
                }
              }
            }
          }  // end rest
        }
      }  // done parsing line
      out << '\n';
    }
  }
  in.close();  out.close();
}

int main(int argc, const char* argv[]) {
  map<string, syminfo> info;
  read_tags(argv[1], info);
  for (int i = 2;  i < argc;  ++i)
    replace_tags_in_file(argv[i], info);
  return 0;
}