// Read a tabular cross-reference file generated by ctags, then read a list of // html files generated by Vim's TOhtml command on C++ code. Link words // in the html files to cross-references from ctags. // Usage: // linkify [tags file] [html files]... // Still plenty of holes: // - unnecessarily linking definition location to itself // - except SubX definitions, which start at start of line // - can't detect strings in spite of attempt to support them below, because // Vim's generated html turns quotes into html entities // - distinguishing function and variable names // - distinguishing Mu code in C++ files // - distinguishing between function overloads // - if there's duplicate tags we aren't smart enough to distinguish between // them yet, so we simply don't add any link at all // - but even that's not perfect, because sometimes the tags file has a // single definition but there's still multiple overloads (say I defined // 'clear()' on some type, and it's already defined on STL classes) // - ctags misses some symbols in layered code #include #include using std::map; #include using std::string; #include using std::istream; using std::cout; using std::cerr; #include using std::istringstream; using std::ostringstream; #include using std::ifstream; using std::ofstream; #include using std::isspace; // unicode-aware struct syminfo { string filename; int line_num; syminfo() :line_num(0) {} }; bool has_data(istream& in) { in.peek(); if (in.eof()) return false; assert(in); return true; } bool starts_with(const string& s, const string& pat) { string::const_iterator a=s.begin(), b=pat.begin(); for (/*nada*/; a!=s.end() && b!=pat.end(); ++a, ++b) if (*a != *b) return false; return b == pat.end(); } bool ends_with(const string& s, const string& pat) { string::const_reverse_iterator a=s.rbegin(), b=pat.rbegin(); for (/*nada*/; a!=s.rend() && b!=pat.rend(); ++a, ++b) if (*a != *b) return false; return b == pat.rend(); } void encode_some_html_entities(string& s) { std::string::size_type pos = 0; while (true) { pos = s.find_first_of("<>", pos); if (pos == std::string::npos) break; std::string replacement; switch (s.at(pos)) { case '<': replacement = "<"; break; case '>': replacement = ">"; break; } s.replace(pos, 1, replacement); pos += replacement.size(); }; } void read_tags(const string& filename, map& info) { ifstream in(filename.c_str()); //? cerr << "reading " << filename << '\n'; string dummy; while (has_data(in)) { string symbol; in >> symbol; if (symbol == "operator") { // unsupported getline(in, dummy); // skip continue; } encode_some_html_entities(symbol); //? cerr << symbol << '\n'; if (info.find(symbol) != info.end()) { info[symbol].line_num = -1; info[symbol].filename.clear(); } else { in >> dummy; in >> info[symbol].line_num; in >> info[symbol].filename; } getline(in, dummy); // skip rest of line //? cerr << symbol << ": " << info[symbol].filename << ':' << info[symbol].line_num << '\n'; } in.close(); } void replace_tags_in_file(const string& filename, const map& info) { //? cerr << info.size() << " symbols\n"; ifstream in(filename.c_str()); ofstream out((filename+".out").c_str()); while (has_data(in)) { // send lines that don't start with '").size(); int skip_first_span = line.find("") + span_size; out << line.substr(0, skip_first_span); istringstream in2(line.substr(skip_first_span)); in2 >> std::noskipws; // only in .subx files, refuse to linkify the first word on a line bool at_start_of_line = ends_with(filename, ".subx.html"); //? cerr << filename << ": " << at_start_of_line << '\n'; while (has_data(in2)) { if (isspace(in2.peek())) { //? cerr << "space\n"; char c; in2 >> c; out << c; at_start_of_line = false; } // within a line, send straight through all characters inside '<..>' else if (in2.peek() == '<') { //? cerr << "tag\n"; char c = '\0'; while (in2 >> c) { //? cerr << "span: " << c << '\n'; out << c; if (c == '>') break; } // don't include initial tag when computing 'at_start_of_line' //? cerr << "end tag\n"; } else { // send straight through all characters inside strings (handling escapes) char c = in2.get(); if (c == '"') { //? cerr << "string\n"; out << c; while (in2 >> c) { out << c; if (c == '\\') { in2 >> c; out << c; } else if (c == '"') { break; } } at_start_of_line = false; } else if (c == '\'') { //? cerr << "character\n"; out << c; while (in2 >> c) { out << c; if (c == '\\') { in2 >> c; out << c; } else if (c == '\'') { break; } } at_start_of_line = false; } // send straight through any characters after '#' (comments) else if (c == '#') { //? cerr << "comment\n"; out << c; while (in2 >> c) out << c; at_start_of_line = false; } // send straight through any characters after '//' (comments) else if (c == '/' && in2.peek() == '/') { //? cerr << "comment\n"; out << c; while (in2 >> c) out << c; at_start_of_line = false; } else { //? cerr << "rest\n"; if (c == ',' || c == ':') { out << c; at_start_of_line = false; continue; } ostringstream out2; out2 << c; while (in2 >> c) { if (isspace(c) || c == '<' || c == '"' || c == '\'' || c == '/' || c == ',' || c == ':') { // keep sync'd with other clauses above in2.putback(c); break; } out2 << c; } string symbol = out2.str(); if (symbol == "equal" || symbol == "index" || symbol == "put-index" || symbol == "length") { //? cerr << " blacklisted\n"; out << symbol; } else if (info.find(symbol) == info.end()) { //? cerr << " no info\n"; out << symbol;