about summary refs log tree commit diff stats
path: root/ascii2txt.pl
diff options
context:
space:
mode:
Diffstat (limited to 'ascii2txt.pl')
-rw-r--r--ascii2txt.pl192
1 files changed, 192 insertions, 0 deletions
diff --git a/ascii2txt.pl b/ascii2txt.pl
new file mode 100644
index 0000000..cc949e8
--- /dev/null
+++ b/ascii2txt.pl
@@ -0,0 +1,192 @@
+#!/bin/perl
+
+# Copyright (c) 2010,2011,2012 Todd T. Fries <todd@fries.net>
+#
+# Permission to use, copy, modify, and distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+# read in 'mandoc -Tascii' formatted man pages, spit out txt useful for further
+# processing by other utilities
+
+use strict;
+use warnings;
+
+our $fileinfo = $ARGV[0];
+
+our $verbose = 0;
+
+my $line;
+my @lines;
+while(<STDIN>) {
+	$line = $_;
+	push @lines,$line;
+}
+my $oline = "";
+my $fmtline = "%s";
+foreach $line (@lines) {
+
+	my $newline = "";
+	foreach my $seg (split(/(.\x08.)/,$line)) {
+		my $newseg = $seg;
+		$newseg =~ m/^(.)\x08(.)$/;
+		if (!defined($1) || !defined($2)) {
+			$newline .= $seg;
+			next;
+		}
+		if ($1 eq $2) {
+			$newline .= "${2}";
+			next;
+		}
+		if ($1 eq "_") {
+			$newline .= "${2}";
+			next;
+		}
+		$newline .= $seg;
+		next;
+	}
+	if ($verbose > 0) {
+		printf STDERR "==> text{bf,it}\n   line: <%s>\nnewline: <%s>\n",$line,$newline;
+	}
+	$line = $newline;
+	$line =~ m/(.)\x08/;
+	if (defined($1)) {
+		printf STDERR "Removing %s\\x08\n",$1;
+	}
+	$line =~ s/.\x08//g;
+
+	# combine adjacent entries
+	foreach my $macro (("textbf", "textit")) {
+		$oline = "";
+		while ($oline ne $line) {
+			#printf STDERR "combine adjacent\n";
+			$oline = $line;
+			$line =~ s/\xab\\${macro}\{([^\}]*)\}\xbb\xab\\${macro}\{([^\}]*)\}\xbb/\xab\\${macro}\{$1$2\}\xbb/g;
+		}
+	}
+	# combine space separated
+	foreach my $macro (("textbf")) {
+		#printf STDERR "combine space\n";
+		$oline = "";
+		while ($oline ne $line) {
+			$oline = $line;
+			$line =~ s/\xab\\${macro}\{([^\}]*)\}\xbb[ ]+\xab\\${macro}\{([^\}]*)\}\xbb/\xab\\${macro}\{$1 $2\}\xbb/g;
+		}
+	}
+
+	# do the substitution one at a time to be sure to add all man pages, not just the last ones per line.
+	# XXX provide an exceptions list, audio(9) has mono(1) and stereo(2)
+	# XXX references, which are _not_ man pages
+	$oline = "";
+	while ($oline ne $line) {
+		$oline=$line;
+		$line =~ s/\{(http|ftp|https):\/\/(.*)\}/ $1:\/\/$2 /;
+		if (0) {
+		if ($line =~ m/ ([a-z][a-z0-9\.\-\_]*)\(([1-9])\)([,\.\) ])/) {
+			my $quote = texquote($1);
+			$line =~ s/ ([a-z][a-z0-9\.\-\_]*)\(([1-9])\)([,\.\) ])/ \xab\\man{$quote}{$2}\xbb$3/;
+		}
+		
+		if ($line =~ m/ ([a-z][a-z0-9\.\-\_]*)\(([1-9])\)$/) {
+			my $quote = texquote($1);
+			$line =~ s/ ([a-z][a-z0-9\.\-\_]*)\(([1-9])\)$/ \xab\\man{$quote}{$2}\xbb/;
+		}
+		}
+	}
+	my @macros = ("textbf","textit","man","href");
+	# quote arguments for tex
+	foreach my $macro (@macros) {
+		my $newline = "";
+		foreach my $seg (split(/(\xab\\${macro}\{[^\xbb]*\}\xbb)/,$line)) {
+			#printf STDERR "quote args\n";
+			my $newseg = $seg;
+			# check for nesting first; we only want to escape the
+			# inner most argument, process nested macro if it has a nested macro
+			# since the nested macro won't catch in the other regex cases
+			my $foundnest = 0;
+			foreach my $nest (@macros) {
+				if ($macro eq $nest) {
+					next;
+				}
+				
+				$newseg =~ m/^\xab\\${macro}\{[ ]*\\${nest}\{([^\xbb]*)\}\{([^\xbb]*)\}\}\xbb$/;
+				if (defined($2)) {
+					$foundnest = 1;
+					$newline .= "\xab\\${macro}\{\\${nest}\{".texquote($1)."\}\{".texquote(${2})."\}\}\xbb";
+					last;
+				}
+				$newseg =~ m/^\xab\\${macro}\{[ ]*\\${nest}\{([^\xbb]*)\}\}\xbb$/;
+				if (defined($1)) {
+					$foundnest = 1;
+					$newline .= "\xab\\${macro}\{\\${nest}\{".texquote($1)."\}\}\xbb";
+					last;
+				}
+			}
+			if ($foundnest > 0) {
+				next;
+			}
+				
+			# check for 2 args first
+			$newseg =~ m/^\xab\\${macro}\{([^\xbb]*)\}\{([^\xbb]*)\}\xbb$/;
+			if (defined($2)) {
+				$newline .= "\xab\\${macro}\{".texquote($1)."\}\{".texquote(${2})."\}\xbb";
+				next;
+			}
+			$newseg =~ m/^\xab\\${macro}\{([^\xbb]*)\}\xbb$/;
+			if (defined($1)) {
+				$newline .= "\xab\\${macro}\{".texquote($1)."\}\xbb";
+				next;
+			}
+			$newline .= $seg;
+		}
+		$line = $newline;
+	}
+	printf $fmtline,$line;
+}
+
+1;
+
+sub texquote {
+        my ($text) = @_;
+        my ($ret) = "";
+        my ($esctest) = "";
+        my ($escbase) = "BaCkSlAsH";
+        my ($esccount) = 0;
+
+	#$verbose++;
+	if ($verbose > 0) {
+        	printf STDERR "\ntexquote: '%s' -> ",$text;
+	}
+
+        if ($text =~ m/\\/) {
+                $esctest=sprintf "%s%d",$escbase,$esccount++;
+                while ($text =~ m/$esctest/) {
+                        $esctest=sprintf "%s%d",$escbase,$esccount++;
+                }
+                $text =~ s/\\/$esctest/g;
+		if ($verbose > 0) {
+                	printf STDERR "'%s' -> ",$text;
+		}
+        }
+
+        $text =~ s/([%\{\}_#\&\$\^])/\\$1/g;
+        $text =~ s/([<>\|\*~])/\{\$$1\$\}/g;
+
+        if ($esccount > 0) {
+                $text =~ s/$esctest/\$\\backslash\$/g;
+        }
+	if ($verbose > 0) {
+        	printf STDERR "'%s'\n",$text;
+	}
+	#$verbose--;
+
+        return $text;
+}