about summary refs log blame commit diff stats
path: root/ascii2txt.pl
blob: cc949e856162438dd1fc8c56d45e7788fc9ac123 (plain) (tree)































































































































































































                                                                                                                                  
#!/bin/perl

# Copyright (c) 2010,2011,2012 Todd T. Fries <todd@fries.net>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

# read in 'mandoc -Tascii' formatted man pages, spit out txt useful for further
# processing by other utilities

use strict;
use warnings;

our $fileinfo = $ARGV[0];

our $verbose = 0;

my $line;
my @lines;
while(<STDIN>) {
	$line = $_;
	push @lines,$line;
}
my $oline = "";
my $fmtline = "%s";
foreach $line (@lines) {

	my $newline = "";
	foreach my $seg (split(/(.\x08.)/,$line)) {
		my $newseg = $seg;
		$newseg =~ m/^(.)\x08(.)$/;
		if (!defined($1) || !defined($2)) {
			$newline .= $seg;
			next;
		}
		if ($1 eq $2) {
			$newline .= "${2}";
			next;
		}
		if ($1 eq "_") {
			$newline .= "${2}";
			next;
		}
		$newline .= $seg;
		next;
	}
	if ($verbose > 0) {
		printf STDERR "==> text{bf,it}\n   line: <%s>\nnewline: <%s>\n",$line,$newline;
	}
	$line = $newline;
	$line =~ m/(.)\x08/;
	if (defined($1)) {
		printf STDERR "Removing %s\\x08\n",$1;
	}
	$line =~ s/.\x08//g;

	# combine adjacent entries
	foreach my $macro (("textbf", "textit")) {
		$oline = "";
		while ($oline ne $line) {
			#printf STDERR "combine adjacent\n";
			$oline = $line;
			$line =~ s/\xab\\${macro}\{([^\}]*)\}\xbb\xab\\${macro}\{([^\}]*)\}\xbb/\xab\\${macro}\{$1$2\}\xbb/g;
		}
	}
	# combine space separated
	foreach my $macro (("textbf")) {
		#printf STDERR "combine space\n";
		$oline = "";
		while ($oline ne $line) {
			$oline = $line;
			$line =~ s/\xab\\${macro}\{([^\}]*)\}\xbb[ ]+\xab\\${macro}\{([^\}]*)\}\xbb/\xab\\${macro}\{$1 $2\}\xbb/g;
		}
	}

	# do the substitution one at a time to be sure to add all man pages, not just the last ones per line.
	# XXX provide an exceptions list, audio(9) has mono(1) and stereo(2)
	# XXX references, which are _not_ man pages
	$oline = "";
	while ($oline ne $line) {
		$oline=$line;
		$line =~ s/\{(http|ftp|https):\/\/(.*)\}/ $1:\/\/$2 /;
		if (0) {
		if ($line =~ m/ ([a-z][a-z0-9\.\-\_]*)\(([1-9])\)([,\.\) ])/) {
			my $quote = texquote($1);
			$line =~ s/ ([a-z][a-z0-9\.\-\_]*)\(([1-9])\)([,\.\) ])/ \xab\\man{$quote}{$2}\xbb$3/;
		}
		
		if ($line =~ m/ ([a-z][a-z0-9\.\-\_]*)\(([1-9])\)$/) {
			my $quote = texquote($1);
			$line =~ s/ ([a-z][a-z0-9\.\-\_]*)\(([1-9])\)$/ \xab\\man{$quote}{$2}\xbb/;
		}
		}
	}
	my @macros = ("textbf","textit","man","href");
	# quote arguments for tex
	foreach my $macro (@macros) {
		my $newline = "";
		foreach my $seg (split(/(\xab\\${macro}\{[^\xbb]*\}\xbb)/,$line)) {
			#printf STDERR "quote args\n";
			my $newseg = $seg;
			# check for nesting first; we only want to escape the
			# inner most argument, process nested macro if it has a nested macro
			# since the nested macro won't catch in the other regex cases
			my $foundnest = 0;
			foreach my $nest (@macros) {
				if ($macro eq $nest) {
					next;
				}
				
				$newseg =~ m/^\xab\\${macro}\{[ ]*\\${nest}\{([^\xbb]*)\}\{([^\xbb]*)\}\}\xbb$/;
				if (defined($2)) {
					$foundnest = 1;
					$newline .= "\xab\\${macro}\{\\${nest}\{".texquote($1)."\}\{".texquote(${2})."\}\}\xbb";
					last;
				}
				$newseg =~ m/^\xab\\${macro}\{[ ]*\\${nest}\{([^\xbb]*)\}\}\xbb$/;
				if (defined($1)) {
					$foundnest = 1;
					$newline .= "\xab\\${macro}\{\\${nest}\{".texquote($1)."\}\}\xbb";
					last;
				}
			}
			if ($foundnest > 0) {
				next;
			}
				
			# check for 2 args first
			$newseg =~ m/^\xab\\${macro}\{([^\xbb]*)\}\{([^\xbb]*)\}\xbb$/;
			if (defined($2)) {
				$newline .= "\xab\\${macro}\{".texquote($1)."\}\{".texquote(${2})."\}\xbb";
				next;
			}
			$newseg =~ m/^\xab\\${macro}\{([^\xbb]*)\}\xbb$/;
			if (defined($1)) {
				$newline .= "\xab\\${macro}\{".texquote($1)."\}\xbb";
				next;
			}
			$newline .= $seg;
		}
		$line = $newline;
	}
	printf $fmtline,$line;
}

1;

sub texquote {
        my ($text) = @_;
        my ($ret) = "";
        my ($esctest) = "";
        my ($escbase) = "BaCkSlAsH";
        my ($esccount) = 0;

	#$verbose++;
	if ($verbose > 0) {
        	printf STDERR "\ntexquote: '%s' -> ",$text;
	}

        if ($text =~ m/\\/) {
                $esctest=sprintf "%s%d",$escbase,$esccount++;
                while ($text =~ m/$esctest/) {
                        $esctest=sprintf "%s%d",$escbase,$esccount++;
                }
                $text =~ s/\\/$esctest/g;
		if ($verbose > 0) {
                	printf STDERR "'%s' -> ",$text;
		}
        }

        $text =~ s/([%\{\}_#\&\$\^])/\\$1/g;
        $text =~ s/([<>\|\*~])/\{\$$1\$\}/g;

        if ($esccount > 0) {
                $text =~ s/$esctest/\$\\backslash\$/g;
        }
	if ($verbose > 0) {
        	printf STDERR "'%s'\n",$text;
	}
	#$verbose--;

        return $text;
}