diff options
Diffstat (limited to 'ascii2txt.pl')
-rw-r--r-- | ascii2txt.pl | 192 |
1 files changed, 192 insertions, 0 deletions
diff --git a/ascii2txt.pl b/ascii2txt.pl new file mode 100644 index 0000000..cc949e8 --- /dev/null +++ b/ascii2txt.pl @@ -0,0 +1,192 @@ +#!/bin/perl + +# Copyright (c) 2010,2011,2012 Todd T. Fries <todd@fries.net> +# +# Permission to use, copy, modify, and distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +# read in 'mandoc -Tascii' formatted man pages, spit out txt useful for further +# processing by other utilities + +use strict; +use warnings; + +our $fileinfo = $ARGV[0]; + +our $verbose = 0; + +my $line; +my @lines; +while(<STDIN>) { + $line = $_; + push @lines,$line; +} +my $oline = ""; +my $fmtline = "%s"; +foreach $line (@lines) { + + my $newline = ""; + foreach my $seg (split(/(.\x08.)/,$line)) { + my $newseg = $seg; + $newseg =~ m/^(.)\x08(.)$/; + if (!defined($1) || !defined($2)) { + $newline .= $seg; + next; + } + if ($1 eq $2) { + $newline .= "${2}"; + next; + } + if ($1 eq "_") { + $newline .= "${2}"; + next; + } + $newline .= $seg; + next; + } + if ($verbose > 0) { + printf STDERR "==> text{bf,it}\n line: <%s>\nnewline: <%s>\n",$line,$newline; + } + $line = $newline; + $line =~ m/(.)\x08/; + if (defined($1)) { + printf STDERR "Removing %s\\x08\n",$1; + } + $line =~ s/.\x08//g; + + # combine adjacent entries + foreach my $macro (("textbf", "textit")) { + $oline = ""; + while ($oline ne $line) { + #printf STDERR "combine adjacent\n"; + $oline = $line; + $line =~ s/\xab\\${macro}\{([^\}]*)\}\xbb\xab\\${macro}\{([^\}]*)\}\xbb/\xab\\${macro}\{$1$2\}\xbb/g; + } + } + # combine space separated + foreach my $macro (("textbf")) { + #printf STDERR "combine space\n"; + $oline = ""; + while ($oline ne $line) { + $oline = $line; + $line =~ s/\xab\\${macro}\{([^\}]*)\}\xbb[ ]+\xab\\${macro}\{([^\}]*)\}\xbb/\xab\\${macro}\{$1 $2\}\xbb/g; + } + } + + # do the substitution one at a time to be sure to add all man pages, not just the last ones per line. + # XXX provide an exceptions list, audio(9) has mono(1) and stereo(2) + # XXX references, which are _not_ man pages + $oline = ""; + while ($oline ne $line) { + $oline=$line; + $line =~ s/\{(http|ftp|https):\/\/(.*)\}/ $1:\/\/$2 /; + if (0) { + if ($line =~ m/ ([a-z][a-z0-9\.\-\_]*)\(([1-9])\)([,\.\) ])/) { + my $quote = texquote($1); + $line =~ s/ ([a-z][a-z0-9\.\-\_]*)\(([1-9])\)([,\.\) ])/ \xab\\man{$quote}{$2}\xbb$3/; + } + + if ($line =~ m/ ([a-z][a-z0-9\.\-\_]*)\(([1-9])\)$/) { + my $quote = texquote($1); + $line =~ s/ ([a-z][a-z0-9\.\-\_]*)\(([1-9])\)$/ \xab\\man{$quote}{$2}\xbb/; + } + } + } + my @macros = ("textbf","textit","man","href"); + # quote arguments for tex + foreach my $macro (@macros) { + my $newline = ""; + foreach my $seg (split(/(\xab\\${macro}\{[^\xbb]*\}\xbb)/,$line)) { + #printf STDERR "quote args\n"; + my $newseg = $seg; + # check for nesting first; we only want to escape the + # inner most argument, process nested macro if it has a nested macro + # since the nested macro won't catch in the other regex cases + my $foundnest = 0; + foreach my $nest (@macros) { + if ($macro eq $nest) { + next; + } + + $newseg =~ m/^\xab\\${macro}\{[ ]*\\${nest}\{([^\xbb]*)\}\{([^\xbb]*)\}\}\xbb$/; + if (defined($2)) { + $foundnest = 1; + $newline .= "\xab\\${macro}\{\\${nest}\{".texquote($1)."\}\{".texquote(${2})."\}\}\xbb"; + last; + } + $newseg =~ m/^\xab\\${macro}\{[ ]*\\${nest}\{([^\xbb]*)\}\}\xbb$/; + if (defined($1)) { + $foundnest = 1; + $newline .= "\xab\\${macro}\{\\${nest}\{".texquote($1)."\}\}\xbb"; + last; + } + } + if ($foundnest > 0) { + next; + } + + # check for 2 args first + $newseg =~ m/^\xab\\${macro}\{([^\xbb]*)\}\{([^\xbb]*)\}\xbb$/; + if (defined($2)) { + $newline .= "\xab\\${macro}\{".texquote($1)."\}\{".texquote(${2})."\}\xbb"; + next; + } + $newseg =~ m/^\xab\\${macro}\{([^\xbb]*)\}\xbb$/; + if (defined($1)) { + $newline .= "\xab\\${macro}\{".texquote($1)."\}\xbb"; + next; + } + $newline .= $seg; + } + $line = $newline; + } + printf $fmtline,$line; +} + +1; + +sub texquote { + my ($text) = @_; + my ($ret) = ""; + my ($esctest) = ""; + my ($escbase) = "BaCkSlAsH"; + my ($esccount) = 0; + + #$verbose++; + if ($verbose > 0) { + printf STDERR "\ntexquote: '%s' -> ",$text; + } + + if ($text =~ m/\\/) { + $esctest=sprintf "%s%d",$escbase,$esccount++; + while ($text =~ m/$esctest/) { + $esctest=sprintf "%s%d",$escbase,$esccount++; + } + $text =~ s/\\/$esctest/g; + if ($verbose > 0) { + printf STDERR "'%s' -> ",$text; + } + } + + $text =~ s/([%\{\}_#\&\$\^])/\\$1/g; + $text =~ s/([<>\|\*~])/\{\$$1\$\}/g; + + if ($esccount > 0) { + $text =~ s/$esctest/\$\\backslash\$/g; + } + if ($verbose > 0) { + printf STDERR "'%s'\n",$text; + } + #$verbose--; + + return $text; +} |