#! /usr/bin/perl -w # cstr-to-text.pl: change a cstr html file to a text file. # Gordon Paynter (gwp@cs.waikato.ac.nz) # On the lucy/rose/borg change first line to: #! /usr/local/bin/perl -w # Version 1 1998 Oct 23 First distibuted, give or take a version. # Version 1.01 1998 Nov 17 New page number form: "
..." # Version 1.02 1998 Nov 17 Bug:
at end of line.
if (!$ARGV[0] || !$ARGV[1]) {
die "Usage: cstr-to-text.pl <\!\-\-Page No\-\->/) {
# print "Page number generated by prescript 2.1\n";
if ($line =~ / \s*$/) {
$ignore_next_paragraph_marker = 0;
} else {
$ignore_next_paragraph_marker = 1;
}
next;
}
if ($line =~ /^<\!\-\-End Of Page\-\->/) {
# print "End of Page generated by prescript (up to and including 2.1)\n";
if ($line =~ / \s*$/) {
$ignore_next_paragraph_marker = 0;
} else {
$ignore_next_paragraph_marker = 1;
}
next;
}
if ($line =~ /^ \d+$/) {
# print "Page number as paragraph mark and single number\n";
$ignore_next_paragraph_marker = 1;
next;
}
if ($line =~ /^ \- \d+ \-$/) {
# print "Page number in yet another form\n";
$ignore_next_paragraph_marker = 1;
next;
}
if ($ignore_next_paragraph_marker && ($line =~ /^ /)) {
$line =~ s/^ //;
$ignore_next_paragraph_marker = 0;
}
if ($line =~ /[A-Za-z0-9]\-$/) {
#print "Hyphenation\n";
$ignore_next_paragraph_marker = 1;
}
print OUT $line;
}
close(OUT);
`lynx -force_html -dump $tmpfile > $outfile`;
`rm $tmpfile`;