#! /usr/bin/perl -w # cstr-to-text.pl: change a cstr html file to a text file. # Gordon Paynter (gwp@cs.waikato.ac.nz) # On the lucy/rose/borg change first line to: #! /usr/local/bin/perl -w # Version 1 1998 Oct 23 First distibuted, give or take a version. # Version 1.01 1998 Nov 17 New page number form: "

..." # Version 1.02 1998 Nov 17 Bug:

at end of line. if (!$ARGV[0] || !$ARGV[1]) { die "Usage: cstr-to-text.pl \n"; } $infile = $ARGV[0]; $tmpfile = "/tmp/c2t.$$"; $outfile = $ARGV[1]; # print STDERR "Preparing text file $infile in $outfile\n"; open(IN, "<$infile"); open(OUT, ">$tmpfile"); $ignore_next_paragraph_marker = 0; while () { $line = $_; if ($line =~ /^<\!\-\-Page No\-\->/) { # print "Page number generated by prescript before 2.1\n"; $ignore_next_paragraph_marker = 1; next; } if ($line =~ /^

<\!\-\-Page No\-\->/) { # print "Page number generated by prescript 2.1\n"; if ($line =~ /

\s*$/) { $ignore_next_paragraph_marker = 0; } else { $ignore_next_paragraph_marker = 1; } next; } if ($line =~ /^<\!\-\-End Of Page\-\->/) { # print "End of Page generated by prescript (up to and including 2.1)\n"; if ($line =~ /

\s*$/) { $ignore_next_paragraph_marker = 0; } else { $ignore_next_paragraph_marker = 1; } next; } if ($line =~ /^

\d+$/) { # print "Page number as paragraph mark and single number\n"; $ignore_next_paragraph_marker = 1; next; } if ($line =~ /^

\- \d+ \-$/) { # print "Page number in yet another form\n"; $ignore_next_paragraph_marker = 1; next; } if ($ignore_next_paragraph_marker && ($line =~ /^

/)) { $line =~ s/^

//; $ignore_next_paragraph_marker = 0; } if ($line =~ /[A-Za-z0-9]\-$/) { #print "Hyphenation\n"; $ignore_next_paragraph_marker = 1; } print OUT $line; } close(OUT); `lynx -force_html -dump $tmpfile > $outfile`; `rm $tmpfile`;