[1972] | 1 | #! /usr/bin/perl -w
|
---|
| 2 |
|
---|
| 3 | # cstr-to-text.pl: change a cstr html file to a text file.
|
---|
| 4 | # Gordon Paynter ([email protected])
|
---|
| 5 |
|
---|
| 6 | # On the lucy/rose/borg change first line to: #! /usr/local/bin/perl -w
|
---|
| 7 |
|
---|
| 8 | # Version 1 1998 Oct 23 First distibuted, give or take a version.
|
---|
| 9 | # Version 1.01 1998 Nov 17 New page number form: "<p><!--Page No-->..."
|
---|
| 10 | # Version 1.02 1998 Nov 17 Bug: <p> at end of line.
|
---|
| 11 |
|
---|
| 12 | if (!$ARGV[0] || !$ARGV[1]) {
|
---|
| 13 | die "Usage: cstr-to-text.pl <input-cstr-file> <output-text-file>\n";
|
---|
| 14 | }
|
---|
| 15 | $infile = $ARGV[0];
|
---|
| 16 | $tmpfile = "/tmp/c2t.$$";
|
---|
| 17 | $outfile = $ARGV[1];
|
---|
| 18 |
|
---|
| 19 |
|
---|
| 20 | # print STDERR "Preparing text file $infile in $outfile\n";
|
---|
| 21 |
|
---|
| 22 | open(IN, "<$infile");
|
---|
| 23 | open(OUT, ">$tmpfile");
|
---|
| 24 |
|
---|
| 25 | $ignore_next_paragraph_marker = 0;
|
---|
| 26 |
|
---|
| 27 | while (<IN>) {
|
---|
| 28 |
|
---|
| 29 | $line = $_;
|
---|
| 30 |
|
---|
| 31 | if ($line =~ /^<\!\-\-Page No\-\->/) {
|
---|
| 32 | # print "Page number generated by prescript before 2.1\n";
|
---|
| 33 | $ignore_next_paragraph_marker = 1;
|
---|
| 34 | next;
|
---|
| 35 | }
|
---|
| 36 |
|
---|
| 37 | if ($line =~ /^<p><\!\-\-Page No\-\->/) {
|
---|
| 38 | # print "Page number generated by prescript 2.1\n";
|
---|
| 39 | if ($line =~ /<p>\s*$/) {
|
---|
| 40 | $ignore_next_paragraph_marker = 0;
|
---|
| 41 | } else {
|
---|
| 42 | $ignore_next_paragraph_marker = 1;
|
---|
| 43 | }
|
---|
| 44 | next;
|
---|
| 45 | }
|
---|
| 46 |
|
---|
| 47 | if ($line =~ /^<\!\-\-End Of Page\-\->/) {
|
---|
| 48 | # print "End of Page generated by prescript (up to and including 2.1)\n";
|
---|
| 49 | if ($line =~ /<p>\s*$/) {
|
---|
| 50 | $ignore_next_paragraph_marker = 0;
|
---|
| 51 | } else {
|
---|
| 52 | $ignore_next_paragraph_marker = 1;
|
---|
| 53 | }
|
---|
| 54 | next;
|
---|
| 55 | }
|
---|
| 56 |
|
---|
| 57 | if ($line =~ /^<p>\d+$/) {
|
---|
| 58 | # print "Page number as paragraph mark and single number\n";
|
---|
| 59 | $ignore_next_paragraph_marker = 1;
|
---|
| 60 | next;
|
---|
| 61 | }
|
---|
| 62 |
|
---|
| 63 | if ($line =~ /^<p>\- \d+ \-$/) {
|
---|
| 64 | # print "Page number in yet another form\n";
|
---|
| 65 | $ignore_next_paragraph_marker = 1;
|
---|
| 66 | next;
|
---|
| 67 | }
|
---|
| 68 |
|
---|
| 69 | if ($ignore_next_paragraph_marker && ($line =~ /^<p>/)) {
|
---|
| 70 | $line =~ s/^<p>//;
|
---|
| 71 | $ignore_next_paragraph_marker = 0;
|
---|
| 72 | }
|
---|
| 73 |
|
---|
| 74 | if ($line =~ /[A-Za-z0-9]\-$/) {
|
---|
| 75 | #print "Hyphenation\n";
|
---|
| 76 | $ignore_next_paragraph_marker = 1;
|
---|
| 77 | }
|
---|
| 78 |
|
---|
| 79 |
|
---|
| 80 | print OUT $line;
|
---|
| 81 |
|
---|
| 82 | }
|
---|
| 83 |
|
---|
| 84 | close(OUT);
|
---|
| 85 |
|
---|
| 86 | `lynx -force_html -dump $tmpfile > $outfile`;
|
---|
| 87 |
|
---|
| 88 | `rm $tmpfile`;
|
---|
| 89 |
|
---|