1 | #! /usr/bin/perl -w
|
---|
2 |
|
---|
3 | # cstr-to-text.pl: change a cstr html file to a text file.
|
---|
4 | # Gordon Paynter ([email protected])
|
---|
5 |
|
---|
6 | # On the lucy/rose/borg change first line to: #! /usr/local/bin/perl -w
|
---|
7 |
|
---|
8 | # Version 1 1998 Oct 23 First distibuted, give or take a version.
|
---|
9 | # Version 1.01 1998 Nov 17 New page number form: "<p><!--Page No-->..."
|
---|
10 | # Version 1.02 1998 Nov 17 Bug: <p> at end of line.
|
---|
11 |
|
---|
12 | if (!$ARGV[0] || !$ARGV[1]) {
|
---|
13 | die "Usage: cstr-to-text.pl <input-cstr-file> <output-text-file>\n";
|
---|
14 | }
|
---|
15 | $infile = $ARGV[0];
|
---|
16 | $tmpfile = "/tmp/c2t.$$";
|
---|
17 | $outfile = $ARGV[1];
|
---|
18 |
|
---|
19 |
|
---|
20 | # print STDERR "Preparing text file $infile in $outfile\n";
|
---|
21 |
|
---|
22 | open(IN, "<$infile");
|
---|
23 | open(OUT, ">$tmpfile");
|
---|
24 |
|
---|
25 | $ignore_next_paragraph_marker = 0;
|
---|
26 |
|
---|
27 | while (<IN>) {
|
---|
28 |
|
---|
29 | $line = $_;
|
---|
30 |
|
---|
31 | if ($line =~ /^<\!\-\-Page No\-\->/) {
|
---|
32 | # print "Page number generated by prescript before 2.1\n";
|
---|
33 | $ignore_next_paragraph_marker = 1;
|
---|
34 | next;
|
---|
35 | }
|
---|
36 |
|
---|
37 | if ($line =~ /^<p><\!\-\-Page No\-\->/) {
|
---|
38 | # print "Page number generated by prescript 2.1\n";
|
---|
39 | if ($line =~ /<p>\s*$/) {
|
---|
40 | $ignore_next_paragraph_marker = 0;
|
---|
41 | } else {
|
---|
42 | $ignore_next_paragraph_marker = 1;
|
---|
43 | }
|
---|
44 | next;
|
---|
45 | }
|
---|
46 |
|
---|
47 | if ($line =~ /^<\!\-\-End Of Page\-\->/) {
|
---|
48 | # print "End of Page generated by prescript (up to and including 2.1)\n";
|
---|
49 | if ($line =~ /<p>\s*$/) {
|
---|
50 | $ignore_next_paragraph_marker = 0;
|
---|
51 | } else {
|
---|
52 | $ignore_next_paragraph_marker = 1;
|
---|
53 | }
|
---|
54 | next;
|
---|
55 | }
|
---|
56 |
|
---|
57 | if ($line =~ /^<p>\d+$/) {
|
---|
58 | # print "Page number as paragraph mark and single number\n";
|
---|
59 | $ignore_next_paragraph_marker = 1;
|
---|
60 | next;
|
---|
61 | }
|
---|
62 |
|
---|
63 | if ($line =~ /^<p>\- \d+ \-$/) {
|
---|
64 | # print "Page number in yet another form\n";
|
---|
65 | $ignore_next_paragraph_marker = 1;
|
---|
66 | next;
|
---|
67 | }
|
---|
68 |
|
---|
69 | if ($ignore_next_paragraph_marker && ($line =~ /^<p>/)) {
|
---|
70 | $line =~ s/^<p>//;
|
---|
71 | $ignore_next_paragraph_marker = 0;
|
---|
72 | }
|
---|
73 |
|
---|
74 | if ($line =~ /[A-Za-z0-9]\-$/) {
|
---|
75 | #print "Hyphenation\n";
|
---|
76 | $ignore_next_paragraph_marker = 1;
|
---|
77 | }
|
---|
78 |
|
---|
79 |
|
---|
80 | print OUT $line;
|
---|
81 |
|
---|
82 | }
|
---|
83 |
|
---|
84 | close(OUT);
|
---|
85 |
|
---|
86 | `lynx -force_html -dump $tmpfile > $outfile`;
|
---|
87 |
|
---|
88 | `rm $tmpfile`;
|
---|
89 |
|
---|