Last change
on this file since 2308 was 1972, checked in by jmt14, 23 years ago |
* empty log message *
|
-
Property svn:executable
set to
*
-
Property svn:keywords
set to
Author Date Id Revision
|
File size:
953 bytes
|
Line | |
---|
1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 | # convert an html file to a text file
|
---|
4 |
|
---|
5 | # Version
|
---|
6 | # 1 1999 Aug 24 First version. Compensates for files Lynx cannot parse.
|
---|
7 | # 1.1 1999 Aug 24 Instead of special cases, put a time limit on lynx.
|
---|
8 |
|
---|
9 | die unless (-e "$ARGV[0]");
|
---|
10 | $filename = $ARGV[0];
|
---|
11 |
|
---|
12 | # Lynx can't handle framesets. Sorry.
|
---|
13 | #$frameset = `grep "<FRAMESET" $filename`;
|
---|
14 | #exit if ($frameset =~ /./);
|
---|
15 |
|
---|
16 | # Lynx can't handle files with no body. Sorry.
|
---|
17 | #$size = `wc $filename`;
|
---|
18 | #($lines, $words, $chars) = $size =~ /^\s+(\d+)\s+(\d+)\s+(\d+)/;
|
---|
19 | #exit unless ($lines && $words && $chars);
|
---|
20 | #exit if ($lines < 10);
|
---|
21 | #exit if ($words < 10);
|
---|
22 | #exit if ($chars < 10);
|
---|
23 |
|
---|
24 | # convert the html file to text with lynx
|
---|
25 | `ulimit -t 300; lynx -force_html -nolist -dump $filename > $filename.$$`;
|
---|
26 |
|
---|
27 | open(IN, "$filename.$$");
|
---|
28 | while (<IN>) {
|
---|
29 |
|
---|
30 | # remove the [IMAGE], [LINK], and [INLINE] markers
|
---|
31 | s/\[INLINE\]/. /g;
|
---|
32 | s/\[IMAGE\]/. /g;
|
---|
33 | s/\[LINK\]/. /g;
|
---|
34 |
|
---|
35 | print;
|
---|
36 | }
|
---|
37 |
|
---|
38 | `rm $filename.$$`;
|
---|
Note:
See
TracBrowser
for help on using the repository browser.