source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 1435

Last change on this file since 1435 was 1435, checked in by davidb, 24 years ago

Rearrangement of ConvertTo inheritence so HTMLPlug and TextPlug do not need
to know anything about the conversion process.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.3 KB
Line 
1###########################################################################
2#
3# HTMLPlug.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34# To use frames properly you'll need to use the WebPlug plugin.
35#
36
37
38package HTMLPlug;
39
40use BasPlug;
41use ghtml;
42use util;
43use parsargv;
44
45sub BEGIN {
46 @ISA = ('BasPlug');
47}
48
49sub print_usage {
50 print STDERR "\n usage: plugin HTMLPlug [options]\n\n";
51 print STDERR " options:\n";
52 print STDERR " -nolinks Don't make any attempt to trap links (setting this flag may\n";
53 print STDERR " improve speed of building/importing but any relative links within\n";
54 print STDERR " documents will be broken).\n";
55 print STDERR " -keep_head Don't remove headers from html files.\n";
56 print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n";
57 print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to extract.\n";
58 print STDERR " Defaults to 'Title'.\n";
59 print STDERR " Use `first200` to get the first 200 characters of the body.\n";
60 print STDERR " Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n";
61 print STDERR " -w3mir Set if w3mir was used to generate input file structure.\n";
62 print STDERR " -assoc_files Perl regular expression of file extensions to associate with\n";
63 print STDERR " html documents. Defaults to '(?i)\.(jpe?g|gif|png|css|pdf)\$'\n";
64 print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images). Also\n";
65 print STDERR " creates much shallower directory structure (useful when creating\n";
66 print STDERR " collections to go on cd-rom).\n\n";
67 print STDERR " -title_sub Substitution expression to modify string stored as Title.\n";
68 print STDERR " Used by, for example, PDFHtml to remove Page 1 etc from text\n";
69 print STDERR " chosen to be used as the title.\n";
70}
71
72sub new {
73 my $class = shift (@_);
74 my $self = new BasPlug ($class, @_);
75
76 if (!parsargv::parse(\@_,
77 q^nolinks^, \$self->{'nolinks'},
78 q^keep_head^, \$self->{'keep_head'},
79 q^no_metadata^, \$self->{'no_metadata'},
80 q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
81 q^w3mir^, \$self->{'w3mir'},
82 q^assoc_files/.*/(?i)\.(jpe?g|gif|png|css|pdf)$^, \$self->{'assoc_files'},
83 q^rename_assoc_files^, \$self->{'rename_assoc_files'},
84 q^title_sub/.*/^, \$self->{'title_sub'},
85 "allow_extra_options")) {
86
87 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
88 &print_usage();
89 die "\n";
90 }
91
92 $self->{'aux_files'} = {};
93 $self->{'dir_num'} = 0;
94 $self->{'file_num'} = 0;
95
96 return bless $self, $class;
97}
98
99
100sub get_default_block_exp {
101 my $self = shift (@_);
102
103 return q^(?i)\.(gif|jpe?g|png|pdf|rtf|css)$^;
104}
105
106sub get_default_process_exp {
107 my $self = shift (@_);
108
109 # the last option is an attempt to encode the concept of an html query ...
110 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php|\.cgi|.+\?.+=.*)$^;
111}
112
113
114# do plugin specific processing of doc_obj
115sub process {
116 my $self = shift (@_);
117 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
118 my $outhandle = $self->{'outhandle'};
119
120 print $outhandle "HTMLPlug: processing $file\n"
121 if $self->{'verbosity'} > 1;
122
123 my $cursection = $doc_obj->get_top_section();
124
125 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
126 unless $self->{'no_metadata'};
127
128 # Store URL for page as metadata - this can be used for an
129 # altavista style search interface. The URL won't be valid
130 # unless the file structure contains the domain name (i.e.
131 # like when w3mir is used to download a website).
132 my $web_url = "http://$file";
133 $web_url =~ s/\\/\//g; # for windows
134 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
135
136 # remove header and footer
137 if (!$self->{'keep_head'}) {
138 $$textref =~ s/^.*?<body[^>]*>//is;
139 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
140 }
141
142 # trap links
143 if (!$self->{'nolinks'}) {
144
145 # usemap="./#index" not handled correctly => change to "#index"
146 $$textref =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
147 $self->replace_usemap_links($1, $2, $3)/isge;
148
149 $$textref =~ s/(<(?:a|area|frame|link)\s+[^>]*?\s*(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
150 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
151 }
152
153 # trap images
154 $$textref =~ s/(<img[^>]*? src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
155 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
156
157 # add text to document object
158 $doc_obj->add_utf8_text($cursection, $$textref);
159
160 return 1;
161}
162
163sub replace_images {
164 my $self = shift (@_);
165 my ($front, $link, $back, $base_dir,
166 $file, $doc_obj, $section) = @_;
167
168 $link =~ s/\n/ /g;
169
170 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
171 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
172}
173
174sub replace_href_links {
175 my $self = shift (@_);
176 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
177
178 # attempt to sort out targets - frames are not handled
179 # well in this plugin and some cases will screw things
180 # up - e.g. the _parent target (so we'll just remove
181 # them all ;-)
182 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
183 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
184 $front =~ s/target=\"?_parent\"?//is;
185 $back =~ s/target=\"?_parent\"?//is;
186
187 return $front . $link . $back if $link =~ /^\#/s;
188 $link =~ s/\n/ /g;
189
190 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
191 # href may use '\'s where '/'s should be on Windows
192 $href =~ s/\\/\//g;
193
194 my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/;
195
196 ##### leave all these links alone (they won't be picked up by intermediate
197 ##### pages). I think that's safest when dealing with frames, targets etc.
198 ##### (at least until I think of a better way to do it). Problems occur with
199 ##### mailto links from within small frames, the intermediate page is displayed
200 ##### within that frame and can't be seen. There is still potential for this to
201 ##### happen even with html pages - the solution seems to be to somehow tell
202 ##### the browser from the server side to display the page being sent (i.e.
203 ##### the intermediate page) in the top level window - I'm not sure if that's
204 ##### possible - the following line should probably be deleted if that can be done
205 return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is;
206
207
208 if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||
209 ($href =~ /\/$/) || ($href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
210 &ghtml::urlsafe ($href);
211 return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
212
213 } else {
214 # link is to some other type of file (image, pdf etc.) so we'll
215 # need to associate that file
216 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
217 }
218}
219
220sub add_file {
221 my $self = shift (@_);
222 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
223 my ($newname);
224
225 my $filename = $href;
226 $filename =~ s/^[^:]*:\/\///;
227 $filename = &util::filename_cat($base_dir, $filename);
228
229 my ($ext) = $filename =~ /(\.[^\.]*)$/;
230
231 if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
232 return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
233 }
234
235 if ($self->{'rename_assoc_files'}) {
236 if (defined $self->{'aux_files'}->{$href}) {
237 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
238 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
239 } else {
240 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
241 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
242 $self->inc_filecount ();
243 }
244 $doc_obj->associate_file($filename, $newname, undef, $section);
245 return "_httpcollimg_/$newname";
246
247 } else {
248 ($newname) = $filename =~ /([^\/\\]*)$/;
249 $doc_obj->associate_file($filename, $newname, undef, $section);
250 return "_httpdocimg_/$newname";
251 }
252}
253
254
255sub format_link {
256 my $self = shift (@_);
257 my ($link, $base_dir, $file) = @_;
258
259 my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;
260 $hash_part = "" if !defined $hash_part;
261 if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {
262 my $outhandle = $self->{'outhandle'};
263 print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
264 if $self->{'verbosity'};
265 return ($link, "", 0);
266 }
267
268 if ($before_hash =~ s/^((?:http|ftp|file):\/\/)//i) {
269 my $type = $1;
270
271 if ($link =~ /^(http|ftp):/i) {
272 # Turn url (using /) into file name (possibly using \ on windows)
273 my @http_dir_split = split('/', $before_hash);
274 $before_hash = &util::filename_cat(@http_dir_split);
275 }
276
277 $before_hash = $self->eval_dir_dots($before_hash);
278
279 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
280
281 my $rl = 0;
282 $rl = 1 if (-e $linkfilename);
283
284 # make sure there's a slash on the end if it's a directory
285 if ($before_hash !~ /\/$/) {
286 $before_hash .= "/" if (-d $linkfilename);
287 }
288
289 return ($type . $before_hash, $hash_part, $rl);
290
291 } elsif ($link !~ /^(mailto|news|gopher|nntp|telnet|javascript):/i) {
292
293 if ($before_hash =~ s/^\///) {
294 # the first directory will be the domain name if w3mir was used
295 # to generate archives, otherwise we'll assume all files are
296 # from the same site and base_dir is the root
297 if ($self->{'w3mir'}) {
298 my @dirs = split /[\/\\]/, $file;
299 my $domname = shift (@dirs);
300 $before_hash = &util::filename_cat($domname, $before_hash);
301 $before_hash =~ s/\\/\//g; # for windows
302 }
303 else
304 {
305 # see if link shares directory with source document
306 # => turn into relative link if this is so!
307 my $before_hash_fix = &util::filename_cat("",$before_hash);
308 $before_hash_fix =~ s/^$base_dir(\\|\/)//;
309 $before_hash = $before_hash_fix;
310
311 }
312
313
314 } else {
315 # Turn relative file path into full path
316 my $dirname = &File::Basename::dirname($file);
317 $before_hash = &util::filename_cat($dirname, $before_hash);
318 $before_hash = $self->eval_dir_dots($before_hash);
319 }
320
321
322 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
323
324 # make sure there's a slash on the end if it's a directory
325 if ($before_hash !~ /\/$/) {
326 $before_hash .= "/" if (-d $linkfilename);
327 }
328
329 return ("http://" . $before_hash, $hash_part, 1);
330
331 } else {
332 # mailto, news, nntp, telnet, javascript or gopher link
333 return ($before_hash, "", 0);
334 }
335}
336
337sub extract_metadata {
338 my $self = shift (@_);
339 my ($textref, $metadata, $doc_obj, $section) = @_;
340
341 return if (!defined $self->{'metadata_fields'});
342
343 foreach my $field (split /,/, $self->{'metadata_fields'}) {
344
345 # don't need to extract field if it was passed in from a previous
346 # (recursive) plugin
347 next if defined $metadata->{$field};
348
349 # see if there's a <meta> tag for this field
350 if ($$textref =~ /<meta(.*?)(?:name|http-equiv)\s*=\s*\"?$field\"?([^>]*)/is) {
351 my $content = $1 . $2;
352 if ($content =~ /content\s*=\s*\"?(.*?)\"?/is) {
353 if (defined $1) {
354 my $value = $1;
355 $value =~ s/\s+/ /gs;
356 $doc_obj->add_utf8_metadata($section, $field, $value);
357 next;
358 }
359 }
360 }
361
362 # TITLE: extract the document title
363
364 if ($field =~ /^title$/i) {
365
366 # see if there's a <title> tag
367 if ($$textref =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) {
368 if (defined $1) {
369 my $title = $1;
370 if ($title =~ /\w/) {
371 $title =~ s/\s+/ /gs;
372 $title =~ s/^\s+//;
373 $title =~ s/\s+$//;
374 $doc_obj->add_utf8_metadata ($section, $field, $title);
375 next;
376 }
377 }
378 }
379
380 # if no title use first 100 characters
381 my $tmptext = $$textref;
382 $tmptext =~ s/\s+/ /gs;
383 $tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'});
384 $tmptext =~ s/<[^>]*>//g;
385 $tmptext = substr ($tmptext, 0, 100);
386 $tmptext =~ s/^\s+//;
387 $tmptext =~ s/\s+$//;
388 $tmptext =~ s/\s\S*$/.../;
389 $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
390 next;
391 }
392
393 # FIRST200: extract the first 200 characters as metadata
394
395 if ($field =~ /^first200$/i) {
396 my $tmptext = $$textref;
397 $tmptext =~ s/\s+/ /gs;
398 $tmptext =~ s/.*<body[^>]*>//i;
399 $tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'});
400 $tmptext =~ s/<[^>]*>//g;
401 $tmptext = substr ($tmptext, 0, 200);
402 $tmptext =~ s/^\s+//;
403 $tmptext =~ s/\s+$//;
404 $tmptext =~ s/\s\S*$/.../;
405 $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
406 next;
407 }
408
409 # H1: extract the text between the first <H1> and </H1> tags
410 if ($field =~ /^H1$/i) {
411 my $tmptext = $$textref;
412 $tmptext =~ s/\s+/ /gs;
413 if ($tmptext =~ /<H1[^>]*>/i) {
414 $tmptext =~ s/.*<H1[^>]*>//i;
415 $tmptext =~ s/<\/H1[^>]*>.*//i;
416 $tmptext =~ s/^\s+//;
417 $tmptext =~ s/\s+$//;
418 $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
419 }
420 next;
421 }
422 }
423}
424
425
426# evaluate any "../" to next directory up
427# evaluate any "./" as here
428sub eval_dir_dots {
429 my $self = shift (@_);
430 my ($filename) = @_;
431
432 my $dirsep_os = &util::get_os_dirsep();
433 my @dirsep = split(/$dirsep_os/,$filename);
434
435 my @eval_dirs = ();
436 foreach my $d (@dirsep) {
437 if ($d eq "..") {
438 pop(@eval_dirs);
439
440 } elsif ($d eq ".") {
441 # do nothing!
442
443 } else {
444 push(@eval_dirs,$d);
445 }
446 }
447
448 return &util::filename_cat(@eval_dirs);
449}
450
451sub replace_usemap_links {
452 my $self = shift (@_);
453 my ($front, $link, $back) = @_;
454
455 $link =~ s/^\.\///;
456 return $front . $link . $back;
457}
458
459sub inc_filecount {
460 my $self = shift (@_);
461
462 if ($self->{'file_num'} == 1000) {
463 $self->{'dir_num'} ++;
464 $self->{'file_num'} = 0;
465 } else {
466 $self->{'file_num'} ++;
467 }
468}
469
4701;
Note: See TracBrowser for help on using the repository browser.