source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 1206

Last change on this file since 1206 was 1190, checked in by gwp, 24 years ago

The first 200 chars of body text can now be extracted as metadata
by adding 'first200' to the -metadata_fields argument. A potential
problem extracting the title was resolved.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.3 KB
Line 
1###########################################################################
2#
3# HTMLPlug.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34# To use frames properly you'll need to use the WebPlug plugin.
35#
36
37
38package HTMLPlug;
39
40use BasPlug;
41use ghtml;
42use util;
43use parsargv;
44
45sub BEGIN {
46 @ISA = ('BasPlug');
47}
48
49use strict;
50
51sub print_usage {
52 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
53
54 print STDERR "\n usage: plugin HTMLPlug [options]\n\n";
55 print STDERR " options:\n";
56 print STDERR " -process_exp A perl regular expression to match against filenames.\n";
57 print STDERR " Matching filenames will be processed by this plugin.\n";
58 print STDERR " Defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
59 print STDERR " .htm or .html (case-insensitive).\n";
60 print STDERR " -nolinks Don't make any attempt to trap links (setting this flag may\n";
61 print STDERR " improve speed of building/importing but any relative links within\n";
62 print STDERR " documents will be broken).\n";
63 print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
64 print STDERR " being passed to any further plugins in the list. By default\n";
65 print STDERR " HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png, .pdf,\n";
66 print STDERR " .rtf or .css file extensions.\n";
67 print STDERR " -keep_head Don't remove headers from html files.\n";
68 print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n";
69 print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to extract.\n";
70 print STDERR " Defaults to 'Title'.\n";
71 print STDERR " Use `first200` to get the first 100 characters of the body.\n";
72 print STDERR " -w3mir Set if w3mir was used to generate input file structure.\n";
73 print STDERR " w3mir \n";
74 print STDERR " -assoc_files Perl regular expression of file extensions to associate with\n";
75 print STDERR " html documents. Defaults to '(?i)\.(jpe?g|gif|png|css|pdf)$'\n";
76 print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images). Also\n";
77 print STDERR " creates much shallower directory structure (useful when creating\n";
78 print STDERR " collections to go on cd-rom).\n\n";
79}
80
81sub new {
82 my $class = shift (@_);
83 my $self = new BasPlug ();
84
85 if (!parsargv::parse(\@_,
86 q^process_exp/.*/(?i)\.html?$^, \$self->{'process_exp'},
87 q^nolinks^, \$self->{'nolinks'},
88 q^block_exp/.*/(?i)\.(gif|jpe?g|png|pdf|rtf|css)$^, \$self->{'block_exp'},
89 q^keep_head^, \$self->{'keep_head'},
90 q^no_metadata^, \$self->{'no_metadata'},
91 q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
92 q^w3mir^, \$self->{'w3mir'},
93 q^assoc_files/.*/(?i)\.(jpe?g|gif|png|css|pdf)$^, \$self->{'assoc_files'},
94 q^rename_assoc_files^, \$self->{'rename_assoc_files'})) {
95 &print_usage();
96 die "\n";
97 }
98
99 $self->{'aux_files'} = {};
100 $self->{'dir_num'} = 0;
101 $self->{'file_num'} = 0;
102
103 return bless $self, $class;
104}
105
106sub is_recursive {
107 my $self = shift (@_);
108
109 return 0; # this is not a recursive plugin
110}
111
112# return number of files processed, undef if can't process
113# Note that $base_dir might be "" and that $file might
114# include directories
115sub read {
116 my $self = shift (@_);
117 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
118
119 my $filename = &util::filename_cat($base_dir, $file);
120 return 0 if $filename =~ /$self->{'block_exp'}/;
121 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
122 return undef;
123 }
124 $file =~ s/^[\/\\]+//;
125
126 $self->{'verbosity'} = $processor->{'verbosity'};
127 print STDERR "HTMLPlug: processing $file\n"
128 if $self->{'verbosity'} > 1;
129
130 # create a new document
131 my $doc_obj = new doc ($file, "indexed_doc");
132 my $cursection = $doc_obj->get_top_section();
133
134 # read in HTML file
135 open (FILE, $filename) || die "HTMLPlug::read - can't open $filename\n";
136 undef $/;
137 my $text = <FILE>;
138 $/ = "\n";
139 close FILE;
140 if (!defined $text || $text !~ /\w/) {
141 print STDERR "HTMLPlug: ERROR: $file contains no text\n" if $self->{'verbosity'};
142 return 0;
143 }
144
145 $self->extra_metadata ($doc_obj, $cursection, $metadata);
146 $self->extract_metadata (\$text, $metadata, $doc_obj, $cursection)
147 unless $self->{'no_metadata'};
148
149 # Store URL for page as metadata - this can be used for an
150 # altavista style search interface. The URL won't be valid
151 # unless the file structure contains the domain name (i.e.
152 # like when w3mir is used to download a website).
153 my $web_url = "http://$file";
154 $web_url =~ s/\\/\//g; # for windows
155 $doc_obj->add_metadata($cursection, "URL", $web_url);
156
157 # remove header and footer
158 if (!$self->{'keep_head'}) {
159 $text =~ s/^.*?<body[^>]*>//is;
160 $text =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
161 }
162
163 # trap links
164 if (!$self->{'nolinks'}) {
165
166 # usemap="./#index" not handled correctly => change to "#index"
167 $text =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
168 $self->replace_usemap_links($1, $2, $3)/isge;
169
170 $text =~ s/(<(?:a|area|frame|link)\s+[^>]*?(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
171 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
172 }
173
174 # trap images
175 $text =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
176 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
177
178 $doc_obj->add_text ($cursection, $text);
179
180 # add an OID
181 $doc_obj->set_OID();
182
183 # process the document
184 $processor->process($doc_obj);
185
186 return 1; # processed the file
187}
188
189sub replace_images {
190 my $self = shift (@_);
191 my ($front, $link, $back, $base_dir,
192 $file, $doc_obj, $section) = @_;
193
194 $link =~ s/\n/ /g;
195
196 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
197 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
198}
199
200sub replace_href_links {
201 my $self = shift (@_);
202 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
203
204 # attempt to sort out targets - frames are not handled
205 # well in this plugin and some cases will screw things
206 # up - e.g. the _parent target (so we'll just remove
207 # them all ;-)
208 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
209 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
210 $front =~ s/target=\"?_parent\"?//is;
211 $back =~ s/target=\"?_parent\"?//is;
212
213 return $front . $link . $back if $link =~ /^\#/s;
214 $link =~ s/\n/ /g;
215
216 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
217
218 my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/;
219
220 ##### leave all these links alone (they won't be picked up by intermediate
221 ##### pages). I think that's safest when dealing with frames, targets etc.
222 ##### (at least until I think of a better way to do it). Problems occur with
223 ##### mailto links from within small frames, the intermediate page is displayed
224 ##### within that frame and can't be seen. There is still potential for this to
225 ##### happen even with html pages - the solution seems to be to somehow tell
226 ##### the browser from the server side to display the page being sent (i.e.
227 ##### the intermediate page) in the top level window - I'm not sure if that's
228 ##### possible - the following line should probably be deleted if that can be done
229 return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is;
230
231
232 if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||
233 ($href =~ /\/$/) || ($href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
234 &ghtml::urlsafe ($href);
235 return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
236
237 } else {
238 # link is to some other type of file (image, pdf etc.) so we'll
239 # need to associate that file
240 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
241 }
242}
243
244sub add_file {
245 my $self = shift (@_);
246 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
247 my ($newname);
248
249 my $filename = $href;
250 $filename =~ s/^[^:]*:\/\///;
251 $filename = &util::filename_cat ($base_dir, $filename);
252 my ($ext) = $filename =~ /(\.[^\.]*)$/;
253
254 if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
255 return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
256 }
257
258 if ($self->{'rename_assoc_files'}) {
259 if (defined $self->{'aux_files'}->{$href}) {
260 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
261 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
262 } else {
263 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
264 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
265 $self->inc_filecount ();
266 }
267 $doc_obj->associate_file($filename, $newname, undef, $section);
268 return "_httpcollimg_/$newname";
269
270 } else {
271 ($newname) = $filename =~ /([^\/\\]*)$/;
272 $doc_obj->associate_file($filename, $newname, undef, $section);
273 return "_httpdocimg_/$newname";
274 }
275}
276
277
278sub format_link {
279 my $self = shift (@_);
280 my ($link, $base_dir, $file) = @_;
281
282 my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;
283 $hash_part = "" if !defined $hash_part;
284 if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {
285 print STDERR "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
286 if $self->{'verbosity'};
287 return ($link, "", 0);
288 }
289
290 if ($before_hash =~ s/^((?:http|ftp|file):\/\/)//i) {
291 my $type = $1;
292
293 if ($link =~ /^(http|ftp):/i) {
294 # Turn url (using /) into file name (possibly using \ on windows)
295 my @http_dir_split = split('/', $before_hash);
296 $before_hash = &util::filename_cat(@http_dir_split);
297 }
298
299 $before_hash = $self->eval_dir_dots($before_hash);
300
301 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
302
303 my $rl = 0;
304 $rl = 1 if (-e $linkfilename);
305
306 # make sure there's a slash on the end if it's a directory
307 if ($before_hash !~ /\/$/) {
308 $before_hash .= "/" if (-d $linkfilename);
309 }
310
311 return ($type . $before_hash, $hash_part, $rl);
312
313 } elsif ($link !~ /^(mailto|news|gopher|nntp|telnet|javascript):/i) {
314
315 if ($before_hash =~ s/^\///) {
316 # the first directory will be the domain name if w3mir was used
317 # to generate archives, otherwise we'll assume all files are
318 # from the same site and base_dir is the root
319 if ($self->{'w3mir'}) {
320 my @dirs = split /[\/\\]/, $file;
321 my $domname = shift (@dirs);
322 $before_hash = &util::filename_cat($domname, $before_hash);
323 $before_hash =~ s/\\/\//g; # for windows
324 }
325
326 } else {
327 # Turn relative file path into full path
328 my $dirname = &File::Basename::dirname($file);
329 $before_hash = &util::filename_cat($dirname, $before_hash);
330 $before_hash = $self->eval_dir_dots($before_hash);
331 }
332
333 # make sure there's a slash on the end if it's a directory
334 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
335 if ($before_hash !~ /\/$/) {
336 $before_hash .= "/" if (-d $linkfilename);
337 }
338
339 return ("http://" . $before_hash, $hash_part, 1);
340
341 } else {
342 # mailto, news, nntp, telnet, javascript or gopher link
343 return ($before_hash, "", 0);
344 }
345}
346
347sub extract_metadata {
348 my $self = shift (@_);
349 my ($textref, $metadata, $doc_obj, $section) = @_;
350
351 foreach my $field (split /,/, $self->{'metadata_fields'}) {
352
353 # don't need to extract field if it was passed in from a previous
354 # (recursive) plugin
355 next if defined $metadata->{$field};
356
357 # see if there's a <meta> tag for this field
358 if ($$textref =~ /<meta(.*?)(?:name|http-equiv)\s*=\s*\"?$field\"?([^>]*)/is) {
359 my $content = $1 . $2;
360 if ($content =~ /content\s*=\s*\"?(.*?)\"?/is) {
361 if (defined $1) {
362 my $value = $1;
363 $value =~ s/\s+/ /gs;
364 $doc_obj->add_metadata($section, $field, $value);
365 next;
366 }
367 }
368 }
369
370 # special case for Title metadata - try <title> tags
371 # then first 100 characters of text
372
373 if ($field =~ /^title$/i) {
374
375 # see if there's a <title> tag
376 if ($$textref =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) {
377 if (defined $1) {
378 my $title = $1;
379 if ($title =~ /\w/) {
380 $title =~ s/\s+/ /gs;
381 $doc_obj->add_metadata ($section, $field, $title);
382 next;
383 }
384 }
385 }
386
387 # if no title use first 100 characters
388 my $tmptext = $$textref;
389 $tmptext =~ s/\s+/ /gs;
390 $tmptext =~ s/<[^>]*>//g;
391 my $title = substr ($tmptext, 0, 100);
392 $doc_obj->add_metadata ($section, $field, $title);
393 }
394
395 # if the user requests the first chars as metadata the extract it
396
397 if ($field =~ /^first200$/i) {
398 my $tmptext = $$textref;
399 $tmptext =~ s/\s+/ /gs;
400 $tmptext =~ s/.*<body[^>]*>//i;
401 $tmptext =~ s/<[^>]*>//g;
402 $tmptext = substr ($tmptext, 0, 200);
403 $tmptext =~ s/\s\S*$/.../;
404 $doc_obj->add_metadata ($section, $field, $tmptext);
405 }
406 }
407}
408
409
410# evaluate any "../" to next directory up
411# evaluate any "./" as here
412sub eval_dir_dots {
413 my $self = shift (@_);
414 my ($filename) = @_;
415
416 my $dirsep_os = &util::get_os_dirsep();
417 my @dirsep = split(/$dirsep_os/,$filename);
418
419 my @eval_dirs = ();
420 foreach my $d (@dirsep) {
421 if ($d eq "..") {
422 pop(@eval_dirs);
423
424 } elsif ($d eq ".") {
425 # do nothing!
426
427 } else {
428 push(@eval_dirs,$d);
429 }
430 }
431
432 return &util::filename_cat(@eval_dirs);
433}
434
435sub replace_usemap_links {
436 my $self = shift (@_);
437 my ($front, $link, $back) = @_;
438
439 $link =~ s/^\.\///;
440 return $front . $link . $back;
441}
442
443sub inc_filecount {
444 my $self = shift (@_);
445
446 if ($self->{'file_num'} == 1000) {
447 $self->{'dir_num'} ++;
448 $self->{'file_num'} = 0;
449 } else {
450 $self->{'file_num'} ++;
451 }
452}
453
4541;
Note: See TracBrowser for help on using the repository browser.