Context Navigation

source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 1206

Last change on this file since 1206 was 1190, checked in by gwp, 24 years ago
The first 200 chars of body text can now be extracted as metadata by adding 'first200' to the -metadata_fields argument. A potential problem extracting the title was resolved.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 15.3 KB

Line
1	###########################################################################
2	#
3	# HTMLPlug.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	# To use frames properly you'll need to use the WebPlug plugin.
35	#
36
37
38	package HTMLPlug;
39
40	use BasPlug;
41	use ghtml;
42	use util;
43	use parsargv;
44
45	sub BEGIN {
46	@ISA = ('BasPlug');
47	}
48
49	use strict;
50
51	sub print_usage {
52	print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
53
54	print STDERR "\n usage: plugin HTMLPlug [options]\n\n";
55	print STDERR " options:\n";
56	print STDERR " -process_exp A perl regular expression to match against filenames.\n";
57	print STDERR " Matching filenames will be processed by this plugin.\n";
58	print STDERR " Defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
59	print STDERR " .htm or .html (case-insensitive).\n";
60	print STDERR " -nolinks Don't make any attempt to trap links (setting this flag may\n";
61	print STDERR " improve speed of building/importing but any relative links within\n";
62	print STDERR " documents will be broken).\n";
63	print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
64	print STDERR " being passed to any further plugins in the list. By default\n";
65	print STDERR " HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png, .pdf,\n";
66	print STDERR " .rtf or .css file extensions.\n";
67	print STDERR " -keep_head Don't remove headers from html files.\n";
68	print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n";
69	print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to extract.\n";
70	print STDERR " Defaults to 'Title'.\n";
71	print STDERR " Use `first200` to get the first 100 characters of the body.\n";
72	print STDERR " -w3mir Set if w3mir was used to generate input file structure.\n";
73	print STDERR " w3mir \n";
74	print STDERR " -assoc_files Perl regular expression of file extensions to associate with\n";
75	print STDERR " html documents. Defaults to '(?i)\.(jpe?g\|gif\|png\|css\|pdf)$'\n";
76	print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images). Also\n";
77	print STDERR " creates much shallower directory structure (useful when creating\n";
78	print STDERR " collections to go on cd-rom).\n\n";
79	}
80
81	sub new {
82	my $class = shift (@_);
83	my $self = new BasPlug ();
84
85	if (!parsargv::parse(\@_,
86	q^process_exp/.*/(?i)\.html?$^, \$self->{'process_exp'},
87	q^nolinks^, \$self->{'nolinks'},
88	q^block_exp/.*/(?i)\.(gif\|jpe?g\|png\|pdf\|rtf\|css)$^, \$self->{'block_exp'},
89	q^keep_head^, \$self->{'keep_head'},
90	q^no_metadata^, \$self->{'no_metadata'},
91	q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
92	q^w3mir^, \$self->{'w3mir'},
93	q^assoc_files/.*/(?i)\.(jpe?g\|gif\|png\|css\|pdf)$^, \$self->{'assoc_files'},
94	q^rename_assoc_files^, \$self->{'rename_assoc_files'})) {
95	&print_usage();
96	die "\n";
97	}
98
99	$self->{'aux_files'} = {};
100	$self->{'dir_num'} = 0;
101	$self->{'file_num'} = 0;
102
103	return bless $self, $class;
104	}
105
106	sub is_recursive {
107	my $self = shift (@_);
108
109	return 0; # this is not a recursive plugin
110	}
111
112	# return number of files processed, undef if can't process
113	# Note that $base_dir might be "" and that $file might
114	# include directories
115	sub read {
116	my $self = shift (@_);
117	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
118
119	my $filename = &util::filename_cat($base_dir, $file);
120	return 0 if $filename =~ /$self->{'block_exp'}/;
121	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
122	return undef;
123	}
124	$file =~ s/^[\/\\]+//;
125
126	$self->{'verbosity'} = $processor->{'verbosity'};
127	print STDERR "HTMLPlug: processing $file\n"
128	if $self->{'verbosity'} > 1;
129
130	# create a new document
131	my $doc_obj = new doc ($file, "indexed_doc");
132	my $cursection = $doc_obj->get_top_section();
133
134	# read in HTML file
135	open (FILE, $filename) \|\| die "HTMLPlug::read - can't open $filename\n";
136	undef $/;
137	my $text = <FILE>;
138	$/ = "\n";
139	close FILE;
140	if (!defined $text \|\| $text !~ /\w/) {
141	print STDERR "HTMLPlug: ERROR: $file contains no text\n" if $self->{'verbosity'};
142	return 0;
143	}
144
145	$self->extra_metadata ($doc_obj, $cursection, $metadata);
146	$self->extract_metadata (\$text, $metadata, $doc_obj, $cursection)
147	unless $self->{'no_metadata'};
148
149	# Store URL for page as metadata - this can be used for an
150	# altavista style search interface. The URL won't be valid
151	# unless the file structure contains the domain name (i.e.
152	# like when w3mir is used to download a website).
153	my $web_url = "http://$file";
154	$web_url =~ s/\\/\//g; # for windows
155	$doc_obj->add_metadata($cursection, "URL", $web_url);
156
157	# remove header and footer
158	if (!$self->{'keep_head'}) {
159	$text =~ s/^.?<body[^>]>//is;
160	$text =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
161	}
162
163	# trap links
164	if (!$self->{'nolinks'}) {
165
166	# usemap="./#index" not handled correctly => change to "#index"
167	$text =~ s/(<img[^>]?usemap\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
168	$self->replace_usemap_links($1, $2, $3)/isge;
169
170	$text =~ s/(<(?:a\|area\|frame\|link)\s+[^>]?(?:href\|src)\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
171	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
172	}
173
174	# trap images
175	$text =~ s/(<img[^>]?src\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
176	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
177
178	$doc_obj->add_text ($cursection, $text);
179
180	# add an OID
181	$doc_obj->set_OID();
182
183	# process the document
184	$processor->process($doc_obj);
185
186	return 1; # processed the file
187	}
188
189	sub replace_images {
190	my $self = shift (@_);
191	my ($front, $link, $back, $base_dir,
192	$file, $doc_obj, $section) = @_;
193
194	$link =~ s/\n/ /g;
195
196	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
197	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
198	}
199
200	sub replace_href_links {
201	my $self = shift (@_);
202	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
203
204	# attempt to sort out targets - frames are not handled
205	# well in this plugin and some cases will screw things
206	# up - e.g. the _parent target (so we'll just remove
207	# them all ;-)
208	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
209	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
210	$front =~ s/target=\"?_parent\"?//is;
211	$back =~ s/target=\"?_parent\"?//is;
212
213	return $front . $link . $back if $link =~ /^\#/s;
214	$link =~ s/\n/ /g;
215
216	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
217
218	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
219
220	##### leave all these links alone (they won't be picked up by intermediate
221	##### pages). I think that's safest when dealing with frames, targets etc.
222	##### (at least until I think of a better way to do it). Problems occur with
223	##### mailto links from within small frames, the intermediate page is displayed
224	##### within that frame and can't be seen. There is still potential for this to
225	##### happen even with html pages - the solution seems to be to somehow tell
226	##### the browser from the server side to display the page being sent (i.e.
227	##### the intermediate page) in the top level window - I'm not sure if that's
228	##### possible - the following line should probably be deleted if that can be done
229	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
230
231
232	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
233	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
234	&ghtml::urlsafe ($href);
235	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
236
237	} else {
238	# link is to some other type of file (image, pdf etc.) so we'll
239	# need to associate that file
240	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
241	}
242	}
243
244	sub add_file {
245	my $self = shift (@_);
246	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
247	my ($newname);
248
249	my $filename = $href;
250	$filename =~ s/^[^:]*:\/\///;
251	$filename = &util::filename_cat ($base_dir, $filename);
252	my ($ext) = $filename =~ /(\.[^\.]*)$/;
253
254	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
255	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
256	}
257
258	if ($self->{'rename_assoc_files'}) {
259	if (defined $self->{'aux_files'}->{$href}) {
260	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
261	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
262	} else {
263	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
264	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
265	$self->inc_filecount ();
266	}
267	$doc_obj->associate_file($filename, $newname, undef, $section);
268	return "_httpcollimg_/$newname";
269
270	} else {
271	($newname) = $filename =~ /([^\/\\]*)$/;
272	$doc_obj->associate_file($filename, $newname, undef, $section);
273	return "_httpdocimg_/$newname";
274	}
275	}
276
277
278	sub format_link {
279	my $self = shift (@_);
280	my ($link, $base_dir, $file) = @_;
281
282	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
283	$hash_part = "" if !defined $hash_part;
284	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
285	print STDERR "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
286	if $self->{'verbosity'};
287	return ($link, "", 0);
288	}
289
290	if ($before_hash =~ s/^((?:http\|ftp\|file):\/\/)//i) {
291	my $type = $1;
292
293	if ($link =~ /^(http\|ftp):/i) {
294	# Turn url (using /) into file name (possibly using \ on windows)
295	my @http_dir_split = split('/', $before_hash);
296	$before_hash = &util::filename_cat(@http_dir_split);
297	}
298
299	$before_hash = $self->eval_dir_dots($before_hash);
300
301	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
302
303	my $rl = 0;
304	$rl = 1 if (-e $linkfilename);
305
306	# make sure there's a slash on the end if it's a directory
307	if ($before_hash !~ /\/$/) {
308	$before_hash .= "/" if (-d $linkfilename);
309	}
310
311	return ($type . $before_hash, $hash_part, $rl);
312
313	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i) {
314
315	if ($before_hash =~ s/^\///) {
316	# the first directory will be the domain name if w3mir was used
317	# to generate archives, otherwise we'll assume all files are
318	# from the same site and base_dir is the root
319	if ($self->{'w3mir'}) {
320	my @dirs = split /[\/\\]/, $file;
321	my $domname = shift (@dirs);
322	$before_hash = &util::filename_cat($domname, $before_hash);
323	$before_hash =~ s/\\/\//g; # for windows
324	}
325
326	} else {
327	# Turn relative file path into full path
328	my $dirname = &File::Basename::dirname($file);
329	$before_hash = &util::filename_cat($dirname, $before_hash);
330	$before_hash = $self->eval_dir_dots($before_hash);
331	}
332
333	# make sure there's a slash on the end if it's a directory
334	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
335	if ($before_hash !~ /\/$/) {
336	$before_hash .= "/" if (-d $linkfilename);
337	}
338
339	return ("http://" . $before_hash, $hash_part, 1);
340
341	} else {
342	# mailto, news, nntp, telnet, javascript or gopher link
343	return ($before_hash, "", 0);
344	}
345	}
346
347	sub extract_metadata {
348	my $self = shift (@_);
349	my ($textref, $metadata, $doc_obj, $section) = @_;
350
351	foreach my $field (split /,/, $self->{'metadata_fields'}) {
352
353	# don't need to extract field if it was passed in from a previous
354	# (recursive) plugin
355	next if defined $metadata->{$field};
356
357	# see if there's a <meta> tag for this field
358	if ($$textref =~ /<meta(.?)(?:name\|http-equiv)\s=\s\"?$field\"?([^>])/is) {
359	my $content = $1 . $2;
360	if ($content =~ /content\s=\s\"?(.*?)\"?/is) {
361	if (defined $1) {
362	my $value = $1;
363	$value =~ s/\s+/ /gs;
364	$doc_obj->add_metadata($section, $field, $value);
365	next;
366	}
367	}
368	}
369
370	# special case for Title metadata - try <title> tags
371	# then first 100 characters of text
372
373	if ($field =~ /^title$/i) {
374
375	# see if there's a <title> tag
376	if ($$textref =~ /<title[^>]>([^<])<\/title[^>]*>/is) {
377	if (defined $1) {
378	my $title = $1;
379	if ($title =~ /\w/) {
380	$title =~ s/\s+/ /gs;
381	$doc_obj->add_metadata ($section, $field, $title);
382	next;
383	}
384	}
385	}
386
387	# if no title use first 100 characters
388	my $tmptext = $$textref;
389	$tmptext =~ s/\s+/ /gs;
390	$tmptext =~ s/<[^>]*>//g;
391	my $title = substr ($tmptext, 0, 100);
392	$doc_obj->add_metadata ($section, $field, $title);
393	}
394
395	# if the user requests the first chars as metadata the extract it
396
397	if ($field =~ /^first200$/i) {
398	my $tmptext = $$textref;
399	$tmptext =~ s/\s+/ /gs;
400	$tmptext =~ s/.<body[^>]>//i;
401	$tmptext =~ s/<[^>]*>//g;
402	$tmptext = substr ($tmptext, 0, 200);
403	$tmptext =~ s/\s\S*$/.../;
404	$doc_obj->add_metadata ($section, $field, $tmptext);
405	}
406	}
407	}
408
409
410	# evaluate any "../" to next directory up
411	# evaluate any "./" as here
412	sub eval_dir_dots {
413	my $self = shift (@_);
414	my ($filename) = @_;
415
416	my $dirsep_os = &util::get_os_dirsep();
417	my @dirsep = split(/$dirsep_os/,$filename);
418
419	my @eval_dirs = ();
420	foreach my $d (@dirsep) {
421	if ($d eq "..") {
422	pop(@eval_dirs);
423
424	} elsif ($d eq ".") {
425	# do nothing!
426
427	} else {
428	push(@eval_dirs,$d);
429	}
430	}
431
432	return &util::filename_cat(@eval_dirs);
433	}
434
435	sub replace_usemap_links {
436	my $self = shift (@_);
437	my ($front, $link, $back) = @_;
438
439	$link =~ s/^\.\///;
440	return $front . $link . $back;
441	}
442
443	sub inc_filecount {
444	my $self = shift (@_);
445
446	if ($self->{'file_num'} == 1000) {
447	$self->{'dir_num'} ++;
448	$self->{'file_num'} = 0;
449	} else {
450	$self->{'file_num'} ++;
451	}
452	}
453
454	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: