Context Navigation

source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 1403

Last change on this file since 1403 was 1403, checked in by say1, 24 years ago
taught HTMLPlug about shtml, asp, cgi, php and html query files ...
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 14.5 KB

Line
1	###########################################################################
2	#
3	# HTMLPlug.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	# To use frames properly you'll need to use the WebPlug plugin.
35	#
36
37
38	package HTMLPlug;
39
40	use BasPlug;
41	use ghtml;
42	use util;
43	use parsargv;
44
45	sub BEGIN {
46	@ISA = ('BasPlug');
47	}
48
49	use strict;
50
51	sub print_usage {
52	print STDERR "\n usage: plugin HTMLPlug [options]\n\n";
53	print STDERR " options:\n";
54	print STDERR " -nolinks Don't make any attempt to trap links (setting this flag may\n";
55	print STDERR " improve speed of building/importing but any relative links within\n";
56	print STDERR " documents will be broken).\n";
57	print STDERR " -keep_head Don't remove headers from html files.\n";
58	print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n";
59	print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to extract.\n";
60	print STDERR " Defaults to 'Title'.\n";
61	print STDERR " Use `first200` to get the first 200 characters of the body.\n";
62	print STDERR " Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n";
63	print STDERR " -w3mir Set if w3mir was used to generate input file structure.\n";
64	print STDERR " -assoc_files Perl regular expression of file extensions to associate with\n";
65	print STDERR " html documents. Defaults to '(?i)\.(jpe?g\|gif\|png\|css\|pdf)\$'\n";
66	print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images). Also\n";
67	print STDERR " creates much shallower directory structure (useful when creating\n";
68	print STDERR " collections to go on cd-rom).\n\n";
69	}
70
71	sub new {
72	my $class = shift (@_);
73	my $self = new BasPlug ("HTMLPlug", @_);
74
75	if (!parsargv::parse(\@_,
76	q^nolinks^, \$self->{'nolinks'},
77	q^keep_head^, \$self->{'keep_head'},
78	q^no_metadata^, \$self->{'no_metadata'},
79	q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
80	q^w3mir^, \$self->{'w3mir'},
81	q^assoc_files/.*/(?i)\.(jpe?g\|gif\|png\|css\|pdf)$^, \$self->{'assoc_files'},
82	q^rename_assoc_files^, \$self->{'rename_assoc_files'},
83	"allow_extra_options")) {
84
85	print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
86	&print_usage();
87	die "\n";
88	}
89
90	$self->{'aux_files'} = {};
91	$self->{'dir_num'} = 0;
92	$self->{'file_num'} = 0;
93
94	return bless $self, $class;
95	}
96
97
98	sub get_default_block_exp {
99	my $self = shift (@_);
100
101	return q^(?i)\.(gif\|jpe?g\|png\|pdf\|rtf\|css)$^;
102	}
103
104	sub get_default_process_exp {
105	my $self = shift (@_);
106
107	# the last option is an attempt to encode the concept of an html query ...
108	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\|\.cgi\|.+\?.+=.*)$^;
109	}
110
111
112	# do plugin specific processing of doc_obj
113	sub process {
114	my $self = shift (@_);
115	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
116
117	print STDERR "HTMLPlug: processing $file\n"
118	if $self->{'verbosity'} > 1;
119
120	my $cursection = $doc_obj->get_top_section();
121
122	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
123	unless $self->{'no_metadata'};
124
125	# Store URL for page as metadata - this can be used for an
126	# altavista style search interface. The URL won't be valid
127	# unless the file structure contains the domain name (i.e.
128	# like when w3mir is used to download a website).
129	my $web_url = "http://$file";
130	$web_url =~ s/\\/\//g; # for windows
131	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
132
133	# remove header and footer
134	if (!$self->{'keep_head'}) {
135	$$textref =~ s/^.?<body[^>]>//is;
136	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
137	}
138
139	# trap links
140	if (!$self->{'nolinks'}) {
141
142	# usemap="./#index" not handled correctly => change to "#index"
143	$$textref =~ s/(<img[^>]?usemap\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
144	$self->replace_usemap_links($1, $2, $3)/isge;
145
146	$$textref =~ s/(<(?:a\|area\|frame\|link)\s+[^>]?\s(?:href\|src)\s=\s\"?)([^\">\s]+)(\"?[^>]*>)/
147	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
148	}
149
150	# trap images
151	$$textref =~ s/(<img[^>]? src\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
152	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
153
154	# add text to document object
155	$doc_obj->add_utf8_text($cursection, $$textref);
156
157	return 1;
158	}
159
160	sub replace_images {
161	my $self = shift (@_);
162	my ($front, $link, $back, $base_dir,
163	$file, $doc_obj, $section) = @_;
164
165	$link =~ s/\n/ /g;
166
167	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
168	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
169	}
170
171	sub replace_href_links {
172	my $self = shift (@_);
173	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
174
175	# attempt to sort out targets - frames are not handled
176	# well in this plugin and some cases will screw things
177	# up - e.g. the _parent target (so we'll just remove
178	# them all ;-)
179	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
180	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
181	$front =~ s/target=\"?_parent\"?//is;
182	$back =~ s/target=\"?_parent\"?//is;
183
184	return $front . $link . $back if $link =~ /^\#/s;
185	$link =~ s/\n/ /g;
186
187	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
188	# href may use '\'s where '/'s should be on Windows
189	$href =~ s/\\/\//g;
190
191	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
192
193	##### leave all these links alone (they won't be picked up by intermediate
194	##### pages). I think that's safest when dealing with frames, targets etc.
195	##### (at least until I think of a better way to do it). Problems occur with
196	##### mailto links from within small frames, the intermediate page is displayed
197	##### within that frame and can't be seen. There is still potential for this to
198	##### happen even with html pages - the solution seems to be to somehow tell
199	##### the browser from the server side to display the page being sent (i.e.
200	##### the intermediate page) in the top level window - I'm not sure if that's
201	##### possible - the following line should probably be deleted if that can be done
202	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
203
204
205	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
206	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
207	&ghtml::urlsafe ($href);
208	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
209
210	} else {
211	# link is to some other type of file (image, pdf etc.) so we'll
212	# need to associate that file
213	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
214	}
215	}
216
217	sub add_file {
218	my $self = shift (@_);
219	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
220	my ($newname);
221
222	my $filename = $href;
223	$filename =~ s/^[^:]*:\/\///;
224	$filename = &util::filename_cat ($base_dir, $filename);
225	my ($ext) = $filename =~ /(\.[^\.]*)$/;
226
227	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
228	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
229	}
230
231	if ($self->{'rename_assoc_files'}) {
232	if (defined $self->{'aux_files'}->{$href}) {
233	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
234	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
235	} else {
236	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
237	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
238	$self->inc_filecount ();
239	}
240	$doc_obj->associate_file($filename, $newname, undef, $section);
241	return "_httpcollimg_/$newname";
242
243	} else {
244	($newname) = $filename =~ /([^\/\\]*)$/;
245	$doc_obj->associate_file($filename, $newname, undef, $section);
246	return "_httpdocimg_/$newname";
247	}
248	}
249
250
251	sub format_link {
252	my $self = shift (@_);
253	my ($link, $base_dir, $file) = @_;
254
255	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
256	$hash_part = "" if !defined $hash_part;
257	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
258	print STDERR "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
259	if $self->{'verbosity'};
260	return ($link, "", 0);
261	}
262
263	if ($before_hash =~ s/^((?:http\|ftp\|file):\/\/)//i) {
264	my $type = $1;
265
266	if ($link =~ /^(http\|ftp):/i) {
267	# Turn url (using /) into file name (possibly using \ on windows)
268	my @http_dir_split = split('/', $before_hash);
269	$before_hash = &util::filename_cat(@http_dir_split);
270	}
271
272	$before_hash = $self->eval_dir_dots($before_hash);
273
274	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
275
276	my $rl = 0;
277	$rl = 1 if (-e $linkfilename);
278
279	# make sure there's a slash on the end if it's a directory
280	if ($before_hash !~ /\/$/) {
281	$before_hash .= "/" if (-d $linkfilename);
282	}
283
284	return ($type . $before_hash, $hash_part, $rl);
285
286	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i) {
287
288	if ($before_hash =~ s/^\///) {
289	# the first directory will be the domain name if w3mir was used
290	# to generate archives, otherwise we'll assume all files are
291	# from the same site and base_dir is the root
292	if ($self->{'w3mir'}) {
293	my @dirs = split /[\/\\]/, $file;
294	my $domname = shift (@dirs);
295	$before_hash = &util::filename_cat($domname, $before_hash);
296	$before_hash =~ s/\\/\//g; # for windows
297	}
298
299	} else {
300	# Turn relative file path into full path
301	my $dirname = &File::Basename::dirname($file);
302	$before_hash = &util::filename_cat($dirname, $before_hash);
303	$before_hash = $self->eval_dir_dots($before_hash);
304	}
305
306	# make sure there's a slash on the end if it's a directory
307	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
308	if ($before_hash !~ /\/$/) {
309	$before_hash .= "/" if (-d $linkfilename);
310	}
311
312	return ("http://" . $before_hash, $hash_part, 1);
313
314	} else {
315	# mailto, news, nntp, telnet, javascript or gopher link
316	return ($before_hash, "", 0);
317	}
318	}
319
320	sub extract_metadata {
321	my $self = shift (@_);
322	my ($textref, $metadata, $doc_obj, $section) = @_;
323
324	return if (!defined $self->{'metadata_fields'});
325
326	foreach my $field (split /,/, $self->{'metadata_fields'}) {
327
328	# don't need to extract field if it was passed in from a previous
329	# (recursive) plugin
330	next if defined $metadata->{$field};
331
332	# see if there's a <meta> tag for this field
333	if ($$textref =~ /<meta(.?)(?:name\|http-equiv)\s=\s\"?$field\"?([^>])/is) {
334	my $content = $1 . $2;
335	if ($content =~ /content\s=\s\"?(.*?)\"?/is) {
336	if (defined $1) {
337	my $value = $1;
338	$value =~ s/\s+/ /gs;
339	$doc_obj->add_utf8_metadata($section, $field, $value);
340	next;
341	}
342	}
343	}
344
345	# TITLE: extract the document title
346
347	if ($field =~ /^title$/i) {
348
349	# see if there's a <title> tag
350	if ($$textref =~ /<title[^>]>([^<])<\/title[^>]*>/is) {
351	if (defined $1) {
352	my $title = $1;
353	if ($title =~ /\w/) {
354	$title =~ s/\s+/ /gs;
355	$title =~ s/^\s+//;
356	$title =~ s/\s+$//;
357	$doc_obj->add_utf8_metadata ($section, $field, $title);
358	next;
359	}
360	}
361	}
362
363	# if no title use first 100 characters
364	my $tmptext = $$textref;
365	$tmptext =~ s/\s+/ /gs;
366	$tmptext =~ s/<[^>]*>//g;
367	$tmptext = substr ($tmptext, 0, 100);
368	$tmptext =~ s/^\s+//;
369	$tmptext =~ s/\s+$//;
370	$tmptext =~ s/\s\S*$/.../;
371	$doc_obj->add_utf8_metadata ($section, $field, $tmptext);
372	next;
373	}
374
375	# FIRST200: extract the first 200 characters as metadata
376
377	if ($field =~ /^first200$/i) {
378	my $tmptext = $$textref;
379	$tmptext =~ s/\s+/ /gs;
380	$tmptext =~ s/.<body[^>]>//i;
381	$tmptext =~ s/<[^>]*>//g;
382	$tmptext = substr ($tmptext, 0, 200);
383	$tmptext =~ s/^\s+//;
384	$tmptext =~ s/\s+$//;
385	$tmptext =~ s/\s\S*$/.../;
386	$doc_obj->add_utf8_metadata ($section, $field, $tmptext);
387	next;
388	}
389
390	# H1: extract the text between the first <H1> and </H1> tags
391	if ($field =~ /^H1$/i) {
392	my $tmptext = $$textref;
393	$tmptext =~ s/\s+/ /gs;
394	if ($tmptext =~ /<H1[^>]*>/i) {
395	$tmptext =~ s/.<H1[^>]>//i;
396	$tmptext =~ s/<\/H1[^>]>.//i;
397	$tmptext =~ s/^\s+//;
398	$tmptext =~ s/\s+$//;
399	$doc_obj->add_utf8_metadata ($section, $field, $tmptext);
400	}
401	next;
402	}
403	}
404	}
405
406
407	# evaluate any "../" to next directory up
408	# evaluate any "./" as here
409	sub eval_dir_dots {
410	my $self = shift (@_);
411	my ($filename) = @_;
412
413	my $dirsep_os = &util::get_os_dirsep();
414	my @dirsep = split(/$dirsep_os/,$filename);
415
416	my @eval_dirs = ();
417	foreach my $d (@dirsep) {
418	if ($d eq "..") {
419	pop(@eval_dirs);
420
421	} elsif ($d eq ".") {
422	# do nothing!
423
424	} else {
425	push(@eval_dirs,$d);
426	}
427	}
428
429	return &util::filename_cat(@eval_dirs);
430	}
431
432	sub replace_usemap_links {
433	my $self = shift (@_);
434	my ($front, $link, $back) = @_;
435
436	$link =~ s/^\.\///;
437	return $front . $link . $back;
438	}
439
440	sub inc_filecount {
441	my $self = shift (@_);
442
443	if ($self->{'file_num'} == 1000) {
444	$self->{'dir_num'} ++;
445	$self->{'file_num'} = 0;
446	} else {
447	$self->{'file_num'} ++;
448	}
449	}
450
451	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: