Context Navigation

source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 1435

Last change on this file since 1435 was 1435, checked in by davidb, 24 years ago
Rearrangement of ConvertTo inheritence so HTMLPlug and TextPlug do not need to know anything about the conversion process.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 15.3 KB

Line
1	###########################################################################
2	#
3	# HTMLPlug.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	# To use frames properly you'll need to use the WebPlug plugin.
35	#
36
37
38	package HTMLPlug;
39
40	use BasPlug;
41	use ghtml;
42	use util;
43	use parsargv;
44
45	sub BEGIN {
46	@ISA = ('BasPlug');
47	}
48
49	sub print_usage {
50	print STDERR "\n usage: plugin HTMLPlug [options]\n\n";
51	print STDERR " options:\n";
52	print STDERR " -nolinks Don't make any attempt to trap links (setting this flag may\n";
53	print STDERR " improve speed of building/importing but any relative links within\n";
54	print STDERR " documents will be broken).\n";
55	print STDERR " -keep_head Don't remove headers from html files.\n";
56	print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n";
57	print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to extract.\n";
58	print STDERR " Defaults to 'Title'.\n";
59	print STDERR " Use `first200` to get the first 200 characters of the body.\n";
60	print STDERR " Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n";
61	print STDERR " -w3mir Set if w3mir was used to generate input file structure.\n";
62	print STDERR " -assoc_files Perl regular expression of file extensions to associate with\n";
63	print STDERR " html documents. Defaults to '(?i)\.(jpe?g\|gif\|png\|css\|pdf)\$'\n";
64	print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images). Also\n";
65	print STDERR " creates much shallower directory structure (useful when creating\n";
66	print STDERR " collections to go on cd-rom).\n\n";
67	print STDERR " -title_sub Substitution expression to modify string stored as Title.\n";
68	print STDERR " Used by, for example, PDFHtml to remove Page 1 etc from text\n";
69	print STDERR " chosen to be used as the title.\n";
70	}
71
72	sub new {
73	my $class = shift (@_);
74	my $self = new BasPlug ($class, @_);
75
76	if (!parsargv::parse(\@_,
77	q^nolinks^, \$self->{'nolinks'},
78	q^keep_head^, \$self->{'keep_head'},
79	q^no_metadata^, \$self->{'no_metadata'},
80	q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
81	q^w3mir^, \$self->{'w3mir'},
82	q^assoc_files/.*/(?i)\.(jpe?g\|gif\|png\|css\|pdf)$^, \$self->{'assoc_files'},
83	q^rename_assoc_files^, \$self->{'rename_assoc_files'},
84	q^title_sub/.*/^, \$self->{'title_sub'},
85	"allow_extra_options")) {
86
87	print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
88	&print_usage();
89	die "\n";
90	}
91
92	$self->{'aux_files'} = {};
93	$self->{'dir_num'} = 0;
94	$self->{'file_num'} = 0;
95
96	return bless $self, $class;
97	}
98
99
100	sub get_default_block_exp {
101	my $self = shift (@_);
102
103	return q^(?i)\.(gif\|jpe?g\|png\|pdf\|rtf\|css)$^;
104	}
105
106	sub get_default_process_exp {
107	my $self = shift (@_);
108
109	# the last option is an attempt to encode the concept of an html query ...
110	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\|\.cgi\|.+\?.+=.*)$^;
111	}
112
113
114	# do plugin specific processing of doc_obj
115	sub process {
116	my $self = shift (@_);
117	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
118	my $outhandle = $self->{'outhandle'};
119
120	print $outhandle "HTMLPlug: processing $file\n"
121	if $self->{'verbosity'} > 1;
122
123	my $cursection = $doc_obj->get_top_section();
124
125	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
126	unless $self->{'no_metadata'};
127
128	# Store URL for page as metadata - this can be used for an
129	# altavista style search interface. The URL won't be valid
130	# unless the file structure contains the domain name (i.e.
131	# like when w3mir is used to download a website).
132	my $web_url = "http://$file";
133	$web_url =~ s/\\/\//g; # for windows
134	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
135
136	# remove header and footer
137	if (!$self->{'keep_head'}) {
138	$$textref =~ s/^.?<body[^>]>//is;
139	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
140	}
141
142	# trap links
143	if (!$self->{'nolinks'}) {
144
145	# usemap="./#index" not handled correctly => change to "#index"
146	$$textref =~ s/(<img[^>]?usemap\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
147	$self->replace_usemap_links($1, $2, $3)/isge;
148
149	$$textref =~ s/(<(?:a\|area\|frame\|link)\s+[^>]?\s(?:href\|src)\s=\s\"?)([^\">\s]+)(\"?[^>]*>)/
150	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
151	}
152
153	# trap images
154	$$textref =~ s/(<img[^>]? src\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
155	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
156
157	# add text to document object
158	$doc_obj->add_utf8_text($cursection, $$textref);
159
160	return 1;
161	}
162
163	sub replace_images {
164	my $self = shift (@_);
165	my ($front, $link, $back, $base_dir,
166	$file, $doc_obj, $section) = @_;
167
168	$link =~ s/\n/ /g;
169
170	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
171	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
172	}
173
174	sub replace_href_links {
175	my $self = shift (@_);
176	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
177
178	# attempt to sort out targets - frames are not handled
179	# well in this plugin and some cases will screw things
180	# up - e.g. the _parent target (so we'll just remove
181	# them all ;-)
182	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
183	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
184	$front =~ s/target=\"?_parent\"?//is;
185	$back =~ s/target=\"?_parent\"?//is;
186
187	return $front . $link . $back if $link =~ /^\#/s;
188	$link =~ s/\n/ /g;
189
190	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
191	# href may use '\'s where '/'s should be on Windows
192	$href =~ s/\\/\//g;
193
194	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
195
196	##### leave all these links alone (they won't be picked up by intermediate
197	##### pages). I think that's safest when dealing with frames, targets etc.
198	##### (at least until I think of a better way to do it). Problems occur with
199	##### mailto links from within small frames, the intermediate page is displayed
200	##### within that frame and can't be seen. There is still potential for this to
201	##### happen even with html pages - the solution seems to be to somehow tell
202	##### the browser from the server side to display the page being sent (i.e.
203	##### the intermediate page) in the top level window - I'm not sure if that's
204	##### possible - the following line should probably be deleted if that can be done
205	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
206
207
208	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
209	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
210	&ghtml::urlsafe ($href);
211	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
212
213	} else {
214	# link is to some other type of file (image, pdf etc.) so we'll
215	# need to associate that file
216	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
217	}
218	}
219
220	sub add_file {
221	my $self = shift (@_);
222	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
223	my ($newname);
224
225	my $filename = $href;
226	$filename =~ s/^[^:]*:\/\///;
227	$filename = &util::filename_cat($base_dir, $filename);
228
229	my ($ext) = $filename =~ /(\.[^\.]*)$/;
230
231	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
232	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
233	}
234
235	if ($self->{'rename_assoc_files'}) {
236	if (defined $self->{'aux_files'}->{$href}) {
237	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
238	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
239	} else {
240	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
241	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
242	$self->inc_filecount ();
243	}
244	$doc_obj->associate_file($filename, $newname, undef, $section);
245	return "_httpcollimg_/$newname";
246
247	} else {
248	($newname) = $filename =~ /([^\/\\]*)$/;
249	$doc_obj->associate_file($filename, $newname, undef, $section);
250	return "_httpdocimg_/$newname";
251	}
252	}
253
254
255	sub format_link {
256	my $self = shift (@_);
257	my ($link, $base_dir, $file) = @_;
258
259	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
260	$hash_part = "" if !defined $hash_part;
261	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
262	my $outhandle = $self->{'outhandle'};
263	print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
264	if $self->{'verbosity'};
265	return ($link, "", 0);
266	}
267
268	if ($before_hash =~ s/^((?:http\|ftp\|file):\/\/)//i) {
269	my $type = $1;
270
271	if ($link =~ /^(http\|ftp):/i) {
272	# Turn url (using /) into file name (possibly using \ on windows)
273	my @http_dir_split = split('/', $before_hash);
274	$before_hash = &util::filename_cat(@http_dir_split);
275	}
276
277	$before_hash = $self->eval_dir_dots($before_hash);
278
279	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
280
281	my $rl = 0;
282	$rl = 1 if (-e $linkfilename);
283
284	# make sure there's a slash on the end if it's a directory
285	if ($before_hash !~ /\/$/) {
286	$before_hash .= "/" if (-d $linkfilename);
287	}
288
289	return ($type . $before_hash, $hash_part, $rl);
290
291	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i) {
292
293	if ($before_hash =~ s/^\///) {
294	# the first directory will be the domain name if w3mir was used
295	# to generate archives, otherwise we'll assume all files are
296	# from the same site and base_dir is the root
297	if ($self->{'w3mir'}) {
298	my @dirs = split /[\/\\]/, $file;
299	my $domname = shift (@dirs);
300	$before_hash = &util::filename_cat($domname, $before_hash);
301	$before_hash =~ s/\\/\//g; # for windows
302	}
303	else
304	{
305	# see if link shares directory with source document
306	# => turn into relative link if this is so!
307	my $before_hash_fix = &util::filename_cat("",$before_hash);
308	$before_hash_fix =~ s/^$base_dir(\\\|\/)//;
309	$before_hash = $before_hash_fix;
310
311	}
312
313
314	} else {
315	# Turn relative file path into full path
316	my $dirname = &File::Basename::dirname($file);
317	$before_hash = &util::filename_cat($dirname, $before_hash);
318	$before_hash = $self->eval_dir_dots($before_hash);
319	}
320
321
322	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
323
324	# make sure there's a slash on the end if it's a directory
325	if ($before_hash !~ /\/$/) {
326	$before_hash .= "/" if (-d $linkfilename);
327	}
328
329	return ("http://" . $before_hash, $hash_part, 1);
330
331	} else {
332	# mailto, news, nntp, telnet, javascript or gopher link
333	return ($before_hash, "", 0);
334	}
335	}
336
337	sub extract_metadata {
338	my $self = shift (@_);
339	my ($textref, $metadata, $doc_obj, $section) = @_;
340
341	return if (!defined $self->{'metadata_fields'});
342
343	foreach my $field (split /,/, $self->{'metadata_fields'}) {
344
345	# don't need to extract field if it was passed in from a previous
346	# (recursive) plugin
347	next if defined $metadata->{$field};
348
349	# see if there's a <meta> tag for this field
350	if ($$textref =~ /<meta(.?)(?:name\|http-equiv)\s=\s\"?$field\"?([^>])/is) {
351	my $content = $1 . $2;
352	if ($content =~ /content\s=\s\"?(.*?)\"?/is) {
353	if (defined $1) {
354	my $value = $1;
355	$value =~ s/\s+/ /gs;
356	$doc_obj->add_utf8_metadata($section, $field, $value);
357	next;
358	}
359	}
360	}
361
362	# TITLE: extract the document title
363
364	if ($field =~ /^title$/i) {
365
366	# see if there's a <title> tag
367	if ($$textref =~ /<title[^>]>([^<])<\/title[^>]*>/is) {
368	if (defined $1) {
369	my $title = $1;
370	if ($title =~ /\w/) {
371	$title =~ s/\s+/ /gs;
372	$title =~ s/^\s+//;
373	$title =~ s/\s+$//;
374	$doc_obj->add_utf8_metadata ($section, $field, $title);
375	next;
376	}
377	}
378	}
379
380	# if no title use first 100 characters
381	my $tmptext = $$textref;
382	$tmptext =~ s/\s+/ /gs;
383	$tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'});
384	$tmptext =~ s/<[^>]*>//g;
385	$tmptext = substr ($tmptext, 0, 100);
386	$tmptext =~ s/^\s+//;
387	$tmptext =~ s/\s+$//;
388	$tmptext =~ s/\s\S*$/.../;
389	$doc_obj->add_utf8_metadata ($section, $field, $tmptext);
390	next;
391	}
392
393	# FIRST200: extract the first 200 characters as metadata
394
395	if ($field =~ /^first200$/i) {
396	my $tmptext = $$textref;
397	$tmptext =~ s/\s+/ /gs;
398	$tmptext =~ s/.<body[^>]>//i;
399	$tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'});
400	$tmptext =~ s/<[^>]*>//g;
401	$tmptext = substr ($tmptext, 0, 200);
402	$tmptext =~ s/^\s+//;
403	$tmptext =~ s/\s+$//;
404	$tmptext =~ s/\s\S*$/.../;
405	$doc_obj->add_utf8_metadata ($section, $field, $tmptext);
406	next;
407	}
408
409	# H1: extract the text between the first <H1> and </H1> tags
410	if ($field =~ /^H1$/i) {
411	my $tmptext = $$textref;
412	$tmptext =~ s/\s+/ /gs;
413	if ($tmptext =~ /<H1[^>]*>/i) {
414	$tmptext =~ s/.<H1[^>]>//i;
415	$tmptext =~ s/<\/H1[^>]>.//i;
416	$tmptext =~ s/^\s+//;
417	$tmptext =~ s/\s+$//;
418	$doc_obj->add_utf8_metadata ($section, $field, $tmptext);
419	}
420	next;
421	}
422	}
423	}
424
425
426	# evaluate any "../" to next directory up
427	# evaluate any "./" as here
428	sub eval_dir_dots {
429	my $self = shift (@_);
430	my ($filename) = @_;
431
432	my $dirsep_os = &util::get_os_dirsep();
433	my @dirsep = split(/$dirsep_os/,$filename);
434
435	my @eval_dirs = ();
436	foreach my $d (@dirsep) {
437	if ($d eq "..") {
438	pop(@eval_dirs);
439
440	} elsif ($d eq ".") {
441	# do nothing!
442
443	} else {
444	push(@eval_dirs,$d);
445	}
446	}
447
448	return &util::filename_cat(@eval_dirs);
449	}
450
451	sub replace_usemap_links {
452	my $self = shift (@_);
453	my ($front, $link, $back) = @_;
454
455	$link =~ s/^\.\///;
456	return $front . $link . $back;
457	}
458
459	sub inc_filecount {
460	my $self = shift (@_);
461
462	if ($self->{'file_num'} == 1000) {
463	$self->{'dir_num'} ++;
464	$self->{'file_num'} = 0;
465	} else {
466	$self->{'file_num'} ++;
467	}
468	}
469
470	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: