Context Navigation

source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 9961

Last change on this file since 9961 was 9747, checked in by davidb, 19 years ago
Encountered new circumstance -- table -- for HTML tags that reference image. These can be in either <table> <tr> or <td> and use the attribute 'background'. Seems to be informally sorted by Netscape and IE but have my doubts over whether or not it is in the standard. Have added addtional RE processing to HTMLPlug that caters for these tags with this attribute.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 29.9 KB

Line
1	###########################################################################
2	#
3	# HTMLPlug.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlug;
37
38	use BasPlug;
39	use ghtml;
40	use unicode;
41	use util;
42	use parsargv;
43	use XMLParser;
44
45	sub BEGIN {
46	@HTMLPlug::ISA = ('BasPlug');
47	}
48
49	use strict; # every perl program should have this!
50	no strict 'refs'; # make an exception so we can use variables as filehandles
51
52	my $arguments =
53	[ { 'name' => "process_exp",
54	'desc' => "{BasPlug.process_exp}",
55	'type' => "regexp",
56	'deft' => &get_default_process_exp() },
57	{ 'name' => "block_exp",
58	'desc' => "{BasPlug.block_exp}",
59	'type' => 'regexp',
60	'deft' => &get_default_block_exp() },
61	{ 'name' => "nolinks",
62	'desc' => "{HTMLPlug.nolinks}",
63	'type' => "flag" },
64	{ 'name' => "keep_head",
65	'desc' => "{HTMLPlug.keep_head}",
66	'type' => "flag" },
67	{ 'name' => "no_metadata",
68	'desc' => "{HTMLPlug.no_metadata}",
69	'type' => "flag" },
70	{ 'name' => "metadata_fields",
71	'desc' => "{HTMLPlug.metadata_fields}",
72	'type' => "string",
73	'deft' => "Title" },
74	{ 'name' => "hunt_creator_metadata",
75	'desc' => "{HTMLPlug.hunt_creator_metadata}",
76	'type' => "flag" },
77	{ 'name' => "file_is_url",
78	'desc' => "{HTMLPlug.file_is_url}",
79	'type' => "flag" },
80	{ 'name' => "assoc_files",
81	'desc' => "{HTMLPlug.assoc_files}",
82	'type' => "regexp",
83	'deft' => &get_default_block_exp() },
84	{ 'name' => "rename_assoc_files",
85	'desc' => "{HTMLPlug.rename_assoc_files}",
86	'type' => "flag" },
87	{ 'name' => "title_sub",
88	'desc' => "{HTMLPlug.title_sub}",
89	'type' => "string",
90	'deft' => "" },
91	{ 'name' => "description_tags",
92	'desc' => "{HTMLPlug.description_tags}",
93	'type' => "flag" },
94	{ 'name' => "no_strip_metadata_html",
95	'desc' => "{HTMLPlug.no_strip_metadata_html}",
96	'type' => "string",
97	'deft' => "",
98	'reqd' => "no"}
99	];
100
101	my $options = { 'name' => "HTMLPlug",
102	'desc' => "{HTMLPlug.desc}",
103	'abstract' => "no",
104	'inherits' => "yes",
105	'args' => $arguments };
106
107	sub new {
108	my $class = shift (@_);
109	my $self = new BasPlug ($class, @_);
110	$self->{'plugin_type'} = "HTMLPlug";
111	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
112	my $option_list = $self->{'option_list'};
113	push( @{$option_list}, $options );
114
115	if (!parsargv::parse(\@_,
116	q^nolinks^, \$self->{'nolinks'},
117	q^keep_head^, \$self->{'keep_head'},
118	q^no_metadata^, \$self->{'no_metadata'},
119	q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
120	q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'},
121	q^w3mir^, \$self->{'w3mir'},
122	q^file_is_url^, \$self->{'file_is_url'},
123	q^assoc_files/.*/(?i)\.(jpe?g\|jpe\|gif\|png\|css)$^, \$self->{'assoc_files'},
124	q^rename_assoc_files^, \$self->{'rename_assoc_files'},
125	q^title_sub/.*/^, \$self->{'title_sub'},
126	q^description_tags^, \$self->{'description_tags'},
127	q^no_strip_metadata_html/.*/^, \$self->{'no_strip_metadata_html'},
128	"allow_extra_options")) {
129
130	print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
131	$self->print_txt_usage(""); # Use default resource bundle
132	die "\n";
133	}
134
135	# retain this for backward compatibility (w3mir option was replaced by
136	# file_is_url)
137	if ($self->{'w3mir'}) {
138	$self->{'file_is_url'} = 1;
139	}
140
141	$self->{'aux_files'} = {};
142	$self->{'dir_num'} = 0;
143	$self->{'file_num'} = 0;
144
145	return bless $self, $class;
146	}
147
148	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
149	# if have eg <script language="javascript" src="img/lib.js@123">
150	sub get_default_block_exp {
151	my $self = shift (@_);
152
153	return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
154	}
155
156	sub get_default_process_exp {
157	my $self = shift (@_);
158
159	# the last option is an attempt to encode the concept of an html query ...
160	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
161	}
162
163	sub store_block_files
164	{
165	my $self =shift (@_);
166	my ($filename) = @_;
167	my $html_fname = $filename;
168	my @file_blocks;
169
170	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
171
172	# read in file ($text will be in utf8)
173	my $text = "";
174	$self->read_file ($filename, $encoding, $language, \$text);
175	my $textref = \$text;
176	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
177	my $closecom = '(?:-->\|(?:—\|\|--)>)';
178	$$textref =~ s/$opencom(.*?)$closecom//gs;
179
180	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
181	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
182	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
183	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
184	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
185	my @tabbg_matches = ($$textref =~ m/<(?:table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
186
187	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches) {
188
189	# remove quotes from link at start and end if necessary
190	if ($link=~/^\"/) {
191	$link=~s/^\"//;
192	$link=~s/\"$//;
193	}
194
195	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
196
197	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
198	# Turn relative file path into full path
199	my $dirname = &File::Basename::dirname($filename);
200	$link = &util::filename_cat($dirname, $link);
201	}
202	$link = $self->eval_dir_dots($link);
203
204	$self->{'file_blocks'}->{$link} = 1;
205	}
206	}
207
208
209	# do plugin specific processing of doc_obj
210	sub process {
211	my $self = shift (@_);
212	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
213	my $outhandle = $self->{'outhandle'};
214
215	print STDERR "<Processing n='$file' p='HTMLPlug'>\n" if ($gli);
216
217	print $outhandle "HTMLPlug: processing $file\n"
218	if $self->{'verbosity'} > 1;
219
220	if ($ENV{'GSDLOS'} =~ /^windows/i) {
221	# this makes life so much easier... perl can cope with unix-style '/'s.
222	$base_dir =~ s@(\\)+@/@g;
223	$file =~ s@(\\)+@/@g;
224	}
225
226	# reset per-doc stuff...
227	$self->{'aux_files'} = {};
228	$self->{'dir_num'} = 0;
229	$self->{'file_num'} = 0;
230
231	my $cursection = $doc_obj->get_top_section();
232
233	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
234	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
235
236	# Store URL for page as metadata - this can be used for an
237	# altavista style search interface. The URL won't be valid
238	# unless the file structure contains the domain name (i.e.
239	# like when w3mir is used to download a website).
240
241	# URL metadata (even invalid ones) are used to support internal
242	# links, so even if 'file_is_url' is off, still need to store info
243
244	my $web_url = "http://$file";
245	$doc_obj->add_metadata($cursection, "URL", $web_url);
246
247	if ($self->{'file_is_url'}) {
248	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
249	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
250	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
251	}
252
253	if ($self->{'description_tags'}) {
254	# remove the html header - note that doing this here means any
255	# sections defined within the header will be lost (so all <Section>
256	# tags must appear within the body of the HTML)
257	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
258
259	$$textref =~ s/^.?<body[^>]>//is;
260	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
261
262	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
263	my $closecom = '(?:-->\|(?:—\|\|--)>)';
264
265	my $lt = '(?:<\|<)';
266	my $gt = '(?:>\|>)';
267	my $quot = '(?:"\|"\|”\|“)';
268
269	my $dont_strip = '';
270	if ($self->{'no_strip_metadata_html'}) {
271	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
272	}
273
274	my $found_something = 0; my $top = 1;
275	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
276	my $text = $1;
277	my $comment = $2;
278	if (defined $text) {
279	# text before a comment - note that getting to here
280	# doesn't necessarily mean there are Section tags in
281	# the document
282	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
283	}
284	while ($comment =~ s/$lt(.*?)$gt//s) {
285	my $tag = $1;
286	if ($tag eq "Section") {
287	$found_something = 1;
288	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
289	$top = 0;
290	} elsif ($tag eq "/Section") {
291	$found_something = 1;
292	$cursection = $doc_obj->get_parent_section ($cursection);
293	} elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
294	my $metaname = $1;
295	my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
296	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
297	my $metavalue = $1;
298	$metavalue =~ s/^\s+//;
299	$metavalue =~ s/\s+$//;
300	# assume that no metadata value intentionally includes
301	# carriage returns or HTML tags (if they're there they
302	# were probably introduced when converting to HTML from
303	# some other format).
304	# actually some people want to have html tags in their
305	# metadata.
306	$metavalue =~ s/[\cJ\cM]/ /sg;
307	$metavalue =~ s/<[^>]+>//sg
308	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ /^($dont_strip)$/);
309	$metavalue =~ s/\s+/ /sg;
310	if ($accumulate) {
311	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
312	} else {
313	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
314	}
315	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
316	# do nothing with containing Description tags
317	} else {
318	# simple HTML tag (probably created by the conversion
319	# to HTML from some other format) - we'll ignore it and
320	# hope for the best ;-)
321	}
322	}
323	}
324	if ($cursection ne "") {
325	print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
326	}
327
328	$$textref =~ s/^.?<body[^>]>//is;
329	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
330	if ($$textref =~ /\S/) {
331	if (!$found_something) {
332	if ($self->{'verbosity'} > 2) {
333	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
334	print $outhandle " will be processed as a single section document\n";
335	}
336
337	# go ahead and process single-section document
338	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
339
340	# if document contains no Section tags we'll go ahead
341	# and extract metadata (this won't have been done
342	# above as the -description_tags option prevents it)
343	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
344	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
345	unless $self->{'no_metadata'};
346
347	} else {
348	print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
349	print $outhandle " of the final closing </Section> tag. This text will\n";
350	print $outhandle " be ignored.";
351
352	my ($text);
353	if (length($$textref) > 30) {
354	$text = substr($$textref, 0, 30) . "...";
355	} else {
356	$text = $$textref;
357	}
358	$text =~ s/\n/ /isg;
359	print $outhandle " ($text)\n";
360	}
361	} elsif (!$found_something) {
362
363	if ($self->{'verbosity'} > 2) {
364	# may get to here if document contained no valid Section
365	# tags but did contain some comments. The text will have
366	# been processed already but we should print the warning
367	# as above and extract metadata
368	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
369	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
370	}
371
372	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
373	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
374	unless $self->{'no_metadata'};
375	}
376
377	} else {
378
379	# remove header and footer
380	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
381	$$textref =~ s/^.?<body[^>]>//is;
382	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
383	}
384
385	# single section document
386	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
387	}
388	return 1;
389	}
390
391	# note that process_section may be called multiple times for a single
392	# section (relying on the fact that add_utf8_text appends the text to any
393	# that may exist already).
394	sub process_section {
395	my $self = shift (@_);
396	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
397	# trap links
398	if (!$self->{'nolinks'}) {
399
400	# usemap="./#index" not handled correctly => change to "#index"
401	$$textref =~ s/(<img[^>]?usemap\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
402	$self->replace_usemap_links($1, $2, $3)/isge;
403
404	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s\"?)([^\">\s]+)(\"?[^>]*>)/
405	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
406	}
407
408	# trap images
409
410	# allow spaces if inside quotes - jrm21
411	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)(\"[^\"]+\"\|[^\s>]+)([^>]>)/
412	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
413
414	# add text to document object
415	# turn \ into \\ so that the rest of greenstone doesn't think there
416	# is an escape code following. (Macro parsing loses them...)
417	$$textref =~ s/\\/\\\\/go;
418	$doc_obj->add_utf8_text($cursection, $$textref);
419	}
420
421	sub replace_images {
422	my $self = shift (@_);
423	my ($front, $link, $back, $base_dir,
424	$file, $doc_obj, $section) = @_;
425
426	# remove quotes from link at start and end if necessary
427	if ($link=~/^\"/) {
428	$link=~s/^\"//;$link=~s/\"$//;
429	$front.='"';
430	$back="\"$back";
431	}
432
433	$link =~ s/\n/ /g;
434
435	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
436	# If the Word file path has spaces in it, wv messes up and you end up with
437	# absolute paths for the images, and without the "file://" prefix
438	# So check for this special case and massage the data to be correct
439	if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
440	$link =~ s/^.*\\([^\\]+)$/$1/;
441	}
442
443	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
444
445	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
446
447	my $anchor_name = $img_file;
448	$anchor_name =~ s/^.*\///;
449	$anchor_name = "<a name=\"$anchor_name\">";
450
451	return $front . $img_file . $back . $anchor_name;
452	}
453
454	sub replace_href_links {
455	my $self = shift (@_);
456	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
457
458	# attempt to sort out targets - frames are not handled
459	# well in this plugin and some cases will screw things
460	# up - e.g. the _parent target (so we'll just remove
461	# them all ;-)
462	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
463	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
464	$front =~ s/target=\"?_parent\"?//is;
465	$back =~ s/target=\"?_parent\"?//is;
466
467	return $front . $link . $back if $link =~ /^\#/s;
468	$link =~ s/\n/ /g;
469
470	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
471	# href may use '\'s where '/'s should be on Windows
472	$href =~ s/\\/\//g;
473
474	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
475
476
477	##### leave all these links alone (they won't be picked up by intermediate
478	##### pages). I think that's safest when dealing with frames, targets etc.
479	##### (at least until I think of a better way to do it). Problems occur with
480	##### mailto links from within small frames, the intermediate page is displayed
481	##### within that frame and can't be seen. There is still potential for this to
482	##### happen even with html pages - the solution seems to be to somehow tell
483	##### the browser from the server side to display the page being sent (i.e.
484	##### the intermediate page) in the top level window - I'm not sure if that's
485	##### possible - the following line should probably be deleted if that can be done
486	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
487
488
489	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
490	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
491	&ghtml::urlsafe ($href);
492	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
493	} else {
494	# link is to some other type of file (eg image) so we'll
495	# need to associate that file
496	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
497	}
498	}
499
500	sub add_file {
501	my $self = shift (@_);
502	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
503	my ($newname);
504
505	my $filename = $href;
506	$filename =~ s/^[^:]*:\/\///;
507	$filename = &util::filename_cat($base_dir, $filename);
508
509	# Replace %20's in URL with a space if required. Note that the filename
510	# may include the %20 in some situations
511	if ($filename =~ /\%20/) {
512	if (!-e $filename) {
513	$filename =~ s/\%20/ /g;
514	}
515	}
516
517	my ($ext) = $filename =~ /(\.[^\.]*)$/;
518
519	if ($rl == 0) {
520	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
521	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
522	}
523	else {
524	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
525	}
526	}
527
528	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
529	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
530	}
531	if ($self->{'rename_assoc_files'}) {
532	if (defined $self->{'aux_files'}->{$href}) {
533	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
534	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
535	} else {
536	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
537	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
538	$self->inc_filecount ();
539	}
540	$doc_obj->associate_file($filename, $newname, undef, $section);
541	return "_httpdocimg_/$newname";
542	} else {
543	($newname) = $filename =~ /([^\/\\]*)$/;
544	$doc_obj->associate_file($filename, $newname, undef, $section);
545	return "_httpdocimg_/$newname";
546	}
547	}
548
549
550	sub format_link {
551	my $self = shift (@_);
552	my ($link, $base_dir, $file) = @_;
553
554	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
555
556	$hash_part = "" if !defined $hash_part;
557	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
558	my $outhandle = $self->{'outhandle'};
559	print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
560	if $self->{'verbosity'};
561	return ($link, "", 0);
562	}
563
564	if ($before_hash =~ s@^((?:http\|ftp\|file)://)@@i) {
565	my $type = $1;
566
567	if ($link =~ /^(http\|ftp):/i) {
568	# Turn url (using /) into file name (possibly using \ on windows)
569	my @http_dir_split = split('/', $before_hash);
570	$before_hash = &util::filename_cat(@http_dir_split);
571	}
572
573	$before_hash = $self->eval_dir_dots($before_hash);
574
575	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
576
577	my $rl = 0;
578	$rl = 1 if (-e $linkfilename);
579
580	# make sure there's a slash on the end if it's a directory
581	if ($before_hash !~ /\/$/) {
582	$before_hash .= "/" if (-d $linkfilename);
583	}
584
585	return ($type . $before_hash, $hash_part, $rl);
586
587	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i) {
588	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ /\\/) {
589
590	# the first directory will be the domain name if file_is_url
591	# to generate archives, otherwise we'll assume all files are
592	# from the same site and base_dir is the root
593
594	if ($self->{'file_is_url'}) {
595	my @dirs = split /[\/\\]/, $file;
596	my $domname = shift (@dirs);
597	$before_hash = &util::filename_cat($domname, $before_hash);
598	$before_hash =~ s@\\@/@g; # for windows
599	}
600	else
601	{
602	# see if link shares directory with source document
603	# => turn into relative link if this is so!
604
605	if ($ENV{'GSDLOS'} =~ /^windows/i) {
606	# too difficult doing a pattern match with embedded '\'s...
607	my $win_before_hash=$before_hash;
608	$win_before_hash =~ s@(\\)+@/@g;
609	# $base_dir is already similarly "converted" on windows.
610	if ($win_before_hash =~ s@^$base_dir/@@o) {
611	# if this is true, we removed a prefix
612	$before_hash=$win_before_hash;
613	}
614	}
615	else {
616	# before_hash has lost leading slash by this point,
617	# -> add back in prior to substitution with $base_dir
618	$before_hash = "/$before_hash";
619
620	$before_hash = &util::filename_cat("",$before_hash);
621	$before_hash =~ s@^$base_dir/@@;
622	}
623	}
624	} else {
625	# Turn relative file path into full path
626	my $dirname = &File::Basename::dirname($file);
627	$before_hash = &util::filename_cat($dirname, $before_hash);
628	$before_hash = $self->eval_dir_dots($before_hash);
629	}
630
631	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
632	# make sure there's a slash on the end if it's a directory
633	if ($before_hash !~ /\/$/) {
634	$before_hash .= "/" if (-d $linkfilename);
635	}
636	return ("http://" . $before_hash, $hash_part, 1);
637	} else {
638	# mailto, news, nntp, telnet, javascript or gopher link
639	return ($before_hash, "", 0);
640	}
641	}
642
643	sub extract_first_NNNN_characters {
644	my $self = shift (@_);
645	my ($textref, $doc_obj, $thissection) = @_;
646
647	foreach my $size (split /,/, $self->{'first'}) {
648	my $tmptext = $$textref;
649	# skip to the body
650	$tmptext =~ s/.<body[^>]>//i;
651	# remove javascript
652	$tmptext =~ s@<script.*?</script>@ @sig;
653	$tmptext =~ s/<[^>]*>/ /g;
654	$tmptext =~ s/ / /g;
655	$tmptext =~ s/^\s+//;
656	$tmptext =~ s/\s+$//;
657	$tmptext =~ s/\s+/ /gs;
658	$tmptext = &unicode::substr ($tmptext, 0, $size);
659	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
660	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
661	}
662	}
663
664
665	sub extract_metadata {
666	my $self = shift (@_);
667	my ($textref, $metadata, $doc_obj, $section) = @_;
668	my $outhandle = $self->{'outhandle'};
669	# if we don't want metadata, we may as well not be here ...
670	return if (!defined $self->{'metadata_fields'});
671
672	# metadata fields to extract/save. 'key' is the (lowercase) name of the
673	# html meta, 'value' is the metadata name for greenstone to use
674	my %find_fields = ();
675
676	my %creator_fields = (); # short-cut for lookups
677
678
679	foreach my $field (split /,/, $self->{'metadata_fields'}) {
680	# support tag<tagname>
681	if ($field =~ /^(.?)<(.?)>$/) {
682	# "$2" is the user's preferred gs metadata name
683	$find_fields{lc($1)}=$2; # lc = lowercase
684	} else { # no <tagname> for mapping
685	# "$field" is the user's preferred gs metadata name
686	$find_fields{lc($field)}=$field; # lc = lowercase
687	}
688	}
689
690	if (defined $self->{'hunt_creator_metadata'} &&
691	$self->{'hunt_creator_metadata'} == 1 ) {
692	my @extra_fields =
693	(
694	'author',
695	'author.email',
696	'creator',
697	'dc.creator',
698	'dc.creator.corporatename',
699	);
700
701	# add the creator_metadata fields to search for
702	foreach my $field (@extra_fields) {
703	$creator_fields{$field}=0; # add to lookup hash
704	}
705	}
706
707
708	# find the header in the html file, which has the meta tags
709	$$textref =~ m@<head>(.*?)</head>@si;
710
711	my $html_header=$1;
712	# go through every <meta... tag defined in the html and see if it is
713	# one of the tags we want to match.
714
715	# special case for title - we want to remember if its been found
716	my $found_title = 0;
717	# this assumes that ">" won't appear. (I don't think it's allowed to...)
718	$html_header =~ /^/; # match the start of the string, for \G assertion
719
720	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
721	my $metatag=$1;
722	my ($tag, $value);
723
724	# find the tag name
725	$metatag =~ /(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
726	$tag=$2;
727	# in case they're not using " or ', but they should...
728	if (! $tag) {
729	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
730	$tag=$1;
731	}
732
733	if (!defined $tag) {
734	print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
735	next;
736	}
737
738	# don't need to assign this field if it was passed in from a previous
739	# (recursive) plugin
740	if (defined $metadata->{$tag}) {next}
741
742	# find the tag content
743	$metatag =~ /content\s=\s([\"\'])?(.*?)\1/is;
744	$value=$2;
745	if (! $value) {
746	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
747	$value=$1;
748	}
749	if (!defined $value) {
750	print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
751	next;
752	}
753
754	# clean up and add
755	$value =~ s/\s+/ /gs;
756	chomp($value); # remove trailing \n, if any
757	if (exists $creator_fields{lc($tag)}) {
758	# map this value onto greenstone's "Creator" metadata
759	$tag='Creator';
760	} elsif (!exists $find_fields{lc($tag)}) {
761	next; # don't want this tag
762	} else {
763	# get the user's preferred capitalisation
764	$tag = $find_fields{lc($tag)};
765	}
766	if (lc($tag) eq "title") {
767	$found_title = 1;
768	}
769	print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
770	if ($self->{'verbosity'} > 2);
771	$doc_obj->add_utf8_metadata($section, $tag, $value);
772
773	}
774
775	# TITLE: extract the document title
776	if (exists $find_fields{'title'} && !$found_title) {
777	# we want a title, and didn't find one in the meta tags
778	# see if there's a <title> tag
779	my $title;
780	my $from = ""; # for debugging output only
781	if ($html_header =~ /<title[^>]>([^<]+)<\/title[^>]>/is) {
782	$title = $1;
783	$from = "<title> tags";
784	}
785
786	if (!defined $title) {
787	$from = "first 100 chars";
788	# if no title use first 100 or so characters
789	$title = $$textref;
790	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
791	$title =~ s/^.*?<body>//si;
792	# ignore javascript!
793	$title =~ s@<script.*?</script>@ @sig;
794	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
795	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
796	$title = substr ($title, 0, 100);
797	$title =~ s/\s\S*$/.../;
798	}
799	$title =~ s/<[^>]*>/ /g; # remove html tags
800	$title =~ s/ / /g;
801	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
802	$title =~ s/\s+/ /gs; # collapse multiple spaces
803	$title =~ s/^\s*//; # remove leading spaces
804	$title =~ s/\s*$//; # remove trailing spaces
805
806	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
807	$title =~ s/^\s+//s; # in case title_sub introduced any...
808	$doc_obj->add_utf8_metadata ($section, 'Title', $title);
809	print $outhandle " extracted Title metadata \"$title\" from $from\n"
810	if ($self->{'verbosity'} > 2);
811	}
812
813	# add FileFormat metadata
814	$doc_obj->add_metadata($section,"FileFormat", "HTML");
815
816	# Special, for metadata names such as tagH1 - extracts
817	# the text between the first <H1> and </H1> tags into "H1" metadata.
818
819	foreach my $field (keys %find_fields) {
820	if ($field !~ /^tag([a-z0-9]+)$/i) {next}
821	my $tag = $1;
822	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
823	my $content = $1;
824	$content =~ s/ / /g;
825	$content =~ s/<[^>]*>/ /g;
826	$content =~ s/^\s+//;
827	$content =~ s/\s+$//;
828	$content =~ s/\s+/ /gs;
829	if ($content) {
830	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
831	$tag =~ s/^tag//i;
832	$doc_obj->add_utf8_metadata ($section, $tag, $content);
833	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
834	if ($self->{'verbosity'} > 2);
835	}
836	}
837	}
838	}
839
840
841	# evaluate any "../" to next directory up
842	# evaluate any "./" as here
843	sub eval_dir_dots {
844	my $self = shift (@_);
845	my ($filename) = @_;
846	my $dirsep_os = &util::get_os_dirsep();
847	my @dirsep = split(/$dirsep_os/,$filename);
848
849	my @eval_dirs = ();
850	foreach my $d (@dirsep) {
851	if ($d eq "..") {
852	pop(@eval_dirs);
853
854	} elsif ($d eq ".") {
855	# do nothing!
856
857	} else {
858	push(@eval_dirs,$d);
859	}
860	}
861
862	# Need to fiddle with number of elements in @eval_dirs if the
863	# first one is the empty string. This is because of a
864	# modification to util::filename_cat that supresses the addition
865	# of a leading '/' character (or \ if windows) (intended to help
866	# filename cat with relative paths) if the first entry in the
867	# array is the empty string. Making the array start with two
868	# empty strings is a way to defeat this "smart" option.
869	#
870	if (scalar(@eval_dirs) > 0) {
871	if ($eval_dirs[0] eq ""){
872	unshift(@eval_dirs,"");
873	}
874	}
875	return &util::filename_cat(@eval_dirs);
876	}
877
878	sub replace_usemap_links {
879	my $self = shift (@_);
880	my ($front, $link, $back) = @_;
881
882	$link =~ s/^\.\///;
883	return $front . $link . $back;
884	}
885
886	sub inc_filecount {
887	my $self = shift (@_);
888
889	if ($self->{'file_num'} == 1000) {
890	$self->{'dir_num'} ++;
891	$self->{'file_num'} = 0;
892	} else {
893	$self->{'file_num'} ++;
894	}
895	}
896
897
898	# Extend the BasPlug read_file so that strings like é are
899	# converted to UTF8 internally.
900	#
901	# We don't convert < or > or & or " in case
902	# they interfere with the GML files
903
904	sub read_file {
905	my ($self, $filename, $encoding, $language, $textref) = @_;
906
907	&BasPlug::read_file($self, $filename, $encoding, $language, $textref);
908
909	# Convert entities to their UTF8 equivalents
910	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
911	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
912	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
913	}
914
915	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: