Context Navigation

source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 9056

Last change on this file since 9056 was 9056, checked in by kjdon, 19 years ago
added an option to not strip html tags from metadata in description tags. contributed by jens wille
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 29.6 KB

Line
1	###########################################################################
2	#
3	# HTMLPlug.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlug;
37
38	use BasPlug;
39	use ghtml;
40	use unicode;
41	use util;
42	use parsargv;
43	use XMLParser;
44
45	sub BEGIN {
46	@HTMLPlug::ISA = ('BasPlug');
47	}
48
49	use strict; # every perl program should have this!
50	no strict 'refs'; # make an exception so we can use variables as filehandles
51
52	my $arguments =
53	[ { 'name' => "process_exp",
54	'desc' => "{BasPlug.process_exp}",
55	'type' => "regexp",
56	'deft' => &get_default_process_exp() },
57	{ 'name' => "block_exp",
58	'desc' => "{BasPlug.block_exp}",
59	'type' => 'regexp',
60	'deft' => &get_default_block_exp() },
61	{ 'name' => "nolinks",
62	'desc' => "{HTMLPlug.nolinks}",
63	'type' => "flag" },
64	{ 'name' => "keep_head",
65	'desc' => "{HTMLPlug.keep_head}",
66	'type' => "flag" },
67	{ 'name' => "no_metadata",
68	'desc' => "{HTMLPlug.no_metadata}",
69	'type' => "flag" },
70	{ 'name' => "metadata_fields",
71	'desc' => "{HTMLPlug.metadata_fields}",
72	'type' => "string",
73	'deft' => "Title" },
74	{ 'name' => "hunt_creator_metadata",
75	'desc' => "{HTMLPlug.hunt_creator_metadata}",
76	'type' => "flag" },
77	{ 'name' => "file_is_url",
78	'desc' => "{HTMLPlug.file_is_url}",
79	'type' => "flag" },
80	{ 'name' => "assoc_files",
81	'desc' => "{HTMLPlug.assoc_files}",
82	'type' => "regexp",
83	'deft' => &get_default_block_exp() },
84	{ 'name' => "rename_assoc_files",
85	'desc' => "{HTMLPlug.rename_assoc_files}",
86	'type' => "flag" },
87	{ 'name' => "title_sub",
88	'desc' => "{HTMLPlug.title_sub}",
89	'type' => "string",
90	'deft' => "" },
91	{ 'name' => "description_tags",
92	'desc' => "{HTMLPlug.description_tags}",
93	'type' => "flag" } ,
94	{ 'name' => "no_strip_metadata_html",
95	'desc' => "{HTMLPlug.no_strip_metadata_html}",
96	'type' => "string",
97	'deft' => "",
98	'reqd' => "no"}
99	];
100
101	my $options = { 'name' => "HTMLPlug",
102	'desc' => "{HTMLPlug.desc}",
103	'abstract' => "no",
104	'inherits' => "yes",
105	'args' => $arguments };
106
107	sub new {
108	my $class = shift (@_);
109	my $self = new BasPlug ($class, @_);
110	$self->{'plugin_type'} = "HTMLPlug";
111	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
112	my $option_list = $self->{'option_list'};
113	push( @{$option_list}, $options );
114
115	if (!parsargv::parse(\@_,
116	q^nolinks^, \$self->{'nolinks'},
117	q^keep_head^, \$self->{'keep_head'},
118	q^no_metadata^, \$self->{'no_metadata'},
119	q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
120	q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'},
121	q^w3mir^, \$self->{'w3mir'},
122	q^file_is_url^, \$self->{'file_is_url'},
123	q^assoc_files/.*/(?i)\.(jpe?g\|jpe\|gif\|png\|css)$^, \$self->{'assoc_files'},
124	q^rename_assoc_files^, \$self->{'rename_assoc_files'},
125	q^title_sub/.*/^, \$self->{'title_sub'},
126	q^description_tags^, \$self->{'description_tags'},
127	q^no_strip_metadata_html/.*/^, \$self->{'no_strip_metadata_html'},
128	"allow_extra_options")) {
129
130	print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
131	$self->print_txt_usage(""); # Use default resource bundle
132	die "\n";
133	}
134
135	# retain this for backward compatibility (w3mir option was replaced by
136	# file_is_url)
137	if ($self->{'w3mir'}) {
138	$self->{'file_is_url'} = 1;
139	}
140
141	$self->{'aux_files'} = {};
142	$self->{'dir_num'} = 0;
143	$self->{'file_num'} = 0;
144
145	return bless $self, $class;
146	}
147
148	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
149	# if have eg <script language="javascript" src="img/lib.js@123">
150	sub get_default_block_exp {
151	my $self = shift (@_);
152
153	return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
154	}
155
156	sub get_default_process_exp {
157	my $self = shift (@_);
158
159	# the last option is an attempt to encode the concept of an html query ...
160	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
161	}
162
163
164	sub metadata_read {
165	my $self = shift (@_);
166	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
167
168	my $outhandle = $self->{'outhandle'};
169
170	my $filename = $file;
171	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
172
173	my ($dir) = $filename =~ /^(.?)[^\/\\]$/;
174
175	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
176	return undef; # can't recognise
177	}
178
179	if ($self->{'smart_block'}) {
180	# Do encoding stuff
181	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
182
183	# read in file ($text will be in utf8)
184	my $text = "";
185	$self->read_file ($filename, $encoding, $language, \$text);
186	$self->store_block_files (\$text, $filename);
187	}
188	return 1;
189	}
190
191	sub store_block_files
192	{
193	my $self =shift (@_);
194	my ($textref, $filename) = @_;
195
196	my $html_fname = $filename;
197	my @file_blocks;
198
199	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
200	my $closecom = '(?:-->\|(?:—\|\|--)>)';
201	$$textref =~ s/$opencom(.*?)$closecom//gs;
202
203	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
204	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
205	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
206	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
207
208
209	foreach my $link (@img_matches, @usemap_matches, @link_matches) {
210
211	# remove quotes from link at start and end if necessary
212	if ($link=~/^\"/) {
213	$link=~s/^\"//;
214	$link=~s/\"$//;
215	}
216
217	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
218
219	if ($link !~ s@^/@@ && $link !~ /^([A-Z]:?)\\/) {
220	# Turn relative file path into full path
221	my $dirname = &File::Basename::dirname($filename);
222	$link = &util::filename_cat($dirname, $link);
223	}
224	$link = $self->eval_dir_dots($link);
225	$self->{'file_blocks'}->{$link} = 1;
226	}
227	}
228
229	# do plugin specific processing of doc_obj
230	sub process {
231	my $self = shift (@_);
232	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
233	my $outhandle = $self->{'outhandle'};
234
235	print STDERR "<Processing n='$file' p='HTMLPlug'>\n" if ($gli);
236
237	print $outhandle "HTMLPlug: processing $file\n"
238	if $self->{'verbosity'} > 1;
239
240	if ($ENV{'GSDLOS'} =~ /^windows/i) {
241	# this makes life so much easier... perl can cope with unix-style '/'s.
242	$base_dir =~ s@(\\)+@/@g;
243	$file =~ s@(\\)+@/@g;
244	}
245
246	# reset per-doc stuff...
247	$self->{'aux_files'} = {};
248	$self->{'dir_num'} = 0;
249	$self->{'file_num'} = 0;
250
251	my $cursection = $doc_obj->get_top_section();
252
253	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
254	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
255
256	# Store URL for page as metadata - this can be used for an
257	# altavista style search interface. The URL won't be valid
258	# unless the file structure contains the domain name (i.e.
259	# like when w3mir is used to download a website).
260	my $web_url = "http://$file";
261	$doc_obj->add_metadata($cursection, "URL", $web_url);
262
263	if ($self->{'description_tags'}) {
264	# remove the html header - note that doing this here means any
265	# sections defined within the header will be lost (so all <Section>
266	# tags must appear within the body of the HTML)
267	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
268
269	$$textref =~ s/^.?<body[^>]>//is;
270	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
271
272	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
273	my $closecom = '(?:-->\|(?:—\|\|--)>)';
274
275	my $lt = '(?:<\|<)';
276	my $gt = '(?:>\|>)';
277	my $quot = '(?:"\|"\|”\|“)';
278
279	my $found_something = 0; my $top = 1;
280	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
281	my $text = $1;
282	my $comment = $2;
283	if (defined $text) {
284	# text before a comment - note that getting to here
285	# doesn't necessarily mean there are Section tags in
286	# the document
287	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
288	}
289	while ($comment =~ s/$lt(.*?)$gt//s) {
290	my $tag = $1;
291	if ($tag eq "Section") {
292	$found_something = 1;
293	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
294	$top = 0;
295	} elsif ($tag eq "/Section") {
296	$found_something = 1;
297	$cursection = $doc_obj->get_parent_section ($cursection);
298	} elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
299	my $metaname = $1;
300	my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
301	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
302	my $metavalue = $1;
303	$metavalue =~ s/^\s+//;
304	$metavalue =~ s/\s+$//;
305	# assume that no metadata value intentionally includes
306	# carriage returns or HTML tags (if they're there they
307	# were probably introduced when converting to HTML from
308	# some other format).
309	# actually some people want to have html tags in their
310	# metadata.
311	$metavalue =~ s/[\cJ\cM]/ /sg;
312	my $dont = '';
313	if ($self->{'no_strip_metadata_html'}) {
314	($dont = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
315	}
316	$metavalue =~ s/<[^>]+>//sg
317	unless $dont && ($dont eq 'all' \|\| $metaname =~ /^($dont)$/);
318	if ($dont && ($dont eq 'all' \|\| $metaname =~ /^($dont)$/)) {
319	}
320	$metavalue =~ s/\s+/ /sg;
321	if ($accumulate) {
322	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
323	} else {
324	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
325	}
326	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
327	# do nothing with containing Description tags
328	} else {
329	# simple HTML tag (probably created by the conversion
330	# to HTML from some other format) - we'll ignore it and
331	# hope for the best ;-)
332	}
333	}
334	}
335	if ($cursection ne "") {
336	print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
337	}
338
339	$$textref =~ s/^.?<body[^>]>//is;
340	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
341	if ($$textref =~ /\S/) {
342	if (!$found_something) {
343	if ($self->{'verbosity'} > 2) {
344	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
345	print $outhandle " will be processed as a single section document\n";
346	}
347
348	# go ahead and process single-section document
349	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
350
351	# if document contains no Section tags we'll go ahead
352	# and extract metadata (this won't have been done
353	# above as the -description_tags option prevents it)
354	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
355	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
356	unless $self->{'no_metadata'};
357
358	} else {
359	print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
360	print $outhandle " of the final closing </Section> tag. This text will\n";
361	print $outhandle " be ignored.";
362
363	my ($text);
364	if (length($$textref) > 30) {
365	$text = substr($$textref, 0, 30) . "...";
366	} else {
367	$text = $$textref;
368	}
369	$text =~ s/\n/ /isg;
370	print $outhandle " ($text)\n";
371	}
372	} elsif (!$found_something) {
373
374	if ($self->{'verbosity'} > 2) {
375	# may get to here if document contained no valid Section
376	# tags but did contain some comments. The text will have
377	# been processed already but we should print the warning
378	# as above and extract metadata
379	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
380	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
381	}
382
383	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
384	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
385	unless $self->{'no_metadata'};
386	}
387
388	} else {
389
390	# remove header and footer
391	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
392	$$textref =~ s/^.?<body[^>]>//is;
393	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
394	}
395
396	# single section document
397	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
398	}
399	return 1;
400	}
401
402	# note that process_section may be called multiple times for a single
403	# section (relying on the fact that add_utf8_text appends the text to any
404	# that may exist already).
405	sub process_section {
406	my $self = shift (@_);
407	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
408	# trap links
409	if (!$self->{'nolinks'}) {
410
411	# usemap="./#index" not handled correctly => change to "#index"
412	$$textref =~ s/(<img[^>]?usemap\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
413	$self->replace_usemap_links($1, $2, $3)/isge;
414
415	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s\"?)([^\">\s]+)(\"?[^>]*>)/
416	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
417	}
418
419	# trap images
420
421	# allow spaces if inside quotes - jrm21
422	$$textref =~ s/(<img[^>]?src\s=\s)(\"[^\"]+\"\|[^\s>]+)([^>]>)/
423	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
424
425	# add text to document object
426	# turn \ into \\ so that the rest of greenstone doesn't think there
427	# is an escape code following. (Macro parsing loses them...)
428	$$textref =~ s/\\/\\\\/go;
429	$doc_obj->add_utf8_text($cursection, $$textref);
430	}
431
432	sub replace_images {
433	my $self = shift (@_);
434	my ($front, $link, $back, $base_dir,
435	$file, $doc_obj, $section) = @_;
436
437	# remove quotes from link at start and end if necessary
438	if ($link=~/^\"/) {
439	$link=~s/^\"//;$link=~s/\"$//;
440	$front.='"';
441	$back="\"$back";
442	}
443
444	$link =~ s/\n/ /g;
445
446	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
447	# If the Word file path has spaces in it, wv messes up and you end up with
448	# absolute paths for the images, and without the "file://" prefix
449	# So check for this special case and massage the data to be correct
450	if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
451	$link =~ s/^.*\\([^\\]+)$/$1/;
452	}
453
454	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
455
456	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
457	my $anchor_name = $img_file;
458	$anchor_name =~ s/^.*\///;
459	$anchor_name = "<a name=\"$anchor_name\">";
460
461	return $front . $img_file . $back . $anchor_name;
462	}
463
464	sub replace_href_links {
465	my $self = shift (@_);
466	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
467
468	# attempt to sort out targets - frames are not handled
469	# well in this plugin and some cases will screw things
470	# up - e.g. the _parent target (so we'll just remove
471	# them all ;-)
472	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
473	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
474	$front =~ s/target=\"?_parent\"?//is;
475	$back =~ s/target=\"?_parent\"?//is;
476
477	return $front . $link . $back if $link =~ /^\#/s;
478	$link =~ s/\n/ /g;
479
480	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
481	# href may use '\'s where '/'s should be on Windows
482	$href =~ s/\\/\//g;
483
484	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
485
486
487	##### leave all these links alone (they won't be picked up by intermediate
488	##### pages). I think that's safest when dealing with frames, targets etc.
489	##### (at least until I think of a better way to do it). Problems occur with
490	##### mailto links from within small frames, the intermediate page is displayed
491	##### within that frame and can't be seen. There is still potential for this to
492	##### happen even with html pages - the solution seems to be to somehow tell
493	##### the browser from the server side to display the page being sent (i.e.
494	##### the intermediate page) in the top level window - I'm not sure if that's
495	##### possible - the following line should probably be deleted if that can be done
496	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
497
498
499	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
500	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
501	&ghtml::urlsafe ($href);
502	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
503	} else {
504	# link is to some other type of file (eg image) so we'll
505	# need to associate that file
506	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
507	}
508	}
509
510	sub add_file {
511	my $self = shift (@_);
512	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
513	my ($newname);
514
515	my $filename = $href;
516	$filename =~ s/^[^:]*:\/\///;
517	$filename = &util::filename_cat($base_dir, $filename);
518
519	# Replace %20's in URL with a space if required. Note that the filename
520	# may include the %20 in some situations
521	if ($filename =~ /\%20/) {
522	if (!-e $filename) {
523	$filename =~ s/\%20/ /g;
524	}
525	}
526
527	my ($ext) = $filename =~ /(\.[^\.]*)$/;
528
529	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
530	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
531	}
532	if ($self->{'rename_assoc_files'}) {
533	if (defined $self->{'aux_files'}->{$href}) {
534	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
535	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
536	} else {
537	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
538	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
539	$self->inc_filecount ();
540	}
541	$doc_obj->associate_file($filename, $newname, undef, $section);
542	return "_httpdocimg_/$newname";
543	} else {
544	($newname) = $filename =~ /([^\/\\]*)$/;
545	$doc_obj->associate_file($filename, $newname, undef, $section);
546	return "_httpdocimg_/$newname";
547	}
548	}
549
550
551	sub format_link {
552	my $self = shift (@_);
553	my ($link, $base_dir, $file) = @_;
554
555	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
556
557	$hash_part = "" if !defined $hash_part;
558	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
559	my $outhandle = $self->{'outhandle'};
560	print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
561	if $self->{'verbosity'};
562	return ($link, "", 0);
563	}
564
565	if ($before_hash =~ s@^((?:http\|ftp\|file)://)@@i) {
566	my $type = $1;
567
568	if ($link =~ /^(http\|ftp):/i) {
569	# Turn url (using /) into file name (possibly using \ on windows)
570	my @http_dir_split = split('/', $before_hash);
571	$before_hash = &util::filename_cat(@http_dir_split);
572	}
573
574	$before_hash = $self->eval_dir_dots($before_hash);
575
576	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
577
578	my $rl = 0;
579	$rl = 1 if (-e $linkfilename);
580
581	# make sure there's a slash on the end if it's a directory
582	if ($before_hash !~ /\/$/) {
583	$before_hash .= "/" if (-d $linkfilename);
584	}
585
586	return ($type . $before_hash, $hash_part, $rl);
587
588	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i) {
589	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ /\\/) {
590
591	# the first directory will be the domain name if file_is_url
592	# to generate archives, otherwise we'll assume all files are
593	# from the same site and base_dir is the root
594
595	if ($self->{'file_is_url'}) {
596	my @dirs = split /[\/\\]/, $file;
597	my $domname = shift (@dirs);
598	$before_hash = &util::filename_cat($domname, $before_hash);
599	$before_hash =~ s@\\@/@g; # for windows
600	}
601	else
602	{
603	# see if link shares directory with source document
604	# => turn into relative link if this is so!
605
606	if ($ENV{'GSDLOS'} =~ /^windows/i) {
607	# too difficult doing a pattern match with embedded '\'s...
608	my $win_before_hash=$before_hash;
609	$win_before_hash =~ s@(\\)+@/@g;
610	# $base_dir is already similarly "converted" on windows.
611	if ($win_before_hash =~ s@^$base_dir/@@o) {
612	# if this is true, we removed a prefix
613	$before_hash=$win_before_hash;
614	}
615	}
616	else {
617	$before_hash = &util::filename_cat("",$before_hash);
618	$before_hash =~ s@^$base_dir/@@;
619	}
620	}
621	} else {
622	# Turn relative file path into full path
623	my $dirname = &File::Basename::dirname($file);
624	$before_hash = &util::filename_cat($dirname, $before_hash);
625	$before_hash = $self->eval_dir_dots($before_hash);
626	}
627
628	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
629	# make sure there's a slash on the end if it's a directory
630	if ($before_hash !~ /\/$/) {
631	$before_hash .= "/" if (-d $linkfilename);
632	}
633	return ("http://" . $before_hash, $hash_part, 1);
634	} else {
635	# mailto, news, nntp, telnet, javascript or gopher link
636	return ($before_hash, "", 0);
637	}
638	}
639
640	sub extract_first_NNNN_characters {
641	my $self = shift (@_);
642	my ($textref, $doc_obj, $thissection) = @_;
643
644	foreach my $size (split /,/, $self->{'first'}) {
645	my $tmptext = $$textref;
646	# skip to the body
647	$tmptext =~ s/.<body[^>]>//i;
648	# remove javascript
649	$tmptext =~ s@<script.*?</script>@ @sig;
650	$tmptext =~ s/<[^>]*>/ /g;
651	$tmptext =~ s/ / /g;
652	$tmptext =~ s/^\s+//;
653	$tmptext =~ s/\s+$//;
654	$tmptext =~ s/\s+/ /gs;
655	# with perl 5.6 at least, substr might segment at a multi-byte char...
656	use utf8;
657	$tmptext = substr ($tmptext, 0, $size);
658	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
659	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
660	}
661	}
662
663
664	sub extract_metadata {
665	my $self = shift (@_);
666	my ($textref, $metadata, $doc_obj, $section) = @_;
667	my $outhandle = $self->{'outhandle'};
668	# if we don't want metadata, we may as well not be here ...
669	return if (!defined $self->{'metadata_fields'});
670
671	# metadata fields to extract/save. 'key' is the (lowercase) name of the
672	# html meta, 'value' is the metadata name for greenstone to use
673	my %find_fields = ();
674
675	my %creator_fields = (); # short-cut for lookups
676
677
678	foreach my $field (split /,/, $self->{'metadata_fields'}) {
679	# support tag<tagname>
680	if ($field =~ /^(.?)<(.?)>$/) {
681	# "$2" is the user's preferred gs metadata name
682	$find_fields{lc($1)}=$2; # lc = lowercase
683	} else { # no <tagname> for mapping
684	# "$field" is the user's preferred gs metadata name
685	$find_fields{lc($field)}=$field; # lc = lowercase
686	}
687	}
688
689	if (defined $self->{'hunt_creator_metadata'} &&
690	$self->{'hunt_creator_metadata'} == 1 ) {
691	my @extra_fields =
692	(
693	'author',
694	'author.email',
695	'creator',
696	'dc.creator',
697	'dc.creator.corporatename',
698	);
699
700	# add the creator_metadata fields to search for
701	foreach my $field (@extra_fields) {
702	$creator_fields{$field}=0; # add to lookup hash
703	}
704	}
705
706
707	# find the header in the html file, which has the meta tags
708	$$textref =~ m@<head>(.*?)</head>@si;
709
710	my $html_header=$1;
711	# go through every <meta... tag defined in the html and see if it is
712	# one of the tags we want to match.
713
714	# special case for title - we want to remember if its been found
715	my $found_title = 0;
716	# this assumes that ">" won't appear. (I don't think it's allowed to...)
717	$html_header =~ /^/; # match the start of the string, for \G assertion
718
719	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
720	my $metatag=$1;
721	my ($tag, $value);
722
723	# find the tag name
724	$metatag =~ /(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
725	$tag=$2;
726	# in case they're not using " or ', but they should...
727	if (! $tag) {
728	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
729	$tag=$1;
730	}
731
732	if (!defined $tag) {
733	print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
734	next;
735	}
736
737	# don't need to assign this field if it was passed in from a previous
738	# (recursive) plugin
739	if (defined $metadata->{$tag}) {next}
740
741	# find the tag content
742	$metatag =~ /content\s=\s([\"\'])?(.*?)\1/is;
743	$value=$2;
744	if (! $value) {
745	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
746	$value=$1;
747	}
748	if (!defined $value) {
749	print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
750	next;
751	}
752
753	# clean up and add
754	$value =~ s/\s+/ /gs;
755	chomp($value); # remove trailing \n, if any
756	if (exists $creator_fields{lc($tag)}) {
757	# map this value onto greenstone's "Creator" metadata
758	$tag='Creator';
759	} elsif (!exists $find_fields{lc($tag)}) {
760	next; # don't want this tag
761	} else {
762	# get the user's preferred capitalisation
763	$tag = $find_fields{lc($tag)};
764	}
765	if (lc($tag) eq "title") {
766	$found_title = 1;
767	}
768	print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
769	if ($self->{'verbosity'} > 2);
770	$doc_obj->add_utf8_metadata($section, $tag, $value);
771
772	}
773
774	# TITLE: extract the document title
775	if (exists $find_fields{'title'} && !$found_title) {
776	# we want a title, and didn't find one in the meta tags
777	# see if there's a <title> tag
778	my $title;
779	my $from = ""; # for debugging output only
780	if ($html_header =~ /<title[^>]>([^<]+)<\/title[^>]>/is) {
781	$title = $1;
782	$from = "<title> tags";
783	}
784
785	if (!defined $title) {
786	$from = "first 100 chars";
787	# if no title use first 100 or so characters
788	$title = $$textref;
789	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
790	$title =~ s/^.*?<body>//si;
791	# ignore javascript!
792	$title =~ s@<script.*?</script>@ @sig;
793	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
794	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
795	$title = substr ($title, 0, 100);
796	$title =~ s/\s\S*$/.../;
797	}
798	$title =~ s/<[^>]*>/ /g; # remove html tags
799	$title =~ s/ / /g;
800	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
801	$title =~ s/\s+/ /gs; # collapse multiple spaces
802	$title =~ s/^\s*//; # remove leading spaces
803	$title =~ s/\s*$//; # remove trailing spaces
804
805	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
806	$title =~ s/^\s+//s; # in case title_sub introduced any...
807	$doc_obj->add_utf8_metadata ($section, 'Title', $title);
808	print $outhandle " extracted Title metadata \"$title\" from $from\n"
809	if ($self->{'verbosity'} > 2);
810	}
811
812	# add FileFormat metadata
813	$doc_obj->add_metadata($section,"FileFormat", "HTML");
814
815	# Special, for metadata names such as tagH1 - extracts
816	# the text between the first <H1> and </H1> tags into "H1" metadata.
817
818	foreach my $field (keys %find_fields) {
819	if ($field !~ /^tag([a-z0-9]+)$/i) {next}
820	my $tag = $1;
821	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
822	my $content = $1;
823	$content =~ s/ / /g;
824	$content =~ s/<[^>]*>/ /g;
825	$content =~ s/^\s+//;
826	$content =~ s/\s+$//;
827	$content =~ s/\s+/ /gs;
828	if ($content) {
829	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
830	$tag =~ s/^tag//i;
831	$doc_obj->add_utf8_metadata ($section, $tag, $content);
832	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
833	if ($self->{'verbosity'} > 2);
834	}
835	}
836	}
837	}
838
839
840	# evaluate any "../" to next directory up
841	# evaluate any "./" as here
842	sub eval_dir_dots {
843	my $self = shift (@_);
844	my ($filename) = @_;
845	my $dirsep_os = &util::get_os_dirsep();
846	my @dirsep = split(/$dirsep_os/,$filename);
847
848	my @eval_dirs = ();
849	foreach my $d (@dirsep) {
850	if ($d eq "..") {
851	pop(@eval_dirs);
852
853	} elsif ($d eq ".") {
854	# do nothing!
855
856	} else {
857	push(@eval_dirs,$d);
858	}
859	}
860
861	# Need to fiddle with number of elements in @eval_dirs if the
862	# first one is the empty string. This is because of a
863	# modification to util::filename_cat that supresses the addition
864	# of a leading '/' character (or \ if windows) (intended to help
865	# filename cat with relative paths) if the first entry in the
866	# array is the empty string. Making the array start with two
867	# empty strings is a way to defeat this "smart" option.
868	#
869	if (scalar(@eval_dirs) > 0) {
870	if ($eval_dirs[0] eq ""){
871	unshift(@eval_dirs,"");
872	}
873	}
874	return &util::filename_cat(@eval_dirs);
875	}
876
877	sub replace_usemap_links {
878	my $self = shift (@_);
879	my ($front, $link, $back) = @_;
880
881	$link =~ s/^\.\///;
882	return $front . $link . $back;
883	}
884
885	sub inc_filecount {
886	my $self = shift (@_);
887
888	if ($self->{'file_num'} == 1000) {
889	$self->{'dir_num'} ++;
890	$self->{'file_num'} = 0;
891	} else {
892	$self->{'file_num'} ++;
893	}
894	}
895
896
897	# Extend the BasPlug read_file so that strings like é are
898	# converted to UTF8 internally.
899	#
900	# We don't convert < or > or & or " in case
901	# they interfere with the GML files
902
903	sub read_file {
904	my ($self, $filename, $encoding, $language, $textref) = @_;
905
906	&BasPlug::read_file($self, $filename, $encoding, $language, $textref);
907
908	# Convert entities to their UTF8 equivalents
909	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
910	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
911	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
912	}
913
914	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: