Context Navigation

source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 11090

Last change on this file since 11090 was 10725, checked in by chi, 19 years ago
For some reasons, to change the date format to "yyymmdd" used "date" and can't recognise "Date". Therefore, adding some code to convert the metadata "Date" to "date" in the early stage.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 31.8 KB

Line
1	###########################################################################
2	#
3	# HTMLPlug.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlug;
37
38	use BasPlug;
39	use ghtml;
40	use unicode;
41	use util;
42	use XMLParser;
43
44	sub BEGIN {
45	@HTMLPlug::ISA = ('BasPlug');
46	}
47
48	use strict; # every perl program should have this!
49	no strict 'refs'; # make an exception so we can use variables as filehandles
50
51	my $arguments =
52	[ { 'name' => "process_exp",
53	'desc' => "{BasPlug.process_exp}",
54	'type' => "regexp",
55	'deft' => &get_default_process_exp() },
56	{ 'name' => "block_exp",
57	'desc' => "{BasPlug.block_exp}",
58	'type' => 'regexp',
59	'deft' => &get_default_block_exp() },
60	{ 'name' => "nolinks",
61	'desc' => "{HTMLPlug.nolinks}",
62	'type' => "flag" },
63	{ 'name' => "keep_head",
64	'desc' => "{HTMLPlug.keep_head}",
65	'type' => "flag" },
66	{ 'name' => "no_metadata",
67	'desc' => "{HTMLPlug.no_metadata}",
68	'type' => "flag" },
69	{ 'name' => "metadata_fields",
70	'desc' => "{HTMLPlug.metadata_fields}",
71	'type' => "string",
72	'deft' => "Title" },
73	{ 'name' => "hunt_creator_metadata",
74	'desc' => "{HTMLPlug.hunt_creator_metadata}",
75	'type' => "flag" },
76	{ 'name' => "file_is_url",
77	'desc' => "{HTMLPlug.file_is_url}",
78	'type' => "flag" },
79	{ 'name' => "assoc_files",
80	'desc' => "{HTMLPlug.assoc_files}",
81	'type' => "regexp",
82	'deft' => &get_default_block_exp() },
83	{ 'name' => "rename_assoc_files",
84	'desc' => "{HTMLPlug.rename_assoc_files}",
85	'type' => "flag" },
86	{ 'name' => "title_sub",
87	'desc' => "{HTMLPlug.title_sub}",
88	'type' => "string",
89	'deft' => "" },
90	{ 'name' => "description_tags",
91	'desc' => "{HTMLPlug.description_tags}",
92	'type' => "flag" },
93	# retain this for backward compatibility (w3mir option was replaced by
94	# file_is_url)
95	{ 'name' => "w3mir",
96	# 'desc' => "{HTMLPlug.w3mir}",
97	'type' => "flag",
98	'hiddengli' => "yes"},
99	{ 'name' => "no_strip_metadata_html",
100	'desc' => "{HTMLPlug.no_strip_metadata_html}",
101	'type' => "string",
102	'deft' => "",
103	'reqd' => "no"},
104	{ 'name' => "sectionalise_using_h_tags",
105	'desc' => "{HTMLPlug.sectionalise_using_h_tags}",
106	'type' => "flag" }
107	];
108
109	my $options = { 'name' => "HTMLPlug",
110	'desc' => "{HTMLPlug.desc}",
111	'abstract' => "no",
112	'inherits' => "yes",
113	'args' => $arguments };
114
115	sub new {
116	my ($class) = shift (@_);
117	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
118	push(@$pluginlist, $class);
119
120	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
121	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
122
123
124	my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
125
126	if ($self->{'w3mir'}) {
127	$self->{'file_is_url'} = 1;
128	}
129	$self->{'aux_files'} = {};
130	$self->{'dir_num'} = 0;
131	$self->{'file_num'} = 0;
132
133	return bless $self, $class;
134	}
135
136	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
137	# if have eg <script language="javascript" src="img/lib.js@123">
138	sub get_default_block_exp {
139	my $self = shift (@_);
140
141	return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
142	}
143
144	sub get_default_process_exp {
145	my $self = shift (@_);
146
147	# the last option is an attempt to encode the concept of an html query ...
148	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
149	}
150
151	sub store_block_files
152	{
153	my $self =shift (@_);
154	my ($filename) = @_;
155	my $html_fname = $filename;
156	my @file_blocks;
157
158	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
159
160	# read in file ($text will be in utf8)
161	my $text = "";
162	$self->read_file ($filename, $encoding, $language, \$text);
163	my $textref = \$text;
164	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
165	my $closecom = '(?:-->\|(?:—\|\|--)>)';
166	$$textref =~ s/$opencom(.*?)$closecom//gs;
167
168	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
169	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
170	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
171	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
172	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
173	my @tabbg_matches = ($$textref =~ m/<(?:table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
174
175	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches) {
176
177	# remove quotes from link at start and end if necessary
178	if ($link=~/^\"/) {
179	$link=~s/^\"//;
180	$link=~s/\"$//;
181	}
182
183	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
184
185	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
186	# Turn relative file path into full path
187	my $dirname = &File::Basename::dirname($filename);
188	$link = &util::filename_cat($dirname, $link);
189	}
190	$link = $self->eval_dir_dots($link);
191
192	$self->{'file_blocks'}->{$link} = 1;
193	}
194	}
195
196
197	# do plugin specific processing of doc_obj
198	sub process {
199	my $self = shift (@_);
200	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
201	my $outhandle = $self->{'outhandle'};
202
203	print STDERR "<Processing n='$file' p='HTMLPlug'>\n" if ($gli);
204
205	print $outhandle "HTMLPlug: processing $file\n"
206	if $self->{'verbosity'} > 1;
207
208	if ($ENV{'GSDLOS'} =~ /^windows/i) {
209	# this makes life so much easier... perl can cope with unix-style '/'s.
210	$base_dir =~ s@(\\)+@/@g;
211	$file =~ s@(\\)+@/@g;
212	}
213
214	# reset per-doc stuff...
215	$self->{'aux_files'} = {};
216	$self->{'dir_num'} = 0;
217	$self->{'file_num'} = 0;
218
219	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
220	# you can also include metadata in the format (X can be any number)
221	# <hX>Title<!--gsdl-metadata
222	# <Metadata name="name1">value1</Metadata>
223	# ...
224	# <Metadata name="nameN">valueN</Metadata>
225	#--></hX>
226	if ($self->{'sectionalise_using_h_tags'}) {
227	# description_tags should allways be activated because we convert headings to description tags
228	$self->{'description_tags'} = 1;
229
230	my $arrSections = [];
231	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
232
233	if (scalar(@$arrSections)) {
234	my $strMetadata = $self->update_section_data($arrSections, -1);
235	if (length($strMetadata)) {
236	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
237	$$textref =~ s/<\/body>/$strMetadata/ig;
238	}
239	}
240	}
241
242	my $cursection = $doc_obj->get_top_section();
243
244	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
245	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
246
247	# Store URL for page as metadata - this can be used for an
248	# altavista style search interface. The URL won't be valid
249	# unless the file structure contains the domain name (i.e.
250	# like when w3mir is used to download a website).
251
252	# URL metadata (even invalid ones) are used to support internal
253	# links, so even if 'file_is_url' is off, still need to store info
254
255	my $web_url = "http://$file";
256	$doc_obj->add_metadata($cursection, "URL", $web_url);
257
258	if ($self->{'file_is_url'}) {
259	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
260	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
261	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
262	}
263
264	if ($self->{'description_tags'}) {
265	# remove the html header - note that doing this here means any
266	# sections defined within the header will be lost (so all <Section>
267	# tags must appear within the body of the HTML)
268	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
269
270	$$textref =~ s/^.?<body[^>]>//is;
271	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
272
273	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
274	my $closecom = '(?:-->\|(?:—\|\|--)>)';
275
276	my $lt = '(?:<\|<)';
277	my $gt = '(?:>\|>)';
278	my $quot = '(?:"\|"\|”\|“)';
279
280	my $dont_strip = '';
281	if ($self->{'no_strip_metadata_html'}) {
282	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
283	}
284
285	my $found_something = 0; my $top = 1;
286	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
287	my $text = $1;
288	my $comment = $2;
289	if (defined $text) {
290	# text before a comment - note that getting to here
291	# doesn't necessarily mean there are Section tags in
292	# the document
293	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
294	}
295	while ($comment =~ s/$lt(.*?)$gt//s) {
296	my $tag = $1;
297	if ($tag eq "Section") {
298	$found_something = 1;
299	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
300	$top = 0;
301	} elsif ($tag eq "/Section") {
302	$found_something = 1;
303	$cursection = $doc_obj->get_parent_section ($cursection);
304	} elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
305	my $metaname = $1;
306	my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
307	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
308	my $metavalue = $1;
309	$metavalue =~ s/^\s+//;
310	$metavalue =~ s/\s+$//;
311	# assume that no metadata value intentionally includes
312	# carriage returns or HTML tags (if they're there they
313	# were probably introduced when converting to HTML from
314	# some other format).
315	# actually some people want to have html tags in their
316	# metadata.
317	$metavalue =~ s/[\cJ\cM]/ /sg;
318	$metavalue =~ s/<[^>]+>//sg
319	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ /^($dont_strip)$/);
320	$metavalue =~ s/\s+/ /sg;
321	if ($accumulate) {
322	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
323	} else {
324	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
325	}
326	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
327	# do nothing with containing Description tags
328	} else {
329	# simple HTML tag (probably created by the conversion
330	# to HTML from some other format) - we'll ignore it and
331	# hope for the best ;-)
332	}
333	}
334	}
335	if ($cursection ne "") {
336	print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
337	}
338
339	$$textref =~ s/^.?<body[^>]>//is;
340	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
341	if ($$textref =~ /\S/) {
342	if (!$found_something) {
343	if ($self->{'verbosity'} > 2) {
344	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
345	print $outhandle " will be processed as a single section document\n";
346	}
347
348	# go ahead and process single-section document
349	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
350
351	# if document contains no Section tags we'll go ahead
352	# and extract metadata (this won't have been done
353	# above as the -description_tags option prevents it)
354	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
355	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
356	unless $self->{'no_metadata'};
357
358	} else {
359	print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
360	print $outhandle " of the final closing </Section> tag. This text will\n";
361	print $outhandle " be ignored.";
362
363	my ($text);
364	if (length($$textref) > 30) {
365	$text = substr($$textref, 0, 30) . "...";
366	} else {
367	$text = $$textref;
368	}
369	$text =~ s/\n/ /isg;
370	print $outhandle " ($text)\n";
371	}
372	} elsif (!$found_something) {
373
374	if ($self->{'verbosity'} > 2) {
375	# may get to here if document contained no valid Section
376	# tags but did contain some comments. The text will have
377	# been processed already but we should print the warning
378	# as above and extract metadata
379	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
380	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
381	}
382
383	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
384	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
385	unless $self->{'no_metadata'};
386	}
387
388	} else {
389
390	# remove header and footer
391	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
392	$$textref =~ s/^.?<body[^>]>//is;
393	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
394	}
395
396	# single section document
397	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
398	}
399	return 1;
400	}
401
402
403	sub process_heading
404	{
405	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
406	$strHeadingText = '' if (!defined($strHeadingText));
407
408	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
409
410	my $strSecMetadata = '';
411	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
412	{
413	$strSecMetadata .= $1;
414	}
415
416	$strHeadingText =~ s/^\s+//g;
417	$strHeadingText =~ s/\s+$//g;
418	$strSecMetadata =~ s/^\s+//g;
419	$strSecMetadata =~ s/\s+$//g;
420
421	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
422
423	if (length($strSecMetadata)) {
424	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
425	}
426
427	$strMetadata .= "\t</Description>\n";
428
429	return "<!--" . $strMetadata . "-->";
430	}
431
432
433	sub update_section_data
434	{
435	my ($self, $arrSections, $nCurTocNo) = @_;
436	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
437
438	if ($nSections == 0) {
439	push @$arrSections, $nCurTocNo;
440	return $strBuffer;
441	}
442	$nLast = $arrSections->[$nSections - 1];
443	if ($nCurTocNo > $nLast) {
444	push @$arrSections, $nCurTocNo;
445	return $strBuffer;
446	}
447	for(my $i = $nSections - 1; $i >= 0; $i--) {
448	if ($nCurTocNo <= $arrSections->[$i]) {
449	$strBuffer .= "\n</Section>";
450	pop @$arrSections;
451	}
452	}
453	push @$arrSections, $nCurTocNo;
454	return $strBuffer;
455	}
456
457
458	# note that process_section may be called multiple times for a single
459	# section (relying on the fact that add_utf8_text appends the text to any
460	# that may exist already).
461	sub process_section {
462	my $self = shift (@_);
463	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
464	# trap links
465	if (!$self->{'nolinks'}) {
466
467	# usemap="./#index" not handled correctly => change to "#index"
468	$$textref =~ s/(<img[^>]?usemap\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
469	$self->replace_usemap_links($1, $2, $3)/isge;
470
471	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s\"?)([^\">\s]+)(\"?[^>]*>)/
472	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
473	}
474
475	# trap images
476
477	# allow spaces if inside quotes - jrm21
478	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)(\"[^\"]+\"\|[^\s>]+)([^>]>)/
479	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
480
481	# add text to document object
482	# turn \ into \\ so that the rest of greenstone doesn't think there
483	# is an escape code following. (Macro parsing loses them...)
484	$$textref =~ s/\\/\\\\/go;
485
486	$doc_obj->add_utf8_text($cursection, $$textref);
487	}
488
489	sub replace_images {
490	my $self = shift (@_);
491	my ($front, $link, $back, $base_dir,
492	$file, $doc_obj, $section) = @_;
493
494	# remove quotes from link at start and end if necessary
495	if ($link=~/^\"/) {
496	$link=~s/^\"//;$link=~s/\"$//;
497	$front.='"';
498	$back="\"$back";
499	}
500
501	$link =~ s/\n/ /g;
502
503	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
504	# If the Word file path has spaces in it, wv messes up and you end up with
505	# absolute paths for the images, and without the "file://" prefix
506	# So check for this special case and massage the data to be correct
507	if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
508	$link =~ s/^.*\\([^\\]+)$/$1/;
509	}
510
511	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
512
513	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
514
515	my $anchor_name = $img_file;
516	$anchor_name =~ s/^.*\///;
517	$anchor_name = "<a name=\"$anchor_name\">";
518
519	return $front . $img_file . $back . $anchor_name;
520	}
521
522	sub replace_href_links {
523	my $self = shift (@_);
524	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
525
526	# attempt to sort out targets - frames are not handled
527	# well in this plugin and some cases will screw things
528	# up - e.g. the _parent target (so we'll just remove
529	# them all ;-)
530	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
531	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
532	$front =~ s/target=\"?_parent\"?//is;
533	$back =~ s/target=\"?_parent\"?//is;
534
535	return $front . $link . $back if $link =~ /^\#/s;
536	$link =~ s/\n/ /g;
537
538	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
539	# href may use '\'s where '/'s should be on Windows
540	$href =~ s/\\/\//g;
541
542	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
543
544
545	##### leave all these links alone (they won't be picked up by intermediate
546	##### pages). I think that's safest when dealing with frames, targets etc.
547	##### (at least until I think of a better way to do it). Problems occur with
548	##### mailto links from within small frames, the intermediate page is displayed
549	##### within that frame and can't be seen. There is still potential for this to
550	##### happen even with html pages - the solution seems to be to somehow tell
551	##### the browser from the server side to display the page being sent (i.e.
552	##### the intermediate page) in the top level window - I'm not sure if that's
553	##### possible - the following line should probably be deleted if that can be done
554	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
555
556
557	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
558	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
559	&ghtml::urlsafe ($href);
560	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
561	} else {
562	# link is to some other type of file (eg image) so we'll
563	# need to associate that file
564	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
565	}
566	}
567
568	sub add_file {
569	my $self = shift (@_);
570	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
571	my ($newname);
572
573	my $filename = $href;
574	if ($base_dir eq "") {
575	# remove http:/ thereby leaving one slash at the start
576	$filename =~ s/^[^:]*:\///;
577	}
578	else {
579	# remove http://
580	$filename =~ s/^[^:]*:\/\///;
581	}
582
583	$filename = &util::filename_cat($base_dir, $filename);
584
585	# Replace %20's in URL with a space if required. Note that the filename
586	# may include the %20 in some situations
587	if ($filename =~ /\%20/) {
588	if (!-e $filename) {
589	$filename =~ s/\%20/ /g;
590	}
591	}
592
593	my ($ext) = $filename =~ /(\.[^\.]*)$/;
594
595	if ($rl == 0) {
596	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
597	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
598	}
599	else {
600	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
601	}
602	}
603
604	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
605	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
606	}
607	if ($self->{'rename_assoc_files'}) {
608	if (defined $self->{'aux_files'}->{$href}) {
609	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
610	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
611	} else {
612	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
613	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
614	$self->inc_filecount ();
615	}
616	$doc_obj->associate_file($filename, $newname, undef, $section);
617	return "_httpdocimg_/$newname";
618	} else {
619	($newname) = $filename =~ /([^\/\\]*)$/;
620	$doc_obj->associate_file($filename, $newname, undef, $section);
621	return "_httpdocimg_/$newname";
622	}
623	}
624
625
626	sub format_link {
627	my $self = shift (@_);
628	my ($link, $base_dir, $file) = @_;
629
630	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
631
632	$hash_part = "" if !defined $hash_part;
633	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
634	my $outhandle = $self->{'outhandle'};
635	print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
636	if $self->{'verbosity'};
637	return ($link, "", 0);
638	}
639
640	if ($before_hash =~ s@^((?:http\|ftp\|file)://)@@i) {
641	my $type = $1;
642
643	if ($link =~ /^(http\|ftp):/i) {
644	# Turn url (using /) into file name (possibly using \ on windows)
645	my @http_dir_split = split('/', $before_hash);
646	$before_hash = &util::filename_cat(@http_dir_split);
647	}
648
649	$before_hash = $self->eval_dir_dots($before_hash);
650
651	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
652
653	my $rl = 0;
654	$rl = 1 if (-e $linkfilename);
655
656	# make sure there's a slash on the end if it's a directory
657	if ($before_hash !~ /\/$/) {
658	$before_hash .= "/" if (-d $linkfilename);
659	}
660
661	return ($type . $before_hash, $hash_part, $rl);
662
663	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ /^\//) {
664	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ /\\/) {
665
666	# the first directory will be the domain name if file_is_url
667	# to generate archives, otherwise we'll assume all files are
668	# from the same site and base_dir is the root
669
670	if ($self->{'file_is_url'}) {
671	my @dirs = split /[\/\\]/, $file;
672	my $domname = shift (@dirs);
673	$before_hash = &util::filename_cat($domname, $before_hash);
674	$before_hash =~ s@\\@/@g; # for windows
675	}
676	else
677	{
678	# see if link shares directory with source document
679	# => turn into relative link if this is so!
680
681	if ($ENV{'GSDLOS'} =~ /^windows/i) {
682	# too difficult doing a pattern match with embedded '\'s...
683	my $win_before_hash=$before_hash;
684	$win_before_hash =~ s@(\\)+@/@g;
685	# $base_dir is already similarly "converted" on windows.
686	if ($win_before_hash =~ s@^$base_dir/@@o) {
687	# if this is true, we removed a prefix
688	$before_hash=$win_before_hash;
689	}
690	}
691	else {
692	# before_hash has lost leading slash by this point,
693	# -> add back in prior to substitution with $base_dir
694	$before_hash = "/$before_hash";
695
696	$before_hash = &util::filename_cat("",$before_hash);
697	$before_hash =~ s@^$base_dir/@@;
698	}
699	}
700	} else {
701	# Turn relative file path into full path
702	my $dirname = &File::Basename::dirname($file);
703	$before_hash = &util::filename_cat($dirname, $before_hash);
704	$before_hash = $self->eval_dir_dots($before_hash);
705	}
706
707	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
708	# make sure there's a slash on the end if it's a directory
709	if ($before_hash !~ /\/$/) {
710	$before_hash .= "/" if (-d $linkfilename);
711	}
712	return ("http://" . $before_hash, $hash_part, 1);
713	} else {
714	# mailto, news, nntp, telnet, javascript or gopher link
715	return ($before_hash, "", 0);
716	}
717	}
718
719	sub extract_first_NNNN_characters {
720	my $self = shift (@_);
721	my ($textref, $doc_obj, $thissection) = @_;
722
723	foreach my $size (split /,/, $self->{'first'}) {
724	my $tmptext = $$textref;
725	# skip to the body
726	$tmptext =~ s/.<body[^>]>//i;
727	# remove javascript
728	$tmptext =~ s@<script.*?</script>@ @sig;
729	$tmptext =~ s/<[^>]*>/ /g;
730	$tmptext =~ s/ / /g;
731	$tmptext =~ s/^\s+//;
732	$tmptext =~ s/\s+$//;
733	$tmptext =~ s/\s+/ /gs;
734	$tmptext = &unicode::substr ($tmptext, 0, $size);
735	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
736	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
737	}
738	}
739
740
741	sub extract_metadata {
742	my $self = shift (@_);
743	my ($textref, $metadata, $doc_obj, $section) = @_;
744	my $outhandle = $self->{'outhandle'};
745	# if we don't want metadata, we may as well not be here ...
746	return if (!defined $self->{'metadata_fields'});
747
748	# metadata fields to extract/save. 'key' is the (lowercase) name of the
749	# html meta, 'value' is the metadata name for greenstone to use
750	my %find_fields = ();
751
752	my %creator_fields = (); # short-cut for lookups
753
754
755	foreach my $field (split /,/, $self->{'metadata_fields'}) {
756	# support tag<tagname>
757	if ($field =~ /^(.?)<(.?)>$/) {
758	# "$2" is the user's preferred gs metadata name
759	$find_fields{lc($1)}=$2; # lc = lowercase
760	} else { # no <tagname> for mapping
761	# "$field" is the user's preferred gs metadata name
762	$find_fields{lc($field)}=$field; # lc = lowercase
763	}
764	}
765
766	if (defined $self->{'hunt_creator_metadata'} &&
767	$self->{'hunt_creator_metadata'} == 1 ) {
768	my @extra_fields =
769	(
770	'author',
771	'author.email',
772	'creator',
773	'dc.creator',
774	'dc.creator.corporatename',
775	);
776
777	# add the creator_metadata fields to search for
778	foreach my $field (@extra_fields) {
779	$creator_fields{$field}=0; # add to lookup hash
780	}
781	}
782
783
784	# find the header in the html file, which has the meta tags
785	$$textref =~ m@<head>(.*?)</head>@si;
786
787	my $html_header=$1;
788
789	# go through every <meta... tag defined in the html and see if it is
790	# one of the tags we want to match.
791
792	# special case for title - we want to remember if its been found
793	my $found_title = 0;
794	# this assumes that ">" won't appear. (I don't think it's allowed to...)
795	$html_header =~ /^/; # match the start of the string, for \G assertion
796
797	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
798	my $metatag=$1;
799	my ($tag, $value);
800
801	# find the tag name
802	$metatag =~ /(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
803	$tag=$2;
804	# in case they're not using " or ', but they should...
805	if (! $tag) {
806	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
807	$tag=$1;
808	}
809
810	if (!defined $tag) {
811	print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
812	next;
813	}
814
815	# don't need to assign this field if it was passed in from a previous
816	# (recursive) plugin
817	if (defined $metadata->{$tag}) {next}
818
819	# find the tag content
820	$metatag =~ /content\s=\s([\"\'])?(.*?)\1/is;
821	$value=$2;
822
823	if (! $value) {
824	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
825	$value=$1;
826	}
827	if (!defined $value) {
828	print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
829	next;
830	}
831
832	# clean up and add
833	$value =~ s/\s+/ /gs;
834	chomp($value); # remove trailing \n, if any
835	if (exists $creator_fields{lc($tag)}) {
836	# map this value onto greenstone's "Creator" metadata
837	$tag='Creator';
838	} elsif (!exists $find_fields{lc($tag)}) {
839	next; # don't want this tag
840	} else {
841	# get the user's preferred capitalisation
842	$tag = $find_fields{lc($tag)};
843	}
844	if (lc($tag) eq "title") {
845	$found_title = 1;
846	}
847	print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
848	if ($self->{'verbosity'} > 2);
849	if ($tag =~ /date.*/i){
850	$tag = lc($tag);
851	}
852	$doc_obj->add_utf8_metadata($section, $tag, $value);
853
854	}
855
856	# TITLE: extract the document title
857	if (exists $find_fields{'title'} && !$found_title) {
858	# we want a title, and didn't find one in the meta tags
859	# see if there's a <title> tag
860	my $title;
861	my $from = ""; # for debugging output only
862	if ($html_header =~ /<title[^>]>([^<]+)<\/title[^>]>/is) {
863	$title = $1;
864	$from = "<title> tags";
865	}
866
867	if (!defined $title) {
868	$from = "first 100 chars";
869	# if no title use first 100 or so characters
870	$title = $$textref;
871	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
872	$title =~ s/^.*?<body>//si;
873	# ignore javascript!
874	$title =~ s@<script.*?</script>@ @sig;
875	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
876	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
877	$title = substr ($title, 0, 100);
878	$title =~ s/\s\S*$/.../;
879	}
880	$title =~ s/<[^>]*>/ /g; # remove html tags
881	$title =~ s/ / /g;
882	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
883	$title =~ s/\s+/ /gs; # collapse multiple spaces
884	$title =~ s/^\s*//; # remove leading spaces
885	$title =~ s/\s*$//; # remove trailing spaces
886
887	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
888	$title =~ s/^\s+//s; # in case title_sub introduced any...
889	$doc_obj->add_utf8_metadata ($section, 'Title', $title);
890	print $outhandle " extracted Title metadata \"$title\" from $from\n"
891	if ($self->{'verbosity'} > 2);
892	}
893
894	# add FileFormat metadata
895	$doc_obj->add_metadata($section,"FileFormat", "HTML");
896
897	# Special, for metadata names such as tagH1 - extracts
898	# the text between the first <H1> and </H1> tags into "H1" metadata.
899
900	foreach my $field (keys %find_fields) {
901	if ($field !~ /^tag([a-z0-9]+)$/i) {next}
902	my $tag = $1;
903	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
904	my $content = $1;
905	$content =~ s/ / /g;
906	$content =~ s/<[^>]*>/ /g;
907	$content =~ s/^\s+//;
908	$content =~ s/\s+$//;
909	$content =~ s/\s+/ /gs;
910	if ($content) {
911	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
912	$tag =~ s/^tag//i;
913	$doc_obj->add_utf8_metadata ($section, $tag, $content);
914	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
915	if ($self->{'verbosity'} > 2);
916	}
917	}
918	}
919	}
920
921
922	# evaluate any "../" to next directory up
923	# evaluate any "./" as here
924	sub eval_dir_dots {
925	my $self = shift (@_);
926	my ($filename) = @_;
927	my $dirsep_os = &util::get_os_dirsep();
928	my @dirsep = split(/$dirsep_os/,$filename);
929
930	my @eval_dirs = ();
931	foreach my $d (@dirsep) {
932	if ($d eq "..") {
933	pop(@eval_dirs);
934
935	} elsif ($d eq ".") {
936	# do nothing!
937
938	} else {
939	push(@eval_dirs,$d);
940	}
941	}
942
943	# Need to fiddle with number of elements in @eval_dirs if the
944	# first one is the empty string. This is because of a
945	# modification to util::filename_cat that supresses the addition
946	# of a leading '/' character (or \ if windows) (intended to help
947	# filename cat with relative paths) if the first entry in the
948	# array is the empty string. Making the array start with two
949	# empty strings is a way to defeat this "smart" option.
950	#
951	if (scalar(@eval_dirs) > 0) {
952	if ($eval_dirs[0] eq ""){
953	unshift(@eval_dirs,"");
954	}
955	}
956	return &util::filename_cat(@eval_dirs);
957	}
958
959	sub replace_usemap_links {
960	my $self = shift (@_);
961	my ($front, $link, $back) = @_;
962
963	$link =~ s/^\.\///;
964	return $front . $link . $back;
965	}
966
967	sub inc_filecount {
968	my $self = shift (@_);
969
970	if ($self->{'file_num'} == 1000) {
971	$self->{'dir_num'} ++;
972	$self->{'file_num'} = 0;
973	} else {
974	$self->{'file_num'} ++;
975	}
976	}
977
978
979	# Extend the BasPlug read_file so that strings like é are
980	# converted to UTF8 internally.
981	#
982	# We don't convert < or > or & or " in case
983	# they interfere with the GML files
984
985	sub read_file {
986	my ($self, $filename, $encoding, $language, $textref) = @_;
987
988	&BasPlug::read_file($self, $filename, $encoding, $language, $textref);
989
990	# Convert entities to their UTF8 equivalents
991	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
992	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
993	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
994	}
995
996	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: