Context Navigation

source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 8814

Last change on this file since 8814 was 8794, checked in by jrm21, 20 years ago
remove trailing \n from meta tags (bug reported by Tim Finney, 13 Dec 2004)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 28.6 KB

Line
1	###########################################################################
2	#
3	# HTMLPlug.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36
37	package HTMLPlug;
38
39	use BasPlug;
40	use ghtml;
41	use unicode;
42	use util;
43	use parsargv;
44	use XMLParser;
45
46	sub BEGIN {
47	@HTMLPlug::ISA = ('BasPlug');
48	}
49
50	use strict; # every perl program should have this!
51	no strict 'refs'; # make an exception so we can use variables as filehandles
52
53	my $arguments =
54	[ { 'name' => "process_exp",
55	'desc' => "{BasPlug.process_exp}",
56	'type' => "regexp",
57	'deft' => &get_default_process_exp() },
58	{ 'name' => "block_exp",
59	'desc' => "{BasPlug.block_exp}",
60	'type' => 'regexp',
61	'deft' => &get_default_block_exp() },
62	{ 'name' => "nolinks",
63	'desc' => "{HTMLPlug.nolinks}",
64	'type' => "flag" },
65	{ 'name' => "keep_head",
66	'desc' => "{HTMLPlug.keep_head}",
67	'type' => "flag" },
68	{ 'name' => "no_metadata",
69	'desc' => "{HTMLPlug.no_metadata}",
70	'type' => "flag" },
71	{ 'name' => "metadata_fields",
72	'desc' => "{HTMLPlug.metadata_fields}",
73	'type' => "string",
74	'deft' => "Title" },
75	{ 'name' => "hunt_creator_metadata",
76	'desc' => "{HTMLPlug.hunt_creator_metadata}",
77	'type' => "flag" },
78	{ 'name' => "file_is_url",
79	'desc' => "{HTMLPlug.file_is_url}",
80	'type' => "flag" },
81	{ 'name' => "assoc_files",
82	'desc' => "{HTMLPlug.assoc_files}",
83	'type' => "regexp",
84	'deft' => &get_default_block_exp() },
85	{ 'name' => "rename_assoc_files",
86	'desc' => "{HTMLPlug.rename_assoc_files}",
87	'type' => "flag" },
88	{ 'name' => "title_sub",
89	'desc' => "{HTMLPlug.title_sub}",
90	'type' => "string",
91	'deft' => "" },
92	{ 'name' => "description_tags",
93	'desc' => "{HTMLPlug.description_tags}",
94	'type' => "flag" } ];
95
96	my $options = { 'name' => "HTMLPlug",
97	'desc' => "{HTMLPlug.desc}",
98	'abstract' => "no",
99	'inherits' => "yes",
100	'args' => $arguments };
101
102	sub new {
103	my $class = shift (@_);
104	my $self = new BasPlug ($class, @_);
105	$self->{'plugin_type'} = "HTMLPlug";
106	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
107	my $option_list = $self->{'option_list'};
108	push( @{$option_list}, $options );
109
110	if (!parsargv::parse(\@_,
111	q^nolinks^, \$self->{'nolinks'},
112	q^keep_head^, \$self->{'keep_head'},
113	q^no_metadata^, \$self->{'no_metadata'},
114	q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
115	q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'},
116	q^w3mir^, \$self->{'w3mir'},
117	q^file_is_url^, \$self->{'file_is_url'},
118	q^assoc_files/.*/(?i)\.(jpe?g\|jpe\|gif\|png\|css)$^, \$self->{'assoc_files'},
119	q^rename_assoc_files^, \$self->{'rename_assoc_files'},
120	q^title_sub/.*/^, \$self->{'title_sub'},
121	q^description_tags^, \$self->{'description_tags'},
122	"allow_extra_options")) {
123
124	print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
125	$self->print_txt_usage(""); # Use default resource bundle
126	die "\n";
127	}
128
129	# retain this for backward compatibility (w3mir option was replaced by
130	# file_is_url)
131	if ($self->{'w3mir'}) {
132	$self->{'file_is_url'} = 1;
133	}
134
135	$self->{'aux_files'} = {};
136	$self->{'dir_num'} = 0;
137	$self->{'file_num'} = 0;
138
139	return bless $self, $class;
140	}
141
142	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
143	# if have eg <script language="javascript" src="img/lib.js@123">
144	sub get_default_block_exp {
145	my $self = shift (@_);
146
147	return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
148	}
149
150	sub get_default_process_exp {
151	my $self = shift (@_);
152
153	# the last option is an attempt to encode the concept of an html query ...
154	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
155	}
156
157	sub metadata_read {
158	my $self = shift (@_);
159	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
160
161	my $outhandle = $self->{'outhandle'};
162
163	my $filename = $file;
164	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
165
166	my ($dir) = $filename =~ /^(.?)[^\/\\]$/;
167
168	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
169	return undef; # can't recognise
170	}
171
172	# Do encoding stuff
173	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
174
175	# read in file ($text will be in utf8)
176	my $text = "";
177	$self->read_file ($filename, $encoding, $language, \$text);
178
179	$self->store_block_files (\$text, $filename);
180
181	return 1;
182	}
183
184	sub store_block_files
185	{
186	my $self =shift (@_);
187	my ($textref, $filename) = @_;
188
189	my $html_fname = $filename;
190	my @file_blocks;
191
192	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
193	my $closecom = '(?:-->\|(?:—\|\|--)>)';
194	$$textref =~ s/$opencom(.*?)$closecom//gs;
195
196	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
197	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
198	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
199	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
200
201
202	foreach my $link (@img_matches, @usemap_matches, @link_matches) {
203
204	# remove quotes from link at start and end if necessary
205	if ($link=~/^\"/) {
206	$link=~s/^\"//;
207	$link=~s/\"$//;
208	}
209
210	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
211
212	if ($link !~ s@^/@@ && $link !~ /^([A-Z]:?)\\/) {
213	# Turn relative file path into full path
214	my $dirname = &File::Basename::dirname($filename);
215	$link = &util::filename_cat($dirname, $link);
216	}
217	$link = $self->eval_dir_dots($link);
218	$self->{'file_blocks'}->{$link} = 1;
219	}
220	}
221
222	# do plugin specific processing of doc_obj
223	sub process {
224	my $self = shift (@_);
225	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
226	my $outhandle = $self->{'outhandle'};
227
228	print STDERR "<Processing n='$file' p='HTMLPlug'>\n" if ($gli);
229
230	print $outhandle "HTMLPlug: processing $file\n"
231	if $self->{'verbosity'} > 1;
232
233	if ($ENV{'GSDLOS'} =~ /^windows/i) {
234	# this makes life so much easier... perl can cope with unix-style '/'s.
235	$base_dir =~ s@(\\)+@/@g;
236	$file =~ s@(\\)+@/@g;
237	}
238
239	# reset per-doc stuff...
240	$self->{'aux_files'} = {};
241	$self->{'dir_num'} = 0;
242	$self->{'file_num'} = 0;
243
244	my $cursection = $doc_obj->get_top_section();
245
246	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
247	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
248
249	# Store URL for page as metadata - this can be used for an
250	# altavista style search interface. The URL won't be valid
251	# unless the file structure contains the domain name (i.e.
252	# like when w3mir is used to download a website).
253	my $web_url = "http://$file";
254	$doc_obj->add_metadata($cursection, "URL", $web_url);
255
256	if ($self->{'description_tags'}) {
257	# remove the html header - note that doing this here means any
258	# sections defined within the header will be lost (so all <Section>
259	# tags must appear within the body of the HTML)
260	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
261
262	$$textref =~ s/^.?<body[^>]>//is;
263	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
264
265	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
266	my $closecom = '(?:-->\|(?:—\|\|--)>)';
267
268	my $lt = '(?:<\|<)';
269	my $gt = '(?:>\|>)';
270	my $quot = '(?:"\|"\|”\|“)';
271
272	my $found_something = 0; my $top = 1;
273	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
274	my $text = $1;
275	my $comment = $2;
276	if (defined $text) {
277	# text before a comment - note that getting to here
278	# doesn't necessarily mean there are Section tags in
279	# the document
280	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
281	}
282	while ($comment =~ s/$lt(.*?)$gt//s) {
283	my $tag = $1;
284	if ($tag eq "Section") {
285	$found_something = 1;
286	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
287	$top = 0;
288	} elsif ($tag eq "/Section") {
289	$found_something = 1;
290	$cursection = $doc_obj->get_parent_section ($cursection);
291	} elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
292	my $metaname = $1;
293	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
294	my $metavalue = $1;
295	$metavalue =~ s/^\s+//;
296	$metavalue =~ s/\s+$//;
297	# assume that no metadata value intentionally includes
298	# carriage returns or HTML tags (if they're there they
299	# were probably introduced when converting to HTML from
300	# some other format).
301	$metavalue =~ s/[\cJ\cM]/ /sg;
302	$metavalue =~ s/<[^>]+>//sg;
303	$metavalue =~ s/\s+/ /sg;
304	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
305	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
306	# do nothing with containing Description tags
307	} else {
308	# simple HTML tag (probably created by the conversion
309	# to HTML from some other format) - we'll ignore it and
310	# hope for the best ;-)
311	}
312	}
313	}
314	if ($cursection ne "") {
315	print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
316	}
317
318	$$textref =~ s/^.?<body[^>]>//is;
319	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
320	if ($$textref =~ /\S/) {
321	if (!$found_something) {
322	if ($self->{'verbosity'} > 2) {
323	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
324	print $outhandle " will be processed as a single section document\n";
325	}
326
327	# go ahead and process single-section document
328	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
329
330	# if document contains no Section tags we'll go ahead
331	# and extract metadata (this won't have been done
332	# above as the -description_tags option prevents it)
333	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
334	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
335	unless $self->{'no_metadata'};
336
337	} else {
338	print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
339	print $outhandle " of the final closing </Section> tag. This text will\n";
340	print $outhandle " be ignored.";
341
342	my ($text);
343	if (length($$textref) > 30) {
344	$text = substr($$textref, 0, 30) . "...";
345	} else {
346	$text = $$textref;
347	}
348	$text =~ s/\n/ /isg;
349	print $outhandle " ($text)\n";
350	}
351	} elsif (!$found_something) {
352
353	if ($self->{'verbosity'} > 2) {
354	# may get to here if document contained no valid Section
355	# tags but did contain some comments. The text will have
356	# been processed already but we should print the warning
357	# as above and extract metadata
358	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
359	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
360	}
361
362	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
363	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
364	unless $self->{'no_metadata'};
365	}
366
367	} else {
368
369	# remove header and footer
370	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
371	$$textref =~ s/^.?<body[^>]>//is;
372	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
373	}
374
375	# single section document
376	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
377	}
378	return 1;
379	}
380
381	# note that process_section may be called multiple times for a single
382	# section (relying on the fact that add_utf8_text appends the text to any
383	# that may exist already).
384	sub process_section {
385	my $self = shift (@_);
386	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
387	# trap links
388	if (!$self->{'nolinks'}) {
389
390	# usemap="./#index" not handled correctly => change to "#index"
391	$$textref =~ s/(<img[^>]?usemap\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
392	$self->replace_usemap_links($1, $2, $3)/isge;
393
394	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s\"?)([^\">\s]+)(\"?[^>]*>)/
395	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
396	}
397
398	# trap images
399
400	# allow spaces if inside quotes - jrm21
401	$$textref =~ s/(<img[^>]?src\s=\s)(\"[^\"]+\"\|[^\s>]+)([^>]>)/
402	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
403
404	# add text to document object
405	# turn \ into \\ so that the rest of greenstone doesn't think there
406	# is an escape code following. (Macro parsing loses them...)
407	$$textref =~ s/\\/\\\\/go;
408	$doc_obj->add_utf8_text($cursection, $$textref);
409	}
410
411	sub replace_images {
412	my $self = shift (@_);
413	my ($front, $link, $back, $base_dir,
414	$file, $doc_obj, $section) = @_;
415
416	# remove quotes from link at start and end if necessary
417	if ($link=~/^\"/) {
418	$link=~s/^\"//;$link=~s/\"$//;
419	$front.='"';
420	$back="\"$back";
421	}
422
423	$link =~ s/\n/ /g;
424
425	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
426	# If the Word file path has spaces in it, wv messes up and you end up with
427	# absolute paths for the images, and without the "file://" prefix
428	# So check for this special case and massage the data to be correct
429	if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
430	$link =~ s/^.*\\([^\\]+)$/$1/;
431	}
432
433	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
434
435	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
436	my $anchor_name = $img_file;
437	$anchor_name =~ s/^.*\///;
438	$anchor_name = "<a name=\"$anchor_name\">";
439
440	return $front . $img_file . $back . $anchor_name;
441	}
442
443	sub replace_href_links {
444	my $self = shift (@_);
445	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
446
447	# attempt to sort out targets - frames are not handled
448	# well in this plugin and some cases will screw things
449	# up - e.g. the _parent target (so we'll just remove
450	# them all ;-)
451	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
452	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
453	$front =~ s/target=\"?_parent\"?//is;
454	$back =~ s/target=\"?_parent\"?//is;
455
456	return $front . $link . $back if $link =~ /^\#/s;
457	$link =~ s/\n/ /g;
458
459	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
460	# href may use '\'s where '/'s should be on Windows
461	$href =~ s/\\/\//g;
462
463	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
464
465	##### leave all these links alone (they won't be picked up by intermediate
466	##### pages). I think that's safest when dealing with frames, targets etc.
467	##### (at least until I think of a better way to do it). Problems occur with
468	##### mailto links from within small frames, the intermediate page is displayed
469	##### within that frame and can't be seen. There is still potential for this to
470	##### happen even with html pages - the solution seems to be to somehow tell
471	##### the browser from the server side to display the page being sent (i.e.
472	##### the intermediate page) in the top level window - I'm not sure if that's
473	##### possible - the following line should probably be deleted if that can be done
474	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
475
476
477	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
478	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
479	&ghtml::urlsafe ($href);
480	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
481	} else {
482	# link is to some other type of file (eg image) so we'll
483	# need to associate that file
484	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
485	}
486	}
487
488	sub add_file {
489	my $self = shift (@_);
490	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
491	my ($newname);
492
493	my $filename = $href;
494	$filename =~ s/^[^:]*:\/\///;
495	$filename = &util::filename_cat($base_dir, $filename);
496
497	# Replace %20's in URL with a space if required. Note that the filename
498	# may include the %20 in some situations
499	if ($filename =~ /\%20/) {
500	if (!-e $filename) {
501	$filename =~ s/\%20/ /g;
502	}
503	}
504
505	my ($ext) = $filename =~ /(\.[^\.]*)$/;
506
507	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
508	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
509	}
510	if ($self->{'rename_assoc_files'}) {
511	if (defined $self->{'aux_files'}->{$href}) {
512	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
513	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
514	} else {
515	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
516	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
517	$self->inc_filecount ();
518	}
519	$doc_obj->associate_file($filename, $newname, undef, $section);
520	return "_httpdocimg_/$newname";
521	} else {
522	($newname) = $filename =~ /([^\/\\]*)$/;
523	$doc_obj->associate_file($filename, $newname, undef, $section);
524	return "_httpdocimg_/$newname";
525	}
526	}
527
528
529	sub format_link {
530	my $self = shift (@_);
531	my ($link, $base_dir, $file) = @_;
532
533	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
534
535	$hash_part = "" if !defined $hash_part;
536	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
537	my $outhandle = $self->{'outhandle'};
538	print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
539	if $self->{'verbosity'};
540	return ($link, "", 0);
541	}
542
543	if ($before_hash =~ s@^((?:http\|ftp\|file)://)@@i) {
544	my $type = $1;
545
546	if ($link =~ /^(http\|ftp):/i) {
547	# Turn url (using /) into file name (possibly using \ on windows)
548	my @http_dir_split = split('/', $before_hash);
549	$before_hash = &util::filename_cat(@http_dir_split);
550	}
551
552	$before_hash = $self->eval_dir_dots($before_hash);
553
554	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
555
556	my $rl = 0;
557	$rl = 1 if (-e $linkfilename);
558
559	# make sure there's a slash on the end if it's a directory
560	if ($before_hash !~ /\/$/) {
561	$before_hash .= "/" if (-d $linkfilename);
562	}
563
564	return ($type . $before_hash, $hash_part, $rl);
565
566	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i) {
567	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ /\\/) {
568
569	# the first directory will be the domain name if file_is_url
570	# to generate archives, otherwise we'll assume all files are
571	# from the same site and base_dir is the root
572
573	if ($self->{'file_is_url'}) {
574	my @dirs = split /[\/\\]/, $file;
575	my $domname = shift (@dirs);
576	$before_hash = &util::filename_cat($domname, $before_hash);
577	$before_hash =~ s@\\@/@g; # for windows
578	}
579	else
580	{
581	# see if link shares directory with source document
582	# => turn into relative link if this is so!
583
584	if ($ENV{'GSDLOS'} =~ /^windows/i) {
585	# too difficult doing a pattern match with embedded '\'s...
586	my $win_before_hash=$before_hash;
587	$win_before_hash =~ s@(\\)+@/@g;
588	# $base_dir is already similarly "converted" on windows.
589	if ($win_before_hash =~ s@^$base_dir/@@o) {
590	# if this is true, we removed a prefix
591	$before_hash=$win_before_hash;
592	}
593	}
594	else {
595	$before_hash = &util::filename_cat("",$before_hash);
596	$before_hash =~ s@^$base_dir/@@;
597	}
598	}
599	} else {
600	# Turn relative file path into full path
601	my $dirname = &File::Basename::dirname($file);
602	$before_hash = &util::filename_cat($dirname, $before_hash);
603	$before_hash = $self->eval_dir_dots($before_hash);
604	}
605
606	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
607	# make sure there's a slash on the end if it's a directory
608	if ($before_hash !~ /\/$/) {
609	$before_hash .= "/" if (-d $linkfilename);
610	}
611
612	return ("http://" . $before_hash, $hash_part, 1);
613	} else {
614	# mailto, news, nntp, telnet, javascript or gopher link
615	return ($before_hash, "", 0);
616	}
617	}
618
619	sub extract_first_NNNN_characters {
620	my $self = shift (@_);
621	my ($textref, $doc_obj, $thissection) = @_;
622
623	foreach my $size (split /,/, $self->{'first'}) {
624	my $tmptext = $$textref;
625	# skip to the body
626	$tmptext =~ s/.<body[^>]>//i;
627	# remove javascript
628	$tmptext =~ s@<script.*?</script>@ @sig;
629	$tmptext =~ s/<[^>]*>/ /g;
630	$tmptext =~ s/ / /g;
631	$tmptext =~ s/^\s+//;
632	$tmptext =~ s/\s+$//;
633	$tmptext =~ s/\s+/ /gs;
634	# with perl 5.6 at least, substr might segment at a multi-byte char...
635	use utf8;
636	$tmptext = substr ($tmptext, 0, $size);
637	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
638	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
639	}
640	}
641
642
643	sub extract_metadata {
644	my $self = shift (@_);
645	my ($textref, $metadata, $doc_obj, $section) = @_;
646	my $outhandle = $self->{'outhandle'};
647	# if we don't want metadata, we may as well not be here ...
648	return if (!defined $self->{'metadata_fields'});
649
650	my %find_fields = (); # metadata fields to extract/save
651
652	my %creator_fields = (); # short-cut for lookups
653
654
655	foreach my $field (split /,/, $self->{'metadata_fields'}) {
656	# support tag<tagname>
657	if ($field =~ /^(.?)<(.?)>$/) {
658	# "$2" is the user's preferred gs metadata name
659	$find_fields{lc($1)}=$2; # lc = lowercase
660	}
661	# "$field" is the user's preferred gs metadata name
662	$find_fields{lc($field)}=$field; # lc = lowercase
663	}
664
665	if (defined $self->{'hunt_creator_metadata'} &&
666	$self->{'hunt_creator_metadata'} == 1 ) {
667	my @extra_fields =
668	(
669	'author',
670	'author.email',
671	'creator',
672	'dc.creator',
673	'dc.creator.corporatename',
674	);
675
676	# add the creator_metadata fields to search for
677	foreach my $field (@extra_fields) {
678	$creator_fields{$field}=0; # add to lookup hash
679	}
680	}
681
682
683	# find the header in the html file, which has the meta tags
684	$$textref =~ m@<head>(.*?)</head>@si;
685
686	my $html_header=$1;
687	# go through every <meta... tag defined in the html and see if it is
688	# one of the tags we want to match.
689
690	# special case for title - we want to remember if its been found
691	my $found_title = 0;
692	# this assumes that ">" won't appear. (I don't think it's allowed to...)
693	$html_header =~ /^/; # match the start of the string, for \G assertion
694
695	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
696	my $metatag=$1;
697	my ($tag, $value);
698
699	# find the tag name
700	$metatag =~ /(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
701	$tag=$2;
702	# in case they're not using " or ', but they should...
703	if (! $tag) {
704	$metatag =~ /(?:name\|http-equiv)\s=\s(.*?)(?!\w)/is;
705	$tag=$1;
706	}
707
708	if (!defined $tag) {
709	print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
710	next;
711	}
712
713	# don't need to assign this field if it was passed in from a previous
714	# (recursive) plugin
715	if (defined $metadata->{$tag}) {next}
716
717	# find the tag content
718	$metatag =~ /content\s=\s([\"\'])?(.*?)\1/is;
719	$value=$2;
720	if (! $value) {
721	$metatag =~ /(?:name\|http-equiv)\s=\s(.*?)(?!\w)/is;
722	$value=$1;
723	}
724	if (!defined $value) {
725	print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
726	next;
727	}
728
729	# clean up and add
730	$value =~ s/\s+/ /gs;
731	chomp($value); # remove trailing \n, if any
732	if (exists $creator_fields{lc($tag)}) {
733	# map this value onto greenstone's "Creator" metadata
734	$tag='Creator';
735	} elsif (!exists $find_fields{lc($tag)}) {
736	next; # don't want this tag
737	} else {
738	# get the user's preferred capitalisation
739	$tag = $find_fields{lc($tag)};
740	}
741	if (lc($tag) eq "title") {
742	$found_title = 1;
743	}
744	print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
745	if ($self->{'verbosity'} > 2);
746	$doc_obj->add_utf8_metadata($section, $tag, $value);
747
748	}
749
750	# TITLE: extract the document title
751	if (exists $find_fields{'title'} && !$found_title) {
752	# we want a title, and didn't find one in the meta tags
753	# see if there's a <title> tag
754	my $title;
755	my $from = "";
756	if ($html_header =~ /<title[^>]>([^<]+)<\/title[^>]>/is) {
757	$title = $1;
758	$from = "<title> tags";
759	}
760
761	if (!defined $title) {
762	$from = "first 100 chars";
763	# if no title use first 100 or so characters
764	$title = $$textref;
765	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
766	$title =~ s/^.*?<body>//si;
767	# ignore javascript!
768	$title =~ s@<script.*?</script>@ @sig;
769	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
770	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
771	$title = substr ($title, 0, 100);
772	$title =~ s/\s\S*$/.../;
773	}
774	$title =~ s/<[^>]*>/ /g; # remove html tags
775	$title =~ s/ / /g;
776	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
777	$title =~ s/\s+/ /gs; # collapse multiple spaces
778	$title =~ s/^\s*//; # remove leading spaces
779	$title =~ s/\s*$//; # remove trailing spaces
780
781	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
782	$title =~ s/^\s+//s; # in case title_sub introduced any...
783	$doc_obj->add_utf8_metadata ($section, 'Title', $title);
784	print $outhandle " extracted Title metadata \"$title\" from $from\n"
785	if ($self->{'verbosity'} > 2);
786	}
787
788	# add FileFormat metadata
789	$doc_obj->add_metadata($section,"FileFormat", "HTML");
790
791	# Special, for metadata names such as tagH1 - extracts
792	# the text between the first <H1> and </H1> tags into "H1" metadata.
793
794	foreach my $field (keys %find_fields) {
795	if ($field !~ /^tag([a-z0-9]+)$/i) {next}
796	my $tag = $1;
797	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
798	my $content = $1;
799	$content =~ s/ / /g;
800	$content =~ s/<[^>]*>/ /g;
801	$content =~ s/^\s+//;
802	$content =~ s/\s+$//;
803	$content =~ s/\s+/ /gs;
804	if ($content) {
805	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
806	$tag =~ s/^tag//i;
807	$doc_obj->add_utf8_metadata ($section, $tag, $content);
808	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
809	if ($self->{'verbosity'} > 2);
810	}
811	}
812	}
813	}
814
815
816	# evaluate any "../" to next directory up
817	# evaluate any "./" as here
818	sub eval_dir_dots {
819	my $self = shift (@_);
820	my ($filename) = @_;
821	my $dirsep_os = &util::get_os_dirsep();
822	my @dirsep = split(/$dirsep_os/,$filename);
823
824	my @eval_dirs = ();
825	foreach my $d (@dirsep) {
826	if ($d eq "..") {
827	pop(@eval_dirs);
828
829	} elsif ($d eq ".") {
830	# do nothing!
831
832	} else {
833	push(@eval_dirs,$d);
834	}
835	}
836
837	# Need to fiddle with number of elements in @eval_dirs if the
838	# first one is the empty string. This is because of a
839	# modification to util::filename_cat that supresses the addition
840	# of a leading '/' character (or \ if windows) (intended to help
841	# filename cat with relative paths) if the first entry in the
842	# array is the empty string. Making the array start with two
843	# empty strings is a way to defeat this "smart" option.
844	#
845	if (scalar(@eval_dirs) > 0) {
846	if ($eval_dirs[0] eq ""){
847	unshift(@eval_dirs,"");
848	}
849	}
850	return &util::filename_cat(@eval_dirs);
851	}
852
853	sub replace_usemap_links {
854	my $self = shift (@_);
855	my ($front, $link, $back) = @_;
856
857	$link =~ s/^\.\///;
858	return $front . $link . $back;
859	}
860
861	sub inc_filecount {
862	my $self = shift (@_);
863
864	if ($self->{'file_num'} == 1000) {
865	$self->{'dir_num'} ++;
866	$self->{'file_num'} = 0;
867	} else {
868	$self->{'file_num'} ++;
869	}
870	}
871
872
873	# Extend the BasPlug read_file so that strings like é are
874	# converted to UTF8 internally.
875	#
876	# We don't convert < or > or & or " in case
877	# they interfere with the GML files
878
879	sub read_file {
880	my ($self, $filename, $encoding, $language, $textref) = @_;
881
882	&BasPlug::read_file($self, $filename, $encoding, $language, $textref);
883
884	# Convert entities to their UTF8 equivalents
885	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
886	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
887	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
888	}
889
890	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: