Context Navigation

source: gsdl/trunk/perllib/plugins/HTMLPlugin.pm@ 16799

Last change on this file since 16799 was 16799, checked in by ak19, 16 years ago
URL encoding href links for internal links since the database lookup has trouble with keys that are UTF8
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 52.2 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use ReadTextFile;
39	use HBPlugin;
40	use ghtml;
41	use unicode;
42	use util;
43	use XMLParser;
44
45	use Image::Size;
46	use File::Copy;
47
48	sub BEGIN {
49	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
50	}
51
52	use strict; # every perl program should have this!
53	no strict 'refs'; # make an exception so we can use variables as filehandles
54
55	my $arguments =
56	[ { 'name' => "process_exp",
57	'desc' => "{BasePlugin.process_exp}",
58	'type' => "regexp",
59	'deft' => &get_default_process_exp() },
60	{ 'name' => "block_exp",
61	'desc' => "{BasePlugin.block_exp}",
62	'type' => 'regexp',
63	'deft' => &get_default_block_exp() },
64	{ 'name' => "nolinks",
65	'desc' => "{HTMLPlugin.nolinks}",
66	'type' => "flag" },
67	{ 'name' => "keep_head",
68	'desc' => "{HTMLPlugin.keep_head}",
69	'type' => "flag" },
70	{ 'name' => "no_metadata",
71	'desc' => "{HTMLPlugin.no_metadata}",
72	'type' => "flag" },
73	{ 'name' => "metadata_fields",
74	'desc' => "{HTMLPlugin.metadata_fields}",
75	'type' => "string",
76	'deft' => "Title" },
77	{ 'name' => "hunt_creator_metadata",
78	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
79	'type' => "flag" },
80	{ 'name' => "file_is_url",
81	'desc' => "{HTMLPlugin.file_is_url}",
82	'type' => "flag" },
83	{ 'name' => "assoc_files",
84	'desc' => "{HTMLPlugin.assoc_files}",
85	'type' => "regexp",
86	'deft' => &get_default_block_exp() },
87	{ 'name' => "rename_assoc_files",
88	'desc' => "{HTMLPlugin.rename_assoc_files}",
89	'type' => "flag" },
90	{ 'name' => "title_sub",
91	'desc' => "{HTMLPlugin.title_sub}",
92	'type' => "string",
93	'deft' => "" },
94	{ 'name' => "description_tags",
95	'desc' => "{HTMLPlugin.description_tags}",
96	'type' => "flag" },
97	# retain this for backward compatibility (w3mir option was replaced by
98	# file_is_url)
99	{ 'name' => "w3mir",
100	# 'desc' => "{HTMLPlugin.w3mir}",
101	'type' => "flag",
102	'hiddengli' => "yes"},
103	{ 'name' => "no_strip_metadata_html",
104	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
105	'type' => "string",
106	'deft' => "",
107	'reqd' => "no"},
108	{ 'name' => "sectionalise_using_h_tags",
109	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
110	'type' => "flag" },
111	{ 'name' => "use_realistic_book",
112	'desc' => "{HTMLPlugin.tidy_html}",
113	'type' => "flag"},
114	{ 'name' => "old_style_HDL",
115	'desc' => "{HTMLPlugin.old_style_HDL}",
116	'type' => "flag"}
117	];
118
119	my $options = { 'name' => "HTMLPlugin",
120	'desc' => "{HTMLPlugin.desc}",
121	'abstract' => "no",
122	'inherits' => "yes",
123	'args' => $arguments };
124
125
126	sub HB_read_html_file {
127	my $self = shift (@_);
128	my ($htmlfile, $text) = @_;
129
130	# load in the file
131	if (!open (FILE, $htmlfile)) {
132	print STDERR "ERROR - could not open $htmlfile\n";
133	return;
134	}
135
136	my $foundbody = 0;
137	$self->HB_gettext (\$foundbody, $text, "FILE");
138	close FILE;
139
140	# just in case there was no <body> tag
141	if (!$foundbody) {
142	$foundbody = 1;
143	open (FILE, $htmlfile) \|\| return;
144	$self->HB_gettext (\$foundbody, $text, "FILE");
145	close FILE;
146	}
147	# text is in utf8
148	}
149
150	# converts the text to utf8, as ghtml does that for é etc.
151	sub HB_gettext {
152	my $self = shift (@_);
153	my ($foundbody, $text, $handle) = @_;
154
155	my $line = "";
156	while (defined ($line = <$handle>)) {
157	# look for body tag
158	if (!$$foundbody) {
159	if ($line =~ s/^.<body[^>]>//i) {
160	$$foundbody = 1;
161	} else {
162	next;
163	}
164	}
165
166	# check for symbol fonts
167	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
168	my $font = $1;
169	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
170	if ($font !~ m/^arial$/i);
171	}
172
173	$$text .= $line;
174	}
175
176	if ($self->{'input_encoding'} eq "iso_8859_1") {
177	# convert to utf-8
178	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
179	}
180	# convert any alphanumeric character entities to their utf-8
181	# equivalent for indexing purposes
182	#&ghtml::convertcharentities ($$text);
183
184	$$text =~ s/\s+/ /g; # remove \n's
185	}
186
187	sub HB_clean_section {
188	my $self = shift (@_);
189	my ($section) = @_;
190
191	# remove tags without a starting tag from the section
192	my ($tag, $tagstart);
193	while ($section =~ m/<\/([^>]{1,10})>/) {
194	$tag = $1;
195	$tagstart = index($section, "<$tag");
196	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
197	$section =~ s/<\/$tag>//;
198	}
199
200	# remove extra paragraph tags
201	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
202
203	# remove extra stuff at the end of the section
204	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
205
206	# add a newline at the beginning of each paragraph
207	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
208
209	# add a newline every 80 characters at a word boundary
210	# Note: this regular expression puts a line feed before
211	# the last word in each section, even when it is not
212	# needed.
213	$section =~ s/(.{1,80})\s/$1\n/g;
214
215	# fix up the image links
216	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
217	<center><img src=\"$1\" \/><\/center><br\/>/ig;
218	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
219	<center><img src=\"$1\" \/><\/center><br\/>/ig;
220
221	return $section;
222	}
223
224	# Will convert the oldHDL format to the new HDL format (using the Section tag)
225	sub convert_to_newHDLformat
226	{
227	my $self = shift (@_);
228	my ($file,$cnfile) = @_;
229	my $input_filename = $file;
230	my $tmp_filename = $cnfile;
231
232	# write HTML tmp file with new HDL format
233	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
234
235	# read in the file and do basic html cleaning (removing header etc)
236	my $html = "";
237	$self->HB_read_html_file ($input_filename, \$html);
238
239	# process the file one section at a time
240	my $curtoclevel = 1;
241	my $firstsection = 1;
242	my $toclevel = 0;
243	while (length ($html) > 0) {
244	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
245	$toclevel = $3;
246	my $title = $4;
247	my $sectiontext = "";
248	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
249	$sectiontext = $1;
250	} else {
251	$sectiontext = $html;
252	$html = "";
253	}
254
255	# remove tags and extra spaces from the title
256	$title =~ s/<\/?[^>]+>//g;
257	$title =~ s/^\s+\|\s+$//g;
258
259	# close any sections below the current level and
260	# create a new section (special case for the firstsection)
261	print PROD "<!--\n";
262	while (($curtoclevel > $toclevel) \|\|
263	(!$firstsection && $curtoclevel == $toclevel)) {
264	$curtoclevel--;
265	print PROD "</Section>\n";
266	}
267	if ($curtoclevel+1 < $toclevel) {
268	print STDERR "WARNING - jump in toc levels in $input_filename " .
269	"from $curtoclevel to $toclevel\n";
270	}
271	while ($curtoclevel < $toclevel) {
272	$curtoclevel++;
273	}
274
275	if ($curtoclevel == 1) {
276	# add the header tag
277	print PROD "-->\n";
278	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
279	print PROD "<!--\n";
280	}
281
282	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
283
284	print PROD "-->\n";
285
286	# clean up the section html
287	$sectiontext = $self->HB_clean_section($sectiontext);
288
289	print PROD "$sectiontext\n";
290
291	} else {
292	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
293	"\nin $input_filename\n";
294	last;
295	}
296	$firstsection = 0;
297	}
298
299	print PROD "<!--\n";
300	while ($curtoclevel > 0) {
301	$curtoclevel--;
302	print PROD "</Section>\n";
303	}
304	print PROD "-->\n";
305
306	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
307
308	return $tmp_filename;
309	}
310
311	sub shorten {
312	my $self = shift (@_);
313	my ($text) = @_;
314
315	return "\"$text\"" if (length($text) < 100);
316
317	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
318	substr ($text, length($text)-50) . "\"";
319	}
320
321	sub convert_tidy_or_oldHDL_file
322	{
323	my $self = shift (@_);
324	my ($file) = @_;
325	my $input_filename = $file;
326
327	if (-d $input_filename)
328	{
329	return $input_filename;
330	}
331
332	# get the input filename
333	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
334	my $base_dirname = $dirname;
335	$suffix = lc($suffix);
336
337	# derive tmp filename from input filename
338	# Remove any white space from filename -- no risk of name collision, and
339	# makes later conversion by utils simpler. Leave spaces in path...
340	# tidy up the filename with space, dot, hyphen between
341	$tailname =~ s/\s+//g;
342	$tailname =~ s/\.+//g;
343	$tailname =~ s/\-+//g;
344	# convert to utf-8 otherwise we have problems with the doc.xml file
345	# later on
346	&unicode::ensure_utf8(\$tailname);
347
348	# softlink to collection tmp dir
349	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
350	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
351
352	my $test_dirname = "";
353	my $f_separator = &util::get_os_dirsep();
354
355	if ($dirname =~ m/import$f_separator/)
356	{
357	$test_dirname = $'; #'
358
359	#print STDERR "init $'\n";
360
361	while ($test_dirname =~ m/[$f_separator]/)
362	{
363	my $folderdirname = $`;
364	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
365	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
366	$test_dirname = $'; #'
367	}
368	}
369
370	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
371
372	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
373	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
374	{
375	#convert the input file to a new style HDL
376	my $hdl_output_filename = $input_filename;
377	if ($self->{'old_style_HDL'})
378	{
379	$hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
380	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
381	}
382
383	#just for checking copy all other file from the base dir to tmp dir if it is not exists
384	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
385	my @files = grep {!/^\.+$/} readdir(DIR);
386	close(DIR);
387
388	foreach my $file (@files)
389	{
390	my $src_file = &util::filename_cat($base_dirname,$file);
391	my $dest_file = &util::filename_cat($tmp_dirname,$file);
392	if ((!-e $dest_file) && (!-d $src_file))
393	{
394	# just copy the original file back to the tmp directory
395	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
396	}
397	}
398
399	# tidy the input file
400	my $tidy_output_filename = $hdl_output_filename;
401	if ($self->{'tidy_html'})
402	{
403	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
404	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
405	}
406	$tmp_filename = $tidy_output_filename;
407	}
408	else
409	{
410	if (!-e $tmp_filename)
411	{
412	# just copy the original file back to the tmp directory
413	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
414	}
415	}
416
417	return $tmp_filename;
418	}
419
420
421	# Will make the html input file as a proper XML file with removed font tag and
422	# image size added to the img tag.
423	# The tidying process takes place in a collection specific 'tmp' directory so
424	# that we don't accidentally damage the input.
425	sub tmp_tidy_file
426	{
427	my $self = shift (@_);
428	my ($file,$cnfile) = @_;
429	my $input_filename = $file;
430	my $tmp_filename = $cnfile;
431
432	# get the input filename
433	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
434
435	require HTML::TokeParser::Simple;
436
437	# create HTML parser to decode the input file
438	my $parser = HTML::TokeParser::Simple->new($input_filename);
439
440	# write HTML tmp file without the font tag and image size are added to the img tag
441	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
442	while (my $token = $parser->get_token())
443	{
444	# is it an img tag
445	if ($token->is_start_tag('img'))
446	{
447	# get the attributes
448	my $attr = $token->return_attr;
449
450	# get the full path to the image
451	my $img_file = &util::filename_cat($dirname,$attr->{src});
452
453	# set the width and height attribute
454	($attr->{width}, $attr->{height}) = imgsize($img_file);
455
456	# recreate the tag
457	print PROD "<img";
458	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
459	print PROD ">";
460	}
461	# is it a font tag
462	else
463	{
464	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
465	{
466	# remove font tag
467	print PROD "";
468	}
469	else
470	{
471	# print without changes
472	print PROD $token->as_is;
473	}
474	}
475	}
476	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
477
478	# run html-tidy on the tmp file to make it a proper XML file
479	my $tidyfile = `tidy -utf8 -wrap 0 -asxml $tmp_filename`;
480
481	# write result back to the tmp file
482	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
483	print PROD $tidyfile;
484	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
485
486	# return the output filename
487	return $tmp_filename;
488	}
489
490	sub read_into_doc_obj
491	{
492	my $self = shift (@_);
493	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
494
495	# get the input file
496	my $input_filename = $file;
497	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
498	$suffix = lc($suffix);
499
500	if (($self->{'tidy_html'}) \|\| ($self->{'old_style_HDL'}))
501	{
502	# because the document has to be sectionalized set the description tags
503	$self->{'description_tags'} = 1;
504
505	# set the file to be tidied
506	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
507
508	# get the tidied file
509	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
510	my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
511
512	# derive tmp filename from input filename
513	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
514
515	# set the new input file and base_dir to be from the tidied file
516	$file = "$tailname$suffix";
517	$base_dir = $dirname;
518	}
519
520	# call the parent read_into_doc_obj
521	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
522
523	return ($process_status,$doc_obj);
524	}
525
526	sub new {
527	my ($class) = shift (@_);
528	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
529	push(@$pluginlist, $class);
530
531	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
532	push(@{$hashArgOptLists->{"OptList"}},$options);
533
534
535	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
536
537	if ($self->{'w3mir'}) {
538	$self->{'file_is_url'} = 1;
539	}
540	$self->{'aux_files'} = {};
541	$self->{'dir_num'} = 0;
542	$self->{'file_num'} = 0;
543
544	return bless $self, $class;
545	}
546
547	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
548	# if have eg <script language="javascript" src="img/lib.js@123">
549	sub get_default_block_exp {
550	my $self = shift (@_);
551
552	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
553	return "";
554	}
555
556	sub get_default_process_exp {
557	my $self = shift (@_);
558
559	# the last option is an attempt to encode the concept of an html query ...
560	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
561	}
562
563	sub store_block_files
564	{
565	my $self =shift (@_);
566	my ($filename_full_path, $block_hash) = @_;
567
568	my $html_fname = $filename_full_path;
569	my @file_blocks;
570
571	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
572
573	# read in file ($text will be in utf8)
574	my $raw_text = "";
575	$self->read_file_no_decoding ($filename_full_path, \$raw_text);
576
577	my $textref = \$raw_text;
578	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
579	my $closecom = '(?:-->\|(?:—\|\|--)>)';
580	$$textref =~ s/$opencom(.*?)$closecom//gs;
581
582	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
583	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
584	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
585	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
586	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
587	my @tabbg_matches = ($$textref =~ m/<(?:table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
588	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
589
590	if(!defined $self->{'utf8_to_original_filename'}) {
591	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
592	$self->{'utf8_to_original_filename'} = {};
593	}
594
595	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
596
597	# remove quotes from link at start and end if necessary
598	if ($link=~/^\"/) {
599	$link=~s/^\"//;
600	$link=~s/\"$//;
601	}
602
603	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
604	# some links may just be anchor names
605	next unless ($link =~ /\S+/);
606
607	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
608	# Turn relative file path into full path
609	my $dirname = &File::Basename::dirname($filename_full_path);
610	$link = &util::filename_cat($dirname, $link);
611	}
612	$link = $self->eval_dir_dots($link);
613
614
615	# this is the actual filename on the filesystem (that the link refers to)
616	my $url_original_filename = $self->opt_url_decode($link);
617
618	# Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
619	my $utf8_link = "";
620	$self->decode_text($link,$encoding,$language,\$utf8_link);
621	### my $utf8_url_encoded_link = &unicode::url_encode($utf8_link);
622
623	$self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
624	### $self->{'utf8_to_original_filename'}->{$utf8_url_encoded_link} = $url_original_filename;
625
626	# print STDERR "**** Storing block link: $link\n";
627	# print STDERR "**** URL original filename: $url_original_filename\n";
628	print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
629
630	$block_hash->{'file_blocks'}->{$url_original_filename} = 1;
631	}
632	}
633
634	# Given a filename in any encoding, will URL decode it to get back the original filename
635	# in the original encoding. Because this method is intended to work out the original
636	# filename, it not URL decode any filename if a file by the name of the URL-encoded*
637	# string already exists in the local folder.
638	# Return the original filename corresponding to the parameter URL-encoded filename, and
639	# a decoded flag that is set to true iff URL-decoding had to be applied.
640	sub opt_url_decode {
641	my $self = shift (@_);
642	my ($link) = @_;
643
644	# Replace %XX's in URL with decoded value if required.
645	# Note that the filename may include the %XX in some situations
646	if ($link =~ m/\%[A-F0-9]{2}/i) {
647	if (!-e $link) {
648	$link = &unicode::url_decode($link);
649	}
650	}
651
652	return $link;
653	}
654
655
656	# do plugin specific processing of doc_obj
657	sub process {
658	my $self = shift (@_);
659	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
660	my $outhandle = $self->{'outhandle'};
661
662	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
663	# this makes life so much easier... perl can cope with unix-style '/'s.
664	$base_dir =~ s@(\\)+@/@g;
665	$file =~ s@(\\)+@/@g;
666	}
667
668	# reset per-doc stuff...
669	$self->{'aux_files'} = {};
670	$self->{'dir_num'} = 0;
671	$self->{'file_num'} = 0;
672
673	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
674	# you can also include metadata in the format (X can be any number)
675	# <hX>Title<!--gsdl-metadata
676	# <Metadata name="name1">value1</Metadata>
677	# ...
678	# <Metadata name="nameN">valueN</Metadata>
679	#--></hX>
680	if ($self->{'sectionalise_using_h_tags'}) {
681	# description_tags should allways be activated because we convert headings to description tags
682	$self->{'description_tags'} = 1;
683
684	my $arrSections = [];
685	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
686
687	if (scalar(@$arrSections)) {
688	my $strMetadata = $self->update_section_data($arrSections, -1);
689	if (length($strMetadata)) {
690	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
691	$$textref =~ s/<\/body>/$strMetadata/ig;
692	}
693	}
694	}
695
696	my $cursection = $doc_obj->get_top_section();
697
698	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
699	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
700
701	# Store URL for page as metadata - this can be used for an
702	# altavista style search interface. The URL won't be valid
703	# unless the file structure contains the domain name (i.e.
704	# like when w3mir is used to download a website).
705
706	# URL metadata (even invalid ones) are used to support internal
707	# links, so even if 'file_is_url' is off, still need to store info
708
709	# print STDERR "#### file: $file\n";
710	my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$");
711	my $utf8_file = $self->filename_to_utf8_metadata($file);
712
713	my $web_url = "http://";
714	if(defined $dirname) { # local directory
715	$web_url = $web_url.$dirname.$utf8_file;
716	} else {
717	$web_url = $web_url.$utf8_file;
718	}
719	# URL encode this in unison with web_url since database lookup via Java code does not handle utf8-encoded keys
720	$web_url = &unicode::url_encode($web_url);
721	# print STDERR "#### weburl: $web_url\n";
722
723	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
724
725	if ($self->{'file_is_url'}) {
726	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
727	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
728	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
729	}
730
731	if ($self->{'description_tags'}) {
732	# remove the html header - note that doing this here means any
733	# sections defined within the header will be lost (so all <Section>
734	# tags must appear within the body of the HTML)
735	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
736
737	$$textref =~ s/^.?<body[^>]>//is;
738	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
739
740	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
741	my $closecom = '(?:-->\|(?:—\|\|--)>)';
742
743	my $lt = '(?:<\|<)';
744	my $gt = '(?:>\|>)';
745	my $quot = '(?:"\|"\|”\|“)';
746
747	my $dont_strip = '';
748	if ($self->{'no_strip_metadata_html'}) {
749	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
750	}
751
752	my $found_something = 0; my $top = 1;
753	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
754	my $text = $1;
755	my $comment = $2;
756	if (defined $text) {
757	# text before a comment - note that getting to here
758	# doesn't necessarily mean there are Section tags in
759	# the document
760	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
761	}
762	while ($comment =~ s/$lt(.*?)$gt//s) {
763	my $tag = $1;
764	if ($tag eq "Section") {
765	$found_something = 1;
766	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
767	$top = 0;
768	} elsif ($tag eq "/Section") {
769	$found_something = 1;
770	$cursection = $doc_obj->get_parent_section ($cursection);
771	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
772	my $metaname = $1;
773	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
774	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
775	my $metavalue = $1;
776	$metavalue =~ s/^\s+//;
777	$metavalue =~ s/\s+$//;
778	# assume that no metadata value intentionally includes
779	# carriage returns or HTML tags (if they're there they
780	# were probably introduced when converting to HTML from
781	# some other format).
782	# actually some people want to have html tags in their
783	# metadata.
784	$metavalue =~ s/[\cJ\cM]/ /sg;
785	$metavalue =~ s/<[^>]+>//sg
786	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
787	$metavalue =~ s/\s+/ /sg;
788	if ($accumulate) {
789	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
790	} else {
791	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
792	}
793	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
794	# do nothing with containing Description tags
795	} else {
796	# simple HTML tag (probably created by the conversion
797	# to HTML from some other format) - we'll ignore it and
798	# hope for the best ;-)
799	}
800	}
801	}
802	if ($cursection ne "") {
803	print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
804	}
805
806	$$textref =~ s/^.?<body[^>]>//is;
807	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
808	if ($$textref =~ m/\S/) {
809	if (!$found_something) {
810	if ($self->{'verbosity'} > 2) {
811	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
812	print $outhandle " will be processed as a single section document\n";
813	}
814
815	# go ahead and process single-section document
816	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
817
818	# if document contains no Section tags we'll go ahead
819	# and extract metadata (this won't have been done
820	# above as the -description_tags option prevents it)
821	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
822	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
823	unless $self->{'no_metadata'};
824
825	} else {
826	print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
827	print $outhandle " of the final closing </Section> tag. This text will\n";
828	print $outhandle " be ignored.";
829
830	my ($text);
831	if (length($$textref) > 30) {
832	$text = substr($$textref, 0, 30) . "...";
833	} else {
834	$text = $$textref;
835	}
836	$text =~ s/\n/ /isg;
837	print $outhandle " ($text)\n";
838	}
839	} elsif (!$found_something) {
840
841	if ($self->{'verbosity'} > 2) {
842	# may get to here if document contained no valid Section
843	# tags but did contain some comments. The text will have
844	# been processed already but we should print the warning
845	# as above and extract metadata
846	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
847	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
848	}
849
850	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
851	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
852	unless $self->{'no_metadata'};
853	}
854
855	} else {
856
857	# remove header and footer
858	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
859	$$textref =~ s/^.?<body[^>]>//is;
860	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
861	}
862
863	# single section document
864	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
865	}
866	return 1;
867	}
868
869
870	sub process_heading
871	{
872	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
873	$strHeadingText = '' if (!defined($strHeadingText));
874
875	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
876
877	my $strSecMetadata = '';
878	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
879	{
880	$strSecMetadata .= $1;
881	}
882
883	$strHeadingText =~ s/^\s+//g;
884	$strHeadingText =~ s/\s+$//g;
885	$strSecMetadata =~ s/^\s+//g;
886	$strSecMetadata =~ s/\s+$//g;
887
888	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
889
890	if (length($strSecMetadata)) {
891	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
892	}
893
894	$strMetadata .= "\t</Description>\n";
895
896	return "<!--" . $strMetadata . "-->";
897	}
898
899
900	sub update_section_data
901	{
902	my ($self, $arrSections, $nCurTocNo) = @_;
903	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
904
905	if ($nSections == 0) {
906	push @$arrSections, $nCurTocNo;
907	return $strBuffer;
908	}
909	$nLast = $arrSections->[$nSections - 1];
910	if ($nCurTocNo > $nLast) {
911	push @$arrSections, $nCurTocNo;
912	return $strBuffer;
913	}
914	for(my $i = $nSections - 1; $i >= 0; $i--) {
915	if ($nCurTocNo <= $arrSections->[$i]) {
916	$strBuffer .= "\n</Section>";
917	pop @$arrSections;
918	}
919	}
920	push @$arrSections, $nCurTocNo;
921	return $strBuffer;
922	}
923
924
925	# note that process_section may be called multiple times for a single
926	# section (relying on the fact that add_utf8_text appends the text to any
927	# that may exist already).
928	sub process_section {
929	my $self = shift (@_);
930	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
931	# trap links
932	if (!$self->{'nolinks'}) {
933	# usemap="./#index" not handled correctly => change to "#index"
934	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
935
936	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
937	$self->replace_usemap_links($1, $2, $3)/isge;
938
939	## $$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
940
941	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
942	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
943	}
944
945	# trap images
946
947	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
948	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
949	# turned regular text succeeding images into links. That is, by embedding <imgs>
950	# inside <a href=""></a>, the text following images were no longer misbehaving.
951	# However, there would be many occasions whereby images were not meant to link
952	# to their source images but where the images would link to another web page.
953	# To allow this, the no_image_links option was introduced: it would prevent
954	# the behaviour of embedding images into links that referenced the source images.
955
956	# Somewhere along the line, the problem of normal text turning into links when
957	# such text followed images which were not embedded in <a href=""></a> ceased
958	# to occur. This is why the following lines have been commented out (as well as
959	# two lines in replace_images). They appear to no longer apply.
960
961	# If at any time, there is a need for having images embedded in <a> anchor tags,
962	# then it might be better to turn that into an HTMLPlugin option rather than make
963	# it the default behaviour. Also, eventually, no_image_links needs to become
964	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
965
966	#if(!$self->{'no_image_links'}){
967	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
968	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
969	#}
970
971	# add text to document object
972	# turn \ into \\ so that the rest of greenstone doesn't think there
973	# is an escape code following. (Macro parsing loses them...)
974	$$textref =~ s/\\/\\\\/go;
975
976	$doc_obj->add_utf8_text($cursection, $$textref);
977	}
978
979	sub replace_images {
980	my $self = shift (@_);
981	my ($front, $link, $back, $base_dir,
982	$file, $doc_obj, $section) = @_;
983
984	# remove quotes from link at start and end if necessary
985	if ($link=~/^[\"\']/) {
986	$link=~s/^[\"\']//;
987	$link=~s/[\"\']$//;
988	$front.='"';
989	$back="\"$back";
990	}
991
992	$link =~ s/\n/ /g;
993
994	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
995	# If the Word file path has spaces in it, wv messes up and you end up with
996	# absolute paths for the images, and without the "file://" prefix
997	# So check for this special case and massage the data to be correct
998	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
999	$link =~ s/^.*\\([^\\]+)$/$1/;
1000	}
1001
1002	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
1003
1004	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
1005
1006	# print STDERR "**** link = $link\n";
1007	# print STDERR "**** href = $href\n";
1008	# print STDERR "**** img_file = $img_file\n";
1009
1010	my $anchor_name = $img_file;
1011	#$anchor_name =~ s/^.*\///;
1012	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
1013
1014	my $image_link = $front . $img_file .$back;
1015	return $image_link;
1016
1017	# The reasons for why the following two lines are no longer necessary can be
1018	# found in subroutine process_section
1019	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
1020	#return $anchor_link;
1021
1022	#return $front . $img_file . $back . $anchor_name;
1023	}
1024
1025	sub replace_href_links {
1026	my $self = shift (@_);
1027	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
1028
1029	# remove quotes from link at start and end if necessary
1030	if ($link=~/^[\"\']/) {
1031	$link=~s/^[\"\']//;
1032	$link=~s/[\"\']$//;
1033	$front.='"';
1034	$back="\"$back";
1035	}
1036
1037	# attempt to sort out targets - frames are not handled
1038	# well in this plugin and some cases will screw things
1039	# up - e.g. the _parent target (so we'll just remove
1040	# them all ;-)
1041	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
1042	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
1043	$front =~ s/target=\"?_parent\"?//is;
1044	$back =~ s/target=\"?_parent\"?//is;
1045
1046	return $front . $link . $back if $link =~ m/^\#/s;
1047	$link =~ s/\n/ /g;
1048
1049	# Find file referred to by $link on file system
1050	# This is more complicated than it sounds when char encodings
1051	# is taken in to account
1052	## &unicode::ensure_utf8(\$link);
1053	## $link = &unicode::url_encode($link);
1054	# print STDERR "#### filepath: ".&util::filename_cat($base_dir,$file)."\n";
1055	# print STDERR "#### link: $link\n";
1056
1057	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
1058	# print STDERR "#### href: $href\n";
1059
1060	# href may use '\'s where '/'s should be on Windows
1061	$href =~ s/\\/\//g;
1062
1063	## $href = &unicode::url_decode($href);
1064	# print STDERR "#### href again: $href\n";
1065	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
1066
1067
1068	##### leave all these links alone (they won't be picked up by intermediate
1069	##### pages). I think that's safest when dealing with frames, targets etc.
1070	##### (at least until I think of a better way to do it). Problems occur with
1071	##### mailto links from within small frames, the intermediate page is displayed
1072	##### within that frame and can't be seen. There is still potential for this to
1073	##### happen even with html pages - the solution seems to be to somehow tell
1074	##### the browser from the server side to display the page being sent (i.e.
1075	##### the intermediate page) in the top level window - I'm not sure if that's
1076	##### possible - the following line should probably be deleted if that can be done
1077	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
1078
1079
1080	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
1081	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
1082	# URL encode this in unison with web_url since database lookup via Java code does not handle utf8-encoded keys
1083	# No longer need to &ghtml::urlsafe($href) for special characters, since the entire thing is going to be URL-encoded
1084	$href = &unicode::url_encode($href);
1085	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
1086	} else {
1087	# link is to some other type of file (eg image) so we'll
1088	# need to associate that file
1089	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
1090	}
1091	}
1092
1093	sub add_file {
1094	my $self = shift (@_);
1095	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
1096	my ($newname);
1097
1098	my $filename = $href;
1099	if ($base_dir eq "") {
1100	# remove http:/ thereby leaving one slash at the start
1101	$filename =~ s/^[^:]*:\///;
1102	}
1103	else {
1104	# remove http://
1105	$filename =~ s/^[^:]*:\/\///;
1106	}
1107
1108	$filename = &util::filename_cat($base_dir, $filename);
1109	# print STDERR "**** filename: $filename\n";
1110	# Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some
1111	# situations. If the original file's name was in URL encoding, the following method will not decode it.
1112	my $utf8_filename = $filename;
1113
1114	# print STDERR "*** filename before URL decoding: $filename\n";
1115	$filename = $self->opt_url_decode($utf8_filename);
1116	# print STDERR "*** filename after URL decoding: $filename\n\n";
1117
1118	# some special processing if the intended filename was converted to utf8, but
1119	# the actual file still needs to be renamed
1120	if (!-e $filename) {
1121	# try the original filename stored in map
1122	my $original_filename = $self->{'utf8_to_original_filename'}->{$filename};
1123	if (-e $original_filename) {
1124	$filename = $original_filename;
1125	}
1126	}
1127
1128	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
1129
1130	if ($rl == 0) {
1131	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
1132	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
1133	}
1134	else {
1135	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
1136	}
1137	}
1138
1139	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
1140	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
1141	}
1142	if ($self->{'rename_assoc_files'}) {
1143	if (defined $self->{'aux_files'}->{$href}) {
1144	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1145	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1146	} else {
1147	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1148	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1149	$self->inc_filecount ();
1150	}
1151	$doc_obj->associate_file($filename, $newname, undef, $section);
1152	return "_httpdocimg_/$newname";
1153	} else {
1154	($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
1155
1156	# print STDERR "Before url encoding newname: $newname\n";
1157	# Make sure this name uses only ASCII characters
1158	# => use URL encoding, as it preserves original encoding
1159	$newname = &unicode::url_encode($newname);
1160	# print STDERR "After url encoding newname: $newname\n";
1161	# print STDERR "*** Real name and converted filename:\n\t$filename\n\t$newname\n";
1162
1163	$doc_obj->associate_file($filename, $newname, undef, $section);
1164
1165	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1166	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
1167	my $newname_url = $newname;
1168	$newname_url =~ s/%/%25/g;
1169	return "_httpdocimg_/$newname_url";
1170	}
1171	}
1172
1173
1174	sub format_link {
1175	my $self = shift (@_);
1176	my ($link, $base_dir, $file) = @_;
1177
1178	my ($before_hash, $hash_part) = $link =~ m/^([^\#])(\#?.)$/;
1179
1180	$hash_part = "" if !defined $hash_part;
1181	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
1182	my $outhandle = $self->{'outhandle'};
1183	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1184	if $self->{'verbosity'};
1185	return ($link, "", 0);
1186	}
1187
1188	if ($before_hash =~ s@^((?:http\|ftp\|file)://)@@i) {
1189	my $type = $1;
1190
1191	if ($link =~ m/^(http\|ftp):/i) {
1192	# Turn url (using /) into file name (possibly using \ on windows)
1193	my @http_dir_split = split('/', $before_hash);
1194	$before_hash = &util::filename_cat(@http_dir_split);
1195	}
1196
1197	$before_hash = $self->eval_dir_dots($before_hash);
1198
1199	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1200
1201	my $rl = 0;
1202	$rl = 1 if (-e $linkfilename);
1203
1204	# make sure there's a slash on the end if it's a directory
1205	if ($before_hash !~ m/\/$/) {
1206	$before_hash .= "/" if (-d $linkfilename);
1207	}
1208	return ($type . $before_hash, $hash_part, $rl);
1209
1210	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
1211
1212	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
1213
1214	# the first directory will be the domain name if file_is_url
1215	# to generate archives, otherwise we'll assume all files are
1216	# from the same site and base_dir is the root
1217
1218	if ($self->{'file_is_url'}) {
1219	my @dirs = split /[\/\\]/, $file;
1220	my $domname = shift (@dirs);
1221	$before_hash = &util::filename_cat($domname, $before_hash);
1222	$before_hash =~ s@\\@/@g; # for windows
1223	}
1224	else
1225	{
1226	# see if link shares directory with source document
1227	# => turn into relative link if this is so!
1228
1229	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1230	# too difficult doing a pattern match with embedded '\'s...
1231	my $win_before_hash=$before_hash;
1232	$win_before_hash =~ s@(\\)+@/@g;
1233	# $base_dir is already similarly "converted" on windows.
1234	if ($win_before_hash =~ s@^$base_dir/@@o) {
1235	# if this is true, we removed a prefix
1236	$before_hash=$win_before_hash;
1237	}
1238	}
1239	else {
1240	# before_hash has lost leading slash by this point,
1241	# -> add back in prior to substitution with $base_dir
1242	$before_hash = "/$before_hash";
1243
1244	$before_hash = &util::filename_cat("",$before_hash);
1245	$before_hash =~ s@^$base_dir/@@;
1246	}
1247	}
1248	} else {
1249	# Turn relative file path into full path
1250	my $dirname = &File::Basename::dirname($file);
1251	$before_hash = &util::filename_cat($dirname, $before_hash);
1252	$before_hash = $self->eval_dir_dots($before_hash);
1253
1254	# print STDERR "#### before_hash: $before_hash\n";
1255	}
1256
1257	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1258	# make sure there's a slash on the end if it's a directory
1259	if ($before_hash !~ m/\/$/) {
1260	$before_hash .= "/" if (-d $linkfilename);
1261	}
1262	return ("http://" . $before_hash, $hash_part, 1);
1263	} else {
1264	# mailto, news, nntp, telnet, javascript or gopher link
1265	return ($before_hash, "", 0);
1266	}
1267	}
1268
1269	sub extract_first_NNNN_characters {
1270	my $self = shift (@_);
1271	my ($textref, $doc_obj, $thissection) = @_;
1272
1273	foreach my $size (split /,/, $self->{'first'}) {
1274	my $tmptext = $$textref;
1275	# skip to the body
1276	$tmptext =~ s/.<body[^>]>//i;
1277	# remove javascript
1278	$tmptext =~ s@<script.*?</script>@ @sig;
1279	$tmptext =~ s/<[^>]*>/ /g;
1280	$tmptext =~ s/ / /g;
1281	$tmptext =~ s/^\s+//;
1282	$tmptext =~ s/\s+$//;
1283	$tmptext =~ s/\s+/ /gs;
1284	$tmptext = &unicode::substr ($tmptext, 0, $size);
1285	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
1286	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1287	}
1288	}
1289
1290
1291	sub extract_metadata {
1292	my $self = shift (@_);
1293	my ($textref, $metadata, $doc_obj, $section) = @_;
1294	my $outhandle = $self->{'outhandle'};
1295	# if we don't want metadata, we may as well not be here ...
1296	return if (!defined $self->{'metadata_fields'});
1297
1298	# metadata fields to extract/save. 'key' is the (lowercase) name of the
1299	# html meta, 'value' is the metadata name for greenstone to use
1300	my %find_fields = ();
1301
1302	my %creator_fields = (); # short-cut for lookups
1303
1304
1305	foreach my $field (split /,/, $self->{'metadata_fields'}) {
1306	$field =~ s/^\s+//; # remove leading whitespace
1307	$field =~ s/\s+$//; # remove trailing whitespace
1308
1309	# support tag<tagname>
1310	if ($field =~ m/^(.?)<(.?)>$/) {
1311	# "$2" is the user's preferred gs metadata name
1312	$find_fields{lc($1)}=$2; # lc = lowercase
1313	} else { # no <tagname> for mapping
1314	# "$field" is the user's preferred gs metadata name
1315	$find_fields{lc($field)}=$field; # lc = lowercase
1316	}
1317	}
1318
1319	if (defined $self->{'hunt_creator_metadata'} &&
1320	$self->{'hunt_creator_metadata'} == 1 ) {
1321	my @extra_fields =
1322	(
1323	'author',
1324	'author.email',
1325	'creator',
1326	'dc.creator',
1327	'dc.creator.corporatename',
1328	);
1329
1330	# add the creator_metadata fields to search for
1331	foreach my $field (@extra_fields) {
1332	$creator_fields{$field}=0; # add to lookup hash
1333	}
1334	}
1335
1336
1337	# find the header in the html file, which has the meta tags
1338	$$textref =~ m@<head>(.*?)</head>@si;
1339
1340	my $html_header=$1;
1341
1342	# go through every <meta... tag defined in the html and see if it is
1343	# one of the tags we want to match.
1344
1345	# special case for title - we want to remember if its been found
1346	my $found_title = 0;
1347	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1348	$html_header =~ m/^/; # match the start of the string, for \G assertion
1349
1350	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1351	my $metatag=$1;
1352	my ($tag, $value);
1353
1354	# find the tag name
1355	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1356	$tag=$2;
1357	# in case they're not using " or ', but they should...
1358	if (! $tag) {
1359	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1360	$tag=$1;
1361	}
1362
1363	if (!defined $tag) {
1364	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1365	next;
1366	}
1367
1368	# don't need to assign this field if it was passed in from a previous
1369	# (recursive) plugin
1370	if (defined $metadata->{$tag}) {next}
1371
1372	# find the tag content
1373	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
1374	$value=$2;
1375
1376	if (! $value) {
1377	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1378	$value=$1;
1379	}
1380	if (!defined $value) {
1381	print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1382	next;
1383	}
1384
1385	# clean up and add
1386	$value =~ s/\s+/ /gs;
1387	chomp($value); # remove trailing \n, if any
1388	if (exists $creator_fields{lc($tag)}) {
1389	# map this value onto greenstone's "Creator" metadata
1390	$tag='Creator';
1391	} elsif (!exists $find_fields{lc($tag)}) {
1392	next; # don't want this tag
1393	} else {
1394	# get the user's preferred capitalisation
1395	$tag = $find_fields{lc($tag)};
1396	}
1397	if (lc($tag) eq "title") {
1398	$found_title = 1;
1399	}
1400	print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
1401	if ($self->{'verbosity'} > 2);
1402	if ($tag =~ m/date.*/i){
1403	$tag = lc($tag);
1404	}
1405	$doc_obj->add_utf8_metadata($section, $tag, $value);
1406
1407	}
1408
1409	# TITLE: extract the document title
1410	if (exists $find_fields{'title'} && !$found_title) {
1411	# we want a title, and didn't find one in the meta tags
1412	# see if there's a <title> tag
1413	my $title;
1414	my $from = ""; # for debugging output only
1415	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
1416	$title = $1;
1417	$from = "<title> tags";
1418	}
1419
1420	if (!defined $title) {
1421	$from = "first 100 chars";
1422	# if no title use first 100 or so characters
1423	$title = $$textref;
1424	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1425	$title =~ s/^.*?<body>//si;
1426	# ignore javascript!
1427	$title =~ s@<script.*?</script>@ @sig;
1428	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1429	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1430	$title = substr ($title, 0, 100);
1431	$title =~ s/\s\S*$/.../;
1432	}
1433	$title =~ s/<[^>]*>/ /g; # remove html tags
1434	$title =~ s/ / /g;
1435	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1436	$title =~ s/\s+/ /gs; # collapse multiple spaces
1437	$title =~ s/^\s*//; # remove leading spaces
1438	$title =~ s/\s*$//; # remove trailing spaces
1439
1440	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1441	$title =~ s/^\s+//s; # in case title_sub introduced any...
1442	$doc_obj->add_utf8_metadata ($section, 'Title', $title);
1443	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1444	if ($self->{'verbosity'} > 2);
1445	}
1446
1447	# add FileFormat metadata
1448	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1449
1450	# Special, for metadata names such as tagH1 - extracts
1451	# the text between the first <H1> and </H1> tags into "H1" metadata.
1452
1453	foreach my $field (keys %find_fields) {
1454	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1455	my $tag = $1;
1456	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1457	my $content = $1;
1458	$content =~ s/ / /g;
1459	$content =~ s/<[^>]*>/ /g;
1460	$content =~ s/^\s+//;
1461	$content =~ s/\s+$//;
1462	$content =~ s/\s+/ /gs;
1463	if ($content) {
1464	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1465	$tag =~ s/^tag//i;
1466	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1467	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1468	if ($self->{'verbosity'} > 2);
1469	}
1470	}
1471	}
1472	}
1473
1474
1475	# evaluate any "../" to next directory up
1476	# evaluate any "./" as here
1477	sub eval_dir_dots {
1478	my $self = shift (@_);
1479	my ($filename) = @_;
1480	my $dirsep_os = &util::get_os_dirsep();
1481	my @dirsep = split(/$dirsep_os/,$filename);
1482
1483	my @eval_dirs = ();
1484	foreach my $d (@dirsep) {
1485	if ($d eq "..") {
1486	pop(@eval_dirs);
1487
1488	} elsif ($d eq ".") {
1489	# do nothing!
1490
1491	} else {
1492	push(@eval_dirs,$d);
1493	}
1494	}
1495
1496	# Need to fiddle with number of elements in @eval_dirs if the
1497	# first one is the empty string. This is because of a
1498	# modification to util::filename_cat that supresses the addition
1499	# of a leading '/' character (or \ if windows) (intended to help
1500	# filename cat with relative paths) if the first entry in the
1501	# array is the empty string. Making the array start with two
1502	# empty strings is a way to defeat this "smart" option.
1503	#
1504	if (scalar(@eval_dirs) > 0) {
1505	if ($eval_dirs[0] eq ""){
1506	unshift(@eval_dirs,"");
1507	}
1508	}
1509	return &util::filename_cat(@eval_dirs);
1510	}
1511
1512	sub replace_usemap_links {
1513	my $self = shift (@_);
1514	my ($front, $link, $back) = @_;
1515
1516	# remove quotes from link at start and end if necessary
1517	if ($link=~/^[\"\']/) {
1518	$link=~s/^[\"\']//;
1519	$link=~s/[\"\']$//;
1520	$front.='"';
1521	$back="\"$back";
1522	}
1523
1524	$link =~ s/^\.\///;
1525	return $front . $link . $back;
1526	}
1527
1528	sub inc_filecount {
1529	my $self = shift (@_);
1530
1531	if ($self->{'file_num'} == 1000) {
1532	$self->{'dir_num'} ++;
1533	$self->{'file_num'} = 0;
1534	} else {
1535	$self->{'file_num'} ++;
1536	}
1537	}
1538
1539
1540	# Extend read_file so that strings like é are
1541	# converted to UTF8 internally.
1542	#
1543	# We don't convert < or > or & or " in case
1544	# they interfere with the GML files
1545
1546	sub read_file {
1547	my $self = shift(@_);
1548	my ($filename, $encoding, $language, $textref) = @_;
1549
1550	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1551
1552	# Convert entities to their UTF8 equivalents
1553	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1554	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
1555	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1556	}
1557
1558	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: