Context Navigation

source: gsdl/trunk/perllib/plugins/HTMLPlug.pm@ 15838

Last change on this file since 15838 was 15838, checked in by ak19, 16 years ago
Updated the regular expression on img src link to make sure that single quotes embedded inside the link portion are not considered as marking the end of the link). Now it checks that the start and quotation marks match, i.e. starting single quote for a link should be matched by ending single quote and starting double quote with ending double quote
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 47.4 KB

Line
1	###########################################################################
2	#
3	# HTMLPlug.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlug;
37
38	use BasPlug;
39	use ghtml;
40	use unicode;
41	use util;
42	use XMLParser;
43
44	use Image::Size;
45	use File::Copy;
46
47	sub BEGIN {
48	@HTMLPlug::ISA = ('BasPlug');
49	}
50
51	use strict; # every perl program should have this!
52	no strict 'refs'; # make an exception so we can use variables as filehandles
53
54	my $arguments =
55	[ { 'name' => "process_exp",
56	'desc' => "{BasPlug.process_exp}",
57	'type' => "regexp",
58	'deft' => &get_default_process_exp() },
59	{ 'name' => "block_exp",
60	'desc' => "{BasPlug.block_exp}",
61	'type' => 'regexp',
62	'deft' => &get_default_block_exp() },
63	{ 'name' => "nolinks",
64	'desc' => "{HTMLPlug.nolinks}",
65	'type' => "flag" },
66	{ 'name' => "keep_head",
67	'desc' => "{HTMLPlug.keep_head}",
68	'type' => "flag" },
69	{ 'name' => "no_metadata",
70	'desc' => "{HTMLPlug.no_metadata}",
71	'type' => "flag" },
72	{ 'name' => "metadata_fields",
73	'desc' => "{HTMLPlug.metadata_fields}",
74	'type' => "string",
75	'deft' => "Title" },
76	{ 'name' => "hunt_creator_metadata",
77	'desc' => "{HTMLPlug.hunt_creator_metadata}",
78	'type' => "flag" },
79	{ 'name' => "file_is_url",
80	'desc' => "{HTMLPlug.file_is_url}",
81	'type' => "flag" },
82	{ 'name' => "assoc_files",
83	'desc' => "{HTMLPlug.assoc_files}",
84	'type' => "regexp",
85	'deft' => &get_default_block_exp() },
86	{ 'name' => "rename_assoc_files",
87	'desc' => "{HTMLPlug.rename_assoc_files}",
88	'type' => "flag" },
89	{ 'name' => "title_sub",
90	'desc' => "{HTMLPlug.title_sub}",
91	'type' => "string",
92	'deft' => "" },
93	{ 'name' => "description_tags",
94	'desc' => "{HTMLPlug.description_tags}",
95	'type' => "flag" },
96	# retain this for backward compatibility (w3mir option was replaced by
97	# file_is_url)
98	{ 'name' => "w3mir",
99	# 'desc' => "{HTMLPlug.w3mir}",
100	'type' => "flag",
101	'hiddengli' => "yes"},
102	{ 'name' => "no_strip_metadata_html",
103	'desc' => "{HTMLPlug.no_strip_metadata_html}",
104	'type' => "string",
105	'deft' => "",
106	'reqd' => "no"},
107	{ 'name' => "sectionalise_using_h_tags",
108	'desc' => "{HTMLPlug.sectionalise_using_h_tags}",
109	'type' => "flag" },
110	{ 'name' => "use_realistic_book",
111	'desc' => "{HTMLPlug.tidy_html}",
112	'type' => "flag"},
113	{ 'name' => "is_old_HDL_tags",
114	'desc' => "{HTMLPlug.old_style_HDL}",
115	'type' => "flag"},
116	{ 'name' => "no_image_links", # in future think about removing this option,
117	'desc' => "{HTMLPlug.no_image_links}", # since it has become the default behaviour
118	'type' => "flag"},
119	];
120
121	my $options = { 'name' => "HTMLPlug",
122	'desc' => "{HTMLPlug.desc}",
123	'abstract' => "no",
124	'inherits' => "yes",
125	'args' => $arguments };
126
127
128	sub HB_read_html_file {
129	my $self = shift (@_);
130	my ($htmlfile, $text) = @_;
131
132	# load in the file
133	if (!open (FILE, $htmlfile)) {
134	print STDERR "ERROR - could not open $htmlfile\n";
135	return;
136	}
137
138	my $foundbody = 0;
139	$self->HB_gettext (\$foundbody, $text, "FILE");
140	close FILE;
141
142	# just in case there was no <body> tag
143	if (!$foundbody) {
144	$foundbody = 1;
145	open (FILE, $htmlfile) \|\| return;
146	$self->HB_gettext (\$foundbody, $text, "FILE");
147	close FILE;
148	}
149	# text is in utf8
150	}
151
152	# converts the text to utf8, as ghtml does that for é etc.
153	sub HB_gettext {
154	my $self = shift (@_);
155	my ($foundbody, $text, $handle) = @_;
156
157	my $line = "";
158	while (defined ($line = <$handle>)) {
159	# look for body tag
160	if (!$$foundbody) {
161	if ($line =~ s/^.<body[^>]>//i) {
162	$$foundbody = 1;
163	} else {
164	next;
165	}
166	}
167
168	# check for symbol fonts
169	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
170	my $font = $1;
171	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
172	if ($font !~ /^arial$/i);
173	}
174
175	$$text .= $line;
176	}
177
178	if ($self->{'input_encoding'} eq "iso_8859_1") {
179	# convert to utf-8
180	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
181	}
182	# convert any alphanumeric character entities to their utf-8
183	# equivalent for indexing purposes
184	#&ghtml::convertcharentities ($$text);
185
186	$$text =~ s/\s+/ /g; # remove \n's
187	}
188
189	sub HB_clean_section {
190	my $self = shift (@_);
191	my ($section) = @_;
192
193	# remove tags without a starting tag from the section
194	my ($tag, $tagstart);
195	while ($section =~ /<\/([^>]{1,10})>/) {
196	$tag = $1;
197	$tagstart = index($section, "<$tag");
198	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
199	$section =~ s/<\/$tag>//;
200	}
201
202	# remove extra paragraph tags
203	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
204
205	# remove extra stuff at the end of the section
206	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
207
208	# add a newline at the beginning of each paragraph
209	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
210
211	# add a newline every 80 characters at a word boundary
212	# Note: this regular expression puts a line feed before
213	# the last word in each section, even when it is not
214	# needed.
215	$section =~ s/(.{1,80})\s/$1\n/g;
216
217	# fix up the image links
218	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
219	<center><img src=\"$1\" \/><\/center><br\/>/ig;
220	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
221	<center><img src=\"$1\" \/><\/center><br\/>/ig;
222
223	return $section;
224	}
225
226	# Will convert the oldHDL format to the new HDL format (using the Section tag)
227	sub convert_to_newHDLformat
228	{
229	my $self = shift (@_);
230	my ($file,$cnfile) = @_;
231	my $input_filename = $file;
232	my $tmp_filename = $cnfile;
233
234	# write HTML tmp file with new HDL format
235	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
236
237	# read in the file and do basic html cleaning (removing header etc)
238	my $html = "";
239	$self->HB_read_html_file ($input_filename, \$html);
240
241	# process the file one section at a time
242	my $curtoclevel = 1;
243	my $firstsection = 1;
244	my $toclevel = 0;
245	while (length ($html) > 0) {
246	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
247	$toclevel = $3;
248	my $title = $4;
249	my $sectiontext = "";
250	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
251	$sectiontext = $1;
252	} else {
253	$sectiontext = $html;
254	$html = "";
255	}
256
257	# remove tags and extra spaces from the title
258	$title =~ s/<\/?[^>]+>//g;
259	$title =~ s/^\s+\|\s+$//g;
260
261	# close any sections below the current level and
262	# create a new section (special case for the firstsection)
263	print PROD "<!--\n";
264	while (($curtoclevel > $toclevel) \|\|
265	(!$firstsection && $curtoclevel == $toclevel)) {
266	$curtoclevel--;
267	print PROD "</Section>\n";
268	}
269	if ($curtoclevel+1 < $toclevel) {
270	print STDERR "WARNING - jump in toc levels in $input_filename " .
271	"from $curtoclevel to $toclevel\n";
272	}
273	while ($curtoclevel < $toclevel) {
274	$curtoclevel++;
275	}
276
277	if ($curtoclevel == 1) {
278	# add the header tag
279	print PROD "-->\n";
280	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
281	print PROD "<!--\n";
282	}
283
284	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
285
286	print PROD "-->\n";
287
288	# clean up the section html
289	$sectiontext = $self->HB_clean_section($sectiontext);
290
291	print PROD "$sectiontext\n";
292
293	} else {
294	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
295	"\nin $input_filename\n";
296	last;
297	}
298	$firstsection = 0;
299	}
300
301	print PROD "<!--\n";
302	while ($curtoclevel > 0) {
303	$curtoclevel--;
304	print PROD "</Section>\n";
305	}
306	print PROD "-->\n";
307
308	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
309
310	return $tmp_filename;
311	}
312
313	sub shorten {
314	my $self = shift (@_);
315	my ($text) = @_;
316
317	return "\"$text\"" if (length($text) < 100);
318
319	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
320	substr ($text, length($text)-50) . "\"";
321	}
322
323	sub convert_tidy_or_oldHDL_file
324	{
325	my $self = shift (@_);
326	my ($file) = @_;
327	my $input_filename = $file;
328
329	if (-d $input_filename)
330	{
331	return $input_filename;
332	}
333
334	# get the input filename
335	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
336	my $base_dirname = $dirname;
337	$suffix = lc($suffix);
338
339	# derive tmp filename from input filename
340	# Remove any white space from filename -- no risk of name collision, and
341	# makes later conversion by utils simpler. Leave spaces in path...
342	# tidy up the filename with space, dot, hyphen between
343	$tailname =~ s/\s+//g;
344	$tailname =~ s/\.+//g;
345	$tailname =~ s/\-+//g;
346	# convert to utf-8 otherwise we have problems with the doc.xml file
347	# later on
348	&unicode::ensure_utf8(\$tailname);
349
350	# softlink to collection tmp dir
351	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
352	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
353
354	my $test_dirname = "";
355	my $f_separator = &util::get_os_dirsep();
356
357	if ($dirname =~ /import$f_separator/)
358	{
359	$test_dirname = $';
360
361	#print STDERR "init $'\n";
362
363	while ($test_dirname =~ /[$f_separator]/)
364	{
365	my $folderdirname = $`;
366	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
367	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
368	$test_dirname = $';
369	}
370	}
371
372	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
373
374	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
375	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
376	{
377	#convert the input file to a new style HDL
378	my $hdl_output_filename = $input_filename;
379	if ($self->{'old_style_HDL'})
380	{
381	$hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
382	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
383	}
384
385	#just for checking copy all other file from the base dir to tmp dir if it is not exists
386	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
387	my @files = grep {!/^\.+$/} readdir(DIR);
388	close(DIR);
389
390	foreach my $file (@files)
391	{
392	my $src_file = &util::filename_cat($base_dirname,$file);
393	my $dest_file = &util::filename_cat($tmp_dirname,$file);
394	if ((!-e $dest_file) && (!-d $src_file))
395	{
396	# just copy the original file back to the tmp directory
397	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
398	}
399	}
400
401	# tidy the input file
402	my $tidy_output_filename = $hdl_output_filename;
403	if ($self->{'tidy_html'})
404	{
405	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
406	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
407	}
408	$tmp_filename = $tidy_output_filename;
409	}
410	else
411	{
412	if (!-e $tmp_filename)
413	{
414	# just copy the original file back to the tmp directory
415	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
416	}
417	}
418
419	return $tmp_filename;
420	}
421
422
423	# Will make the html input file as a proper XML file with removed font tag and
424	# image size added to the img tag.
425	# The tidying process takes place in a collection specific 'tmp' directory so
426	# that we don't accidentally damage the input.
427	sub tmp_tidy_file
428	{
429	my $self = shift (@_);
430	my ($file,$cnfile) = @_;
431	my $input_filename = $file;
432	my $tmp_filename = $cnfile;
433
434	# get the input filename
435	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
436
437	require HTML::TokeParser::Simple;
438
439	# create HTML parser to decode the input file
440	my $parser = HTML::TokeParser::Simple->new($input_filename);
441
442	# write HTML tmp file without the font tag and image size are added to the img tag
443	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
444	while (my $token = $parser->get_token())
445	{
446	# is it an img tag
447	if ($token->is_start_tag('img'))
448	{
449	# get the attributes
450	my $attr = $token->return_attr;
451
452	# get the full path to the image
453	my $img_file = &util::filename_cat($dirname,$attr->{src});
454
455	# set the width and height attribute
456	($attr->{width}, $attr->{height}) = imgsize($img_file);
457
458	# recreate the tag
459	print PROD "<img";
460	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
461	print PROD ">";
462	}
463	# is it a font tag
464	else
465	{
466	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
467	{
468	# remove font tag
469	print PROD "";
470	}
471	else
472	{
473	# print without changes
474	print PROD $token->as_is;
475	}
476	}
477	}
478	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
479
480	# run html-tidy on the tmp file to make it a proper XML file
481	my $tidyfile = `tidy -utf8 -wrap 0 -asxml $tmp_filename`;
482
483	# write result back to the tmp file
484	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
485	print PROD $tidyfile;
486	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
487
488	# return the output filename
489	return $tmp_filename;
490	}
491
492	sub read_into_doc_obj
493	{
494	my $self = shift (@_);
495	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
496
497	# check the process_exp and block_exp thing
498	my ($block_status,$filename) = $self->read_block(@_);
499	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
500
501	# get the input file
502	my $input_filename = $file;
503	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
504	$suffix = lc($suffix);
505
506	if (($self->{'tidy_html'}) \|\| ($self->{'old_style_HDL'}))
507	{
508	# because the document has to be sectionalized set the description tags
509	$self->{'description_tags'} = 1;
510
511	# set the file to be tidied
512	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/;
513
514	# get the tidied file
515	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
516	my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
517
518	# derive tmp filename from input filename
519	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
520
521	# set the new input file and base_dir to be from the tidied file
522	$file = "$tailname$suffix";
523	$base_dir = $dirname;
524	}
525
526	# call the parent read_into_doc_obj
527	my ($process_status,$doc_obj) = &BasPlug::read_into_doc_obj($self,$pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
528
529	return ($process_status,$doc_obj);
530	}
531
532	sub new {
533	my ($class) = shift (@_);
534	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
535	push(@$pluginlist, $class);
536
537	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
538	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
539
540
541	my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
542
543	if ($self->{'w3mir'}) {
544	$self->{'file_is_url'} = 1;
545	}
546	$self->{'aux_files'} = {};
547	$self->{'dir_num'} = 0;
548	$self->{'file_num'} = 0;
549
550	return bless $self, $class;
551	}
552
553	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
554	# if have eg <script language="javascript" src="img/lib.js@123">
555	sub get_default_block_exp {
556	my $self = shift (@_);
557
558	return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
559	}
560
561	sub get_default_process_exp {
562	my $self = shift (@_);
563
564	# the last option is an attempt to encode the concept of an html query ...
565	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
566	}
567
568	sub store_block_files
569	{
570	my $self =shift (@_);
571	my ($filename) = @_;
572	my $html_fname = $filename;
573	my @file_blocks;
574
575	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
576
577	# read in file ($text will be in utf8)
578	my $text = "";
579	$self->read_file ($filename, $encoding, $language, \$text);
580	my $textref = \$text;
581	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
582	my $closecom = '(?:-->\|(?:—\|\|--)>)';
583	$$textref =~ s/$opencom(.*?)$closecom//gs;
584
585	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
586	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
587	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
588	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
589	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
590	my @tabbg_matches = ($$textref =~ m/<(?:table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
591
592	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches) {
593
594	# remove quotes from link at start and end if necessary
595	if ($link=~/^\"/) {
596	$link=~s/^\"//;
597	$link=~s/\"$//;
598	}
599
600	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
601
602	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
603	# Turn relative file path into full path
604	my $dirname = &File::Basename::dirname($filename);
605	$link = &util::filename_cat($dirname, $link);
606	}
607	$link = $self->eval_dir_dots($link);
608
609	$self->{'file_blocks'}->{$link} = 1;
610	}
611	}
612
613
614	# do plugin specific processing of doc_obj
615	sub process {
616	my $self = shift (@_);
617	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
618	my $outhandle = $self->{'outhandle'};
619
620	print STDERR "<Processing n='$file' p='HTMLPlug'>\n" if ($gli);
621
622	print $outhandle "HTMLPlug: processing $file\n"
623	if $self->{'verbosity'} > 1;
624
625	if ($ENV{'GSDLOS'} =~ /^windows/i) {
626	# this makes life so much easier... perl can cope with unix-style '/'s.
627	$base_dir =~ s@(\\)+@/@g;
628	$file =~ s@(\\)+@/@g;
629	}
630
631	# reset per-doc stuff...
632	$self->{'aux_files'} = {};
633	$self->{'dir_num'} = 0;
634	$self->{'file_num'} = 0;
635
636	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
637	# you can also include metadata in the format (X can be any number)
638	# <hX>Title<!--gsdl-metadata
639	# <Metadata name="name1">value1</Metadata>
640	# ...
641	# <Metadata name="nameN">valueN</Metadata>
642	#--></hX>
643	if ($self->{'sectionalise_using_h_tags'}) {
644	# description_tags should allways be activated because we convert headings to description tags
645	$self->{'description_tags'} = 1;
646
647	my $arrSections = [];
648	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
649
650	if (scalar(@$arrSections)) {
651	my $strMetadata = $self->update_section_data($arrSections, -1);
652	if (length($strMetadata)) {
653	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
654	$$textref =~ s/<\/body>/$strMetadata/ig;
655	}
656	}
657	}
658
659	my $cursection = $doc_obj->get_top_section();
660
661	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
662	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
663
664	# Store URL for page as metadata - this can be used for an
665	# altavista style search interface. The URL won't be valid
666	# unless the file structure contains the domain name (i.e.
667	# like when w3mir is used to download a website).
668
669	# URL metadata (even invalid ones) are used to support internal
670	# links, so even if 'file_is_url' is off, still need to store info
671
672	my $web_url = "http://$file";
673	$doc_obj->add_metadata($cursection, "URL", $web_url);
674
675	if ($self->{'file_is_url'}) {
676	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
677	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
678	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
679	}
680
681	if ($self->{'description_tags'}) {
682	# remove the html header - note that doing this here means any
683	# sections defined within the header will be lost (so all <Section>
684	# tags must appear within the body of the HTML)
685	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
686
687	$$textref =~ s/^.?<body[^>]>//is;
688	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
689
690	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
691	my $closecom = '(?:-->\|(?:—\|\|--)>)';
692
693	my $lt = '(?:<\|<)';
694	my $gt = '(?:>\|>)';
695	my $quot = '(?:"\|"\|”\|“)';
696
697	my $dont_strip = '';
698	if ($self->{'no_strip_metadata_html'}) {
699	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
700	}
701
702	my $found_something = 0; my $top = 1;
703	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
704	my $text = $1;
705	my $comment = $2;
706	if (defined $text) {
707	# text before a comment - note that getting to here
708	# doesn't necessarily mean there are Section tags in
709	# the document
710	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
711	}
712	while ($comment =~ s/$lt(.*?)$gt//s) {
713	my $tag = $1;
714	if ($tag eq "Section") {
715	$found_something = 1;
716	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
717	$top = 0;
718	} elsif ($tag eq "/Section") {
719	$found_something = 1;
720	$cursection = $doc_obj->get_parent_section ($cursection);
721	} elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
722	my $metaname = $1;
723	my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
724	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
725	my $metavalue = $1;
726	$metavalue =~ s/^\s+//;
727	$metavalue =~ s/\s+$//;
728	# assume that no metadata value intentionally includes
729	# carriage returns or HTML tags (if they're there they
730	# were probably introduced when converting to HTML from
731	# some other format).
732	# actually some people want to have html tags in their
733	# metadata.
734	$metavalue =~ s/[\cJ\cM]/ /sg;
735	$metavalue =~ s/<[^>]+>//sg
736	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ /^($dont_strip)$/);
737	$metavalue =~ s/\s+/ /sg;
738	if ($accumulate) {
739	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
740	} else {
741	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
742	}
743	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
744	# do nothing with containing Description tags
745	} else {
746	# simple HTML tag (probably created by the conversion
747	# to HTML from some other format) - we'll ignore it and
748	# hope for the best ;-)
749	}
750	}
751	}
752	if ($cursection ne "") {
753	print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
754	}
755
756	$$textref =~ s/^.?<body[^>]>//is;
757	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
758	if ($$textref =~ /\S/) {
759	if (!$found_something) {
760	if ($self->{'verbosity'} > 2) {
761	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
762	print $outhandle " will be processed as a single section document\n";
763	}
764
765	# go ahead and process single-section document
766	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
767
768	# if document contains no Section tags we'll go ahead
769	# and extract metadata (this won't have been done
770	# above as the -description_tags option prevents it)
771	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
772	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
773	unless $self->{'no_metadata'};
774
775	} else {
776	print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
777	print $outhandle " of the final closing </Section> tag. This text will\n";
778	print $outhandle " be ignored.";
779
780	my ($text);
781	if (length($$textref) > 30) {
782	$text = substr($$textref, 0, 30) . "...";
783	} else {
784	$text = $$textref;
785	}
786	$text =~ s/\n/ /isg;
787	print $outhandle " ($text)\n";
788	}
789	} elsif (!$found_something) {
790
791	if ($self->{'verbosity'} > 2) {
792	# may get to here if document contained no valid Section
793	# tags but did contain some comments. The text will have
794	# been processed already but we should print the warning
795	# as above and extract metadata
796	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
797	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
798	}
799
800	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
801	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
802	unless $self->{'no_metadata'};
803	}
804
805	} else {
806
807	# remove header and footer
808	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
809	$$textref =~ s/^.?<body[^>]>//is;
810	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
811	}
812
813	# single section document
814	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
815	}
816	return 1;
817	}
818
819
820	sub process_heading
821	{
822	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
823	$strHeadingText = '' if (!defined($strHeadingText));
824
825	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
826
827	my $strSecMetadata = '';
828	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
829	{
830	$strSecMetadata .= $1;
831	}
832
833	$strHeadingText =~ s/^\s+//g;
834	$strHeadingText =~ s/\s+$//g;
835	$strSecMetadata =~ s/^\s+//g;
836	$strSecMetadata =~ s/\s+$//g;
837
838	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
839
840	if (length($strSecMetadata)) {
841	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
842	}
843
844	$strMetadata .= "\t</Description>\n";
845
846	return "<!--" . $strMetadata . "-->";
847	}
848
849
850	sub update_section_data
851	{
852	my ($self, $arrSections, $nCurTocNo) = @_;
853	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
854
855	if ($nSections == 0) {
856	push @$arrSections, $nCurTocNo;
857	return $strBuffer;
858	}
859	$nLast = $arrSections->[$nSections - 1];
860	if ($nCurTocNo > $nLast) {
861	push @$arrSections, $nCurTocNo;
862	return $strBuffer;
863	}
864	for(my $i = $nSections - 1; $i >= 0; $i--) {
865	if ($nCurTocNo <= $arrSections->[$i]) {
866	$strBuffer .= "\n</Section>";
867	pop @$arrSections;
868	}
869	}
870	push @$arrSections, $nCurTocNo;
871	return $strBuffer;
872	}
873
874
875	# note that process_section may be called multiple times for a single
876	# section (relying on the fact that add_utf8_text appends the text to any
877	# that may exist already).
878	sub process_section {
879	my $self = shift (@_);
880	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
881	# trap links
882	if (!$self->{'nolinks'}) {
883	# usemap="./#index" not handled correctly => change to "#index"
884	$$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
885	$self->replace_usemap_links($1, $2, $3)/isge;
886
887	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
888	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
889	}
890
891	# trap images
892
893	# Previously, by default, HTMLPlug would embed <img> tags inside anchor tags
894	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
895	# turned regular text succeeding images into links. That is, by embedding <imgs>
896	# inside <a href=""></a>, the text following images were no longer misbehaving.
897	# However, there would be many occasions whereby images were not meant to link
898	# to their source images but where the images would link to another web page.
899	# To allow this, the no_image_links option was introduced: it would prevent
900	# the behaviour of embedding images into links that referenced the source images.
901
902	# Somewhere along the line, the problem of normal text turning into links when
903	# such text followed images which were not embedded in <a href=""></a> ceased
904	# to occur. This is why the following lines have been commented out (as well as
905	# two lines in replace_images). They appear to no longer apply.
906
907	# If at any time, there is a need for having images embedded in <a> anchor tags,
908	# then it might be better to turn that into an HTMLPlug option rather than make
909	# it the default behaviour. Also, eventually, no_image_links needs to become
910	# a deprecated option for HTMLPlug as it has now become the default behaviour.
911
912	#if(!$self->{'no_image_links'}){
913	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)(([\"][^\"]+[\"])\|([\'][^\"\']+[\'])\|([^\s\/>]+))([^>]>)/
914	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
915	#}
916
917	# add text to document object
918	# turn \ into \\ so that the rest of greenstone doesn't think there
919	# is an escape code following. (Macro parsing loses them...)
920	$$textref =~ s/\\/\\\\/go;
921
922	$doc_obj->add_utf8_text($cursection, $$textref);
923	}
924
925	sub replace_images {
926	my $self = shift (@_);
927	my ($front, $link, $back, $base_dir,
928	$file, $doc_obj, $section) = @_;
929
930	# remove quotes from link at start and end if necessary
931	if ($link=~/^[\"\']/) {
932	$link=~s/^[\"\']//;
933	$link=~s/[\"\']$//;
934	$front.='"';
935	$back="\"$back";
936	}
937	$link =~ s/\n/ /g;
938
939	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
940	# If the Word file path has spaces in it, wv messes up and you end up with
941	# absolute paths for the images, and without the "file://" prefix
942	# So check for this special case and massage the data to be correct
943	if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
944	$link =~ s/^.*\\([^\\]+)$/$1/;
945	}
946
947	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
948
949	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
950
951	my $anchor_name = $img_file;
952	#$anchor_name =~ s/^.*\///;
953	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
954
955	my $image_link = $front . $img_file .$back;
956	return $image_link;
957
958	# The reasons for why the following two lines are no longer necessary can be
959	# found in subroutine process_section
960	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
961	#return $anchor_link;
962
963	#return $front . $img_file . $back . $anchor_name;
964	}
965
966	sub replace_href_links {
967	my $self = shift (@_);
968	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
969
970	# attempt to sort out targets - frames are not handled
971	# well in this plugin and some cases will screw things
972	# up - e.g. the _parent target (so we'll just remove
973	# them all ;-)
974	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
975	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
976	$front =~ s/target=\"?_parent\"?//is;
977	$back =~ s/target=\"?_parent\"?//is;
978
979	return $front . $link . $back if $link =~ /^\#/s;
980	$link =~ s/\n/ /g;
981
982	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
983	# href may use '\'s where '/'s should be on Windows
984	$href =~ s/\\/\//g;
985
986	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
987
988
989	##### leave all these links alone (they won't be picked up by intermediate
990	##### pages). I think that's safest when dealing with frames, targets etc.
991	##### (at least until I think of a better way to do it). Problems occur with
992	##### mailto links from within small frames, the intermediate page is displayed
993	##### within that frame and can't be seen. There is still potential for this to
994	##### happen even with html pages - the solution seems to be to somehow tell
995	##### the browser from the server side to display the page being sent (i.e.
996	##### the intermediate page) in the top level window - I'm not sure if that's
997	##### possible - the following line should probably be deleted if that can be done
998	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
999
1000
1001	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
1002	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
1003	&ghtml::urlsafe ($href);
1004	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
1005	} else {
1006	# link is to some other type of file (eg image) so we'll
1007	# need to associate that file
1008	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
1009	}
1010	}
1011
1012	sub add_file {
1013	my $self = shift (@_);
1014	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
1015	my ($newname);
1016
1017	my $filename = $href;
1018	if ($base_dir eq "") {
1019	# remove http:/ thereby leaving one slash at the start
1020	$filename =~ s/^[^:]*:\///;
1021	}
1022	else {
1023	# remove http://
1024	$filename =~ s/^[^:]*:\/\///;
1025	}
1026
1027	$filename = &util::filename_cat($base_dir, $filename);
1028
1029	# Replace %20's in URL with a space if required. Note that the filename
1030	# may include the %20 in some situations
1031	if ($filename =~ /\%20/) {
1032	if (!-e $filename) {
1033	$filename =~ s/\%20/ /g;
1034	}
1035	}
1036
1037	my ($ext) = $filename =~ /(\.[^\.]*)$/;
1038
1039	if ($rl == 0) {
1040	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
1041	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
1042	}
1043	else {
1044	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
1045	}
1046	}
1047
1048	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
1049	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
1050	}
1051	if ($self->{'rename_assoc_files'}) {
1052	if (defined $self->{'aux_files'}->{$href}) {
1053	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1054	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1055	} else {
1056	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1057	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1058	$self->inc_filecount ();
1059	}
1060	$doc_obj->associate_file($filename, $newname, undef, $section);
1061	return "_httpdocimg_/$newname";
1062	} else {
1063	($newname) = $filename =~ /([^\/\\]*)$/;
1064	$doc_obj->associate_file($filename, $newname, undef, $section);
1065	return "_httpdocimg_/$newname";
1066	}
1067	}
1068
1069
1070	sub format_link {
1071	my $self = shift (@_);
1072	my ($link, $base_dir, $file) = @_;
1073
1074	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
1075
1076	$hash_part = "" if !defined $hash_part;
1077	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
1078	my $outhandle = $self->{'outhandle'};
1079	print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
1080	if $self->{'verbosity'};
1081	return ($link, "", 0);
1082	}
1083
1084	if ($before_hash =~ s@^((?:http\|ftp\|file)://)@@i) {
1085	my $type = $1;
1086
1087	if ($link =~ /^(http\|ftp):/i) {
1088	# Turn url (using /) into file name (possibly using \ on windows)
1089	my @http_dir_split = split('/', $before_hash);
1090	$before_hash = &util::filename_cat(@http_dir_split);
1091	}
1092
1093	$before_hash = $self->eval_dir_dots($before_hash);
1094
1095	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1096
1097	my $rl = 0;
1098	$rl = 1 if (-e $linkfilename);
1099
1100	# make sure there's a slash on the end if it's a directory
1101	if ($before_hash !~ /\/$/) {
1102	$before_hash .= "/" if (-d $linkfilename);
1103	}
1104
1105	return ($type . $before_hash, $hash_part, $rl);
1106
1107	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ /^\//) {
1108	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ /\\/) {
1109
1110	# the first directory will be the domain name if file_is_url
1111	# to generate archives, otherwise we'll assume all files are
1112	# from the same site and base_dir is the root
1113
1114	if ($self->{'file_is_url'}) {
1115	my @dirs = split /[\/\\]/, $file;
1116	my $domname = shift (@dirs);
1117	$before_hash = &util::filename_cat($domname, $before_hash);
1118	$before_hash =~ s@\\@/@g; # for windows
1119	}
1120	else
1121	{
1122	# see if link shares directory with source document
1123	# => turn into relative link if this is so!
1124
1125	if ($ENV{'GSDLOS'} =~ /^windows/i) {
1126	# too difficult doing a pattern match with embedded '\'s...
1127	my $win_before_hash=$before_hash;
1128	$win_before_hash =~ s@(\\)+@/@g;
1129	# $base_dir is already similarly "converted" on windows.
1130	if ($win_before_hash =~ s@^$base_dir/@@o) {
1131	# if this is true, we removed a prefix
1132	$before_hash=$win_before_hash;
1133	}
1134	}
1135	else {
1136	# before_hash has lost leading slash by this point,
1137	# -> add back in prior to substitution with $base_dir
1138	$before_hash = "/$before_hash";
1139
1140	$before_hash = &util::filename_cat("",$before_hash);
1141	$before_hash =~ s@^$base_dir/@@;
1142	}
1143	}
1144	} else {
1145	# Turn relative file path into full path
1146	my $dirname = &File::Basename::dirname($file);
1147	$before_hash = &util::filename_cat($dirname, $before_hash);
1148	$before_hash = $self->eval_dir_dots($before_hash);
1149	}
1150
1151	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1152	# make sure there's a slash on the end if it's a directory
1153	if ($before_hash !~ /\/$/) {
1154	$before_hash .= "/" if (-d $linkfilename);
1155	}
1156	return ("http://" . $before_hash, $hash_part, 1);
1157	} else {
1158	# mailto, news, nntp, telnet, javascript or gopher link
1159	return ($before_hash, "", 0);
1160	}
1161	}
1162
1163	sub extract_first_NNNN_characters {
1164	my $self = shift (@_);
1165	my ($textref, $doc_obj, $thissection) = @_;
1166
1167	foreach my $size (split /,/, $self->{'first'}) {
1168	my $tmptext = $$textref;
1169	# skip to the body
1170	$tmptext =~ s/.<body[^>]>//i;
1171	# remove javascript
1172	$tmptext =~ s@<script.*?</script>@ @sig;
1173	$tmptext =~ s/<[^>]*>/ /g;
1174	$tmptext =~ s/ / /g;
1175	$tmptext =~ s/^\s+//;
1176	$tmptext =~ s/\s+$//;
1177	$tmptext =~ s/\s+/ /gs;
1178	$tmptext = &unicode::substr ($tmptext, 0, $size);
1179	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
1180	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1181	}
1182	}
1183
1184
1185	sub extract_metadata {
1186	my $self = shift (@_);
1187	my ($textref, $metadata, $doc_obj, $section) = @_;
1188	my $outhandle = $self->{'outhandle'};
1189	# if we don't want metadata, we may as well not be here ...
1190	return if (!defined $self->{'metadata_fields'});
1191
1192	# metadata fields to extract/save. 'key' is the (lowercase) name of the
1193	# html meta, 'value' is the metadata name for greenstone to use
1194	my %find_fields = ();
1195
1196	my %creator_fields = (); # short-cut for lookups
1197
1198
1199	foreach my $field (split /,/, $self->{'metadata_fields'}) {
1200	$field =~ s/^\s+//; # remove leading whitespace
1201	$field =~ s/\s+$//; # remove trailing whitespace
1202
1203	# support tag<tagname>
1204	if ($field =~ /^(.?)<(.?)>$/) {
1205	# "$2" is the user's preferred gs metadata name
1206	$find_fields{lc($1)}=$2; # lc = lowercase
1207	} else { # no <tagname> for mapping
1208	# "$field" is the user's preferred gs metadata name
1209	$find_fields{lc($field)}=$field; # lc = lowercase
1210	}
1211	}
1212
1213	if (defined $self->{'hunt_creator_metadata'} &&
1214	$self->{'hunt_creator_metadata'} == 1 ) {
1215	my @extra_fields =
1216	(
1217	'author',
1218	'author.email',
1219	'creator',
1220	'dc.creator',
1221	'dc.creator.corporatename',
1222	);
1223
1224	# add the creator_metadata fields to search for
1225	foreach my $field (@extra_fields) {
1226	$creator_fields{$field}=0; # add to lookup hash
1227	}
1228	}
1229
1230
1231	# find the header in the html file, which has the meta tags
1232	$$textref =~ m@<head>(.*?)</head>@si;
1233
1234	my $html_header=$1;
1235
1236	# go through every <meta... tag defined in the html and see if it is
1237	# one of the tags we want to match.
1238
1239	# special case for title - we want to remember if its been found
1240	my $found_title = 0;
1241	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1242	$html_header =~ /^/; # match the start of the string, for \G assertion
1243
1244	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1245	my $metatag=$1;
1246	my ($tag, $value);
1247
1248	# find the tag name
1249	$metatag =~ /(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1250	$tag=$2;
1251	# in case they're not using " or ', but they should...
1252	if (! $tag) {
1253	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1254	$tag=$1;
1255	}
1256
1257	if (!defined $tag) {
1258	print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
1259	next;
1260	}
1261
1262	# don't need to assign this field if it was passed in from a previous
1263	# (recursive) plugin
1264	if (defined $metadata->{$tag}) {next}
1265
1266	# find the tag content
1267	$metatag =~ /content\s=\s([\"\'])?(.*?)\1/is;
1268	$value=$2;
1269
1270	if (! $value) {
1271	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1272	$value=$1;
1273	}
1274	if (!defined $value) {
1275	print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
1276	next;
1277	}
1278
1279	# clean up and add
1280	$value =~ s/\s+/ /gs;
1281	chomp($value); # remove trailing \n, if any
1282	if (exists $creator_fields{lc($tag)}) {
1283	# map this value onto greenstone's "Creator" metadata
1284	$tag='Creator';
1285	} elsif (!exists $find_fields{lc($tag)}) {
1286	next; # don't want this tag
1287	} else {
1288	# get the user's preferred capitalisation
1289	$tag = $find_fields{lc($tag)};
1290	}
1291	if (lc($tag) eq "title") {
1292	$found_title = 1;
1293	}
1294	print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
1295	if ($self->{'verbosity'} > 2);
1296	if ($tag =~ /date.*/i){
1297	$tag = lc($tag);
1298	}
1299	$doc_obj->add_utf8_metadata($section, $tag, $value);
1300
1301	}
1302
1303	# TITLE: extract the document title
1304	if (exists $find_fields{'title'} && !$found_title) {
1305	# we want a title, and didn't find one in the meta tags
1306	# see if there's a <title> tag
1307	my $title;
1308	my $from = ""; # for debugging output only
1309	if ($html_header =~ /<title[^>]>([^<]+)<\/title[^>]>/is) {
1310	$title = $1;
1311	$from = "<title> tags";
1312	}
1313
1314	if (!defined $title) {
1315	$from = "first 100 chars";
1316	# if no title use first 100 or so characters
1317	$title = $$textref;
1318	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1319	$title =~ s/^.*?<body>//si;
1320	# ignore javascript!
1321	$title =~ s@<script.*?</script>@ @sig;
1322	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1323	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1324	$title = substr ($title, 0, 100);
1325	$title =~ s/\s\S*$/.../;
1326	}
1327	$title =~ s/<[^>]*>/ /g; # remove html tags
1328	$title =~ s/ / /g;
1329	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1330	$title =~ s/\s+/ /gs; # collapse multiple spaces
1331	$title =~ s/^\s*//; # remove leading spaces
1332	$title =~ s/\s*$//; # remove trailing spaces
1333
1334	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1335	$title =~ s/^\s+//s; # in case title_sub introduced any...
1336	$doc_obj->add_utf8_metadata ($section, 'Title', $title);
1337	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1338	if ($self->{'verbosity'} > 2);
1339	}
1340
1341	# add FileFormat metadata
1342	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1343
1344	# Special, for metadata names such as tagH1 - extracts
1345	# the text between the first <H1> and </H1> tags into "H1" metadata.
1346
1347	foreach my $field (keys %find_fields) {
1348	if ($field !~ /^tag([a-z0-9]+)$/i) {next}
1349	my $tag = $1;
1350	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1351	my $content = $1;
1352	$content =~ s/ / /g;
1353	$content =~ s/<[^>]*>/ /g;
1354	$content =~ s/^\s+//;
1355	$content =~ s/\s+$//;
1356	$content =~ s/\s+/ /gs;
1357	if ($content) {
1358	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1359	$tag =~ s/^tag//i;
1360	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1361	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1362	if ($self->{'verbosity'} > 2);
1363	}
1364	}
1365	}
1366	}
1367
1368
1369	# evaluate any "../" to next directory up
1370	# evaluate any "./" as here
1371	sub eval_dir_dots {
1372	my $self = shift (@_);
1373	my ($filename) = @_;
1374	my $dirsep_os = &util::get_os_dirsep();
1375	my @dirsep = split(/$dirsep_os/,$filename);
1376
1377	my @eval_dirs = ();
1378	foreach my $d (@dirsep) {
1379	if ($d eq "..") {
1380	pop(@eval_dirs);
1381
1382	} elsif ($d eq ".") {
1383	# do nothing!
1384
1385	} else {
1386	push(@eval_dirs,$d);
1387	}
1388	}
1389
1390	# Need to fiddle with number of elements in @eval_dirs if the
1391	# first one is the empty string. This is because of a
1392	# modification to util::filename_cat that supresses the addition
1393	# of a leading '/' character (or \ if windows) (intended to help
1394	# filename cat with relative paths) if the first entry in the
1395	# array is the empty string. Making the array start with two
1396	# empty strings is a way to defeat this "smart" option.
1397	#
1398	if (scalar(@eval_dirs) > 0) {
1399	if ($eval_dirs[0] eq ""){
1400	unshift(@eval_dirs,"");
1401	}
1402	}
1403	return &util::filename_cat(@eval_dirs);
1404	}
1405
1406	sub replace_usemap_links {
1407	my $self = shift (@_);
1408	my ($front, $link, $back) = @_;
1409
1410	$link =~ s/^\.\///;
1411	return $front . $link . $back;
1412	}
1413
1414	sub inc_filecount {
1415	my $self = shift (@_);
1416
1417	if ($self->{'file_num'} == 1000) {
1418	$self->{'dir_num'} ++;
1419	$self->{'file_num'} = 0;
1420	} else {
1421	$self->{'file_num'} ++;
1422	}
1423	}
1424
1425
1426	# Extend the BasPlug read_file so that strings like é are
1427	# converted to UTF8 internally.
1428	#
1429	# We don't convert < or > or & or " in case
1430	# they interfere with the GML files
1431
1432	sub read_file {
1433	my ($self, $filename, $encoding, $language, $textref) = @_;
1434
1435	&BasPlug::read_file($self, $filename, $encoding, $language, $textref);
1436
1437	# Convert entities to their UTF8 equivalents
1438	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1439	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
1440	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1441	}
1442
1443	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: