Context Navigation

source: gsdl/trunk/perllib/plugins/HTMLPlugin.pm@ 16104

Last change on this file since 16104 was 16104, checked in by kjdon, 16 years ago
tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 47.0 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use ReadTextFile;
39	use HBPlugin;
40	use ghtml;
41	use unicode;
42	use util;
43	use XMLParser;
44
45	use Image::Size;
46	use File::Copy;
47
48	sub BEGIN {
49	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
50	}
51
52	use strict; # every perl program should have this!
53	no strict 'refs'; # make an exception so we can use variables as filehandles
54
55	my $arguments =
56	[ { 'name' => "process_exp",
57	'desc' => "{BasePlugin.process_exp}",
58	'type' => "regexp",
59	'deft' => &get_default_process_exp() },
60	{ 'name' => "block_exp",
61	'desc' => "{BasePlugin.block_exp}",
62	'type' => 'regexp',
63	'deft' => &get_default_block_exp() },
64	{ 'name' => "nolinks",
65	'desc' => "{HTMLPlugin.nolinks}",
66	'type' => "flag" },
67	{ 'name' => "keep_head",
68	'desc' => "{HTMLPlugin.keep_head}",
69	'type' => "flag" },
70	{ 'name' => "no_metadata",
71	'desc' => "{HTMLPlugin.no_metadata}",
72	'type' => "flag" },
73	{ 'name' => "metadata_fields",
74	'desc' => "{HTMLPlugin.metadata_fields}",
75	'type' => "string",
76	'deft' => "Title" },
77	{ 'name' => "hunt_creator_metadata",
78	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
79	'type' => "flag" },
80	{ 'name' => "file_is_url",
81	'desc' => "{HTMLPlugin.file_is_url}",
82	'type' => "flag" },
83	{ 'name' => "assoc_files",
84	'desc' => "{HTMLPlugin.assoc_files}",
85	'type' => "regexp",
86	'deft' => &get_default_block_exp() },
87	{ 'name' => "rename_assoc_files",
88	'desc' => "{HTMLPlugin.rename_assoc_files}",
89	'type' => "flag" },
90	{ 'name' => "title_sub",
91	'desc' => "{HTMLPlugin.title_sub}",
92	'type' => "string",
93	'deft' => "" },
94	{ 'name' => "description_tags",
95	'desc' => "{HTMLPlugin.description_tags}",
96	'type' => "flag" },
97	# retain this for backward compatibility (w3mir option was replaced by
98	# file_is_url)
99	{ 'name' => "w3mir",
100	# 'desc' => "{HTMLPlugin.w3mir}",
101	'type' => "flag",
102	'hiddengli' => "yes"},
103	{ 'name' => "no_strip_metadata_html",
104	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
105	'type' => "string",
106	'deft' => "",
107	'reqd' => "no"},
108	{ 'name' => "sectionalise_using_h_tags",
109	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
110	'type' => "flag" },
111	{ 'name' => "use_realistic_book",
112	'desc' => "{HTMLPlugin.tidy_html}",
113	'type' => "flag"},
114	{ 'name' => "old_style_HDL",
115	'desc' => "{HTMLPlugin.old_style_HDL}",
116	'type' => "flag"}
117	];
118
119	my $options = { 'name' => "HTMLPlugin",
120	'desc' => "{HTMLPlugin.desc}",
121	'abstract' => "no",
122	'inherits' => "yes",
123	'args' => $arguments };
124
125
126	sub HB_read_html_file {
127	my $self = shift (@_);
128	my ($htmlfile, $text) = @_;
129
130	# load in the file
131	if (!open (FILE, $htmlfile)) {
132	print STDERR "ERROR - could not open $htmlfile\n";
133	return;
134	}
135
136	my $foundbody = 0;
137	$self->HB_gettext (\$foundbody, $text, "FILE");
138	close FILE;
139
140	# just in case there was no <body> tag
141	if (!$foundbody) {
142	$foundbody = 1;
143	open (FILE, $htmlfile) \|\| return;
144	$self->HB_gettext (\$foundbody, $text, "FILE");
145	close FILE;
146	}
147	# text is in utf8
148	}
149
150	# converts the text to utf8, as ghtml does that for é etc.
151	sub HB_gettext {
152	my $self = shift (@_);
153	my ($foundbody, $text, $handle) = @_;
154
155	my $line = "";
156	while (defined ($line = <$handle>)) {
157	# look for body tag
158	if (!$$foundbody) {
159	if ($line =~ s/^.<body[^>]>//i) {
160	$$foundbody = 1;
161	} else {
162	next;
163	}
164	}
165
166	# check for symbol fonts
167	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
168	my $font = $1;
169	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
170	if ($font !~ /^arial$/i);
171	}
172
173	$$text .= $line;
174	}
175
176	if ($self->{'input_encoding'} eq "iso_8859_1") {
177	# convert to utf-8
178	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
179	}
180	# convert any alphanumeric character entities to their utf-8
181	# equivalent for indexing purposes
182	#&ghtml::convertcharentities ($$text);
183
184	$$text =~ s/\s+/ /g; # remove \n's
185	}
186
187	sub HB_clean_section {
188	my $self = shift (@_);
189	my ($section) = @_;
190
191	# remove tags without a starting tag from the section
192	my ($tag, $tagstart);
193	while ($section =~ /<\/([^>]{1,10})>/) {
194	$tag = $1;
195	$tagstart = index($section, "<$tag");
196	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
197	$section =~ s/<\/$tag>//;
198	}
199
200	# remove extra paragraph tags
201	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
202
203	# remove extra stuff at the end of the section
204	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
205
206	# add a newline at the beginning of each paragraph
207	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
208
209	# add a newline every 80 characters at a word boundary
210	# Note: this regular expression puts a line feed before
211	# the last word in each section, even when it is not
212	# needed.
213	$section =~ s/(.{1,80})\s/$1\n/g;
214
215	# fix up the image links
216	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
217	<center><img src=\"$1\" \/><\/center><br\/>/ig;
218	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
219	<center><img src=\"$1\" \/><\/center><br\/>/ig;
220
221	return $section;
222	}
223
224	# Will convert the oldHDL format to the new HDL format (using the Section tag)
225	sub convert_to_newHDLformat
226	{
227	my $self = shift (@_);
228	my ($file,$cnfile) = @_;
229	my $input_filename = $file;
230	my $tmp_filename = $cnfile;
231
232	# write HTML tmp file with new HDL format
233	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
234
235	# read in the file and do basic html cleaning (removing header etc)
236	my $html = "";
237	$self->HB_read_html_file ($input_filename, \$html);
238
239	# process the file one section at a time
240	my $curtoclevel = 1;
241	my $firstsection = 1;
242	my $toclevel = 0;
243	while (length ($html) > 0) {
244	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
245	$toclevel = $3;
246	my $title = $4;
247	my $sectiontext = "";
248	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
249	$sectiontext = $1;
250	} else {
251	$sectiontext = $html;
252	$html = "";
253	}
254
255	# remove tags and extra spaces from the title
256	$title =~ s/<\/?[^>]+>//g;
257	$title =~ s/^\s+\|\s+$//g;
258
259	# close any sections below the current level and
260	# create a new section (special case for the firstsection)
261	print PROD "<!--\n";
262	while (($curtoclevel > $toclevel) \|\|
263	(!$firstsection && $curtoclevel == $toclevel)) {
264	$curtoclevel--;
265	print PROD "</Section>\n";
266	}
267	if ($curtoclevel+1 < $toclevel) {
268	print STDERR "WARNING - jump in toc levels in $input_filename " .
269	"from $curtoclevel to $toclevel\n";
270	}
271	while ($curtoclevel < $toclevel) {
272	$curtoclevel++;
273	}
274
275	if ($curtoclevel == 1) {
276	# add the header tag
277	print PROD "-->\n";
278	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
279	print PROD "<!--\n";
280	}
281
282	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
283
284	print PROD "-->\n";
285
286	# clean up the section html
287	$sectiontext = $self->HB_clean_section($sectiontext);
288
289	print PROD "$sectiontext\n";
290
291	} else {
292	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
293	"\nin $input_filename\n";
294	last;
295	}
296	$firstsection = 0;
297	}
298
299	print PROD "<!--\n";
300	while ($curtoclevel > 0) {
301	$curtoclevel--;
302	print PROD "</Section>\n";
303	}
304	print PROD "-->\n";
305
306	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
307
308	return $tmp_filename;
309	}
310
311	sub shorten {
312	my $self = shift (@_);
313	my ($text) = @_;
314
315	return "\"$text\"" if (length($text) < 100);
316
317	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
318	substr ($text, length($text)-50) . "\"";
319	}
320
321	sub convert_tidy_or_oldHDL_file
322	{
323	my $self = shift (@_);
324	my ($file) = @_;
325	my $input_filename = $file;
326
327	if (-d $input_filename)
328	{
329	return $input_filename;
330	}
331
332	# get the input filename
333	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
334	my $base_dirname = $dirname;
335	$suffix = lc($suffix);
336
337	# derive tmp filename from input filename
338	# Remove any white space from filename -- no risk of name collision, and
339	# makes later conversion by utils simpler. Leave spaces in path...
340	# tidy up the filename with space, dot, hyphen between
341	$tailname =~ s/\s+//g;
342	$tailname =~ s/\.+//g;
343	$tailname =~ s/\-+//g;
344	# convert to utf-8 otherwise we have problems with the doc.xml file
345	# later on
346	&unicode::ensure_utf8(\$tailname);
347
348	# softlink to collection tmp dir
349	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
350	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
351
352	my $test_dirname = "";
353	my $f_separator = &util::get_os_dirsep();
354
355	if ($dirname =~ /import$f_separator/)
356	{
357	$test_dirname = $'; #'
358
359	#print STDERR "init $'\n";
360
361	while ($test_dirname =~ /[$f_separator]/)
362	{
363	my $folderdirname = $`;
364	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
365	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
366	$test_dirname = $'; #'
367	}
368	}
369
370	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
371
372	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
373	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
374	{
375	#convert the input file to a new style HDL
376	my $hdl_output_filename = $input_filename;
377	if ($self->{'old_style_HDL'})
378	{
379	$hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
380	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
381	}
382
383	#just for checking copy all other file from the base dir to tmp dir if it is not exists
384	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
385	my @files = grep {!/^\.+$/} readdir(DIR);
386	close(DIR);
387
388	foreach my $file (@files)
389	{
390	my $src_file = &util::filename_cat($base_dirname,$file);
391	my $dest_file = &util::filename_cat($tmp_dirname,$file);
392	if ((!-e $dest_file) && (!-d $src_file))
393	{
394	# just copy the original file back to the tmp directory
395	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
396	}
397	}
398
399	# tidy the input file
400	my $tidy_output_filename = $hdl_output_filename;
401	if ($self->{'tidy_html'})
402	{
403	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
404	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
405	}
406	$tmp_filename = $tidy_output_filename;
407	}
408	else
409	{
410	if (!-e $tmp_filename)
411	{
412	# just copy the original file back to the tmp directory
413	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
414	}
415	}
416
417	return $tmp_filename;
418	}
419
420
421	# Will make the html input file as a proper XML file with removed font tag and
422	# image size added to the img tag.
423	# The tidying process takes place in a collection specific 'tmp' directory so
424	# that we don't accidentally damage the input.
425	sub tmp_tidy_file
426	{
427	my $self = shift (@_);
428	my ($file,$cnfile) = @_;
429	my $input_filename = $file;
430	my $tmp_filename = $cnfile;
431
432	# get the input filename
433	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
434
435	require HTML::TokeParser::Simple;
436
437	# create HTML parser to decode the input file
438	my $parser = HTML::TokeParser::Simple->new($input_filename);
439
440	# write HTML tmp file without the font tag and image size are added to the img tag
441	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
442	while (my $token = $parser->get_token())
443	{
444	# is it an img tag
445	if ($token->is_start_tag('img'))
446	{
447	# get the attributes
448	my $attr = $token->return_attr;
449
450	# get the full path to the image
451	my $img_file = &util::filename_cat($dirname,$attr->{src});
452
453	# set the width and height attribute
454	($attr->{width}, $attr->{height}) = imgsize($img_file);
455
456	# recreate the tag
457	print PROD "<img";
458	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
459	print PROD ">";
460	}
461	# is it a font tag
462	else
463	{
464	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
465	{
466	# remove font tag
467	print PROD "";
468	}
469	else
470	{
471	# print without changes
472	print PROD $token->as_is;
473	}
474	}
475	}
476	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
477
478	# run html-tidy on the tmp file to make it a proper XML file
479	my $tidyfile = `tidy -utf8 -wrap 0 -asxml $tmp_filename`;
480
481	# write result back to the tmp file
482	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
483	print PROD $tidyfile;
484	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
485
486	# return the output filename
487	return $tmp_filename;
488	}
489
490	sub read_into_doc_obj
491	{
492	my $self = shift (@_);
493	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
494
495	# check the process_exp and block_exp thing
496	my ($block_status,$filename) = $self->read_block(@_);
497	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
498
499	# get the input file
500	my $input_filename = $file;
501	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
502	$suffix = lc($suffix);
503
504	if (($self->{'tidy_html'}) \|\| ($self->{'old_style_HDL'}))
505	{
506	# because the document has to be sectionalized set the description tags
507	$self->{'description_tags'} = 1;
508
509	# set the file to be tidied
510	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/;
511
512	# get the tidied file
513	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
514	my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
515
516	# derive tmp filename from input filename
517	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
518
519	# set the new input file and base_dir to be from the tidied file
520	$file = "$tailname$suffix";
521	$base_dir = $dirname;
522	}
523
524	# call the parent read_into_doc_obj
525	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
526
527	return ($process_status,$doc_obj);
528	}
529
530	sub new {
531	my ($class) = shift (@_);
532	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
533	push(@$pluginlist, $class);
534
535	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
536	push(@{$hashArgOptLists->{"OptList"}},$options);
537
538
539	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
540
541	if ($self->{'w3mir'}) {
542	$self->{'file_is_url'} = 1;
543	}
544	$self->{'aux_files'} = {};
545	$self->{'dir_num'} = 0;
546	$self->{'file_num'} = 0;
547
548	return bless $self, $class;
549	}
550
551	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
552	# if have eg <script language="javascript" src="img/lib.js@123">
553	sub get_default_block_exp {
554	my $self = shift (@_);
555
556	return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
557	}
558
559	sub get_default_process_exp {
560	my $self = shift (@_);
561
562	# the last option is an attempt to encode the concept of an html query ...
563	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
564	}
565
566	sub store_block_files
567	{
568	my $self =shift (@_);
569	my ($filename) = @_;
570	my $html_fname = $filename;
571	my @file_blocks;
572
573	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
574
575	# read in file ($text will be in utf8)
576	my $text = "";
577	$self->read_file ($filename, $encoding, $language, \$text);
578	my $textref = \$text;
579	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
580	my $closecom = '(?:-->\|(?:—\|\|--)>)';
581	$$textref =~ s/$opencom(.*?)$closecom//gs;
582
583	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
584	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
585	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
586	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
587	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
588	my @tabbg_matches = ($$textref =~ m/<(?:table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
589
590	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches) {
591
592	# remove quotes from link at start and end if necessary
593	if ($link=~/^\"/) {
594	$link=~s/^\"//;
595	$link=~s/\"$//;
596	}
597
598	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
599
600	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
601	# Turn relative file path into full path
602	my $dirname = &File::Basename::dirname($filename);
603	$link = &util::filename_cat($dirname, $link);
604	}
605	$link = $self->eval_dir_dots($link);
606
607	$self->{'file_blocks'}->{$link} = 1;
608	}
609	}
610
611
612	# do plugin specific processing of doc_obj
613	sub process {
614	my $self = shift (@_);
615	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
616	my $outhandle = $self->{'outhandle'};
617
618	if ($ENV{'GSDLOS'} =~ /^windows/i) {
619	# this makes life so much easier... perl can cope with unix-style '/'s.
620	$base_dir =~ s@(\\)+@/@g;
621	$file =~ s@(\\)+@/@g;
622	}
623
624	# reset per-doc stuff...
625	$self->{'aux_files'} = {};
626	$self->{'dir_num'} = 0;
627	$self->{'file_num'} = 0;
628
629	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
630	# you can also include metadata in the format (X can be any number)
631	# <hX>Title<!--gsdl-metadata
632	# <Metadata name="name1">value1</Metadata>
633	# ...
634	# <Metadata name="nameN">valueN</Metadata>
635	#--></hX>
636	if ($self->{'sectionalise_using_h_tags'}) {
637	# description_tags should allways be activated because we convert headings to description tags
638	$self->{'description_tags'} = 1;
639
640	my $arrSections = [];
641	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
642
643	if (scalar(@$arrSections)) {
644	my $strMetadata = $self->update_section_data($arrSections, -1);
645	if (length($strMetadata)) {
646	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
647	$$textref =~ s/<\/body>/$strMetadata/ig;
648	}
649	}
650	}
651
652	my $cursection = $doc_obj->get_top_section();
653
654	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
655	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
656
657	# Store URL for page as metadata - this can be used for an
658	# altavista style search interface. The URL won't be valid
659	# unless the file structure contains the domain name (i.e.
660	# like when w3mir is used to download a website).
661
662	# URL metadata (even invalid ones) are used to support internal
663	# links, so even if 'file_is_url' is off, still need to store info
664
665	my $utf8_file = $self->filename_to_utf8_metadata($file);
666	my $web_url = "http://$utf8_file";
667	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
668
669	if ($self->{'file_is_url'}) {
670	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
671	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
672	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
673	}
674
675	if ($self->{'description_tags'}) {
676	# remove the html header - note that doing this here means any
677	# sections defined within the header will be lost (so all <Section>
678	# tags must appear within the body of the HTML)
679	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
680
681	$$textref =~ s/^.?<body[^>]>//is;
682	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
683
684	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
685	my $closecom = '(?:-->\|(?:—\|\|--)>)';
686
687	my $lt = '(?:<\|<)';
688	my $gt = '(?:>\|>)';
689	my $quot = '(?:"\|"\|”\|“)';
690
691	my $dont_strip = '';
692	if ($self->{'no_strip_metadata_html'}) {
693	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
694	}
695
696	my $found_something = 0; my $top = 1;
697	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
698	my $text = $1;
699	my $comment = $2;
700	if (defined $text) {
701	# text before a comment - note that getting to here
702	# doesn't necessarily mean there are Section tags in
703	# the document
704	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
705	}
706	while ($comment =~ s/$lt(.*?)$gt//s) {
707	my $tag = $1;
708	if ($tag eq "Section") {
709	$found_something = 1;
710	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
711	$top = 0;
712	} elsif ($tag eq "/Section") {
713	$found_something = 1;
714	$cursection = $doc_obj->get_parent_section ($cursection);
715	} elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
716	my $metaname = $1;
717	my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
718	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
719	my $metavalue = $1;
720	$metavalue =~ s/^\s+//;
721	$metavalue =~ s/\s+$//;
722	# assume that no metadata value intentionally includes
723	# carriage returns or HTML tags (if they're there they
724	# were probably introduced when converting to HTML from
725	# some other format).
726	# actually some people want to have html tags in their
727	# metadata.
728	$metavalue =~ s/[\cJ\cM]/ /sg;
729	$metavalue =~ s/<[^>]+>//sg
730	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ /^($dont_strip)$/);
731	$metavalue =~ s/\s+/ /sg;
732	if ($accumulate) {
733	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
734	} else {
735	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
736	}
737	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
738	# do nothing with containing Description tags
739	} else {
740	# simple HTML tag (probably created by the conversion
741	# to HTML from some other format) - we'll ignore it and
742	# hope for the best ;-)
743	}
744	}
745	}
746	if ($cursection ne "") {
747	print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
748	}
749
750	$$textref =~ s/^.?<body[^>]>//is;
751	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
752	if ($$textref =~ /\S/) {
753	if (!$found_something) {
754	if ($self->{'verbosity'} > 2) {
755	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
756	print $outhandle " will be processed as a single section document\n";
757	}
758
759	# go ahead and process single-section document
760	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
761
762	# if document contains no Section tags we'll go ahead
763	# and extract metadata (this won't have been done
764	# above as the -description_tags option prevents it)
765	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
766	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
767	unless $self->{'no_metadata'};
768
769	} else {
770	print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
771	print $outhandle " of the final closing </Section> tag. This text will\n";
772	print $outhandle " be ignored.";
773
774	my ($text);
775	if (length($$textref) > 30) {
776	$text = substr($$textref, 0, 30) . "...";
777	} else {
778	$text = $$textref;
779	}
780	$text =~ s/\n/ /isg;
781	print $outhandle " ($text)\n";
782	}
783	} elsif (!$found_something) {
784
785	if ($self->{'verbosity'} > 2) {
786	# may get to here if document contained no valid Section
787	# tags but did contain some comments. The text will have
788	# been processed already but we should print the warning
789	# as above and extract metadata
790	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
791	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
792	}
793
794	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
795	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
796	unless $self->{'no_metadata'};
797	}
798
799	} else {
800
801	# remove header and footer
802	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
803	$$textref =~ s/^.?<body[^>]>//is;
804	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
805	}
806
807	# single section document
808	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
809	}
810	return 1;
811	}
812
813
814	sub process_heading
815	{
816	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
817	$strHeadingText = '' if (!defined($strHeadingText));
818
819	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
820
821	my $strSecMetadata = '';
822	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
823	{
824	$strSecMetadata .= $1;
825	}
826
827	$strHeadingText =~ s/^\s+//g;
828	$strHeadingText =~ s/\s+$//g;
829	$strSecMetadata =~ s/^\s+//g;
830	$strSecMetadata =~ s/\s+$//g;
831
832	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
833
834	if (length($strSecMetadata)) {
835	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
836	}
837
838	$strMetadata .= "\t</Description>\n";
839
840	return "<!--" . $strMetadata . "-->";
841	}
842
843
844	sub update_section_data
845	{
846	my ($self, $arrSections, $nCurTocNo) = @_;
847	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
848
849	if ($nSections == 0) {
850	push @$arrSections, $nCurTocNo;
851	return $strBuffer;
852	}
853	$nLast = $arrSections->[$nSections - 1];
854	if ($nCurTocNo > $nLast) {
855	push @$arrSections, $nCurTocNo;
856	return $strBuffer;
857	}
858	for(my $i = $nSections - 1; $i >= 0; $i--) {
859	if ($nCurTocNo <= $arrSections->[$i]) {
860	$strBuffer .= "\n</Section>";
861	pop @$arrSections;
862	}
863	}
864	push @$arrSections, $nCurTocNo;
865	return $strBuffer;
866	}
867
868
869	# note that process_section may be called multiple times for a single
870	# section (relying on the fact that add_utf8_text appends the text to any
871	# that may exist already).
872	sub process_section {
873	my $self = shift (@_);
874	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
875	# trap links
876	if (!$self->{'nolinks'}) {
877	# usemap="./#index" not handled correctly => change to "#index"
878	$$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
879	$self->replace_usemap_links($1, $2, $3)/isge;
880
881	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
882	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
883	}
884
885	# trap images
886
887	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
888	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
889	# turned regular text succeeding images into links. That is, by embedding <imgs>
890	# inside <a href=""></a>, the text following images were no longer misbehaving.
891	# However, there would be many occasions whereby images were not meant to link
892	# to their source images but where the images would link to another web page.
893	# To allow this, the no_image_links option was introduced: it would prevent
894	# the behaviour of embedding images into links that referenced the source images.
895
896	# Somewhere along the line, the problem of normal text turning into links when
897	# such text followed images which were not embedded in <a href=""></a> ceased
898	# to occur. This is why the following lines have been commented out (as well as
899	# two lines in replace_images). They appear to no longer apply.
900
901	# If at any time, there is a need for having images embedded in <a> anchor tags,
902	# then it might be better to turn that into an HTMLPlugin option rather than make
903	# it the default behaviour. Also, eventually, no_image_links needs to become
904	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
905
906	#if(!$self->{'no_image_links'}){
907	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)([\"][^\"]+[\"]\|[\'][^\']+[\']\|[^\s\/>]+)([^>]>)/
908	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
909	#}
910
911	# add text to document object
912	# turn \ into \\ so that the rest of greenstone doesn't think there
913	# is an escape code following. (Macro parsing loses them...)
914	$$textref =~ s/\\/\\\\/go;
915
916	$doc_obj->add_utf8_text($cursection, $$textref);
917	}
918
919	sub replace_images {
920	my $self = shift (@_);
921	my ($front, $link, $back, $base_dir,
922	$file, $doc_obj, $section) = @_;
923
924	# remove quotes from link at start and end if necessary
925	if ($link=~/^[\"\']/) {
926	$link=~s/^[\"\']//;
927	$link=~s/[\"\']$//;
928	$front.='"';
929	$back="\"$back";
930	}
931
932	$link =~ s/\n/ /g;
933
934	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
935	# If the Word file path has spaces in it, wv messes up and you end up with
936	# absolute paths for the images, and without the "file://" prefix
937	# So check for this special case and massage the data to be correct
938	if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
939	$link =~ s/^.*\\([^\\]+)$/$1/;
940	}
941
942	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
943
944	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
945
946	my $anchor_name = $img_file;
947	#$anchor_name =~ s/^.*\///;
948	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
949
950	my $image_link = $front . $img_file .$back;
951	return $image_link;
952
953	# The reasons for why the following two lines are no longer necessary can be
954	# found in subroutine process_section
955	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
956	#return $anchor_link;
957
958	#return $front . $img_file . $back . $anchor_name;
959	}
960
961	sub replace_href_links {
962	my $self = shift (@_);
963	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
964
965	# attempt to sort out targets - frames are not handled
966	# well in this plugin and some cases will screw things
967	# up - e.g. the _parent target (so we'll just remove
968	# them all ;-)
969	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
970	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
971	$front =~ s/target=\"?_parent\"?//is;
972	$back =~ s/target=\"?_parent\"?//is;
973
974	return $front . $link . $back if $link =~ /^\#/s;
975	$link =~ s/\n/ /g;
976
977	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
978	# href may use '\'s where '/'s should be on Windows
979	$href =~ s/\\/\//g;
980
981	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
982
983
984	##### leave all these links alone (they won't be picked up by intermediate
985	##### pages). I think that's safest when dealing with frames, targets etc.
986	##### (at least until I think of a better way to do it). Problems occur with
987	##### mailto links from within small frames, the intermediate page is displayed
988	##### within that frame and can't be seen. There is still potential for this to
989	##### happen even with html pages - the solution seems to be to somehow tell
990	##### the browser from the server side to display the page being sent (i.e.
991	##### the intermediate page) in the top level window - I'm not sure if that's
992	##### possible - the following line should probably be deleted if that can be done
993	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
994
995
996	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
997	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
998	&ghtml::urlsafe ($href);
999	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
1000	} else {
1001	# link is to some other type of file (eg image) so we'll
1002	# need to associate that file
1003	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
1004	}
1005	}
1006
1007	sub add_file {
1008	my $self = shift (@_);
1009	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
1010	my ($newname);
1011
1012	my $filename = $href;
1013	if ($base_dir eq "") {
1014	# remove http:/ thereby leaving one slash at the start
1015	$filename =~ s/^[^:]*:\///;
1016	}
1017	else {
1018	# remove http://
1019	$filename =~ s/^[^:]*:\/\///;
1020	}
1021
1022	$filename = &util::filename_cat($base_dir, $filename);
1023
1024	# Replace %20's in URL with a space if required. Note that the filename
1025	# may include the %20 in some situations
1026	if ($filename =~ /\%20/) {
1027	if (!-e $filename) {
1028	$filename =~ s/\%20/ /g;
1029	}
1030	}
1031
1032	my ($ext) = $filename =~ /(\.[^\.]*)$/;
1033
1034	if ($rl == 0) {
1035	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
1036	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
1037	}
1038	else {
1039	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
1040	}
1041	}
1042
1043	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
1044	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
1045	}
1046	if ($self->{'rename_assoc_files'}) {
1047	if (defined $self->{'aux_files'}->{$href}) {
1048	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1049	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1050	} else {
1051	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1052	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1053	$self->inc_filecount ();
1054	}
1055	$doc_obj->associate_file($filename, $newname, undef, $section);
1056	return "_httpdocimg_/$newname";
1057	} else {
1058	($newname) = $filename =~ /([^\/\\]*)$/;
1059	$doc_obj->associate_file($filename, $newname, undef, $section);
1060	return "_httpdocimg_/$newname";
1061	}
1062	}
1063
1064
1065	sub format_link {
1066	my $self = shift (@_);
1067	my ($link, $base_dir, $file) = @_;
1068
1069	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
1070
1071	$hash_part = "" if !defined $hash_part;
1072	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
1073	my $outhandle = $self->{'outhandle'};
1074	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1075	if $self->{'verbosity'};
1076	return ($link, "", 0);
1077	}
1078
1079	if ($before_hash =~ s@^((?:http\|ftp\|file)://)@@i) {
1080	my $type = $1;
1081
1082	if ($link =~ /^(http\|ftp):/i) {
1083	# Turn url (using /) into file name (possibly using \ on windows)
1084	my @http_dir_split = split('/', $before_hash);
1085	$before_hash = &util::filename_cat(@http_dir_split);
1086	}
1087
1088	$before_hash = $self->eval_dir_dots($before_hash);
1089
1090	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1091
1092	my $rl = 0;
1093	$rl = 1 if (-e $linkfilename);
1094
1095	# make sure there's a slash on the end if it's a directory
1096	if ($before_hash !~ /\/$/) {
1097	$before_hash .= "/" if (-d $linkfilename);
1098	}
1099
1100	return ($type . $before_hash, $hash_part, $rl);
1101
1102	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ /^\//) {
1103	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ /\\/) {
1104
1105	# the first directory will be the domain name if file_is_url
1106	# to generate archives, otherwise we'll assume all files are
1107	# from the same site and base_dir is the root
1108
1109	if ($self->{'file_is_url'}) {
1110	my @dirs = split /[\/\\]/, $file;
1111	my $domname = shift (@dirs);
1112	$before_hash = &util::filename_cat($domname, $before_hash);
1113	$before_hash =~ s@\\@/@g; # for windows
1114	}
1115	else
1116	{
1117	# see if link shares directory with source document
1118	# => turn into relative link if this is so!
1119
1120	if ($ENV{'GSDLOS'} =~ /^windows/i) {
1121	# too difficult doing a pattern match with embedded '\'s...
1122	my $win_before_hash=$before_hash;
1123	$win_before_hash =~ s@(\\)+@/@g;
1124	# $base_dir is already similarly "converted" on windows.
1125	if ($win_before_hash =~ s@^$base_dir/@@o) {
1126	# if this is true, we removed a prefix
1127	$before_hash=$win_before_hash;
1128	}
1129	}
1130	else {
1131	# before_hash has lost leading slash by this point,
1132	# -> add back in prior to substitution with $base_dir
1133	$before_hash = "/$before_hash";
1134
1135	$before_hash = &util::filename_cat("",$before_hash);
1136	$before_hash =~ s@^$base_dir/@@;
1137	}
1138	}
1139	} else {
1140	# Turn relative file path into full path
1141	my $dirname = &File::Basename::dirname($file);
1142	$before_hash = &util::filename_cat($dirname, $before_hash);
1143	$before_hash = $self->eval_dir_dots($before_hash);
1144	}
1145
1146	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1147	# make sure there's a slash on the end if it's a directory
1148	if ($before_hash !~ /\/$/) {
1149	$before_hash .= "/" if (-d $linkfilename);
1150	}
1151	return ("http://" . $before_hash, $hash_part, 1);
1152	} else {
1153	# mailto, news, nntp, telnet, javascript or gopher link
1154	return ($before_hash, "", 0);
1155	}
1156	}
1157
1158	sub extract_first_NNNN_characters {
1159	my $self = shift (@_);
1160	my ($textref, $doc_obj, $thissection) = @_;
1161
1162	foreach my $size (split /,/, $self->{'first'}) {
1163	my $tmptext = $$textref;
1164	# skip to the body
1165	$tmptext =~ s/.<body[^>]>//i;
1166	# remove javascript
1167	$tmptext =~ s@<script.*?</script>@ @sig;
1168	$tmptext =~ s/<[^>]*>/ /g;
1169	$tmptext =~ s/ / /g;
1170	$tmptext =~ s/^\s+//;
1171	$tmptext =~ s/\s+$//;
1172	$tmptext =~ s/\s+/ /gs;
1173	$tmptext = &unicode::substr ($tmptext, 0, $size);
1174	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
1175	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1176	}
1177	}
1178
1179
1180	sub extract_metadata {
1181	my $self = shift (@_);
1182	my ($textref, $metadata, $doc_obj, $section) = @_;
1183	my $outhandle = $self->{'outhandle'};
1184	# if we don't want metadata, we may as well not be here ...
1185	return if (!defined $self->{'metadata_fields'});
1186
1187	# metadata fields to extract/save. 'key' is the (lowercase) name of the
1188	# html meta, 'value' is the metadata name for greenstone to use
1189	my %find_fields = ();
1190
1191	my %creator_fields = (); # short-cut for lookups
1192
1193
1194	foreach my $field (split /,/, $self->{'metadata_fields'}) {
1195	$field =~ s/^\s+//; # remove leading whitespace
1196	$field =~ s/\s+$//; # remove trailing whitespace
1197
1198	# support tag<tagname>
1199	if ($field =~ /^(.?)<(.?)>$/) {
1200	# "$2" is the user's preferred gs metadata name
1201	$find_fields{lc($1)}=$2; # lc = lowercase
1202	} else { # no <tagname> for mapping
1203	# "$field" is the user's preferred gs metadata name
1204	$find_fields{lc($field)}=$field; # lc = lowercase
1205	}
1206	}
1207
1208	if (defined $self->{'hunt_creator_metadata'} &&
1209	$self->{'hunt_creator_metadata'} == 1 ) {
1210	my @extra_fields =
1211	(
1212	'author',
1213	'author.email',
1214	'creator',
1215	'dc.creator',
1216	'dc.creator.corporatename',
1217	);
1218
1219	# add the creator_metadata fields to search for
1220	foreach my $field (@extra_fields) {
1221	$creator_fields{$field}=0; # add to lookup hash
1222	}
1223	}
1224
1225
1226	# find the header in the html file, which has the meta tags
1227	$$textref =~ m@<head>(.*?)</head>@si;
1228
1229	my $html_header=$1;
1230
1231	# go through every <meta... tag defined in the html and see if it is
1232	# one of the tags we want to match.
1233
1234	# special case for title - we want to remember if its been found
1235	my $found_title = 0;
1236	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1237	$html_header =~ /^/; # match the start of the string, for \G assertion
1238
1239	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1240	my $metatag=$1;
1241	my ($tag, $value);
1242
1243	# find the tag name
1244	$metatag =~ /(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1245	$tag=$2;
1246	# in case they're not using " or ', but they should...
1247	if (! $tag) {
1248	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1249	$tag=$1;
1250	}
1251
1252	if (!defined $tag) {
1253	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1254	next;
1255	}
1256
1257	# don't need to assign this field if it was passed in from a previous
1258	# (recursive) plugin
1259	if (defined $metadata->{$tag}) {next}
1260
1261	# find the tag content
1262	$metatag =~ /content\s=\s([\"\'])?(.*?)\1/is;
1263	$value=$2;
1264
1265	if (! $value) {
1266	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1267	$value=$1;
1268	}
1269	if (!defined $value) {
1270	print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1271	next;
1272	}
1273
1274	# clean up and add
1275	$value =~ s/\s+/ /gs;
1276	chomp($value); # remove trailing \n, if any
1277	if (exists $creator_fields{lc($tag)}) {
1278	# map this value onto greenstone's "Creator" metadata
1279	$tag='Creator';
1280	} elsif (!exists $find_fields{lc($tag)}) {
1281	next; # don't want this tag
1282	} else {
1283	# get the user's preferred capitalisation
1284	$tag = $find_fields{lc($tag)};
1285	}
1286	if (lc($tag) eq "title") {
1287	$found_title = 1;
1288	}
1289	print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
1290	if ($self->{'verbosity'} > 2);
1291	if ($tag =~ /date.*/i){
1292	$tag = lc($tag);
1293	}
1294	$doc_obj->add_utf8_metadata($section, $tag, $value);
1295
1296	}
1297
1298	# TITLE: extract the document title
1299	if (exists $find_fields{'title'} && !$found_title) {
1300	# we want a title, and didn't find one in the meta tags
1301	# see if there's a <title> tag
1302	my $title;
1303	my $from = ""; # for debugging output only
1304	if ($html_header =~ /<title[^>]>([^<]+)<\/title[^>]>/is) {
1305	$title = $1;
1306	$from = "<title> tags";
1307	}
1308
1309	if (!defined $title) {
1310	$from = "first 100 chars";
1311	# if no title use first 100 or so characters
1312	$title = $$textref;
1313	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1314	$title =~ s/^.*?<body>//si;
1315	# ignore javascript!
1316	$title =~ s@<script.*?</script>@ @sig;
1317	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1318	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1319	$title = substr ($title, 0, 100);
1320	$title =~ s/\s\S*$/.../;
1321	}
1322	$title =~ s/<[^>]*>/ /g; # remove html tags
1323	$title =~ s/ / /g;
1324	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1325	$title =~ s/\s+/ /gs; # collapse multiple spaces
1326	$title =~ s/^\s*//; # remove leading spaces
1327	$title =~ s/\s*$//; # remove trailing spaces
1328
1329	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1330	$title =~ s/^\s+//s; # in case title_sub introduced any...
1331	$doc_obj->add_utf8_metadata ($section, 'Title', $title);
1332	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1333	if ($self->{'verbosity'} > 2);
1334	}
1335
1336	# add FileFormat metadata
1337	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1338
1339	# Special, for metadata names such as tagH1 - extracts
1340	# the text between the first <H1> and </H1> tags into "H1" metadata.
1341
1342	foreach my $field (keys %find_fields) {
1343	if ($field !~ /^tag([a-z0-9]+)$/i) {next}
1344	my $tag = $1;
1345	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1346	my $content = $1;
1347	$content =~ s/ / /g;
1348	$content =~ s/<[^>]*>/ /g;
1349	$content =~ s/^\s+//;
1350	$content =~ s/\s+$//;
1351	$content =~ s/\s+/ /gs;
1352	if ($content) {
1353	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1354	$tag =~ s/^tag//i;
1355	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1356	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1357	if ($self->{'verbosity'} > 2);
1358	}
1359	}
1360	}
1361	}
1362
1363
1364	# evaluate any "../" to next directory up
1365	# evaluate any "./" as here
1366	sub eval_dir_dots {
1367	my $self = shift (@_);
1368	my ($filename) = @_;
1369	my $dirsep_os = &util::get_os_dirsep();
1370	my @dirsep = split(/$dirsep_os/,$filename);
1371
1372	my @eval_dirs = ();
1373	foreach my $d (@dirsep) {
1374	if ($d eq "..") {
1375	pop(@eval_dirs);
1376
1377	} elsif ($d eq ".") {
1378	# do nothing!
1379
1380	} else {
1381	push(@eval_dirs,$d);
1382	}
1383	}
1384
1385	# Need to fiddle with number of elements in @eval_dirs if the
1386	# first one is the empty string. This is because of a
1387	# modification to util::filename_cat that supresses the addition
1388	# of a leading '/' character (or \ if windows) (intended to help
1389	# filename cat with relative paths) if the first entry in the
1390	# array is the empty string. Making the array start with two
1391	# empty strings is a way to defeat this "smart" option.
1392	#
1393	if (scalar(@eval_dirs) > 0) {
1394	if ($eval_dirs[0] eq ""){
1395	unshift(@eval_dirs,"");
1396	}
1397	}
1398	return &util::filename_cat(@eval_dirs);
1399	}
1400
1401	sub replace_usemap_links {
1402	my $self = shift (@_);
1403	my ($front, $link, $back) = @_;
1404
1405	$link =~ s/^\.\///;
1406	return $front . $link . $back;
1407	}
1408
1409	sub inc_filecount {
1410	my $self = shift (@_);
1411
1412	if ($self->{'file_num'} == 1000) {
1413	$self->{'dir_num'} ++;
1414	$self->{'file_num'} = 0;
1415	} else {
1416	$self->{'file_num'} ++;
1417	}
1418	}
1419
1420
1421	# Extend read_file so that strings like é are
1422	# converted to UTF8 internally.
1423	#
1424	# We don't convert < or > or & or " in case
1425	# they interfere with the GML files
1426
1427	sub read_file {
1428	my $self = shift(@_);
1429	my ($filename, $encoding, $language, $textref) = @_;
1430
1431	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1432
1433	# Convert entities to their UTF8 equivalents
1434	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1435	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
1436	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1437	}
1438
1439	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: