Context Navigation

source: gsdl/branches/gsdl-2.74/perllib/plugins/HTMLPlug.pm@ 14270

Last change on this file since 14270 was 14270, checked in by oranfry, 17 years ago
merged selected changes to the gsdl trunk since r14217 into the 2.74 branch
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 46.2 KB

Line
1	###########################################################################
2	#
3	# HTMLPlug.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlug;
37
38	use BasPlug;
39	use ghtml;
40	use unicode;
41	use util;
42	use XMLParser;
43
44	use Image::Size;
45
46	sub BEGIN {
47	@HTMLPlug::ISA = ('BasPlug');
48	}
49
50	use strict; # every perl program should have this!
51	no strict 'refs'; # make an exception so we can use variables as filehandles
52
53	my $arguments =
54	[ { 'name' => "process_exp",
55	'desc' => "{BasPlug.process_exp}",
56	'type' => "regexp",
57	'deft' => &get_default_process_exp() },
58	{ 'name' => "block_exp",
59	'desc' => "{BasPlug.block_exp}",
60	'type' => 'regexp',
61	'deft' => &get_default_block_exp() },
62	{ 'name' => "nolinks",
63	'desc' => "{HTMLPlug.nolinks}",
64	'type' => "flag" },
65	{ 'name' => "keep_head",
66	'desc' => "{HTMLPlug.keep_head}",
67	'type' => "flag" },
68	{ 'name' => "no_metadata",
69	'desc' => "{HTMLPlug.no_metadata}",
70	'type' => "flag" },
71	{ 'name' => "metadata_fields",
72	'desc' => "{HTMLPlug.metadata_fields}",
73	'type' => "string",
74	'deft' => "Title" },
75	{ 'name' => "hunt_creator_metadata",
76	'desc' => "{HTMLPlug.hunt_creator_metadata}",
77	'type' => "flag" },
78	{ 'name' => "file_is_url",
79	'desc' => "{HTMLPlug.file_is_url}",
80	'type' => "flag" },
81	{ 'name' => "assoc_files",
82	'desc' => "{HTMLPlug.assoc_files}",
83	'type' => "regexp",
84	'deft' => &get_default_block_exp() },
85	{ 'name' => "rename_assoc_files",
86	'desc' => "{HTMLPlug.rename_assoc_files}",
87	'type' => "flag" },
88	{ 'name' => "title_sub",
89	'desc' => "{HTMLPlug.title_sub}",
90	'type' => "string",
91	'deft' => "" },
92	{ 'name' => "description_tags",
93	'desc' => "{HTMLPlug.description_tags}",
94	'type' => "flag" },
95	# retain this for backward compatibility (w3mir option was replaced by
96	# file_is_url)
97	{ 'name' => "w3mir",
98	# 'desc' => "{HTMLPlug.w3mir}",
99	'type' => "flag",
100	'hiddengli' => "yes"},
101	{ 'name' => "no_strip_metadata_html",
102	'desc' => "{HTMLPlug.no_strip_metadata_html}",
103	'type' => "string",
104	'deft' => "",
105	'reqd' => "no"},
106	{ 'name' => "sectionalise_using_h_tags",
107	'desc' => "{HTMLPlug.sectionalise_using_h_tags}",
108	'type' => "flag" },
109	{ 'name' => "tidy_html",
110	'desc' => "{HTMLPlug.tidy_html}",
111	'type' => "flag"},
112	{ 'name' => "old_style_HDL",
113	'desc' => "{HTMLPlug.old_style_HDL}",
114	'type' => "flag"}
115	];
116
117	my $options = { 'name' => "HTMLPlug",
118	'desc' => "{HTMLPlug.desc}",
119	'abstract' => "no",
120	'inherits' => "yes",
121	'args' => $arguments };
122
123
124	sub HB_read_html_file {
125	my $self = shift (@_);
126	my ($htmlfile, $text) = @_;
127
128	# load in the file
129	if (!open (FILE, $htmlfile)) {
130	print STDERR "ERROR - could not open $htmlfile\n";
131	return;
132	}
133
134	my $foundbody = 0;
135	$self->HB_gettext (\$foundbody, $text, "FILE");
136	close FILE;
137
138	# just in case there was no <body> tag
139	if (!$foundbody) {
140	$foundbody = 1;
141	open (FILE, $htmlfile) \|\| return;
142	$self->HB_gettext (\$foundbody, $text, "FILE");
143	close FILE;
144	}
145	# text is in utf8
146	}
147
148	# converts the text to utf8, as ghtml does that for é etc.
149	sub HB_gettext {
150	my $self = shift (@_);
151	my ($foundbody, $text, $handle) = @_;
152
153	my $line = "";
154	while (defined ($line = <$handle>)) {
155	# look for body tag
156	if (!$$foundbody) {
157	if ($line =~ s/^.<body[^>]>//i) {
158	$$foundbody = 1;
159	} else {
160	next;
161	}
162	}
163
164	# check for symbol fonts
165	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
166	my $font = $1;
167	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
168	if ($font !~ /^arial$/i);
169	}
170
171	$$text .= $line;
172	}
173
174	if ($self->{'input_encoding'} eq "iso_8859_1") {
175	# convert to utf-8
176	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
177	}
178	# convert any alphanumeric character entities to their utf-8
179	# equivalent for indexing purposes
180	&ghtml::convertcharentities ($$text);
181
182	$$text =~ s/\s+/ /g; # remove \n's
183	}
184
185	sub HB_clean_section {
186	my $self = shift (@_);
187	my ($section) = @_;
188
189	# remove tags without a starting tag from the section
190	my ($tag, $tagstart);
191	while ($section =~ /<\/([^>]{1,10})>/) {
192	$tag = $1;
193	$tagstart = index($section, "<$tag");
194	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
195	$section =~ s/<\/$tag>//;
196	}
197
198	# remove extra paragraph tags
199	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
200
201	# remove extra stuff at the end of the section
202	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
203
204	# add a newline at the beginning of each paragraph
205	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
206
207	# add a newline every 80 characters at a word boundary
208	# Note: this regular expression puts a line feed before
209	# the last word in each section, even when it is not
210	# needed.
211	$section =~ s/(.{1,80})\s/$1\n/g;
212
213	# fix up the image links
214	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
215	<center><img src=\"$1\"><\/center><br>/ig;
216	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
217	<center><img src=\"$1\"><\/center><br>/ig;
218
219	return $section;
220	}
221
222	# Will convert the oldHDL format to the new HDL format (using the Section tag)
223	sub convert_to_newHDLformat
224	{
225	my $self = shift (@_);
226	my ($file,$cnfile) = @_;
227	my $input_filename = $file;
228	my $tmp_filename = $cnfile;
229
230	# write HTML tmp file with new HDL format
231	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
232
233	# read in the file and do basic html cleaning (removing header etc)
234	my $html = "";
235	$self->HB_read_html_file ($input_filename, \$html);
236
237	# process the file one section at a time
238	my $curtoclevel = 1;
239	my $firstsection = 1;
240	my $toclevel = 0;
241	while (length ($html) > 0) {
242	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
243	$toclevel = $3;
244	my $title = $4;
245	my $sectiontext = "";
246	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
247	$sectiontext = $1;
248	} else {
249	$sectiontext = $html;
250	$html = "";
251	}
252
253	# remove tags and extra spaces from the title
254	$title =~ s/<\/?[^>]+>//g;
255	$title =~ s/^\s+\|\s+$//g;
256
257	# close any sections below the current level and
258	# create a new section (special case for the firstsection)
259	print PROD "<!--\n";
260	while (($curtoclevel > $toclevel) \|\|
261	(!$firstsection && $curtoclevel == $toclevel)) {
262	$curtoclevel--;
263	print PROD "</Section>\n";
264	}
265	if ($curtoclevel+1 < $toclevel) {
266	print STDERR "WARNING - jump in toc levels in $input_filename " .
267	"from $curtoclevel to $toclevel\n";
268	}
269	while ($curtoclevel < $toclevel) {
270	$curtoclevel++;
271	}
272
273	if ($curtoclevel == 1) {
274	# add the header tag
275	print PROD "-->\n";
276	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
277	print PROD "<!--\n";
278	}
279
280	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
281
282	print PROD "-->\n";
283
284	# clean up the section html
285	$sectiontext = $self->HB_clean_section($sectiontext);
286
287	print PROD "$sectiontext\n";
288
289	} else {
290	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
291	"\nin $input_filename\n";
292	last;
293	}
294	$firstsection = 0;
295	}
296
297	print PROD "<!--\n";
298	while ($curtoclevel > 0) {
299	$curtoclevel--;
300	print PROD "</Section>\n";
301	}
302	print PROD "-->\n";
303
304	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
305
306	return $tmp_filename;
307	}
308
309	sub shorten {
310	my $self = shift (@_);
311	my ($text) = @_;
312
313	return "\"$text\"" if (length($text) < 100);
314
315	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
316	substr ($text, length($text)-50) . "\"";
317	}
318
319	sub convert_tidy_or_oldHDL_file
320	{
321	my $self = shift (@_);
322	my ($file) = @_;
323	my $input_filename = $file;
324
325	if (-d $input_filename)
326	{
327	return $input_filename;
328	}
329
330	# get the input filename
331	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
332	my $base_dirname = $dirname;
333	$suffix = lc($suffix);
334
335	# derive tmp filename from input filename
336	# Remove any white space from filename -- no risk of name collision, and
337	# makes later conversion by utils simpler. Leave spaces in path...
338	# tidy up the filename with space, dot, hyphen between
339	$tailname =~ s/\s+//g;
340	$tailname =~ s/\.+//g;
341	$tailname =~ s/\-+//g;
342	# convert to utf-8 otherwise we have problems with the doc.xml file
343	# later on
344	&unicode::ensure_utf8(\$tailname);
345
346	# softlink to collection tmp dir
347	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
348	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
349
350	my $test_dirname = "";
351	if ($dirname =~ /import\//)
352	{
353	$test_dirname = $';
354
355	#print STDERR "init $'\n";
356
357	while ($test_dirname =~ /[\/]/)
358	{
359	my $folderdirname = $`;
360	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
361	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
362	$test_dirname = $';
363	#print STDERR "folder $`\n";
364	}
365
366	#my $folderdirname = $';
367	#$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
368	#&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
369	#print STDERR "folderdirnae $'\n";
370
371	#$test_dirname =~ s/[\\\/]+$//;
372	#print STDERR "$'";
373	}
374
375	# remove trailing slashes
376	#$dirname =~ s/[\\\/]+$//;
377	# create folder for this file
378	#my $folderdirname = &File::Basename::basename($dirname);
379	#my $folderdirname = $test_dirname;
380	#$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
381	#&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
382
383	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
384
385	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
386	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
387	{
388	#convert the input file to a new style HDL
389	my $hdl_output_filename = $input_filename;
390	if ($self->{'old_style_HDL'})
391	{
392	$hdl_output_filename = &util::filename_cat($tmp_dirname, "newHDL_$tailname$suffix");
393	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
394	}
395
396	#just for checking copy all other file from the base dir to tmp dir if it is not exists
397	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
398	my @files = grep {!/^\.+$/} readdir(DIR);
399	close(DIR);
400
401	foreach my $file (@files)
402	{
403	my $src_file = &util::filename_cat($base_dirname,$file);
404	my $dest_file = &util::filename_cat($tmp_dirname,$file);
405	if ((!-e $dest_file) && (!-d $src_file))
406	{
407	# just copy the original file back to the tmp directory
408	open (TIDYIN, "< $src_file") or die "Can't open $src_file : $!";
409	open (TIDYOUT, "> $dest_file") or die "Can't open $dest_file : $!";
410	print TIDYOUT <TIDYIN>;
411	close TIDYIN;
412	close TIDYOUT;
413	}
414	}
415
416	# tidy the input file
417	my $tidy_output_filename = $hdl_output_filename;
418	if ($self->{'tidy_html'})
419	{
420	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
421	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
422	}
423	$tmp_filename = $tidy_output_filename;
424	}
425	else
426	{
427	if (!-e $tmp_filename)
428	{
429	# just copy the original file back to the tmp directory
430	open (TIDYIN, "< $input_filename") or die "Can't open $input_filename : $!";
431	open (TIDYOUT, "> $tmp_filename") or die "Can't open $tmp_filename : $!";
432	print TIDYOUT <TIDYIN>;
433	close TIDYIN;
434	close TIDYOUT;
435	}
436	}
437
438	return $tmp_filename;
439	}
440
441
442	# Will make the html input file as a proper XML file with removed font tag and
443	# image size added to the img tag.
444	# The tidying process takes place in a collection specific 'tmp' directory so
445	# that we don't accidentally damage the input.
446	sub tmp_tidy_file
447	{
448	my $self = shift (@_);
449	my ($file,$cnfile) = @_;
450	my $input_filename = $file;
451	my $tmp_filename = $cnfile;
452
453	# get the input filename
454	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
455
456	require HTML::TokeParser::Simple;
457
458	# create HTML parser to decode the input file
459	my $parser = HTML::TokeParser::Simple->new($input_filename);
460
461	# write HTML tmp file without the font tag and image size are added to the img tag
462	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
463	while (my $token = $parser->get_token())
464	{
465	# is it an img tag
466	if ($token->is_start_tag('img'))
467	{
468	# get the attributes
469	my $attr = $token->return_attr;
470
471	# get the full path to the image
472	my $img_file = &util::filename_cat($dirname,$attr->{src});
473
474	# set the width and height attribute
475	($attr->{width}, $attr->{height}) = imgsize($img_file);
476
477	# recreate the tag
478	print PROD "<img";
479	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
480	print PROD ">";
481	}
482	# is it a font tag
483	else
484	{
485	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
486	{
487	# remove font tag
488	print PROD "";
489	}
490	else
491	{
492	# print without changes
493	print PROD $token->as_is;
494	}
495	}
496	}
497	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
498
499	# run html-tidy on the tmp file to make it a proper XML file
500	my $tidyfile = `tidy -wrap 0 -asxml $tmp_filename`;
501
502	# write result back to the tmp file
503	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
504	print PROD $tidyfile;
505	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
506
507	# return the output filename
508	return $tmp_filename;
509	}
510
511	sub read_into_doc_obj
512	{
513	my $self = shift (@_);
514	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
515
516	# check the process_exp and block_exp thing
517	my ($block_status,$filename) = $self->read_block(@_);
518	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
519
520	# get the input file
521	my $input_filename = $file;
522	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
523	$suffix = lc($suffix);
524
525	if (($self->{'tidy_html'}) \|\| ($self->{'old_style_HDL'}))
526	{
527	# set the file to be tidied
528	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/;
529
530	# get the tidied file
531	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
532	my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
533
534	# derive tmp filename from input filename
535	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
536
537	# set the new input file and base_dir to be from the tidied file
538	$file = "$tailname$suffix";
539	$base_dir = $dirname;
540	}
541
542	# call the parent read_into_doc_obj
543	my ($process_status,$doc_obj) = &BasPlug::read_into_doc_obj($self,$pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
544
545	return ($process_status,$doc_obj);
546	}
547
548	sub new {
549	my ($class) = shift (@_);
550	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
551	push(@$pluginlist, $class);
552
553	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
554	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
555
556
557	my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
558
559	if ($self->{'w3mir'}) {
560	$self->{'file_is_url'} = 1;
561	}
562	$self->{'aux_files'} = {};
563	$self->{'dir_num'} = 0;
564	$self->{'file_num'} = 0;
565
566	return bless $self, $class;
567	}
568
569	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
570	# if have eg <script language="javascript" src="img/lib.js@123">
571	sub get_default_block_exp {
572	my $self = shift (@_);
573
574	return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
575	}
576
577	sub get_default_process_exp {
578	my $self = shift (@_);
579
580	# the last option is an attempt to encode the concept of an html query ...
581	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
582	}
583
584	sub store_block_files
585	{
586	my $self =shift (@_);
587	my ($filename) = @_;
588	my $html_fname = $filename;
589	my @file_blocks;
590
591	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
592
593	# read in file ($text will be in utf8)
594	my $text = "";
595	$self->read_file ($filename, $encoding, $language, \$text);
596	my $textref = \$text;
597	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
598	my $closecom = '(?:-->\|(?:—\|\|--)>)';
599	$$textref =~ s/$opencom(.*?)$closecom//gs;
600
601	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
602	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
603	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
604	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
605	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
606	my @tabbg_matches = ($$textref =~ m/<(?:table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
607
608	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches) {
609
610	# remove quotes from link at start and end if necessary
611	if ($link=~/^\"/) {
612	$link=~s/^\"//;
613	$link=~s/\"$//;
614	}
615
616	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
617
618	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
619	# Turn relative file path into full path
620	my $dirname = &File::Basename::dirname($filename);
621	$link = &util::filename_cat($dirname, $link);
622	}
623	$link = $self->eval_dir_dots($link);
624
625	$self->{'file_blocks'}->{$link} = 1;
626	}
627	}
628
629
630	# do plugin specific processing of doc_obj
631	sub process {
632	my $self = shift (@_);
633	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
634	my $outhandle = $self->{'outhandle'};
635
636	print STDERR "<Processing n='$file' p='HTMLPlug'>\n" if ($gli);
637
638	print $outhandle "HTMLPlug: processing $file\n"
639	if $self->{'verbosity'} > 1;
640
641	if ($ENV{'GSDLOS'} =~ /^windows/i) {
642	# this makes life so much easier... perl can cope with unix-style '/'s.
643	$base_dir =~ s@(\\)+@/@g;
644	$file =~ s@(\\)+@/@g;
645	}
646
647	# reset per-doc stuff...
648	$self->{'aux_files'} = {};
649	$self->{'dir_num'} = 0;
650	$self->{'file_num'} = 0;
651
652	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
653	# you can also include metadata in the format (X can be any number)
654	# <hX>Title<!--gsdl-metadata
655	# <Metadata name="name1">value1</Metadata>
656	# ...
657	# <Metadata name="nameN">valueN</Metadata>
658	#--></hX>
659	if ($self->{'sectionalise_using_h_tags'}) {
660	# description_tags should allways be activated because we convert headings to description tags
661	$self->{'description_tags'} = 1;
662
663	my $arrSections = [];
664	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
665
666	if (scalar(@$arrSections)) {
667	my $strMetadata = $self->update_section_data($arrSections, -1);
668	if (length($strMetadata)) {
669	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
670	$$textref =~ s/<\/body>/$strMetadata/ig;
671	}
672	}
673	}
674
675	my $cursection = $doc_obj->get_top_section();
676
677	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
678	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
679
680	# Store URL for page as metadata - this can be used for an
681	# altavista style search interface. The URL won't be valid
682	# unless the file structure contains the domain name (i.e.
683	# like when w3mir is used to download a website).
684
685	# URL metadata (even invalid ones) are used to support internal
686	# links, so even if 'file_is_url' is off, still need to store info
687
688	my $web_url = "http://$file";
689	$doc_obj->add_metadata($cursection, "URL", $web_url);
690
691	if ($self->{'file_is_url'}) {
692	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
693	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
694	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
695	}
696
697	if ($self->{'description_tags'}) {
698	# remove the html header - note that doing this here means any
699	# sections defined within the header will be lost (so all <Section>
700	# tags must appear within the body of the HTML)
701	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
702
703	$$textref =~ s/^.?<body[^>]>//is;
704	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
705
706	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
707	my $closecom = '(?:-->\|(?:—\|\|--)>)';
708
709	my $lt = '(?:<\|<)';
710	my $gt = '(?:>\|>)';
711	my $quot = '(?:"\|"\|”\|“)';
712
713	my $dont_strip = '';
714	if ($self->{'no_strip_metadata_html'}) {
715	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
716	}
717
718	my $found_something = 0; my $top = 1;
719	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
720	my $text = $1;
721	my $comment = $2;
722	if (defined $text) {
723	# text before a comment - note that getting to here
724	# doesn't necessarily mean there are Section tags in
725	# the document
726	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
727	}
728	while ($comment =~ s/$lt(.*?)$gt//s) {
729	my $tag = $1;
730	if ($tag eq "Section") {
731	$found_something = 1;
732	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
733	$top = 0;
734	} elsif ($tag eq "/Section") {
735	$found_something = 1;
736	$cursection = $doc_obj->get_parent_section ($cursection);
737	} elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
738	my $metaname = $1;
739	my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
740	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
741	my $metavalue = $1;
742	$metavalue =~ s/^\s+//;
743	$metavalue =~ s/\s+$//;
744	# assume that no metadata value intentionally includes
745	# carriage returns or HTML tags (if they're there they
746	# were probably introduced when converting to HTML from
747	# some other format).
748	# actually some people want to have html tags in their
749	# metadata.
750	$metavalue =~ s/[\cJ\cM]/ /sg;
751	$metavalue =~ s/<[^>]+>//sg
752	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ /^($dont_strip)$/);
753	$metavalue =~ s/\s+/ /sg;
754	if ($accumulate) {
755	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
756	} else {
757	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
758	}
759	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
760	# do nothing with containing Description tags
761	} else {
762	# simple HTML tag (probably created by the conversion
763	# to HTML from some other format) - we'll ignore it and
764	# hope for the best ;-)
765	}
766	}
767	}
768	if ($cursection ne "") {
769	print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
770	}
771
772	$$textref =~ s/^.?<body[^>]>//is;
773	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
774	if ($$textref =~ /\S/) {
775	if (!$found_something) {
776	if ($self->{'verbosity'} > 2) {
777	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
778	print $outhandle " will be processed as a single section document\n";
779	}
780
781	# go ahead and process single-section document
782	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
783
784	# if document contains no Section tags we'll go ahead
785	# and extract metadata (this won't have been done
786	# above as the -description_tags option prevents it)
787	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
788	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
789	unless $self->{'no_metadata'};
790
791	} else {
792	print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
793	print $outhandle " of the final closing </Section> tag. This text will\n";
794	print $outhandle " be ignored.";
795
796	my ($text);
797	if (length($$textref) > 30) {
798	$text = substr($$textref, 0, 30) . "...";
799	} else {
800	$text = $$textref;
801	}
802	$text =~ s/\n/ /isg;
803	print $outhandle " ($text)\n";
804	}
805	} elsif (!$found_something) {
806
807	if ($self->{'verbosity'} > 2) {
808	# may get to here if document contained no valid Section
809	# tags but did contain some comments. The text will have
810	# been processed already but we should print the warning
811	# as above and extract metadata
812	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
813	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
814	}
815
816	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
817	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
818	unless $self->{'no_metadata'};
819	}
820
821	} else {
822
823	# remove header and footer
824	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
825	$$textref =~ s/^.?<body[^>]>//is;
826	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
827	}
828
829	# single section document
830	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
831	}
832	return 1;
833	}
834
835
836	sub process_heading
837	{
838	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
839	$strHeadingText = '' if (!defined($strHeadingText));
840
841	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
842
843	my $strSecMetadata = '';
844	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
845	{
846	$strSecMetadata .= $1;
847	}
848
849	$strHeadingText =~ s/^\s+//g;
850	$strHeadingText =~ s/\s+$//g;
851	$strSecMetadata =~ s/^\s+//g;
852	$strSecMetadata =~ s/\s+$//g;
853
854	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
855
856	if (length($strSecMetadata)) {
857	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
858	}
859
860	$strMetadata .= "\t</Description>\n";
861
862	return "<!--" . $strMetadata . "-->";
863	}
864
865
866	sub update_section_data
867	{
868	my ($self, $arrSections, $nCurTocNo) = @_;
869	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
870
871	if ($nSections == 0) {
872	push @$arrSections, $nCurTocNo;
873	return $strBuffer;
874	}
875	$nLast = $arrSections->[$nSections - 1];
876	if ($nCurTocNo > $nLast) {
877	push @$arrSections, $nCurTocNo;
878	return $strBuffer;
879	}
880	for(my $i = $nSections - 1; $i >= 0; $i--) {
881	if ($nCurTocNo <= $arrSections->[$i]) {
882	$strBuffer .= "\n</Section>";
883	pop @$arrSections;
884	}
885	}
886	push @$arrSections, $nCurTocNo;
887	return $strBuffer;
888	}
889
890
891	# note that process_section may be called multiple times for a single
892	# section (relying on the fact that add_utf8_text appends the text to any
893	# that may exist already).
894	sub process_section {
895	my $self = shift (@_);
896	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
897	# trap links
898	if (!$self->{'nolinks'}) {
899
900	# usemap="./#index" not handled correctly => change to "#index"
901	$$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
902	$self->replace_usemap_links($1, $2, $3)/isge;
903
904	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
905	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
906	}
907
908	# trap images
909
910	# allow spaces if inside quotes - jrm21
911	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)([\"\'][^\"\']+[\"\']\|[^\s>]+)([^>]>)/
912	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
913
914	# add text to document object
915	# turn \ into \\ so that the rest of greenstone doesn't think there
916	# is an escape code following. (Macro parsing loses them...)
917	$$textref =~ s/\\/\\\\/go;
918
919	$doc_obj->add_utf8_text($cursection, $$textref);
920	}
921
922	sub replace_images {
923	my $self = shift (@_);
924	my ($front, $link, $back, $base_dir,
925	$file, $doc_obj, $section) = @_;
926
927	# remove quotes from link at start and end if necessary
928	if ($link=~/^[\"\']/) {
929	$link=~s/^[\"\']//;$link=~s/[\"\']$//;
930	$front.='"';
931	$back="\"$back";
932	}
933
934	$link =~ s/\n/ /g;
935
936	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
937	# If the Word file path has spaces in it, wv messes up and you end up with
938	# absolute paths for the images, and without the "file://" prefix
939	# So check for this special case and massage the data to be correct
940	if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
941	$link =~ s/^.*\\([^\\]+)$/$1/;
942	}
943
944	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
945
946	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
947
948	my $anchor_name = $img_file;
949	$anchor_name =~ s/^.*\///;
950	$anchor_name = "<a name=\"$anchor_name\" />";
951
952	return $front . $img_file . $back . $anchor_name;
953	}
954
955	sub replace_href_links {
956	my $self = shift (@_);
957	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
958
959	# attempt to sort out targets - frames are not handled
960	# well in this plugin and some cases will screw things
961	# up - e.g. the _parent target (so we'll just remove
962	# them all ;-)
963	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
964	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
965	$front =~ s/target=\"?_parent\"?//is;
966	$back =~ s/target=\"?_parent\"?//is;
967
968	return $front . $link . $back if $link =~ /^\#/s;
969	$link =~ s/\n/ /g;
970
971	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
972	# href may use '\'s where '/'s should be on Windows
973	$href =~ s/\\/\//g;
974
975	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
976
977
978	##### leave all these links alone (they won't be picked up by intermediate
979	##### pages). I think that's safest when dealing with frames, targets etc.
980	##### (at least until I think of a better way to do it). Problems occur with
981	##### mailto links from within small frames, the intermediate page is displayed
982	##### within that frame and can't be seen. There is still potential for this to
983	##### happen even with html pages - the solution seems to be to somehow tell
984	##### the browser from the server side to display the page being sent (i.e.
985	##### the intermediate page) in the top level window - I'm not sure if that's
986	##### possible - the following line should probably be deleted if that can be done
987	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
988
989
990	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
991	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
992	&ghtml::urlsafe ($href);
993	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
994	} else {
995	# link is to some other type of file (eg image) so we'll
996	# need to associate that file
997	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
998	}
999	}
1000
1001	sub add_file {
1002	my $self = shift (@_);
1003	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
1004	my ($newname);
1005
1006	my $filename = $href;
1007	if ($base_dir eq "") {
1008	# remove http:/ thereby leaving one slash at the start
1009	$filename =~ s/^[^:]*:\///;
1010	}
1011	else {
1012	# remove http://
1013	$filename =~ s/^[^:]*:\/\///;
1014	}
1015
1016	$filename = &util::filename_cat($base_dir, $filename);
1017
1018	# Replace %20's in URL with a space if required. Note that the filename
1019	# may include the %20 in some situations
1020	if ($filename =~ /\%20/) {
1021	if (!-e $filename) {
1022	$filename =~ s/\%20/ /g;
1023	}
1024	}
1025
1026	my ($ext) = $filename =~ /(\.[^\.]*)$/;
1027
1028	if ($rl == 0) {
1029	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
1030	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
1031	}
1032	else {
1033	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
1034	}
1035	}
1036
1037	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
1038	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
1039	}
1040	if ($self->{'rename_assoc_files'}) {
1041	if (defined $self->{'aux_files'}->{$href}) {
1042	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1043	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1044	} else {
1045	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1046	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1047	$self->inc_filecount ();
1048	}
1049	$doc_obj->associate_file($filename, $newname, undef, $section);
1050	return "_httpdocimg_/$newname";
1051	} else {
1052	($newname) = $filename =~ /([^\/\\]*)$/;
1053	$doc_obj->associate_file($filename, $newname, undef, $section);
1054	return "_httpdocimg_/$newname";
1055	}
1056	}
1057
1058
1059	sub format_link {
1060	my $self = shift (@_);
1061	my ($link, $base_dir, $file) = @_;
1062
1063	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
1064
1065	$hash_part = "" if !defined $hash_part;
1066	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
1067	my $outhandle = $self->{'outhandle'};
1068	print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
1069	if $self->{'verbosity'};
1070	return ($link, "", 0);
1071	}
1072
1073	if ($before_hash =~ s@^((?:http\|ftp\|file)://)@@i) {
1074	my $type = $1;
1075
1076	if ($link =~ /^(http\|ftp):/i) {
1077	# Turn url (using /) into file name (possibly using \ on windows)
1078	my @http_dir_split = split('/', $before_hash);
1079	$before_hash = &util::filename_cat(@http_dir_split);
1080	}
1081
1082	$before_hash = $self->eval_dir_dots($before_hash);
1083
1084	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1085
1086	my $rl = 0;
1087	$rl = 1 if (-e $linkfilename);
1088
1089	# make sure there's a slash on the end if it's a directory
1090	if ($before_hash !~ /\/$/) {
1091	$before_hash .= "/" if (-d $linkfilename);
1092	}
1093
1094	return ($type . $before_hash, $hash_part, $rl);
1095
1096	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ /^\//) {
1097	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ /\\/) {
1098
1099	# the first directory will be the domain name if file_is_url
1100	# to generate archives, otherwise we'll assume all files are
1101	# from the same site and base_dir is the root
1102
1103	if ($self->{'file_is_url'}) {
1104	my @dirs = split /[\/\\]/, $file;
1105	my $domname = shift (@dirs);
1106	$before_hash = &util::filename_cat($domname, $before_hash);
1107	$before_hash =~ s@\\@/@g; # for windows
1108	}
1109	else
1110	{
1111	# see if link shares directory with source document
1112	# => turn into relative link if this is so!
1113
1114	if ($ENV{'GSDLOS'} =~ /^windows/i) {
1115	# too difficult doing a pattern match with embedded '\'s...
1116	my $win_before_hash=$before_hash;
1117	$win_before_hash =~ s@(\\)+@/@g;
1118	# $base_dir is already similarly "converted" on windows.
1119	if ($win_before_hash =~ s@^$base_dir/@@o) {
1120	# if this is true, we removed a prefix
1121	$before_hash=$win_before_hash;
1122	}
1123	}
1124	else {
1125	# before_hash has lost leading slash by this point,
1126	# -> add back in prior to substitution with $base_dir
1127	$before_hash = "/$before_hash";
1128
1129	$before_hash = &util::filename_cat("",$before_hash);
1130	$before_hash =~ s@^$base_dir/@@;
1131	}
1132	}
1133	} else {
1134	# Turn relative file path into full path
1135	my $dirname = &File::Basename::dirname($file);
1136	$before_hash = &util::filename_cat($dirname, $before_hash);
1137	$before_hash = $self->eval_dir_dots($before_hash);
1138	}
1139
1140	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1141	# make sure there's a slash on the end if it's a directory
1142	if ($before_hash !~ /\/$/) {
1143	$before_hash .= "/" if (-d $linkfilename);
1144	}
1145	return ("http://" . $before_hash, $hash_part, 1);
1146	} else {
1147	# mailto, news, nntp, telnet, javascript or gopher link
1148	return ($before_hash, "", 0);
1149	}
1150	}
1151
1152	sub extract_first_NNNN_characters {
1153	my $self = shift (@_);
1154	my ($textref, $doc_obj, $thissection) = @_;
1155
1156	foreach my $size (split /,/, $self->{'first'}) {
1157	my $tmptext = $$textref;
1158	# skip to the body
1159	$tmptext =~ s/.<body[^>]>//i;
1160	# remove javascript
1161	$tmptext =~ s@<script.*?</script>@ @sig;
1162	$tmptext =~ s/<[^>]*>/ /g;
1163	$tmptext =~ s/ / /g;
1164	$tmptext =~ s/^\s+//;
1165	$tmptext =~ s/\s+$//;
1166	$tmptext =~ s/\s+/ /gs;
1167	$tmptext = &unicode::substr ($tmptext, 0, $size);
1168	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
1169	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1170	}
1171	}
1172
1173
1174	sub extract_metadata {
1175	my $self = shift (@_);
1176	my ($textref, $metadata, $doc_obj, $section) = @_;
1177	my $outhandle = $self->{'outhandle'};
1178	# if we don't want metadata, we may as well not be here ...
1179	return if (!defined $self->{'metadata_fields'});
1180
1181	# metadata fields to extract/save. 'key' is the (lowercase) name of the
1182	# html meta, 'value' is the metadata name for greenstone to use
1183	my %find_fields = ();
1184
1185	my %creator_fields = (); # short-cut for lookups
1186
1187
1188	foreach my $field (split /,/, $self->{'metadata_fields'}) {
1189	$field =~ s/^\s+//; # remove leading whitespace
1190	$field =~ s/\s+$//; # remove trailing whitespace
1191
1192	# support tag<tagname>
1193	if ($field =~ /^(.?)<(.?)>$/) {
1194	# "$2" is the user's preferred gs metadata name
1195	$find_fields{lc($1)}=$2; # lc = lowercase
1196	} else { # no <tagname> for mapping
1197	# "$field" is the user's preferred gs metadata name
1198	$find_fields{lc($field)}=$field; # lc = lowercase
1199	}
1200	}
1201
1202	if (defined $self->{'hunt_creator_metadata'} &&
1203	$self->{'hunt_creator_metadata'} == 1 ) {
1204	my @extra_fields =
1205	(
1206	'author',
1207	'author.email',
1208	'creator',
1209	'dc.creator',
1210	'dc.creator.corporatename',
1211	);
1212
1213	# add the creator_metadata fields to search for
1214	foreach my $field (@extra_fields) {
1215	$creator_fields{$field}=0; # add to lookup hash
1216	}
1217	}
1218
1219
1220	# find the header in the html file, which has the meta tags
1221	$$textref =~ m@<head>(.*?)</head>@si;
1222
1223	my $html_header=$1;
1224
1225	# go through every <meta... tag defined in the html and see if it is
1226	# one of the tags we want to match.
1227
1228	# special case for title - we want to remember if its been found
1229	my $found_title = 0;
1230	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1231	$html_header =~ /^/; # match the start of the string, for \G assertion
1232
1233	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1234	my $metatag=$1;
1235	my ($tag, $value);
1236
1237	# find the tag name
1238	$metatag =~ /(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1239	$tag=$2;
1240	# in case they're not using " or ', but they should...
1241	if (! $tag) {
1242	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1243	$tag=$1;
1244	}
1245
1246	if (!defined $tag) {
1247	print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
1248	next;
1249	}
1250
1251	# don't need to assign this field if it was passed in from a previous
1252	# (recursive) plugin
1253	if (defined $metadata->{$tag}) {next}
1254
1255	# find the tag content
1256	$metatag =~ /content\s=\s([\"\'])?(.*?)\1/is;
1257	$value=$2;
1258
1259	if (! $value) {
1260	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1261	$value=$1;
1262	}
1263	if (!defined $value) {
1264	print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
1265	next;
1266	}
1267
1268	# clean up and add
1269	$value =~ s/\s+/ /gs;
1270	chomp($value); # remove trailing \n, if any
1271	if (exists $creator_fields{lc($tag)}) {
1272	# map this value onto greenstone's "Creator" metadata
1273	$tag='Creator';
1274	} elsif (!exists $find_fields{lc($tag)}) {
1275	next; # don't want this tag
1276	} else {
1277	# get the user's preferred capitalisation
1278	$tag = $find_fields{lc($tag)};
1279	}
1280	if (lc($tag) eq "title") {
1281	$found_title = 1;
1282	}
1283	print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
1284	if ($self->{'verbosity'} > 2);
1285	if ($tag =~ /date.*/i){
1286	$tag = lc($tag);
1287	}
1288	$doc_obj->add_utf8_metadata($section, $tag, $value);
1289
1290	}
1291
1292	# TITLE: extract the document title
1293	if (exists $find_fields{'title'} && !$found_title) {
1294	# we want a title, and didn't find one in the meta tags
1295	# see if there's a <title> tag
1296	my $title;
1297	my $from = ""; # for debugging output only
1298	if ($html_header =~ /<title[^>]>([^<]+)<\/title[^>]>/is) {
1299	$title = $1;
1300	$from = "<title> tags";
1301	}
1302
1303	if (!defined $title) {
1304	$from = "first 100 chars";
1305	# if no title use first 100 or so characters
1306	$title = $$textref;
1307	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1308	$title =~ s/^.*?<body>//si;
1309	# ignore javascript!
1310	$title =~ s@<script.*?</script>@ @sig;
1311	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1312	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1313	$title = substr ($title, 0, 100);
1314	$title =~ s/\s\S*$/.../;
1315	}
1316	$title =~ s/<[^>]*>/ /g; # remove html tags
1317	$title =~ s/ / /g;
1318	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1319	$title =~ s/\s+/ /gs; # collapse multiple spaces
1320	$title =~ s/^\s*//; # remove leading spaces
1321	$title =~ s/\s*$//; # remove trailing spaces
1322
1323	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1324	$title =~ s/^\s+//s; # in case title_sub introduced any...
1325	$doc_obj->add_utf8_metadata ($section, 'Title', $title);
1326	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1327	if ($self->{'verbosity'} > 2);
1328	}
1329
1330	# add FileFormat metadata
1331	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1332
1333	# Special, for metadata names such as tagH1 - extracts
1334	# the text between the first <H1> and </H1> tags into "H1" metadata.
1335
1336	foreach my $field (keys %find_fields) {
1337	if ($field !~ /^tag([a-z0-9]+)$/i) {next}
1338	my $tag = $1;
1339	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1340	my $content = $1;
1341	$content =~ s/ / /g;
1342	$content =~ s/<[^>]*>/ /g;
1343	$content =~ s/^\s+//;
1344	$content =~ s/\s+$//;
1345	$content =~ s/\s+/ /gs;
1346	if ($content) {
1347	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1348	$tag =~ s/^tag//i;
1349	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1350	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1351	if ($self->{'verbosity'} > 2);
1352	}
1353	}
1354	}
1355	}
1356
1357
1358	# evaluate any "../" to next directory up
1359	# evaluate any "./" as here
1360	sub eval_dir_dots {
1361	my $self = shift (@_);
1362	my ($filename) = @_;
1363	my $dirsep_os = &util::get_os_dirsep();
1364	my @dirsep = split(/$dirsep_os/,$filename);
1365
1366	my @eval_dirs = ();
1367	foreach my $d (@dirsep) {
1368	if ($d eq "..") {
1369	pop(@eval_dirs);
1370
1371	} elsif ($d eq ".") {
1372	# do nothing!
1373
1374	} else {
1375	push(@eval_dirs,$d);
1376	}
1377	}
1378
1379	# Need to fiddle with number of elements in @eval_dirs if the
1380	# first one is the empty string. This is because of a
1381	# modification to util::filename_cat that supresses the addition
1382	# of a leading '/' character (or \ if windows) (intended to help
1383	# filename cat with relative paths) if the first entry in the
1384	# array is the empty string. Making the array start with two
1385	# empty strings is a way to defeat this "smart" option.
1386	#
1387	if (scalar(@eval_dirs) > 0) {
1388	if ($eval_dirs[0] eq ""){
1389	unshift(@eval_dirs,"");
1390	}
1391	}
1392	return &util::filename_cat(@eval_dirs);
1393	}
1394
1395	sub replace_usemap_links {
1396	my $self = shift (@_);
1397	my ($front, $link, $back) = @_;
1398
1399	$link =~ s/^\.\///;
1400	return $front . $link . $back;
1401	}
1402
1403	sub inc_filecount {
1404	my $self = shift (@_);
1405
1406	if ($self->{'file_num'} == 1000) {
1407	$self->{'dir_num'} ++;
1408	$self->{'file_num'} = 0;
1409	} else {
1410	$self->{'file_num'} ++;
1411	}
1412	}
1413
1414
1415	# Extend the BasPlug read_file so that strings like é are
1416	# converted to UTF8 internally.
1417	#
1418	# We don't convert < or > or & or " in case
1419	# they interfere with the GML files
1420
1421	sub read_file {
1422	my ($self, $filename, $encoding, $language, $textref) = @_;
1423
1424	&BasPlug::read_file($self, $filename, $encoding, $language, $textref);
1425
1426	# Convert entities to their UTF8 equivalents
1427	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1428	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
1429	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1430	}
1431
1432	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: