Context Navigation

source: gsdl/trunk/perllib/plugins/HTMLPlugin.pm@ 20689

Last change on this file since 20689 was 20689, checked in by ak19, 15 years ago
Bug fix to metadata_fields that are given rename-values.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 51.7 KB

Rev	Line
[14665]	1	###########################################################################
	2	#
[15872]	3	# HTMLPlugin.pm -- basic html plugin
[14665]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright (C) 1999 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27	#
	28	# Note that this plugin handles frames only in a very simple way
	29	# i.e. each frame is treated as a separate document. This means
	30	# search results will contain links to individual frames rather
	31	# than linking to the top level frameset.
	32	# There may also be some problems caused by the _parent target
	33	# (it's removed by this plugin)
	34	#
	35
[15872]	36	package HTMLPlugin;
[14665]	37
[15872]	38	use ReadTextFile;
	39	use HBPlugin;
[14665]	40	use ghtml;
	41	use unicode;
	42	use util;
	43	use XMLParser;
	44
	45	use Image::Size;
[14913]	46	use File::Copy;
[14665]	47
	48	sub BEGIN {
[15872]	49	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
[14665]	50	}
	51
	52	use strict; # every perl program should have this!
	53	no strict 'refs'; # make an exception so we can use variables as filehandles
	54
	55	my $arguments =
	56	[ { 'name' => "process_exp",
[15872]	57	'desc' => "{BasePlugin.process_exp}",
[14665]	58	'type' => "regexp",
	59	'deft' => &get_default_process_exp() },
	60	{ 'name' => "block_exp",
[15872]	61	'desc' => "{BasePlugin.block_exp}",
[14665]	62	'type' => 'regexp',
	63	'deft' => &get_default_block_exp() },
	64	{ 'name' => "nolinks",
[15872]	65	'desc' => "{HTMLPlugin.nolinks}",
[14665]	66	'type' => "flag" },
	67	{ 'name' => "keep_head",
[15872]	68	'desc' => "{HTMLPlugin.keep_head}",
[14665]	69	'type' => "flag" },
	70	{ 'name' => "no_metadata",
[15872]	71	'desc' => "{HTMLPlugin.no_metadata}",
[14665]	72	'type' => "flag" },
	73	{ 'name' => "metadata_fields",
[15872]	74	'desc' => "{HTMLPlugin.metadata_fields}",
[14665]	75	'type' => "string",
	76	'deft' => "Title" },
	77	{ 'name' => "hunt_creator_metadata",
[15872]	78	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
[14665]	79	'type' => "flag" },
	80	{ 'name' => "file_is_url",
[15872]	81	'desc' => "{HTMLPlugin.file_is_url}",
[14665]	82	'type' => "flag" },
	83	{ 'name' => "assoc_files",
[15872]	84	'desc' => "{HTMLPlugin.assoc_files}",
[14665]	85	'type' => "regexp",
	86	'deft' => &get_default_block_exp() },
	87	{ 'name' => "rename_assoc_files",
[15872]	88	'desc' => "{HTMLPlugin.rename_assoc_files}",
[14665]	89	'type' => "flag" },
	90	{ 'name' => "title_sub",
[15872]	91	'desc' => "{HTMLPlugin.title_sub}",
[14665]	92	'type' => "string",
	93	'deft' => "" },
	94	{ 'name' => "description_tags",
[15872]	95	'desc' => "{HTMLPlugin.description_tags}",
[14665]	96	'type' => "flag" },
	97	# retain this for backward compatibility (w3mir option was replaced by
	98	# file_is_url)
	99	{ 'name' => "w3mir",
[15872]	100	# 'desc' => "{HTMLPlugin.w3mir}",
[14665]	101	'type' => "flag",
	102	'hiddengli' => "yes"},
	103	{ 'name' => "no_strip_metadata_html",
[15872]	104	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
[14665]	105	'type' => "string",
	106	'deft' => "",
	107	'reqd' => "no"},
	108	{ 'name' => "sectionalise_using_h_tags",
[15872]	109	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
[14665]	110	'type' => "flag" },
[14913]	111	{ 'name' => "use_realistic_book",
[15872]	112	'desc' => "{HTMLPlugin.tidy_html}",
[14665]	113	'type' => "flag"},
[15872]	114	{ 'name' => "old_style_HDL",
	115	'desc' => "{HTMLPlugin.old_style_HDL}",
	116	'type' => "flag"}
[14665]	117	];
	118
[15872]	119	my $options = { 'name' => "HTMLPlugin",
	120	'desc' => "{HTMLPlugin.desc}",
[14665]	121	'abstract' => "no",
	122	'inherits' => "yes",
	123	'args' => $arguments };
	124
	125
	126	sub HB_read_html_file {
	127	my $self = shift (@_);
	128	my ($htmlfile, $text) = @_;
[16024]	129
[14665]	130	# load in the file
	131	if (!open (FILE, $htmlfile)) {
	132	print STDERR "ERROR - could not open $htmlfile\n";
	133	return;
	134	}
	135
	136	my $foundbody = 0;
	137	$self->HB_gettext (\$foundbody, $text, "FILE");
	138	close FILE;
	139
	140	# just in case there was no <body> tag
	141	if (!$foundbody) {
	142	$foundbody = 1;
	143	open (FILE, $htmlfile) \|\| return;
	144	$self->HB_gettext (\$foundbody, $text, "FILE");
	145	close FILE;
	146	}
	147	# text is in utf8
	148	}
	149
	150	# converts the text to utf8, as ghtml does that for é etc.
	151	sub HB_gettext {
	152	my $self = shift (@_);
	153	my ($foundbody, $text, $handle) = @_;
[16024]	154
[14665]	155	my $line = "";
	156	while (defined ($line = <$handle>)) {
	157	# look for body tag
	158	if (!$$foundbody) {
	159	if ($line =~ s/^.<body[^>]>//i) {
	160	$$foundbody = 1;
	161	} else {
	162	next;
	163	}
	164	}
	165
	166	# check for symbol fonts
[16769]	167	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
[14665]	168	my $font = $1;
	169	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
[16769]	170	if ($font !~ m/^arial$/i);
[14665]	171	}
	172
	173	$$text .= $line;
	174	}
	175
[16024]	176	if ($self->{'input_encoding'} eq "iso_8859_1") {
[14665]	177	# convert to utf-8
	178	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
	179	}
	180	# convert any alphanumeric character entities to their utf-8
	181	# equivalent for indexing purposes
	182	#&ghtml::convertcharentities ($$text);
	183
	184	$$text =~ s/\s+/ /g; # remove \n's
	185	}
	186
	187	sub HB_clean_section {
	188	my $self = shift (@_);
	189	my ($section) = @_;
	190
	191	# remove tags without a starting tag from the section
	192	my ($tag, $tagstart);
[16769]	193	while ($section =~ m/<\/([^>]{1,10})>/) {
[14665]	194	$tag = $1;
	195	$tagstart = index($section, "<$tag");
	196	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
	197	$section =~ s/<\/$tag>//;
	198	}
	199
	200	# remove extra paragraph tags
	201	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
	202
	203	# remove extra stuff at the end of the section
	204	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
	205
	206	# add a newline at the beginning of each paragraph
	207	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
	208
	209	# add a newline every 80 characters at a word boundary
	210	# Note: this regular expression puts a line feed before
	211	# the last word in each section, even when it is not
	212	# needed.
	213	$section =~ s/(.{1,80})\s/$1\n/g;
	214
	215	# fix up the image links
	216	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
	217	<center><img src=\"$1\" \/><\/center><br\/>/ig;
	218	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
	219	<center><img src=\"$1\" \/><\/center><br\/>/ig;
	220
	221	return $section;
	222	}
[16024]	223
[14665]	224	# Will convert the oldHDL format to the new HDL format (using the Section tag)
	225	sub convert_to_newHDLformat
	226	{
[16024]	227	my $self = shift (@_);
	228	my ($file,$cnfile) = @_;
	229	my $input_filename = $file;
	230	my $tmp_filename = $cnfile;
	231
	232	# write HTML tmp file with new HDL format
	233	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
	234
	235	# read in the file and do basic html cleaning (removing header etc)
	236	my $html = "";
	237	$self->HB_read_html_file ($input_filename, \$html);
	238
[14665]	239	# process the file one section at a time
	240	my $curtoclevel = 1;
	241	my $firstsection = 1;
	242	my $toclevel = 0;
	243	while (length ($html) > 0) {
	244	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
	245	$toclevel = $3;
	246	my $title = $4;
	247	my $sectiontext = "";
	248	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
	249	$sectiontext = $1;
	250	} else {
	251	$sectiontext = $html;
	252	$html = "";
	253	}
	254
	255	# remove tags and extra spaces from the title
	256	$title =~ s/<\/?[^>]+>//g;
	257	$title =~ s/^\s+\|\s+$//g;
	258
	259	# close any sections below the current level and
	260	# create a new section (special case for the firstsection)
	261	print PROD "<!--\n";
	262	while (($curtoclevel > $toclevel) \|\|
	263	(!$firstsection && $curtoclevel == $toclevel)) {
	264	$curtoclevel--;
	265	print PROD "</Section>\n";
	266	}
	267	if ($curtoclevel+1 < $toclevel) {
	268	print STDERR "WARNING - jump in toc levels in $input_filename " .
	269	"from $curtoclevel to $toclevel\n";
	270	}
	271	while ($curtoclevel < $toclevel) {
	272	$curtoclevel++;
	273	}
	274
	275	if ($curtoclevel == 1) {
	276	# add the header tag
	277	print PROD "-->\n";
	278	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
	279	print PROD "<!--\n";
	280	}
	281
	282	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
[16024]	283
[14665]	284	print PROD "-->\n";
[16024]	285
[14665]	286	# clean up the section html
	287	$sectiontext = $self->HB_clean_section($sectiontext);
	288
	289	print PROD "$sectiontext\n";
	290
	291	} else {
	292	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
	293	"\nin $input_filename\n";
	294	last;
	295	}
	296	$firstsection = 0;
	297	}
	298
	299	print PROD "<!--\n";
	300	while ($curtoclevel > 0) {
	301	$curtoclevel--;
	302	print PROD "</Section>\n";
	303	}
	304	print PROD "-->\n";
	305
	306	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
	307
	308	return $tmp_filename;
	309	}
	310
	311	sub shorten {
	312	my $self = shift (@_);
	313	my ($text) = @_;
	314
	315	return "\"$text\"" if (length($text) < 100);
	316
	317	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
	318	substr ($text, length($text)-50) . "\"";
	319	}
	320
	321	sub convert_tidy_or_oldHDL_file
	322	{
	323	my $self = shift (@_);
	324	my ($file) = @_;
	325	my $input_filename = $file;
[16024]	326
[14665]	327	if (-d $input_filename)
	328	{
	329	return $input_filename;
	330	}
	331
	332	# get the input filename
	333	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	334	my $base_dirname = $dirname;
	335	$suffix = lc($suffix);
	336
	337	# derive tmp filename from input filename
	338	# Remove any white space from filename -- no risk of name collision, and
	339	# makes later conversion by utils simpler. Leave spaces in path...
	340	# tidy up the filename with space, dot, hyphen between
	341	$tailname =~ s/\s+//g;
	342	$tailname =~ s/\.+//g;
	343	$tailname =~ s/\-+//g;
	344	# convert to utf-8 otherwise we have problems with the doc.xml file
	345	# later on
	346	&unicode::ensure_utf8(\$tailname);
	347
	348	# softlink to collection tmp dir
	349	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
	350	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
	351
	352	my $test_dirname = "";
[14913]	353	my $f_separator = &util::get_os_dirsep();
	354
[16769]	355	if ($dirname =~ m/import$f_separator/)
[14665]	356	{
[16024]	357	$test_dirname = $'; #'
[14665]	358
	359	#print STDERR "init $'\n";
	360
[16769]	361	while ($test_dirname =~ m/[$f_separator]/)
[14665]	362	{
[16024]	363	my $folderdirname = $`;
	364	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
	365	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
	366	$test_dirname = $'; #'
[14665]	367	}
	368	}
	369
	370	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
	371
	372	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
	373	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
	374	{
	375	#convert the input file to a new style HDL
	376	my $hdl_output_filename = $input_filename;
	377	if ($self->{'old_style_HDL'})
	378	{
[16024]	379	$hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
	380	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
[14665]	381	}
	382
	383	#just for checking copy all other file from the base dir to tmp dir if it is not exists
	384	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
	385	my @files = grep {!/^\.+$/} readdir(DIR);
	386	close(DIR);
	387
	388	foreach my $file (@files)
	389	{
[16024]	390	my $src_file = &util::filename_cat($base_dirname,$file);
	391	my $dest_file = &util::filename_cat($tmp_dirname,$file);
	392	if ((!-e $dest_file) && (!-d $src_file))
	393	{
	394	# just copy the original file back to the tmp directory
	395	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
	396	}
[14665]	397	}
	398
	399	# tidy the input file
	400	my $tidy_output_filename = $hdl_output_filename;
[17863]	401	if ($self->{'use_realistic_book'})
[14665]	402	{
[16024]	403	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
	404	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
[14665]	405	}
	406	$tmp_filename = $tidy_output_filename;
	407	}
	408	else
	409	{
	410	if (!-e $tmp_filename)
	411	{
[16024]	412	# just copy the original file back to the tmp directory
	413	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
[14665]	414	}
	415	}
	416
	417	return $tmp_filename;
	418	}
	419
[16024]	420
[14665]	421	# Will make the html input file as a proper XML file with removed font tag and
	422	# image size added to the img tag.
	423	# The tidying process takes place in a collection specific 'tmp' directory so
	424	# that we don't accidentally damage the input.
	425	sub tmp_tidy_file
	426	{
	427	my $self = shift (@_);
	428	my ($file,$cnfile) = @_;
	429	my $input_filename = $file;
	430	my $tmp_filename = $cnfile;
[16024]	431
[14665]	432	# get the input filename
	433	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	434
	435	require HTML::TokeParser::Simple;
[16024]	436
	437	# create HTML parser to decode the input file
	438	my $parser = HTML::TokeParser::Simple->new($input_filename);
[14665]	439
[16024]	440	# write HTML tmp file without the font tag and image size are added to the img tag
	441	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
	442	while (my $token = $parser->get_token())
	443	{
	444	# is it an img tag
	445	if ($token->is_start_tag('img'))
	446	{
	447	# get the attributes
	448	my $attr = $token->return_attr;
[14665]	449
[16024]	450	# get the full path to the image
	451	my $img_file = &util::filename_cat($dirname,$attr->{src});
[14665]	452
[16024]	453	# set the width and height attribute
	454	($attr->{width}, $attr->{height}) = imgsize($img_file);
[14665]	455
[16024]	456	# recreate the tag
	457	print PROD "<img";
	458	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
	459	print PROD ">";
	460	}
	461	# is it a font tag
	462	else
	463	{
	464	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
	465	{
	466	# remove font tag
	467	print PROD "";
	468	}
	469	else
	470	{
	471	# print without changes
	472	print PROD $token->as_is;
	473	}
	474	}
	475	}
	476	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
[14665]	477
[16024]	478	# run html-tidy on the tmp file to make it a proper XML file
[17927]	479	my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`;
[14665]	480
[16024]	481	# write result back to the tmp file
	482	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
	483	print PROD $tidyfile;
	484	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
[14665]	485
	486	# return the output filename
	487	return $tmp_filename;
	488	}
	489
	490	sub read_into_doc_obj
	491	{
	492	my $self = shift (@_);
[16392]	493	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
	494
[14665]	495	# get the input file
	496	my $input_filename = $file;
	497	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	498	$suffix = lc($suffix);
	499
[17863]	500	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
[14665]	501	{
[15872]	502	# because the document has to be sectionalized set the description tags
	503	$self->{'description_tags'} = 1;
[14665]	504
[15872]	505	# set the file to be tidied
[16769]	506	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
[15872]	507
	508	# get the tidied file
	509	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
	510	my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
	511
	512	# derive tmp filename from input filename
	513	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
	514
	515	# set the new input file and base_dir to be from the tidied file
	516	$file = "$tailname$suffix";
	517	$base_dir = $dirname;
[14665]	518	}
	519
	520	# call the parent read_into_doc_obj
[16392]	521	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
[14665]	522
	523	return ($process_status,$doc_obj);
	524	}
	525
	526	sub new {
	527	my ($class) = shift (@_);
	528	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	529	push(@$pluginlist, $class);
	530
[15872]	531	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	532	push(@{$hashArgOptLists->{"OptList"}},$options);
[16024]	533
[14665]	534
[15872]	535	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
[14665]	536
	537	if ($self->{'w3mir'}) {
	538	$self->{'file_is_url'} = 1;
	539	}
	540	$self->{'aux_files'} = {};
	541	$self->{'dir_num'} = 0;
	542	$self->{'file_num'} = 0;
	543
	544	return bless $self, $class;
	545	}
	546
	547	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
	548	# if have eg <script language="javascript" src="img/lib.js@123">
	549	sub get_default_block_exp {
	550	my $self = shift (@_);
	551
[16392]	552	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
	553	return "";
[14665]	554	}
	555
	556	sub get_default_process_exp {
	557	my $self = shift (@_);
	558
	559	# the last option is an attempt to encode the concept of an html query ...
	560	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
	561	}
	562
	563	sub store_block_files
	564	{
	565	my $self =shift (@_);
[16392]	566	my ($filename_full_path, $block_hash) = @_;
	567
	568	my $html_fname = $filename_full_path;
[14665]	569	my @file_blocks;
	570
[16392]	571	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
[14665]	572
	573	# read in file ($text will be in utf8)
[16769]	574	my $raw_text = "";
	575	$self->read_file_no_decoding ($filename_full_path, \$raw_text);
	576
	577	my $textref = \$raw_text;
[14665]	578	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
	579	my $closecom = '(?:-->\|(?:—\|\|--)>)';
	580	$$textref =~ s/$opencom(.*?)$closecom//gs;
	581
	582	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
	583	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
	584	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
	585	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
	586	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
[17127]	587	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
[16638]	588	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
[14665]	589
[16769]	590	if(!defined $self->{'utf8_to_original_filename'}) {
	591	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
	592	$self->{'utf8_to_original_filename'} = {};
	593	}
	594
[16638]	595	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
[14665]	596
	597	# remove quotes from link at start and end if necessary
	598	if ($link=~/^\"/) {
	599	$link=~s/^\"//;
	600	$link=~s/\"$//;
	601	}
	602
	603	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
[16638]	604	# some links may just be anchor names
	605	next unless ($link =~ /\S+/);
[14665]	606
	607	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
	608	# Turn relative file path into full path
[16392]	609	my $dirname = &File::Basename::dirname($filename_full_path);
[14665]	610	$link = &util::filename_cat($dirname, $link);
	611	}
	612	$link = $self->eval_dir_dots($link);
[16638]	613
[16769]	614	# this is the actual filename on the filesystem (that the link refers to)
	615	my $url_original_filename = $self->opt_url_decode($link);
	616
	617	# Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
	618	my $utf8_link = "";
	619	$self->decode_text($link,$encoding,$language,\$utf8_link);
	620
	621	$self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
[16935]	622	# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
[16769]	623
[17088]	624	if ($url_original_filename ne $utf8_link) {
	625	my $outhandle = $self->{'outhandle'};
	626
	627	print $outhandle "URL Encoding $url_original_filename\n";
	628	print $outhandle " ->$utf8_link\n";
	629	}
	630
[16769]	631	$block_hash->{'file_blocks'}->{$url_original_filename} = 1;
[14665]	632	}
	633	}
	634
[16769]	635	# Given a filename in any encoding, will URL decode it to get back the original filename
	636	# in the original encoding. Because this method is intended to work out the original
[18320]	637	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
[16769]	638	# string already exists in the local folder.
	639	# Return the original filename corresponding to the parameter URL-encoded filename, and
	640	# a decoded flag that is set to true iff URL-decoding had to be applied.
	641	sub opt_url_decode {
	642	my $self = shift (@_);
	643	my ($link) = @_;
[16024]	644
[16769]	645	# Replace %XX's in URL with decoded value if required.
	646	# Note that the filename may include the %XX in some situations
	647	if ($link =~ m/\%[A-F0-9]{2}/i) {
	648	if (!-e $link) {
	649	$link = &unicode::url_decode($link);
	650	}
	651	}
	652
	653	return $link;
	654	}
	655
	656
[14665]	657	# do plugin specific processing of doc_obj
	658	sub process {
	659	my $self = shift (@_);
	660	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
	661	my $outhandle = $self->{'outhandle'};
	662
[16769]	663	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
[16024]	664	# this makes life so much easier... perl can cope with unix-style '/'s.
	665	$base_dir =~ s@(\\)+@/@g;
	666	$file =~ s@(\\)+@/@g;
[14665]	667	}
	668
	669	# reset per-doc stuff...
	670	$self->{'aux_files'} = {};
	671	$self->{'dir_num'} = 0;
	672	$self->{'file_num'} = 0;
	673
	674	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
	675	# you can also include metadata in the format (X can be any number)
	676	# <hX>Title<!--gsdl-metadata
	677	# <Metadata name="name1">value1</Metadata>
	678	# ...
	679	# <Metadata name="nameN">valueN</Metadata>
	680	#--></hX>
	681	if ($self->{'sectionalise_using_h_tags'}) {
	682	# description_tags should allways be activated because we convert headings to description tags
	683	$self->{'description_tags'} = 1;
	684
	685	my $arrSections = [];
	686	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
	687
	688	if (scalar(@$arrSections)) {
	689	my $strMetadata = $self->update_section_data($arrSections, -1);
	690	if (length($strMetadata)) {
	691	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
	692	$$textref =~ s/<\/body>/$strMetadata/ig;
	693	}
	694	}
	695	}
	696
	697	my $cursection = $doc_obj->get_top_section();
	698
	699	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
	700	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
	701
	702	# Store URL for page as metadata - this can be used for an
	703	# altavista style search interface. The URL won't be valid
	704	# unless the file structure contains the domain name (i.e.
	705	# like when w3mir is used to download a website).
	706
	707	# URL metadata (even invalid ones) are used to support internal
	708	# links, so even if 'file_is_url' is off, still need to store info
	709
[16735]	710	my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$");
[15872]	711	my $utf8_file = $self->filename_to_utf8_metadata($file);
[18409]	712	$utf8_file =~ s/&\#095;/_/g;
[16735]	713	my $web_url = "http://";
	714	if(defined $dirname) { # local directory
[16836]	715	$dirname = $self->eval_dir_dots($dirname);
[18626]	716	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
[16735]	717	$web_url = $web_url.$dirname.$utf8_file;
	718	} else {
	719	$web_url = $web_url.$utf8_file;
	720	}
[19983]	721	$web_url =~ s/\\/\//g;
[15872]	722	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
	723
[14665]	724	if ($self->{'file_is_url'}) {
	725	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
	726	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
	727	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
	728	}
	729
	730	if ($self->{'description_tags'}) {
	731	# remove the html header - note that doing this here means any
	732	# sections defined within the header will be lost (so all <Section>
	733	# tags must appear within the body of the HTML)
	734	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
	735
	736	$$textref =~ s/^.?<body[^>]>//is;
	737	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	738
	739	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
	740	my $closecom = '(?:-->\|(?:—\|\|--)>)';
	741
	742	my $lt = '(?:<\|<)';
	743	my $gt = '(?:>\|>)';
	744	my $quot = '(?:"\|"\|”\|“)';
	745
	746	my $dont_strip = '';
	747	if ($self->{'no_strip_metadata_html'}) {
	748	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
	749	}
	750
	751	my $found_something = 0; my $top = 1;
	752	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
	753	my $text = $1;
	754	my $comment = $2;
	755	if (defined $text) {
	756	# text before a comment - note that getting to here
	757	# doesn't necessarily mean there are Section tags in
	758	# the document
	759	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
	760	}
	761	while ($comment =~ s/$lt(.*?)$gt//s) {
	762	my $tag = $1;
	763	if ($tag eq "Section") {
	764	$found_something = 1;
	765	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
	766	$top = 0;
	767	} elsif ($tag eq "/Section") {
	768	$found_something = 1;
	769	$cursection = $doc_obj->get_parent_section ($cursection);
[16769]	770	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
[14665]	771	my $metaname = $1;
[16769]	772	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
[14665]	773	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
	774	my $metavalue = $1;
	775	$metavalue =~ s/^\s+//;
	776	$metavalue =~ s/\s+$//;
	777	# assume that no metadata value intentionally includes
	778	# carriage returns or HTML tags (if they're there they
	779	# were probably introduced when converting to HTML from
	780	# some other format).
	781	# actually some people want to have html tags in their
	782	# metadata.
	783	$metavalue =~ s/[\cJ\cM]/ /sg;
	784	$metavalue =~ s/<[^>]+>//sg
[16769]	785	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
[14665]	786	$metavalue =~ s/\s+/ /sg;
	787	if ($accumulate) {
	788	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
	789	} else {
	790	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
	791	}
	792	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
	793	# do nothing with containing Description tags
	794	} else {
	795	# simple HTML tag (probably created by the conversion
	796	# to HTML from some other format) - we'll ignore it and
	797	# hope for the best ;-)
	798	}
	799	}
	800	}
	801	if ($cursection ne "") {
[15872]	802	print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
[14665]	803	}
	804
	805	$$textref =~ s/^.?<body[^>]>//is;
	806	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
[16769]	807	if ($$textref =~ m/\S/) {
[14665]	808	if (!$found_something) {
	809	if ($self->{'verbosity'} > 2) {
[15872]	810	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
[14665]	811	print $outhandle " will be processed as a single section document\n";
	812	}
	813
	814	# go ahead and process single-section document
	815	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
	816
	817	# if document contains no Section tags we'll go ahead
	818	# and extract metadata (this won't have been done
	819	# above as the -description_tags option prevents it)
	820	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
	821	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
	822	unless $self->{'no_metadata'};
	823
	824	} else {
[15872]	825	print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
[14665]	826	print $outhandle " of the final closing </Section> tag. This text will\n";
	827	print $outhandle " be ignored.";
	828
	829	my ($text);
	830	if (length($$textref) > 30) {
	831	$text = substr($$textref, 0, 30) . "...";
	832	} else {
	833	$text = $$textref;
	834	}
	835	$text =~ s/\n/ /isg;
	836	print $outhandle " ($text)\n";
	837	}
	838	} elsif (!$found_something) {
	839
	840	if ($self->{'verbosity'} > 2) {
	841	# may get to here if document contained no valid Section
	842	# tags but did contain some comments. The text will have
	843	# been processed already but we should print the warning
	844	# as above and extract metadata
[15872]	845	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
[14665]	846	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
	847	}
	848
	849	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
	850	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
	851	unless $self->{'no_metadata'};
	852	}
	853
	854	} else {
	855
	856	# remove header and footer
	857	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
	858	$$textref =~ s/^.?<body[^>]>//is;
	859	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	860	}
	861
	862	# single section document
	863	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
	864	}
	865	return 1;
	866	}
	867
	868
	869	sub process_heading
	870	{
	871	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
	872	$strHeadingText = '' if (!defined($strHeadingText));
	873
	874	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
	875
	876	my $strSecMetadata = '';
	877	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
	878	{
	879	$strSecMetadata .= $1;
	880	}
	881
	882	$strHeadingText =~ s/^\s+//g;
	883	$strHeadingText =~ s/\s+$//g;
	884	$strSecMetadata =~ s/^\s+//g;
	885	$strSecMetadata =~ s/\s+$//g;
	886
	887	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
	888
	889	if (length($strSecMetadata)) {
	890	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
	891	}
	892
	893	$strMetadata .= "\t</Description>\n";
	894
	895	return "<!--" . $strMetadata . "-->";
	896	}
	897
	898
	899	sub update_section_data
	900	{
	901	my ($self, $arrSections, $nCurTocNo) = @_;
	902	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
	903
	904	if ($nSections == 0) {
	905	push @$arrSections, $nCurTocNo;
	906	return $strBuffer;
	907	}
	908	$nLast = $arrSections->[$nSections - 1];
	909	if ($nCurTocNo > $nLast) {
	910	push @$arrSections, $nCurTocNo;
	911	return $strBuffer;
	912	}
	913	for(my $i = $nSections - 1; $i >= 0; $i--) {
	914	if ($nCurTocNo <= $arrSections->[$i]) {
	915	$strBuffer .= "\n</Section>";
	916	pop @$arrSections;
	917	}
	918	}
	919	push @$arrSections, $nCurTocNo;
	920	return $strBuffer;
	921	}
	922
	923
	924	# note that process_section may be called multiple times for a single
	925	# section (relying on the fact that add_utf8_text appends the text to any
	926	# that may exist already).
	927	sub process_section {
	928	my $self = shift (@_);
	929	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
	930	# trap links
	931	if (!$self->{'nolinks'}) {
	932	# usemap="./#index" not handled correctly => change to "#index"
[16769]	933	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
	934
	935	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
[14665]	936	$self->replace_usemap_links($1, $2, $3)/isge;
	937
[16769]	938	## $$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
	939
	940	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
[14665]	941	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
	942	}
	943
	944	# trap images
	945
[15872]	946	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
[15176]	947	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
	948	# turned regular text succeeding images into links. That is, by embedding <imgs>
	949	# inside <a href=""></a>, the text following images were no longer misbehaving.
	950	# However, there would be many occasions whereby images were not meant to link
	951	# to their source images but where the images would link to another web page.
	952	# To allow this, the no_image_links option was introduced: it would prevent
	953	# the behaviour of embedding images into links that referenced the source images.
	954
	955	# Somewhere along the line, the problem of normal text turning into links when
	956	# such text followed images which were not embedded in <a href=""></a> ceased
	957	# to occur. This is why the following lines have been commented out (as well as
	958	# two lines in replace_images). They appear to no longer apply.
	959
	960	# If at any time, there is a need for having images embedded in <a> anchor tags,
[15872]	961	# then it might be better to turn that into an HTMLPlugin option rather than make
[15176]	962	# it the default behaviour. Also, eventually, no_image_links needs to become
[15872]	963	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
[15176]	964
	965	#if(!$self->{'no_image_links'}){
[16247]	966	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
[15872]	967	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
[15176]	968	#}
	969
[14665]	970	# add text to document object
	971	# turn \ into \\ so that the rest of greenstone doesn't think there
	972	# is an escape code following. (Macro parsing loses them...)
	973	$$textref =~ s/\\/\\\\/go;
	974
	975	$doc_obj->add_utf8_text($cursection, $$textref);
	976	}
	977
	978	sub replace_images {
	979	my $self = shift (@_);
	980	my ($front, $link, $back, $base_dir,
	981	$file, $doc_obj, $section) = @_;
	982
	983	# remove quotes from link at start and end if necessary
	984	if ($link=~/^[\"\']/) {
[15838]	985	$link=~s/^[\"\']//;
	986	$link=~s/[\"\']$//;
[14665]	987	$front.='"';
	988	$back="\"$back";
	989	}
[15872]	990
[14665]	991	$link =~ s/\n/ /g;
	992
	993	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
	994	# If the Word file path has spaces in it, wv messes up and you end up with
	995	# absolute paths for the images, and without the "file://" prefix
	996	# So check for this special case and massage the data to be correct
[16769]	997	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
[14665]	998	$link =~ s/^.*\\([^\\]+)$/$1/;
	999	}
[16632]	1000
[14665]	1001	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
	1002
	1003	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
	1004
[17127]	1005	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n";
[16632]	1006
[14665]	1007	my $anchor_name = $img_file;
	1008	#$anchor_name =~ s/^.*\///;
	1009	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
	1010
	1011	my $image_link = $front . $img_file .$back;
[15176]	1012	return $image_link;
[14665]	1013
[15176]	1014	# The reasons for why the following two lines are no longer necessary can be
	1015	# found in subroutine process_section
	1016	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
	1017	#return $anchor_link;
	1018
[14665]	1019	#return $front . $img_file . $back . $anchor_name;
	1020	}
	1021
	1022	sub replace_href_links {
	1023	my $self = shift (@_);
	1024	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
	1025
[16769]	1026	# remove quotes from link at start and end if necessary
	1027	if ($link=~/^[\"\']/) {
	1028	$link=~s/^[\"\']//;
	1029	$link=~s/[\"\']$//;
	1030	$front.='"';
	1031	$back="\"$back";
	1032	}
	1033
[14665]	1034	# attempt to sort out targets - frames are not handled
	1035	# well in this plugin and some cases will screw things
	1036	# up - e.g. the _parent target (so we'll just remove
	1037	# them all ;-)
	1038	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
	1039	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
	1040	$front =~ s/target=\"?_parent\"?//is;
	1041	$back =~ s/target=\"?_parent\"?//is;
	1042
[16769]	1043	return $front . $link . $back if $link =~ m/^\#/s;
[14665]	1044	$link =~ s/\n/ /g;
	1045
[16769]	1046	# Find file referred to by $link on file system
	1047	# This is more complicated than it sounds when char encodings
	1048	# is taken in to account
[14665]	1049	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[16769]	1050
[14665]	1051	# href may use '\'s where '/'s should be on Windows
	1052	$href =~ s/\\/\//g;
[16769]	1053	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
[14665]	1054
	1055
	1056	##### leave all these links alone (they won't be picked up by intermediate
	1057	##### pages). I think that's safest when dealing with frames, targets etc.
	1058	##### (at least until I think of a better way to do it). Problems occur with
	1059	##### mailto links from within small frames, the intermediate page is displayed
	1060	##### within that frame and can't be seen. There is still potential for this to
	1061	##### happen even with html pages - the solution seems to be to somehow tell
	1062	##### the browser from the server side to display the page being sent (i.e.
	1063	##### the intermediate page) in the top level window - I'm not sure if that's
	1064	##### possible - the following line should probably be deleted if that can be done
[16769]	1065	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
[14665]	1066
	1067
[16769]	1068	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
	1069	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
[16812]	1070	&ghtml::urlsafe ($href);
[18521]	1071	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
[14665]	1072	} else {
	1073	# link is to some other type of file (eg image) so we'll
	1074	# need to associate that file
	1075	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
	1076	}
	1077	}
	1078
	1079	sub add_file {
	1080	my $self = shift (@_);
	1081	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
	1082	my ($newname);
	1083
	1084	my $filename = $href;
	1085	if ($base_dir eq "") {
	1086	# remove http:/ thereby leaving one slash at the start
	1087	$filename =~ s/^[^:]*:\///;
	1088	}
	1089	else {
	1090	# remove http://
	1091	$filename =~ s/^[^:]*:\/\///;
	1092	}
	1093
	1094	$filename = &util::filename_cat($base_dir, $filename);
[16769]	1095	# Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some
	1096	# situations. If the original file's name was in URL encoding, the following method will not decode it.
	1097	my $utf8_filename = $filename;
	1098	$filename = $self->opt_url_decode($utf8_filename);
	1099
	1100	# some special processing if the intended filename was converted to utf8, but
	1101	# the actual file still needs to be renamed
	1102	if (!-e $filename) {
	1103	# try the original filename stored in map
	1104	my $original_filename = $self->{'utf8_to_original_filename'}->{$filename};
[16920]	1105	if (defined $original_filename && -e $original_filename) {
[16769]	1106	$filename = $original_filename;
[14665]	1107	}
	1108	}
[16769]	1109
	1110	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
[14665]	1111
	1112	if ($rl == 0) {
[16769]	1113	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
[18521]	1114	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
[14665]	1115	}
	1116	else {
[18521]	1117	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
[14665]	1118	}
	1119	}
	1120
[16769]	1121	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
[18521]	1122	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
[14665]	1123	}
	1124	if ($self->{'rename_assoc_files'}) {
	1125	if (defined $self->{'aux_files'}->{$href}) {
	1126	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
	1127	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
	1128	} else {
	1129	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
	1130	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
	1131	$self->inc_filecount ();
	1132	}
	1133	$doc_obj->associate_file($filename, $newname, undef, $section);
	1134	return "_httpdocimg_/$newname";
	1135	} else {
[16904]	1136	if(&unicode::is_url_encoded($utf8_filename)) {
	1137	# use the possibly-decoded filename instead to avoid double URL encoding
	1138	($newname) = $filename =~ m/([^\/\\]*)$/;
	1139	} else {
	1140	($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
	1141	}
[16935]	1142
[18320]	1143	# Make sure this name uses only ASCII characters.
	1144	# We use either base64 or URL encoding, as these preserve original encoding
	1145	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
[16632]	1146
[14665]	1147	$doc_obj->associate_file($filename, $newname, undef, $section);
[16632]	1148
[16769]	1149	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
	1150	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
[16632]	1151	my $newname_url = $newname;
[18404]	1152	$newname_url = &unicode::filename_to_url($newname_url);
[16769]	1153	return "_httpdocimg_/$newname_url";
[14665]	1154	}
	1155	}
	1156
	1157
	1158	sub format_link {
	1159	my $self = shift (@_);
	1160	my ($link, $base_dir, $file) = @_;
	1161
[16769]	1162	my ($before_hash, $hash_part) = $link =~ m/^([^\#])(\#?.)$/;
[15872]	1163
[14665]	1164	$hash_part = "" if !defined $hash_part;
[16769]	1165	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
[14665]	1166	my $outhandle = $self->{'outhandle'};
[15872]	1167	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
[14665]	1168	if $self->{'verbosity'};
	1169	return ($link, "", 0);
	1170	}
	1171
[20576]	1172	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
[14665]	1173	my $type = $1;
	1174
[16769]	1175	if ($link =~ m/^(http\|ftp):/i) {
[14665]	1176	# Turn url (using /) into file name (possibly using \ on windows)
	1177	my @http_dir_split = split('/', $before_hash);
	1178	$before_hash = &util::filename_cat(@http_dir_split);
	1179	}
	1180
	1181	$before_hash = $self->eval_dir_dots($before_hash);
[16024]	1182
[14665]	1183	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
	1184
	1185	my $rl = 0;
	1186	$rl = 1 if (-e $linkfilename);
	1187
	1188	# make sure there's a slash on the end if it's a directory
[16769]	1189	if ($before_hash !~ m/\/$/) {
[14665]	1190	$before_hash .= "/" if (-d $linkfilename);
	1191	}
	1192	return ($type . $before_hash, $hash_part, $rl);
[16024]	1193
[16769]	1194	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
[14665]	1195
[16769]	1196	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
	1197
[14665]	1198	# the first directory will be the domain name if file_is_url
	1199	# to generate archives, otherwise we'll assume all files are
	1200	# from the same site and base_dir is the root
	1201
	1202	if ($self->{'file_is_url'}) {
	1203	my @dirs = split /[\/\\]/, $file;
	1204	my $domname = shift (@dirs);
	1205	$before_hash = &util::filename_cat($domname, $before_hash);
	1206	$before_hash =~ s@\\@/@g; # for windows
	1207	}
	1208	else
	1209	{
	1210	# see if link shares directory with source document
	1211	# => turn into relative link if this is so!
	1212
[16769]	1213	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
[14665]	1214	# too difficult doing a pattern match with embedded '\'s...
	1215	my $win_before_hash=$before_hash;
	1216	$win_before_hash =~ s@(\\)+@/@g;
	1217	# $base_dir is already similarly "converted" on windows.
	1218	if ($win_before_hash =~ s@^$base_dir/@@o) {
[16024]	1219	# if this is true, we removed a prefix
	1220	$before_hash=$win_before_hash;
[14665]	1221	}
	1222	}
	1223	else {
	1224	# before_hash has lost leading slash by this point,
	1225	# -> add back in prior to substitution with $base_dir
	1226	$before_hash = "/$before_hash";
	1227
	1228	$before_hash = &util::filename_cat("",$before_hash);
	1229	$before_hash =~ s@^$base_dir/@@;
	1230	}
	1231	}
	1232	} else {
	1233	# Turn relative file path into full path
	1234	my $dirname = &File::Basename::dirname($file);
	1235	$before_hash = &util::filename_cat($dirname, $before_hash);
[16769]	1236	$before_hash = $self->eval_dir_dots($before_hash);
[14665]	1237	}
	1238
	1239	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
	1240	# make sure there's a slash on the end if it's a directory
[16769]	1241	if ($before_hash !~ m/\/$/) {
[14665]	1242	$before_hash .= "/" if (-d $linkfilename);
	1243	}
	1244	return ("http://" . $before_hash, $hash_part, 1);
	1245	} else {
	1246	# mailto, news, nntp, telnet, javascript or gopher link
	1247	return ($before_hash, "", 0);
	1248	}
	1249	}
	1250
	1251	sub extract_first_NNNN_characters {
	1252	my $self = shift (@_);
	1253	my ($textref, $doc_obj, $thissection) = @_;
	1254
	1255	foreach my $size (split /,/, $self->{'first'}) {
	1256	my $tmptext = $$textref;
	1257	# skip to the body
	1258	$tmptext =~ s/.<body[^>]>//i;
	1259	# remove javascript
	1260	$tmptext =~ s@<script.*?</script>@ @sig;
	1261	$tmptext =~ s/<[^>]*>/ /g;
	1262	$tmptext =~ s/ / /g;
	1263	$tmptext =~ s/^\s+//;
	1264	$tmptext =~ s/\s+$//;
	1265	$tmptext =~ s/\s+/ /gs;
	1266	$tmptext = &unicode::substr ($tmptext, 0, $size);
	1267	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
	1268	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
	1269	}
	1270	}
	1271
	1272
	1273	sub extract_metadata {
	1274	my $self = shift (@_);
	1275	my ($textref, $metadata, $doc_obj, $section) = @_;
	1276	my $outhandle = $self->{'outhandle'};
	1277	# if we don't want metadata, we may as well not be here ...
	1278	return if (!defined $self->{'metadata_fields'});
	1279
	1280	# metadata fields to extract/save. 'key' is the (lowercase) name of the
	1281	# html meta, 'value' is the metadata name for greenstone to use
	1282	my %find_fields = ();
	1283
	1284	my %creator_fields = (); # short-cut for lookups
	1285
	1286
	1287	foreach my $field (split /,/, $self->{'metadata_fields'}) {
	1288	$field =~ s/^\s+//; # remove leading whitespace
	1289	$field =~ s/\s+$//; # remove trailing whitespace
	1290
	1291	# support tag<tagname>
[20689]	1292	if ($field =~ m/^(.?)\s<(.*?)>$/) {
[14665]	1293	# "$2" is the user's preferred gs metadata name
	1294	$find_fields{lc($1)}=$2; # lc = lowercase
	1295	} else { # no <tagname> for mapping
	1296	# "$field" is the user's preferred gs metadata name
	1297	$find_fields{lc($field)}=$field; # lc = lowercase
	1298	}
	1299	}
	1300
	1301	if (defined $self->{'hunt_creator_metadata'} &&
	1302	$self->{'hunt_creator_metadata'} == 1 ) {
	1303	my @extra_fields =
	1304	(
	1305	'author',
	1306	'author.email',
	1307	'creator',
	1308	'dc.creator',
	1309	'dc.creator.corporatename',
	1310	);
	1311
	1312	# add the creator_metadata fields to search for
	1313	foreach my $field (@extra_fields) {
	1314	$creator_fields{$field}=0; # add to lookup hash
	1315	}
	1316	}
	1317
	1318
	1319	# find the header in the html file, which has the meta tags
	1320	$$textref =~ m@<head>(.*?)</head>@si;
	1321
	1322	my $html_header=$1;
	1323
	1324	# go through every <meta... tag defined in the html and see if it is
	1325	# one of the tags we want to match.
	1326
	1327	# special case for title - we want to remember if its been found
	1328	my $found_title = 0;
	1329	# this assumes that ">" won't appear. (I don't think it's allowed to...)
[16769]	1330	$html_header =~ m/^/; # match the start of the string, for \G assertion
[16024]	1331
[14665]	1332	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
	1333	my $metatag=$1;
	1334	my ($tag, $value);
	1335
	1336	# find the tag name
[16769]	1337	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
[14665]	1338	$tag=$2;
	1339	# in case they're not using " or ', but they should...
	1340	if (! $tag) {
[16769]	1341	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
[14665]	1342	$tag=$1;
	1343	}
	1344
	1345	if (!defined $tag) {
[15872]	1346	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
[14665]	1347	next;
	1348	}
	1349
	1350	# don't need to assign this field if it was passed in from a previous
	1351	# (recursive) plugin
	1352	if (defined $metadata->{$tag}) {next}
	1353
	1354	# find the tag content
[16769]	1355	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
[14665]	1356	$value=$2;
	1357
	1358	if (! $value) {
[16769]	1359	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
[14665]	1360	$value=$1;
	1361	}
	1362	if (!defined $value) {
[15872]	1363	print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
[14665]	1364	next;
	1365	}
	1366
	1367	# clean up and add
	1368	$value =~ s/\s+/ /gs;
	1369	chomp($value); # remove trailing \n, if any
	1370	if (exists $creator_fields{lc($tag)}) {
	1371	# map this value onto greenstone's "Creator" metadata
	1372	$tag='Creator';
	1373	} elsif (!exists $find_fields{lc($tag)}) {
[16024]	1374	next; # don't want this tag
[14665]	1375	} else {
	1376	# get the user's preferred capitalisation
	1377	$tag = $find_fields{lc($tag)};
	1378	}
	1379	if (lc($tag) eq "title") {
	1380	$found_title = 1;
	1381	}
[18521]	1382
	1383	if ($self->{'verbosity'} > 2) {
	1384	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
[14665]	1385	}
[18521]	1386
	1387	# Do we still reply on the following? Surely there must
	1388	# be a better way to go about this outside of the plugin?
	1389	#
	1390	#if ($tag =~ m/date.*/i){
	1391	# $tag = lc($tag);
	1392	#}
	1393
[14665]	1394	$doc_obj->add_utf8_metadata($section, $tag, $value);
	1395
	1396	}
	1397
	1398	# TITLE: extract the document title
	1399	if (exists $find_fields{'title'} && !$found_title) {
	1400	# we want a title, and didn't find one in the meta tags
	1401	# see if there's a <title> tag
	1402	my $title;
	1403	my $from = ""; # for debugging output only
[16769]	1404	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
[14665]	1405	$title = $1;
	1406	$from = "<title> tags";
	1407	}
	1408
	1409	if (!defined $title) {
	1410	$from = "first 100 chars";
	1411	# if no title use first 100 or so characters
	1412	$title = $$textref;
	1413	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
	1414	$title =~ s/^.*?<body>//si;
	1415	# ignore javascript!
	1416	$title =~ s@<script.*?</script>@ @sig;
	1417	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
	1418	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
	1419	$title = substr ($title, 0, 100);
	1420	$title =~ s/\s\S*$/.../;
	1421	}
	1422	$title =~ s/<[^>]*>/ /g; # remove html tags
	1423	$title =~ s/ / /g;
	1424	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
	1425	$title =~ s/\s+/ /gs; # collapse multiple spaces
	1426	$title =~ s/^\s*//; # remove leading spaces
	1427	$title =~ s/\s*$//; # remove trailing spaces
	1428
	1429	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
	1430	$title =~ s/^\s+//s; # in case title_sub introduced any...
	1431	$doc_obj->add_utf8_metadata ($section, 'Title', $title);
	1432	print $outhandle " extracted Title metadata \"$title\" from $from\n"
	1433	if ($self->{'verbosity'} > 2);
	1434	}
	1435
	1436	# add FileFormat metadata
	1437	$doc_obj->add_metadata($section,"FileFormat", "HTML");
	1438
	1439	# Special, for metadata names such as tagH1 - extracts
	1440	# the text between the first <H1> and </H1> tags into "H1" metadata.
	1441
	1442	foreach my $field (keys %find_fields) {
[16769]	1443	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
[14665]	1444	my $tag = $1;
	1445	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
	1446	my $content = $1;
	1447	$content =~ s/ / /g;
	1448	$content =~ s/<[^>]*>/ /g;
	1449	$content =~ s/^\s+//;
	1450	$content =~ s/\s+$//;
	1451	$content =~ s/\s+/ /gs;
	1452	if ($content) {
	1453	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
	1454	$tag =~ s/^tag//i;
	1455	$doc_obj->add_utf8_metadata ($section, $tag, $content);
	1456	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
	1457	if ($self->{'verbosity'} > 2);
	1458	}
	1459	}
	1460	}
	1461	}
	1462
	1463
	1464	# evaluate any "../" to next directory up
	1465	# evaluate any "./" as here
	1466	sub eval_dir_dots {
	1467	my $self = shift (@_);
	1468	my ($filename) = @_;
	1469	my $dirsep_os = &util::get_os_dirsep();
	1470	my @dirsep = split(/$dirsep_os/,$filename);
	1471
	1472	my @eval_dirs = ();
	1473	foreach my $d (@dirsep) {
	1474	if ($d eq "..") {
	1475	pop(@eval_dirs);
	1476
	1477	} elsif ($d eq ".") {
	1478	# do nothing!
	1479
	1480	} else {
	1481	push(@eval_dirs,$d);
	1482	}
	1483	}
	1484
	1485	# Need to fiddle with number of elements in @eval_dirs if the
	1486	# first one is the empty string. This is because of a
	1487	# modification to util::filename_cat that supresses the addition
	1488	# of a leading '/' character (or \ if windows) (intended to help
	1489	# filename cat with relative paths) if the first entry in the
	1490	# array is the empty string. Making the array start with two
	1491	# empty strings is a way to defeat this "smart" option.
	1492	#
	1493	if (scalar(@eval_dirs) > 0) {
	1494	if ($eval_dirs[0] eq ""){
	1495	unshift(@eval_dirs,"");
	1496	}
	1497	}
[16836]	1498
	1499	my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
	1500	return $evaluated_filename;
[14665]	1501	}
	1502
	1503	sub replace_usemap_links {
	1504	my $self = shift (@_);
	1505	my ($front, $link, $back) = @_;
	1506
[16769]	1507	# remove quotes from link at start and end if necessary
	1508	if ($link=~/^[\"\']/) {
	1509	$link=~s/^[\"\']//;
	1510	$link=~s/[\"\']$//;
	1511	$front.='"';
	1512	$back="\"$back";
	1513	}
	1514
[14665]	1515	$link =~ s/^\.\///;
	1516	return $front . $link . $back;
	1517	}
	1518
	1519	sub inc_filecount {
	1520	my $self = shift (@_);
	1521
	1522	if ($self->{'file_num'} == 1000) {
	1523	$self->{'dir_num'} ++;
	1524	$self->{'file_num'} = 0;
	1525	} else {
	1526	$self->{'file_num'} ++;
	1527	}
	1528	}
	1529
	1530
[15872]	1531	# Extend read_file so that strings like é are
[14665]	1532	# converted to UTF8 internally.
	1533	#
	1534	# We don't convert < or > or & or " in case
	1535	# they interfere with the GML files
	1536
	1537	sub read_file {
[15872]	1538	my $self = shift(@_);
	1539	my ($filename, $encoding, $language, $textref) = @_;
[14665]	1540
[15872]	1541	$self->SUPER::read_file($filename, $encoding, $language, $textref);
[14665]	1542
	1543	# Convert entities to their UTF8 equivalents
	1544	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
	1545	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
	1546	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
	1547	}
	1548
	1549	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: