Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago
Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 65.6 KB

Rev	Line
[14665]	1	###########################################################################
	2	#
[15872]	3	# HTMLPlugin.pm -- basic html plugin
[14665]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright (C) 1999 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27	#
	28	# Note that this plugin handles frames only in a very simple way
	29	# i.e. each frame is treated as a separate document. This means
	30	# search results will contain links to individual frames rather
	31	# than linking to the top level frameset.
	32	# There may also be some problems caused by the _parent target
	33	# (it's removed by this plugin)
	34	#
	35
[15872]	36	package HTMLPlugin;
[14665]	37
[22842]	38	use Encode;
[23387]	39	use Unicode::Normalize 'normalize';
[22842]	40
[15872]	41	use ReadTextFile;
	42	use HBPlugin;
[14665]	43	use ghtml;
	44	use unicode;
	45	use util;
[27306]	46	use FileUtils;
[14665]	47	use XMLParser;
	48
	49	use Image::Size;
[14913]	50	use File::Copy;
[14665]	51
	52	sub BEGIN {
[15872]	53	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
[14665]	54	}
	55
	56	use strict; # every perl program should have this!
	57	no strict 'refs'; # make an exception so we can use variables as filehandles
	58
	59	my $arguments =
	60	[ { 'name' => "process_exp",
[15872]	61	'desc' => "{BasePlugin.process_exp}",
[14665]	62	'type' => "regexp",
	63	'deft' => &get_default_process_exp() },
	64	{ 'name' => "block_exp",
[15872]	65	'desc' => "{BasePlugin.block_exp}",
[14665]	66	'type' => 'regexp',
	67	'deft' => &get_default_block_exp() },
	68	{ 'name' => "nolinks",
[15872]	69	'desc' => "{HTMLPlugin.nolinks}",
[14665]	70	'type' => "flag" },
	71	{ 'name' => "keep_head",
[15872]	72	'desc' => "{HTMLPlugin.keep_head}",
[14665]	73	'type' => "flag" },
	74	{ 'name' => "no_metadata",
[15872]	75	'desc' => "{HTMLPlugin.no_metadata}",
[14665]	76	'type' => "flag" },
	77	{ 'name' => "metadata_fields",
[15872]	78	'desc' => "{HTMLPlugin.metadata_fields}",
[14665]	79	'type' => "string",
	80	'deft' => "Title" },
[21800]	81	{ 'name' => "metadata_field_separator",
	82	'desc' => "{HTMLPlugin.metadata_field_separator}",
	83	'type' => "string",
	84	'deft' => "" },
[14665]	85	{ 'name' => "hunt_creator_metadata",
[15872]	86	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
[14665]	87	'type' => "flag" },
	88	{ 'name' => "file_is_url",
[15872]	89	'desc' => "{HTMLPlugin.file_is_url}",
[14665]	90	'type' => "flag" },
	91	{ 'name' => "assoc_files",
[15872]	92	'desc' => "{HTMLPlugin.assoc_files}",
[14665]	93	'type' => "regexp",
	94	'deft' => &get_default_block_exp() },
	95	{ 'name' => "rename_assoc_files",
[15872]	96	'desc' => "{HTMLPlugin.rename_assoc_files}",
[14665]	97	'type' => "flag" },
	98	{ 'name' => "title_sub",
[15872]	99	'desc' => "{HTMLPlugin.title_sub}",
[14665]	100	'type' => "string",
	101	'deft' => "" },
	102	{ 'name' => "description_tags",
[15872]	103	'desc' => "{HTMLPlugin.description_tags}",
[14665]	104	'type' => "flag" },
	105	# retain this for backward compatibility (w3mir option was replaced by
	106	# file_is_url)
	107	{ 'name' => "w3mir",
[15872]	108	# 'desc' => "{HTMLPlugin.w3mir}",
[14665]	109	'type' => "flag",
	110	'hiddengli' => "yes"},
	111	{ 'name' => "no_strip_metadata_html",
[15872]	112	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
[14665]	113	'type' => "string",
	114	'deft' => "",
	115	'reqd' => "no"},
	116	{ 'name' => "sectionalise_using_h_tags",
[15872]	117	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
[14665]	118	'type' => "flag" },
[14913]	119	{ 'name' => "use_realistic_book",
[15872]	120	'desc' => "{HTMLPlugin.tidy_html}",
[14665]	121	'type' => "flag"},
[15872]	122	{ 'name' => "old_style_HDL",
	123	'desc' => "{HTMLPlugin.old_style_HDL}",
[20791]	124	'type' => "flag"},
	125	{'name' => "processing_tmp_files",
	126	'desc' => "{BasePlugin.processing_tmp_files}",
	127	'type' => "flag",
	128	'hiddengli' => "yes"}
[14665]	129	];
	130
[15872]	131	my $options = { 'name' => "HTMLPlugin",
	132	'desc' => "{HTMLPlugin.desc}",
[14665]	133	'abstract' => "no",
	134	'inherits' => "yes",
	135	'args' => $arguments };
	136
	137
	138	sub new {
	139	my ($class) = shift (@_);
	140	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	141	push(@$pluginlist, $class);
	142
[15872]	143	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	144	push(@{$hashArgOptLists->{"OptList"}},$options);
[16024]	145
[14665]	146
[15872]	147	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
[14665]	148
	149	if ($self->{'w3mir'}) {
	150	$self->{'file_is_url'} = 1;
	151	}
	152	$self->{'aux_files'} = {};
	153	$self->{'dir_num'} = 0;
	154	$self->{'file_num'} = 0;
	155
	156	return bless $self, $class;
	157	}
	158
	159	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
	160	# if have eg <script language="javascript" src="img/lib.js@123">
[20791]	161	# blocking is now done by reading through the file and recording all the
	162	# images and other files
[14665]	163	sub get_default_block_exp {
	164	my $self = shift (@_);
	165
[16392]	166	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
	167	return "";
[14665]	168	}
	169
	170	sub get_default_process_exp {
	171	my $self = shift (@_);
	172
	173	# the last option is an attempt to encode the concept of an html query ...
	174	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
	175	}
	176
	177	sub store_block_files
	178	{
	179	my $self =shift (@_);
[16392]	180	my ($filename_full_path, $block_hash) = @_;
	181
	182	my $html_fname = $filename_full_path;
[14665]	183
[23335]	184	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
	185	$self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
[14665]	186
	187	# read in file ($text will be in utf8)
[16769]	188	my $raw_text = "";
[23363]	189	$self->read_file_no_decoding($filename_full_path, \$raw_text);
[16769]	190
	191	my $textref = \$raw_text;
[14665]	192	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
	193	my $closecom = '(?:-->\|(?:—\|\|--)>)';
	194	$$textref =~ s/$opencom(.*?)$closecom//gs;
	195
[23363]	196	# Convert entities to their UTF8 equivalents
	197	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
	198	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
	199	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
	200
[14665]	201	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
	202	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
	203	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
	204	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
	205	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
[17127]	206	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
[16638]	207	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
[14665]	208
[23387]	209	if(!defined $self->{'unicode_to_original_filename'}) {
[16769]	210	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
[23387]	211	$self->{'unicode_to_original_filename'} = {};
[16769]	212	}
	213
[23387]	214	foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
[14665]	215
	216	# remove quotes from link at start and end if necessary
[23387]	217	if ($raw_link =~ m/^\"/) {
	218	$raw_link =~ s/^\"//;
	219	$raw_link =~ s/\"$//;
[14665]	220	}
	221
[23371]	222	# remove any anchor names, e.g. foo.html#name becomes foo.html
	223	# but watch out for any #'s that are part of entities, such as α
[23387]	224	$raw_link =~ s/([^&])\#.*$/$1/s;
[23371]	225
[16638]	226	# some links may just be anchor names
[23387]	227	next unless ($raw_link =~ /\S+/);
[14665]	228
[23415]	229	if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/i) {
[14665]	230	# Turn relative file path into full path
[16392]	231	my $dirname = &File::Basename::dirname($filename_full_path);
[27306]	232	$raw_link = &FileUtils::filenameConcatenate($dirname, $raw_link);
[14665]	233	}
[23387]	234	$raw_link = $self->eval_dir_dots($raw_link);
[16638]	235
[16769]	236	# this is the actual filename on the filesystem (that the link refers to)
[23387]	237	my $url_original_filename = $self->opt_url_decode($raw_link);
[16769]	238
[23387]	239	my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
[16769]	240
[23387]	241	if ($exceeds_bytecodes) {
	242	# We have a link to a file name that is more complicated than a raw byte filename
	243	# What we do next depends on the operating system we are on
[16769]	244
[23387]	245	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
	246	# Assume we're dealing with a UTF-8 encoded filename
	247	$url_original_filename = encode("utf8", $url_original_filename);
	248	}
	249	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
	250	# HFS+ is UTF8 with decompostion
	251	$url_original_filename = encode("utf8", $url_original_filename);
	252	$url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
	253	}
	254	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
	255	# Don't need to do anything as later code maps Windows
	256	# unicode filenames to DOS short filenames when needed
	257	}
	258	else {
	259	my $outhandle = $self->{'outhandle'};
	260	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
	261	print $outhandle " in raw file system encoding of: $raw_link\n";
	262	print $outhandle " Assuming filesystem is UTF-8 based.\n";
	263	$url_original_filename = encode("utf8", $url_original_filename);
	264	}
	265	}
	266
	267	# Convert the (currently raw) link into its Unicode version.
	268	# Store the Unicode link along with the url_original_filename
	269	my $unicode_url_original_filename = "";
	270	$self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
	271
	272
	273	$self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
	274
	275
	276	if ($url_original_filename ne $unicode_url_original_filename) {
[17088]	277	my $outhandle = $self->{'outhandle'};
[23387]	278
[17088]	279	print $outhandle "URL Encoding $url_original_filename\n";
[23387]	280	print $outhandle " ->$unicode_url_original_filename\n";
[17088]	281
[23387]	282	# Allow for possibility of raw byte version and Unicode versions of file
[23561]	283	&util::block_filename($block_hash,$unicode_url_original_filename);
[23387]	284	}
[23363]	285
[23418]	286	# $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename);
[23561]	287	&util::block_filename($block_hash,$url_original_filename);
[23418]	288
[14665]	289	}
	290	}
	291
[16769]	292	# Given a filename in any encoding, will URL decode it to get back the original filename
	293	# in the original encoding. Because this method is intended to work out the original
[18320]	294	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
[16769]	295	# string already exists in the local folder.
[23363]	296	#
[16769]	297	sub opt_url_decode {
	298	my $self = shift (@_);
[23387]	299	my ($raw_link) = @_;
[16024]	300
[23387]	301
[16769]	302	# Replace %XX's in URL with decoded value if required.
	303	# Note that the filename may include the %XX in some situations
[23387]	304
	305	## if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
	306
	307	if (($raw_link =~ m/\%[A-F0-9]{2}/i) \|\| ($raw_link =~ m/\&\#x[0-9A-F]+;/i) \|\| ($raw_link =~ m/\&\#[0-9]+;/i)) {
	308	if (!-e $raw_link) {
	309	$raw_link = &unicode::url_decode($raw_link,1);
[16769]	310	}
	311	}
[23387]	312
	313	return $raw_link;
[16769]	314	}
	315
[20774]	316	sub read_into_doc_obj
	317	{
	318	my $self = shift (@_);
	319	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
	320
[22330]	321	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[23335]	322
	323	# Lookup content_encoding worked out in file_block pass for this file
	324	# Store it under the local name 'content_encoding' so its nice and
	325	# easy to access
	326	$self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
	327
[20774]	328	# get the input file
	329	my $input_filename = $file;
	330	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	331	$suffix = lc($suffix);
[22330]	332	my $tidy_filename;
[20774]	333	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
	334	{
	335	# because the document has to be sectionalized set the description tags
	336	$self->{'description_tags'} = 1;
	337
	338	# set the file to be tidied
[27306]	339	$input_filename = &FileUtils::filenameConcatenate($base_dir,$file) if $base_dir =~ m/\w/;
[20774]	340
	341	# get the tidied file
	342	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
[22330]	343	$tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
[20774]	344
	345	# derive tmp filename from input filename
	346	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
	347
	348	# set the new input file and base_dir to be from the tidied file
	349	$file = "$tailname$suffix";
	350	$base_dir = $dirname;
	351	}
	352
	353	# call the parent read_into_doc_obj
	354	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
[22330]	355	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
	356	{
	357	# now we need to reset the filenames in the doc obj so that the converted filenames are not used
	358	my $collect_file = &util::filename_within_collection($filename_full_path);
	359	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
	360	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
[23363]	361	# build. So set it manually.
	362	$doc_obj->set_source_path($filename_full_path);
[22330]	363	my $collect_conv_file = &util::filename_within_collection($tidy_filename);
	364	$doc_obj->set_converted_filename($collect_conv_file);
[23349]	365
	366	my $plugin_filename_encoding = $self->{'filename_encoding'};
[23352]	367	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
	368	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
[22330]	369	}
[23335]	370
	371	delete $self->{'store_content_encoding'}->{$filename_full_path};
	372	$self->{'content_encoding'} = undef;
	373
[20774]	374	return ($process_status,$doc_obj);
	375	}
[16769]	376
[14665]	377	# do plugin specific processing of doc_obj
	378	sub process {
	379	my $self = shift (@_);
	380	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
	381	my $outhandle = $self->{'outhandle'};
	382
[16769]	383	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
[16024]	384	# this makes life so much easier... perl can cope with unix-style '/'s.
[23371]	385	$base_dir =~ s@(\\)+@/@g;
	386	$file =~ s@(\\)+@/@g;
[14665]	387	}
[23371]	388
[27306]	389	my $filename = &FileUtils::filenameConcatenate($base_dir,$file);
[23371]	390	my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
	391	my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
	392
	393	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
	394	# And again
	395	$upgraded_base_dir =~ s@(\\)+@/@g;
	396	$upgraded_filename =~ s@(\\)+@/@g;
	397
	398	# Need to make sure there is a '/' on the end of upgraded_base_dir
[23387]	399	if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
[23371]	400	$upgraded_base_dir .= "/";
	401	}
	402	}
	403	my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
[14665]	404
	405	# reset per-doc stuff...
	406	$self->{'aux_files'} = {};
	407	$self->{'dir_num'} = 0;
	408	$self->{'file_num'} = 0;
	409
	410	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
	411	# you can also include metadata in the format (X can be any number)
	412	# <hX>Title<!--gsdl-metadata
	413	# <Metadata name="name1">value1</Metadata>
	414	# ...
	415	# <Metadata name="nameN">valueN</Metadata>
	416	#--></hX>
	417	if ($self->{'sectionalise_using_h_tags'}) {
	418	# description_tags should allways be activated because we convert headings to description tags
	419	$self->{'description_tags'} = 1;
	420
	421	my $arrSections = [];
[23371]	422	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
[14665]	423
	424	if (scalar(@$arrSections)) {
	425	my $strMetadata = $self->update_section_data($arrSections, -1);
	426	if (length($strMetadata)) {
	427	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
	428	$$textref =~ s/<\/body>/$strMetadata/ig;
	429	}
	430	}
	431	}
	432
	433	my $cursection = $doc_obj->get_top_section();
	434
	435	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
	436	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
	437
	438	# Store URL for page as metadata - this can be used for an
	439	# altavista style search interface. The URL won't be valid
	440	# unless the file structure contains the domain name (i.e.
	441	# like when w3mir is used to download a website).
	442
	443	# URL metadata (even invalid ones) are used to support internal
	444	# links, so even if 'file_is_url' is off, still need to store info
	445
[23371]	446	my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
[23347]	447
[23335]	448	# my $utf8_file = $self->filename_to_utf8_metadata($file);
	449	# $utf8_file =~ s/&\#095;/_/g;
[23835]	450	# variable below used to be utf8_file
	451
[23387]	452	my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
	453	my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
[23335]	454
[16735]	455	my $web_url = "http://";
[23387]	456	my $utf8_web_url = "http://";
[16735]	457	if(defined $dirname) { # local directory
[22689]	458	# Check for "ftp" in the domain name of the directory
	459	# structure to determine if this URL should be a ftp:// URL
	460	# This check is not infallible, but better than omitting the
	461	# check, which would cause all files downloaded from ftp sites
	462	# via mirroring with wget to have potentially erroneous http:// URLs
	463	# assigned in their metadata
	464	if ($dirname =~ /^[^\/]*ftp/i)
	465	{
	466	$web_url = "ftp://";
[23387]	467	$utf8_web_url = "ftp://";
[22689]	468	}
[16836]	469	$dirname = $self->eval_dir_dots($dirname);
[18626]	470	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
[23387]	471
	472	$web_url = $web_url.$dirname.$url_encoded_file;
	473	$utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
[16735]	474	} else {
[23387]	475	$web_url = $web_url.$url_encoded_file;
	476	$utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
[16735]	477	}
[19983]	478	$web_url =~ s/\\/\//g;
[23387]	479	$utf8_web_url =~ s/\\/\//g;
[23371]	480
	481	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23387]	482	print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n";
	483	print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
[23371]	484	}
	485
	486
[15872]	487	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
[23387]	488	$doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
[15872]	489
[14665]	490	if ($self->{'file_is_url'}) {
	491	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
	492	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
	493	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
	494	}
	495
	496	if ($self->{'description_tags'}) {
	497	# remove the html header - note that doing this here means any
	498	# sections defined within the header will be lost (so all <Section>
	499	# tags must appear within the body of the HTML)
	500	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
	501
	502	$$textref =~ s/^.?<body[^>]>//is;
	503	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	504
	505	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
	506	my $closecom = '(?:-->\|(?:—\|\|--)>)';
	507
	508	my $lt = '(?:<\|<)';
	509	my $gt = '(?:>\|>)';
	510	my $quot = '(?:"\|"\|”\|“)';
	511
	512	my $dont_strip = '';
	513	if ($self->{'no_strip_metadata_html'}) {
	514	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
	515	}
	516
	517	my $found_something = 0; my $top = 1;
	518	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
	519	my $text = $1;
	520	my $comment = $2;
	521	if (defined $text) {
	522	# text before a comment - note that getting to here
	523	# doesn't necessarily mean there are Section tags in
	524	# the document
[23371]	525	$self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
[14665]	526	}
	527	while ($comment =~ s/$lt(.*?)$gt//s) {
	528	my $tag = $1;
	529	if ($tag eq "Section") {
	530	$found_something = 1;
	531	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
	532	$top = 0;
	533	} elsif ($tag eq "/Section") {
	534	$found_something = 1;
	535	$cursection = $doc_obj->get_parent_section ($cursection);
[16769]	536	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
[14665]	537	my $metaname = $1;
[16769]	538	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
[14665]	539	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
	540	my $metavalue = $1;
	541	$metavalue =~ s/^\s+//;
	542	$metavalue =~ s/\s+$//;
	543	# assume that no metadata value intentionally includes
	544	# carriage returns or HTML tags (if they're there they
	545	# were probably introduced when converting to HTML from
	546	# some other format).
	547	# actually some people want to have html tags in their
	548	# metadata.
	549	$metavalue =~ s/[\cJ\cM]/ /sg;
	550	$metavalue =~ s/<[^>]+>//sg
[16769]	551	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
[14665]	552	$metavalue =~ s/\s+/ /sg;
[22348]	553	if ($metaname =~ /\./) { # has a namespace
	554	$metaname = "ex.$metaname";
	555	}
[14665]	556	if ($accumulate) {
	557	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
	558	} else {
	559	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
	560	}
	561	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
	562	# do nothing with containing Description tags
	563	} else {
	564	# simple HTML tag (probably created by the conversion
	565	# to HTML from some other format) - we'll ignore it and
	566	# hope for the best ;-)
	567	}
	568	}
	569	}
	570	if ($cursection ne "") {
[23371]	571	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
[14665]	572	}
	573
	574	$$textref =~ s/^.?<body[^>]>//is;
	575	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
[16769]	576	if ($$textref =~ m/\S/) {
[14665]	577	if (!$found_something) {
	578	if ($self->{'verbosity'} > 2) {
[23371]	579	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
[14665]	580	print $outhandle " will be processed as a single section document\n";
	581	}
	582
	583	# go ahead and process single-section document
[23371]	584	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
[14665]	585
	586	# if document contains no Section tags we'll go ahead
	587	# and extract metadata (this won't have been done
	588	# above as the -description_tags option prevents it)
	589	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
	590	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
	591	unless $self->{'no_metadata'};
	592
	593	} else {
[23371]	594	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
[14665]	595	print $outhandle " of the final closing </Section> tag. This text will\n";
	596	print $outhandle " be ignored.";
	597
	598	my ($text);
	599	if (length($$textref) > 30) {
	600	$text = substr($$textref, 0, 30) . "...";
	601	} else {
	602	$text = $$textref;
	603	}
	604	$text =~ s/\n/ /isg;
	605	print $outhandle " ($text)\n";
	606	}
	607	} elsif (!$found_something) {
	608
	609	if ($self->{'verbosity'} > 2) {
	610	# may get to here if document contained no valid Section
	611	# tags but did contain some comments. The text will have
	612	# been processed already but we should print the warning
	613	# as above and extract metadata
[23371]	614	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
[14665]	615	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
	616	}
	617
	618	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
	619	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
	620	unless $self->{'no_metadata'};
	621	}
	622
	623	} else {
	624
	625	# remove header and footer
	626	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
	627	$$textref =~ s/^.?<body[^>]>//is;
	628	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	629	}
	630
[25555]	631	$self->{'css_assoc_files'} = {};
	632
[14665]	633	# single section document
[23371]	634	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
[25555]	635
	636	#my $upgraded_filename_dirname = &File::Basename::dirname($upgraded_filename);
	637
	638	$self->acquire_css_associated_files($doc_obj, $cursection);
	639
	640	$self->{'css_assoc_files'} = {};
[14665]	641	}
[23335]	642
[14665]	643	return 1;
	644	}
	645
	646
	647	sub process_heading
	648	{
	649	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
	650	$strHeadingText = '' if (!defined($strHeadingText));
	651
	652	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
	653
	654	my $strSecMetadata = '';
	655	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
	656	{
	657	$strSecMetadata .= $1;
	658	}
	659
	660	$strHeadingText =~ s/^\s+//g;
	661	$strHeadingText =~ s/\s+$//g;
	662	$strSecMetadata =~ s/^\s+//g;
	663	$strSecMetadata =~ s/\s+$//g;
	664
	665	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
	666
	667	if (length($strSecMetadata)) {
	668	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
	669	}
	670
	671	$strMetadata .= "\t</Description>\n";
	672
	673	return "<!--" . $strMetadata . "-->";
	674	}
	675
	676
	677	sub update_section_data
	678	{
	679	my ($self, $arrSections, $nCurTocNo) = @_;
	680	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
	681
	682	if ($nSections == 0) {
	683	push @$arrSections, $nCurTocNo;
	684	return $strBuffer;
	685	}
	686	$nLast = $arrSections->[$nSections - 1];
	687	if ($nCurTocNo > $nLast) {
	688	push @$arrSections, $nCurTocNo;
	689	return $strBuffer;
	690	}
	691	for(my $i = $nSections - 1; $i >= 0; $i--) {
	692	if ($nCurTocNo <= $arrSections->[$i]) {
	693	$strBuffer .= "\n</Section>";
	694	pop @$arrSections;
	695	}
	696	}
	697	push @$arrSections, $nCurTocNo;
	698	return $strBuffer;
	699	}
	700
	701
	702	# note that process_section may be called multiple times for a single
	703	# section (relying on the fact that add_utf8_text appends the text to any
	704	# that may exist already).
	705	sub process_section {
	706	my $self = shift (@_);
	707	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
[25555]	708
	709	my @styleTagsText = ($$textref =~ m/<style[^>]>([^<])<\/style>/sg);
	710	if(scalar(@styleTagsText) > 0)
	711	{
[27306]	712	my $css_filename_dirname = &File::Basename::dirname(&FileUtils::filenameConcatenate($base_dir, $file));
[25555]	713	foreach my $styleText (@styleTagsText)
	714	{
	715	$self->acquire_css_associated_files_from_text_block($styleText, $css_filename_dirname);
	716	}
	717	}
	718
[14665]	719	# trap links
	720	if (!$self->{'nolinks'}) {
	721	# usemap="./#index" not handled correctly => change to "#index"
[16769]	722	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
	723
[23392]	724	## my $opencom = '(?:<!--\|<!(?:—\|\|--))';
	725	## my $closecom = '(?:-->\|(?:—\|\|--)>)';
[23387]	726
[16769]	727	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
[14665]	728	$self->replace_usemap_links($1, $2, $3)/isge;
	729
[23463]	730	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
	731	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
[23387]	732
[23392]	733	## $$textref =~ s/($opencom.?)?+(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)(.*?$closecom)?+/
	734	# $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
[14665]	735	}
	736
	737	# trap images
	738
[15872]	739	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
[15176]	740	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
	741	# turned regular text succeeding images into links. That is, by embedding <imgs>
	742	# inside <a href=""></a>, the text following images were no longer misbehaving.
	743	# However, there would be many occasions whereby images were not meant to link
	744	# to their source images but where the images would link to another web page.
	745	# To allow this, the no_image_links option was introduced: it would prevent
	746	# the behaviour of embedding images into links that referenced the source images.
	747
	748	# Somewhere along the line, the problem of normal text turning into links when
	749	# such text followed images which were not embedded in <a href=""></a> ceased
	750	# to occur. This is why the following lines have been commented out (as well as
	751	# two lines in replace_images). They appear to no longer apply.
	752
	753	# If at any time, there is a need for having images embedded in <a> anchor tags,
[15872]	754	# then it might be better to turn that into an HTMLPlugin option rather than make
[15176]	755	# it the default behaviour. Also, eventually, no_image_links needs to become
[15872]	756	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
[15176]	757
	758	#if(!$self->{'no_image_links'}){
[16247]	759	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
[15872]	760	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
[15176]	761	#}
	762
[14665]	763	# add text to document object
	764	# turn \ into \\ so that the rest of greenstone doesn't think there
	765	# is an escape code following. (Macro parsing loses them...)
	766	$$textref =~ s/\\/\\\\/go;
	767
	768	$doc_obj->add_utf8_text($cursection, $$textref);
	769	}
	770
	771	sub replace_images {
	772	my $self = shift (@_);
	773	my ($front, $link, $back, $base_dir,
	774	$file, $doc_obj, $section) = @_;
	775
	776	# remove quotes from link at start and end if necessary
	777	if ($link=~/^[\"\']/) {
[15838]	778	$link=~s/^[\"\']//;
	779	$link=~s/[\"\']$//;
[14665]	780	$front.='"';
	781	$back="\"$back";
	782	}
[15872]	783
[14665]	784	$link =~ s/\n/ /g;
	785
	786	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
	787	# If the Word file path has spaces in it, wv messes up and you end up with
	788	# absolute paths for the images, and without the "file://" prefix
	789	# So check for this special case and massage the data to be correct
[16769]	790	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
[14665]	791	$link =~ s/^.*\\([^\\]+)$/$1/;
	792	}
[16632]	793
[14665]	794	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
	795
	796	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
	797
[17127]	798	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n";
[16632]	799
[14665]	800	my $anchor_name = $img_file;
	801	#$anchor_name =~ s/^.*\///;
	802	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
	803
	804	my $image_link = $front . $img_file .$back;
[15176]	805	return $image_link;
[14665]	806
[15176]	807	# The reasons for why the following two lines are no longer necessary can be
	808	# found in subroutine process_section
	809	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
	810	#return $anchor_link;
	811
[14665]	812	#return $front . $img_file . $back . $anchor_name;
	813	}
	814
	815	sub replace_href_links {
	816	my $self = shift (@_);
[23392]	817	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
[25555]	818
	819	if($front =~ m/^<link / && $link =~ m/\.css"$/)
	820	{
	821	my $actual_link = $link;
	822	$actual_link =~ s/^"(.*)"$/$1/;
	823
	824	my $directory = &File::Basename::dirname($file);
	825
[27306]	826	my $css_filename = &FileUtils::filenameConcatenate($base_dir, $directory, $actual_link);
[25555]	827	$self->retrieve_css_associated_files($css_filename);
	828	}
	829
[16769]	830	# remove quotes from link at start and end if necessary
	831	if ($link=~/^[\"\']/) {
	832	$link=~s/^[\"\']//;
	833	$link=~s/[\"\']$//;
	834	$front.='"';
	835	$back="\"$back";
	836	}
	837
[14665]	838	# attempt to sort out targets - frames are not handled
	839	# well in this plugin and some cases will screw things
	840	# up - e.g. the _parent target (so we'll just remove
	841	# them all ;-)
	842	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
	843	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
	844	$front =~ s/target=\"?_parent\"?//is;
	845	$back =~ s/target=\"?_parent\"?//is;
	846
[25673]	847	if($link =~ m/^\#/s)
	848	{
	849	return $front . "_httpsamepagelink_" . $link . $back;
	850	}
	851
[14665]	852	$link =~ s/\n/ /g;
	853
[16769]	854	# Find file referred to by $link on file system
	855	# This is more complicated than it sounds when char encodings
	856	# is taken in to account
[14665]	857	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[23835]	858
[14665]	859	# href may use '\'s where '/'s should be on Windows
	860	$href =~ s/\\/\//g;
[16769]	861	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
[14665]	862
	863	##### leave all these links alone (they won't be picked up by intermediate
	864	##### pages). I think that's safest when dealing with frames, targets etc.
	865	##### (at least until I think of a better way to do it). Problems occur with
	866	##### mailto links from within small frames, the intermediate page is displayed
	867	##### within that frame and can't be seen. There is still potential for this to
	868	##### happen even with html pages - the solution seems to be to somehow tell
	869	##### the browser from the server side to display the page being sent (i.e.
	870	##### the intermediate page) in the top level window - I'm not sure if that's
	871	##### possible - the following line should probably be deleted if that can be done
[16769]	872	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
[14665]	873
[16769]	874	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
	875	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
[23335]	876
[23371]	877	if ($ENV{'GSDLOS'} =~ m/^windows$/) {
[23335]	878
[23371]	879	# Don't do any encoding for now, as not clear what
	880	# the right thing to do is to support filename
	881	# encoding on Windows when they are not UTF16
	882	#
[23347]	883	}
[23371]	884	else {
	885	# => Unix-based system
[23335]	886
[23371]	887	# If web page didn't give encoding, then default to utf8
	888	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
[23835]	889
[23371]	890	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
	891	print STDERR "**** Encoding with '$content_encoding', href: $href\n";
	892	}
[23335]	893
[23835]	894	# on Darwin, the unicode filenames are stored on the file
	895	# system in decomposed form, so any href link (including when
	896	# URL-encoded) should refer to the decomposed name of the file
	897	if ($ENV{'GSDLOS'} =~ /^darwin$/i) {
	898	$href = normalize('D', $href); # Normalization Form D (decomposition)
	899	}
	900
[23371]	901	$href = encode($content_encoding,$href);
	902	}
	903
[23835]	904	$href = &unicode::raw_filename_to_utf8_url_encoded($href);
[23335]	905	$href = &unicode::filename_to_url($href);
	906
[16812]	907	&ghtml::urlsafe ($href);
[23371]	908
[23347]	909	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23387]	910	print STDERR "******DEBUG: href=$href\n";
[23347]	911	}
[23335]	912
[23347]	913
[18521]	914	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
[14665]	915	} else {
[23335]	916	# link is to some other type of file (e.g., an image) so we'll
[14665]	917	# need to associate that file
	918	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
	919	}
	920	}
	921
[25555]	922	sub retrieve_css_associated_files {
	923	my $self = shift (@_);
	924	my ($css_filename) = @_;
	925
	926	my $css_filename_dirname = &File::Basename::dirname($css_filename);
	927
	928	open (CSSFILE, $css_filename) \|\| return;
	929	sysread (CSSFILE, my $file_string, -s CSSFILE);
	930
	931	$self->acquire_css_associated_files_from_text_block($file_string, $css_filename_dirname) unless !defined $file_string;
	932
	933	close CSSFILE;
	934	}
	935
	936	sub acquire_css_associated_files_from_text_block {
	937	my $self = shift (@_);
	938	my ($text, $css_filename_dirname) = @_;
	939
	940	my @image_urls = ($text =~ m/background-image:\surl[^;];/sg);
	941	foreach my $img_url (@image_urls)
	942	{
	943	$img_url =~ s/^.url.\((.)\).$/$1/;
	944	$img_url =~ s/^\s"?([^"])"?\s*$/$1/;
	945
[27306]	946	$self->{'css_assoc_files'}->{&FileUtils::filenameConcatenate($css_filename_dirname, $img_url)} = $img_url;
[25555]	947	}
	948	}
	949
	950	sub acquire_css_associated_files {
	951	my $self = shift(@_);
	952
	953	my ($doc_obj, $section) = @_;
	954
	955	foreach my $image_filename (keys %{$self->{'css_assoc_files'}})
	956	{
	957	$doc_obj->associate_file($image_filename, $self->{'css_assoc_files'}->{$image_filename}, undef, $section);
	958	}
	959	}
	960
[14665]	961	sub add_file {
	962	my $self = shift (@_);
	963	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
	964	my ($newname);
	965
	966	my $filename = $href;
	967	if ($base_dir eq "") {
[23387]	968	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
	969	# remove http://
	970	$filename =~ s/^[^:]*:\/\///;
	971	}
	972	else {
	973	# remove http:/ thereby leaving one slash at the start as
	974	# part of full pathname
	975	$filename =~ s/^[^:]*:\///;
	976	}
[14665]	977	}
	978	else {
	979	# remove http://
	980	$filename =~ s/^[^:]*:\/\///;
	981	}
	982
[27306]	983	$filename = &FileUtils::filenameConcatenate($base_dir, $filename);
[23363]	984
[22355]	985	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'})) {
	986	# we are processing a tidytmp file - want paths to be in import
	987	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
	988	}
[23335]	989
	990	# Replace %XX's in URL with decoded value if required. Note that the
	991	# filename may include the %XX in some situations. If the original
	992	# file's name was in URL encoding, the following method will not decode
	993	# it.
[23387]	994	my $unicode_filename = $filename;
	995	my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
[16769]	996
[23387]	997	# wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
	998	# (it seems) accompanying file
	999	if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
	1000
[23335]	1001	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
	1002
[23387]	1003	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
	1004	# The filenames that come through the HTML file have been decoded
	1005	# into Unicode aware Perl strings. Need to convert them back
	1006	# to their initial raw-byte encoding to match the file that
	1007	# exists on the file system
	1008	$filename = encode($content_encoding, $opt_decode_unicode_filename);
	1009	}
	1010	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
	1011	# HFS+ is UTF8 with decompostion
	1012	$filename = encode($content_encoding, $opt_decode_unicode_filename);
	1013	$filename = normalize('D', $filename); # Normalization Form D (decomposition)
[23335]	1014
[23387]	1015	}
	1016	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
	1017	my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
	1018
	1019	if (defined $long_filename) {
	1020	my $short_filename = Win32::GetLongPathName($long_filename);
	1021	$filename = $short_filename;
	1022	}
	1023	# else {
	1024	# print STDERR "***** failed to map href to real file:\n";
	1025	# print STDERR "****** $href -> $opt_decode_unicode_filename\n";
	1026	# }
	1027	}
	1028	else {
	1029	my $outhandle = $self->{'outhandle'};
	1030	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
	1031	print $outhandle " in file system encoding of href: $href\n";
	1032	print $outhandle " No character encoding done.\n";
	1033	}
	1034
	1035
[16769]	1036	# some special processing if the intended filename was converted to utf8, but
	1037	# the actual file still needs to be renamed
[27306]	1038	#if (!&util::fd_exists($filename)) {
	1039	if (!&FileUtils::fileExists($filename)) {
[16769]	1040	# try the original filename stored in map
[23347]	1041	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23363]	1042	print STDERR "******!! orig filename did not exist: $filename\n";
[23347]	1043	}
[23335]	1044
[23387]	1045	## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
[23363]	1046
[23387]	1047	my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
[23335]	1048
[23347]	1049	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23387]	1050	print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n";
[23347]	1051	}
[23335]	1052
[16920]	1053	if (defined $original_filename && -e $original_filename) {
[23347]	1054	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23363]	1055	print STDERR "****** Found match!\n";
[23347]	1056	}
[16769]	1057	$filename = $original_filename;
[14665]	1058	}
	1059	}
[16769]	1060
	1061	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
[14665]	1062
	1063	if ($rl == 0) {
[16769]	1064	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
[18521]	1065	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
[14665]	1066	}
	1067	else {
[18521]	1068	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
[14665]	1069	}
	1070	}
	1071
[16769]	1072	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
[18521]	1073	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
[14665]	1074	}
[20778]	1075	# add the original image file as a source file
[20791]	1076	if (!$self->{'processing_tmp_files'} ) {
	1077	$doc_obj->associate_source_file($filename);
	1078	}
[14665]	1079	if ($self->{'rename_assoc_files'}) {
	1080	if (defined $self->{'aux_files'}->{$href}) {
	1081	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
	1082	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
	1083	} else {
	1084	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
	1085	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
	1086	$self->inc_filecount ();
	1087	}
	1088	$doc_obj->associate_file($filename, $newname, undef, $section);
	1089	return "_httpdocimg_/$newname";
	1090	} else {
[23387]	1091	if(&unicode::is_url_encoded($unicode_filename)) {
[16904]	1092	# use the possibly-decoded filename instead to avoid double URL encoding
	1093	($newname) = $filename =~ m/([^\/\\]*)$/;
	1094	} else {
[23387]	1095	($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
[16904]	1096	}
[16935]	1097
[18320]	1098	# Make sure this name uses only ASCII characters.
	1099	# We use either base64 or URL encoding, as these preserve original encoding
	1100	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
[16632]	1101
[23363]	1102	### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
[14665]	1103	$doc_obj->associate_file($filename, $newname, undef, $section);
[16632]	1104
[16769]	1105	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
	1106	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
[16632]	1107	my $newname_url = $newname;
[18404]	1108	$newname_url = &unicode::filename_to_url($newname_url);
[16769]	1109	return "_httpdocimg_/$newname_url";
[14665]	1110	}
	1111	}
	1112
	1113
	1114	sub format_link {
	1115	my $self = shift (@_);
	1116	my ($link, $base_dir, $file) = @_;
	1117
[23371]	1118	# strip off hash part, e.g. #foo, but watch out for any entities, e.g. α
	1119	my ($before_hash, $hash_part) = $link =~ m/^(.?[^&])(\#.)?$/;
[23463]	1120
[14665]	1121	$hash_part = "" if !defined $hash_part;
[16769]	1122	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
[23463]	1123	my $outhandle = $self->{'outhandle'};
	1124	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
	1125	if $self->{'verbosity'};
	1126	return ($link, "", 0);
[14665]	1127	}
[23463]	1128
[20576]	1129	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
[23463]	1130	my $type = $1;
[14665]	1131
[16769]	1132	if ($link =~ m/^(http\|ftp):/i) {
[14665]	1133	# Turn url (using /) into file name (possibly using \ on windows)
	1134	my @http_dir_split = split('/', $before_hash);
[27306]	1135	$before_hash = &FileUtils::filenameConcatenate(@http_dir_split);
[14665]	1136	}
	1137
	1138	$before_hash = $self->eval_dir_dots($before_hash);
[16024]	1139
[27306]	1140	my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);
[14665]	1141
	1142	my $rl = 0;
	1143	$rl = 1 if (-e $linkfilename);
	1144
	1145	# make sure there's a slash on the end if it's a directory
[16769]	1146	if ($before_hash !~ m/\/$/) {
[14665]	1147	$before_hash .= "/" if (-d $linkfilename);
	1148	}
	1149	return ($type . $before_hash, $hash_part, $rl);
[16024]	1150
[16769]	1151	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
[14665]	1152
[16769]	1153	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
	1154
[14665]	1155	# the first directory will be the domain name if file_is_url
	1156	# to generate archives, otherwise we'll assume all files are
	1157	# from the same site and base_dir is the root
	1158
	1159	if ($self->{'file_is_url'}) {
	1160	my @dirs = split /[\/\\]/, $file;
	1161	my $domname = shift (@dirs);
[27306]	1162	$before_hash = &FileUtils::filenameConcatenate($domname, $before_hash);
[14665]	1163	$before_hash =~ s@\\@/@g; # for windows
	1164	}
	1165	else
	1166	{
	1167	# see if link shares directory with source document
	1168	# => turn into relative link if this is so!
	1169
[16769]	1170	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
[14665]	1171	# too difficult doing a pattern match with embedded '\'s...
	1172	my $win_before_hash=$before_hash;
	1173	$win_before_hash =~ s@(\\)+@/@g;
	1174	# $base_dir is already similarly "converted" on windows.
	1175	if ($win_before_hash =~ s@^$base_dir/@@o) {
[16024]	1176	# if this is true, we removed a prefix
	1177	$before_hash=$win_before_hash;
[14665]	1178	}
	1179	}
	1180	else {
	1181	# before_hash has lost leading slash by this point,
	1182	# -> add back in prior to substitution with $base_dir
	1183	$before_hash = "/$before_hash";
	1184
[27306]	1185	$before_hash = &FileUtils::filenameConcatenate("",$before_hash);
[14665]	1186	$before_hash =~ s@^$base_dir/@@;
	1187	}
	1188	}
	1189	} else {
	1190	# Turn relative file path into full path
	1191	my $dirname = &File::Basename::dirname($file);
[27306]	1192	$before_hash = &FileUtils::filenameConcatenate($dirname, $before_hash);
[16769]	1193	$before_hash = $self->eval_dir_dots($before_hash);
[14665]	1194	}
	1195
[27306]	1196	my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);
[23387]	1197
	1198
	1199	# print STDERR "**** linkfilename = $linkfilename\n";
	1200	# if (!&util::fd_exists($linkfilename)) {
	1201	# print STDERR "***** Warning: Could not find $linkfilename\n";
	1202	# }
	1203
	1204
[14665]	1205	# make sure there's a slash on the end if it's a directory
[16769]	1206	if ($before_hash !~ m/\/$/) {
[14665]	1207	$before_hash .= "/" if (-d $linkfilename);
	1208	}
[23387]	1209
	1210	# print STDERR "*** returning: $before_hash\n";
	1211
[14665]	1212	return ("http://" . $before_hash, $hash_part, 1);
	1213	} else {
	1214	# mailto, news, nntp, telnet, javascript or gopher link
	1215	return ($before_hash, "", 0);
	1216	}
	1217	}
	1218
	1219	sub extract_first_NNNN_characters {
	1220	my $self = shift (@_);
	1221	my ($textref, $doc_obj, $thissection) = @_;
	1222
	1223	foreach my $size (split /,/, $self->{'first'}) {
	1224	my $tmptext = $$textref;
	1225	# skip to the body
	1226	$tmptext =~ s/.<body[^>]>//i;
	1227	# remove javascript
	1228	$tmptext =~ s@<script.*?</script>@ @sig;
	1229	$tmptext =~ s/<[^>]*>/ /g;
	1230	$tmptext =~ s/ / /g;
	1231	$tmptext =~ s/^\s+//;
	1232	$tmptext =~ s/\s+$//;
	1233	$tmptext =~ s/\s+/ /gs;
	1234	$tmptext = &unicode::substr ($tmptext, 0, $size);
	1235	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
	1236	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
	1237	}
	1238	}
	1239
	1240
	1241	sub extract_metadata {
	1242	my $self = shift (@_);
	1243	my ($textref, $metadata, $doc_obj, $section) = @_;
	1244	my $outhandle = $self->{'outhandle'};
	1245	# if we don't want metadata, we may as well not be here ...
	1246	return if (!defined $self->{'metadata_fields'});
[22348]	1247
[21800]	1248	my $separator = $self->{'metadata_field_separator'};
	1249	if ($separator eq "") {
	1250	undef $separator;
	1251	}
	1252
[14665]	1253	# metadata fields to extract/save. 'key' is the (lowercase) name of the
	1254	# html meta, 'value' is the metadata name for greenstone to use
	1255	my %find_fields = ();
	1256
	1257	my %creator_fields = (); # short-cut for lookups
	1258
	1259
	1260	foreach my $field (split /,/, $self->{'metadata_fields'}) {
	1261	$field =~ s/^\s+//; # remove leading whitespace
	1262	$field =~ s/\s+$//; # remove trailing whitespace
[22348]	1263
[14665]	1264	# support tag<tagname>
[20689]	1265	if ($field =~ m/^(.?)\s<(.*?)>$/) {
[14665]	1266	# "$2" is the user's preferred gs metadata name
	1267	$find_fields{lc($1)}=$2; # lc = lowercase
	1268	} else { # no <tagname> for mapping
	1269	# "$field" is the user's preferred gs metadata name
	1270	$find_fields{lc($field)}=$field; # lc = lowercase
	1271	}
	1272	}
	1273
	1274	if (defined $self->{'hunt_creator_metadata'} &&
	1275	$self->{'hunt_creator_metadata'} == 1 ) {
	1276	my @extra_fields =
	1277	(
	1278	'author',
	1279	'author.email',
	1280	'creator',
	1281	'dc.creator',
	1282	'dc.creator.corporatename',
	1283	);
	1284
	1285	# add the creator_metadata fields to search for
	1286	foreach my $field (@extra_fields) {
	1287	$creator_fields{$field}=0; # add to lookup hash
	1288	}
	1289	}
	1290
	1291
	1292	# find the header in the html file, which has the meta tags
	1293	$$textref =~ m@<head>(.*?)</head>@si;
	1294
	1295	my $html_header=$1;
	1296
	1297	# go through every <meta... tag defined in the html and see if it is
	1298	# one of the tags we want to match.
	1299
	1300	# special case for title - we want to remember if its been found
	1301	my $found_title = 0;
	1302	# this assumes that ">" won't appear. (I don't think it's allowed to...)
[16769]	1303	$html_header =~ m/^/; # match the start of the string, for \G assertion
[16024]	1304
[14665]	1305	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
	1306	my $metatag=$1;
	1307	my ($tag, $value);
	1308
	1309	# find the tag name
[16769]	1310	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
[14665]	1311	$tag=$2;
	1312	# in case they're not using " or ', but they should...
	1313	if (! $tag) {
[16769]	1314	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
[14665]	1315	$tag=$1;
	1316	}
	1317
	1318	if (!defined $tag) {
[15872]	1319	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
[14665]	1320	next;
	1321	}
	1322
	1323	# don't need to assign this field if it was passed in from a previous
	1324	# (recursive) plugin
	1325	if (defined $metadata->{$tag}) {next}
	1326
	1327	# find the tag content
[16769]	1328	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
[14665]	1329	$value=$2;
	1330
[24431]	1331	# The following code assigns the metaname to value if value is
	1332	# empty. Why would we do this?
	1333	#if (! $value) {
	1334	# $metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
	1335	# $value=$1;
	1336	#}
	1337	if (!defined $value \|\| $value eq "") {
	1338	print $outhandle "HTMLPlugin: can't find VALUE in <meta $metatag >\n" if ($self->{'verbosity'} > 2);
[14665]	1339	next;
	1340	}
[22348]	1341
[14665]	1342	# clean up and add
	1343	$value =~ s/\s+/ /gs;
	1344	chomp($value); # remove trailing \n, if any
	1345	if (exists $creator_fields{lc($tag)}) {
	1346	# map this value onto greenstone's "Creator" metadata
	1347	$tag='Creator';
	1348	} elsif (!exists $find_fields{lc($tag)}) {
[16024]	1349	next; # don't want this tag
[14665]	1350	} else {
	1351	# get the user's preferred capitalisation
	1352	$tag = $find_fields{lc($tag)};
	1353	}
	1354	if (lc($tag) eq "title") {
	1355	$found_title = 1;
	1356	}
[18521]	1357
	1358	if ($self->{'verbosity'} > 2) {
	1359	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
[14665]	1360	}
[18521]	1361
[22348]	1362	if ($tag =~ /\./) {
	1363	# there is a . so has a namespace, add ex.
	1364	$tag = "ex.$tag";
	1365	}
[21800]	1366	if (defined $separator) {
	1367	my @values = split($separator, $value);
	1368	foreach my $v (@values) {
	1369	$doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
	1370	}
	1371	}
	1372	else {
	1373	$doc_obj->add_utf8_metadata($section, $tag, $value);
	1374	}
[14665]	1375	}
	1376
	1377	# TITLE: extract the document title
	1378	if (exists $find_fields{'title'} && !$found_title) {
	1379	# we want a title, and didn't find one in the meta tags
	1380	# see if there's a <title> tag
	1381	my $title;
	1382	my $from = ""; # for debugging output only
[16769]	1383	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
[14665]	1384	$title = $1;
	1385	$from = "<title> tags";
	1386	}
	1387
	1388	if (!defined $title) {
	1389	$from = "first 100 chars";
	1390	# if no title use first 100 or so characters
	1391	$title = $$textref;
	1392	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
	1393	$title =~ s/^.*?<body>//si;
	1394	# ignore javascript!
	1395	$title =~ s@<script.*?</script>@ @sig;
	1396	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
	1397	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
	1398	$title = substr ($title, 0, 100);
	1399	$title =~ s/\s\S*$/.../;
	1400	}
	1401	$title =~ s/<[^>]*>/ /g; # remove html tags
	1402	$title =~ s/ / /g;
	1403	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
	1404	$title =~ s/\s+/ /gs; # collapse multiple spaces
	1405	$title =~ s/^\s*//; # remove leading spaces
	1406	$title =~ s/\s*$//; # remove trailing spaces
	1407
	1408	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
	1409	$title =~ s/^\s+//s; # in case title_sub introduced any...
[23335]	1410	$doc_obj->add_utf8_metadata ($section, "Title", $title);
[14665]	1411	print $outhandle " extracted Title metadata \"$title\" from $from\n"
	1412	if ($self->{'verbosity'} > 2);
	1413	}
	1414
	1415	# add FileFormat metadata
	1416	$doc_obj->add_metadata($section,"FileFormat", "HTML");
	1417
	1418	# Special, for metadata names such as tagH1 - extracts
	1419	# the text between the first <H1> and </H1> tags into "H1" metadata.
	1420
	1421	foreach my $field (keys %find_fields) {
[16769]	1422	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
[14665]	1423	my $tag = $1;
	1424	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
	1425	my $content = $1;
	1426	$content =~ s/ / /g;
	1427	$content =~ s/<[^>]*>/ /g;
	1428	$content =~ s/^\s+//;
	1429	$content =~ s/\s+$//;
	1430	$content =~ s/\s+/ /gs;
	1431	if ($content) {
	1432	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
	1433	$tag =~ s/^tag//i;
	1434	$doc_obj->add_utf8_metadata ($section, $tag, $content);
	1435	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
	1436	if ($self->{'verbosity'} > 2);
	1437	}
	1438	}
	1439	}
	1440	}
	1441
	1442
	1443	# evaluate any "../" to next directory up
	1444	# evaluate any "./" as here
	1445	sub eval_dir_dots {
	1446	my $self = shift (@_);
	1447	my ($filename) = @_;
	1448	my $dirsep_os = &util::get_os_dirsep();
	1449	my @dirsep = split(/$dirsep_os/,$filename);
	1450
	1451	my @eval_dirs = ();
	1452	foreach my $d (@dirsep) {
	1453	if ($d eq "..") {
	1454	pop(@eval_dirs);
	1455
	1456	} elsif ($d eq ".") {
	1457	# do nothing!
	1458
	1459	} else {
	1460	push(@eval_dirs,$d);
	1461	}
	1462	}
	1463
	1464	# Need to fiddle with number of elements in @eval_dirs if the
	1465	# first one is the empty string. This is because of a
[27306]	1466	# modification to FileUtils::filenameConcatenate that supresses the addition
[14665]	1467	# of a leading '/' character (or \ if windows) (intended to help
	1468	# filename cat with relative paths) if the first entry in the
	1469	# array is the empty string. Making the array start with two
	1470	# empty strings is a way to defeat this "smart" option.
	1471	#
	1472	if (scalar(@eval_dirs) > 0) {
	1473	if ($eval_dirs[0] eq ""){
	1474	unshift(@eval_dirs,"");
	1475	}
	1476	}
[16836]	1477
[27306]	1478	my $evaluated_filename = (scalar @eval_dirs > 0) ? &FileUtils::filenameConcatenate(@eval_dirs) : "";
[16836]	1479	return $evaluated_filename;
[14665]	1480	}
	1481
	1482	sub replace_usemap_links {
	1483	my $self = shift (@_);
	1484	my ($front, $link, $back) = @_;
	1485
[16769]	1486	# remove quotes from link at start and end if necessary
	1487	if ($link=~/^[\"\']/) {
	1488	$link=~s/^[\"\']//;
	1489	$link=~s/[\"\']$//;
	1490	$front.='"';
	1491	$back="\"$back";
	1492	}
	1493
[14665]	1494	$link =~ s/^\.\///;
	1495	return $front . $link . $back;
	1496	}
	1497
	1498	sub inc_filecount {
	1499	my $self = shift (@_);
	1500
	1501	if ($self->{'file_num'} == 1000) {
	1502	$self->{'dir_num'} ++;
	1503	$self->{'file_num'} = 0;
	1504	} else {
	1505	$self->{'file_num'} ++;
	1506	}
	1507	}
	1508
	1509
[15872]	1510	# Extend read_file so that strings like é are
[14665]	1511	# converted to UTF8 internally.
	1512	#
	1513	# We don't convert < or > or & or " in case
	1514	# they interfere with the GML files
	1515
	1516	sub read_file {
[15872]	1517	my $self = shift(@_);
	1518	my ($filename, $encoding, $language, $textref) = @_;
[14665]	1519
[15872]	1520	$self->SUPER::read_file($filename, $encoding, $language, $textref);
[14665]	1521
[23363]	1522	# Convert entities to their Unicode code-point equivalents
[14665]	1523	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
[22951]	1524	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
[14665]	1525	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
[22842]	1526
[14665]	1527	}
	1528
[20774]	1529	sub HB_read_html_file {
	1530	my $self = shift (@_);
	1531	my ($htmlfile, $text) = @_;
	1532
	1533	# load in the file
	1534	if (!open (FILE, $htmlfile)) {
	1535	print STDERR "ERROR - could not open $htmlfile\n";
	1536	return;
	1537	}
	1538
	1539	my $foundbody = 0;
	1540	$self->HB_gettext (\$foundbody, $text, "FILE");
	1541	close FILE;
	1542
	1543	# just in case there was no <body> tag
	1544	if (!$foundbody) {
	1545	$foundbody = 1;
	1546	open (FILE, $htmlfile) \|\| return;
	1547	$self->HB_gettext (\$foundbody, $text, "FILE");
	1548	close FILE;
	1549	}
	1550	# text is in utf8
	1551	}
	1552
	1553	# converts the text to utf8, as ghtml does that for é etc.
	1554	sub HB_gettext {
	1555	my $self = shift (@_);
	1556	my ($foundbody, $text, $handle) = @_;
	1557
	1558	my $line = "";
	1559	while (defined ($line = <$handle>)) {
	1560	# look for body tag
	1561	if (!$$foundbody) {
	1562	if ($line =~ s/^.<body[^>]>//i) {
	1563	$$foundbody = 1;
	1564	} else {
	1565	next;
	1566	}
	1567	}
	1568
	1569	# check for symbol fonts
	1570	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
	1571	my $font = $1;
	1572	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
	1573	if ($font !~ m/^arial$/i);
	1574	}
	1575
	1576	$$text .= $line;
	1577	}
	1578
	1579	if ($self->{'input_encoding'} eq "iso_8859_1") {
	1580	# convert to utf-8
	1581	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
	1582	}
	1583	# convert any alphanumeric character entities to their utf-8
	1584	# equivalent for indexing purposes
	1585	#&ghtml::convertcharentities ($$text);
	1586
	1587	$$text =~ s/\s+/ /g; # remove \n's
[22857]	1588
	1589	# At this point $$text is a binary byte string
	1590	# => turn it into a Unicode aware string, so full
	1591	# Unicode aware pattern matching can be used.
	1592	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
	1593	#
	1594
	1595	$$text = decode("utf8",$$text);
[20774]	1596	}
	1597
	1598	sub HB_clean_section {
	1599	my $self = shift (@_);
	1600	my ($section) = @_;
	1601
	1602	# remove tags without a starting tag from the section
	1603	my ($tag, $tagstart);
	1604	while ($section =~ m/<\/([^>]{1,10})>/) {
	1605	$tag = $1;
	1606	$tagstart = index($section, "<$tag");
	1607	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
	1608	$section =~ s/<\/$tag>//;
	1609	}
	1610
	1611	# remove extra paragraph tags
	1612	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
	1613
	1614	# remove extra stuff at the end of the section
	1615	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
	1616
	1617	# add a newline at the beginning of each paragraph
	1618	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
	1619
	1620	# add a newline every 80 characters at a word boundary
	1621	# Note: this regular expression puts a line feed before
	1622	# the last word in each section, even when it is not
	1623	# needed.
	1624	$section =~ s/(.{1,80})\s/$1\n/g;
	1625
	1626	# fix up the image links
	1627	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
	1628	<center><img src=\"$1\" \/><\/center><br\/>/ig;
	1629	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
	1630	<center><img src=\"$1\" \/><\/center><br\/>/ig;
	1631
	1632	return $section;
	1633	}
	1634
	1635	# Will convert the oldHDL format to the new HDL format (using the Section tag)
	1636	sub convert_to_newHDLformat
	1637	{
	1638	my $self = shift (@_);
	1639	my ($file,$cnfile) = @_;
	1640	my $input_filename = $file;
	1641	my $tmp_filename = $cnfile;
	1642
	1643	# write HTML tmp file with new HDL format
	1644	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
	1645
	1646	# read in the file and do basic html cleaning (removing header etc)
	1647	my $html = "";
	1648	$self->HB_read_html_file ($input_filename, \$html);
	1649
	1650	# process the file one section at a time
	1651	my $curtoclevel = 1;
	1652	my $firstsection = 1;
	1653	my $toclevel = 0;
	1654	while (length ($html) > 0) {
	1655	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
	1656	$toclevel = $3;
	1657	my $title = $4;
	1658	my $sectiontext = "";
	1659	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
	1660	$sectiontext = $1;
	1661	} else {
	1662	$sectiontext = $html;
	1663	$html = "";
	1664	}
	1665
	1666	# remove tags and extra spaces from the title
	1667	$title =~ s/<\/?[^>]+>//g;
	1668	$title =~ s/^\s+\|\s+$//g;
	1669
	1670	# close any sections below the current level and
	1671	# create a new section (special case for the firstsection)
	1672	print PROD "<!--\n";
	1673	while (($curtoclevel > $toclevel) \|\|
	1674	(!$firstsection && $curtoclevel == $toclevel)) {
	1675	$curtoclevel--;
	1676	print PROD "</Section>\n";
	1677	}
	1678	if ($curtoclevel+1 < $toclevel) {
	1679	print STDERR "WARNING - jump in toc levels in $input_filename " .
	1680	"from $curtoclevel to $toclevel\n";
	1681	}
	1682	while ($curtoclevel < $toclevel) {
	1683	$curtoclevel++;
	1684	}
	1685
	1686	if ($curtoclevel == 1) {
	1687	# add the header tag
	1688	print PROD "-->\n";
	1689	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
	1690	print PROD "<!--\n";
	1691	}
	1692
	1693	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
	1694
	1695	print PROD "-->\n";
	1696
	1697	# clean up the section html
	1698	$sectiontext = $self->HB_clean_section($sectiontext);
	1699
	1700	print PROD "$sectiontext\n";
	1701
	1702	} else {
	1703	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
	1704	"\nin $input_filename\n";
	1705	last;
	1706	}
	1707	$firstsection = 0;
	1708	}
	1709
	1710	print PROD "<!--\n";
	1711	while ($curtoclevel > 0) {
	1712	$curtoclevel--;
	1713	print PROD "</Section>\n";
	1714	}
	1715	print PROD "-->\n";
	1716
	1717	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
	1718
	1719	return $tmp_filename;
	1720	}
	1721
	1722	sub shorten {
	1723	my $self = shift (@_);
	1724	my ($text) = @_;
	1725
	1726	return "\"$text\"" if (length($text) < 100);
	1727
	1728	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
	1729	substr ($text, length($text)-50) . "\"";
	1730	}
	1731
	1732	sub convert_tidy_or_oldHDL_file
	1733	{
	1734	my $self = shift (@_);
	1735	my ($file) = @_;
	1736	my $input_filename = $file;
	1737
	1738	if (-d $input_filename)
	1739	{
	1740	return $input_filename;
	1741	}
	1742
	1743	# get the input filename
	1744	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	1745	my $base_dirname = $dirname;
	1746	$suffix = lc($suffix);
	1747
	1748	# derive tmp filename from input filename
	1749	# Remove any white space from filename -- no risk of name collision, and
	1750	# makes later conversion by utils simpler. Leave spaces in path...
	1751	# tidy up the filename with space, dot, hyphen between
	1752	$tailname =~ s/\s+//g;
	1753	$tailname =~ s/\.+//g;
	1754	$tailname =~ s/\-+//g;
	1755	# convert to utf-8 otherwise we have problems with the doc.xml file
	1756	# later on
	1757	&unicode::ensure_utf8(\$tailname);
	1758
	1759	# softlink to collection tmp dir
[27306]	1760	my $tmp_dirname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
[20774]	1761	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
	1762
	1763	my $test_dirname = "";
	1764	my $f_separator = &util::get_os_dirsep();
	1765
	1766	if ($dirname =~ m/import$f_separator/)
	1767	{
	1768	$test_dirname = $'; #'
	1769
	1770	#print STDERR "init $'\n";
	1771
	1772	while ($test_dirname =~ m/[$f_separator]/)
	1773	{
	1774	my $folderdirname = $`;
[27306]	1775	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname,$folderdirname);
[20774]	1776	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
	1777	$test_dirname = $'; #'
	1778	}
	1779	}
	1780
[27306]	1781	my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
[20774]	1782
	1783	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
	1784	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
	1785	{
	1786	#convert the input file to a new style HDL
	1787	my $hdl_output_filename = $input_filename;
	1788	if ($self->{'old_style_HDL'})
	1789	{
[27306]	1790	$hdl_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
[20774]	1791	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
	1792	}
	1793
	1794	#just for checking copy all other file from the base dir to tmp dir if it is not exists
	1795	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
	1796	my @files = grep {!/^\.+$/} readdir(DIR);
	1797	close(DIR);
	1798
	1799	foreach my $file (@files)
	1800	{
[27306]	1801	my $src_file = &FileUtils::filenameConcatenate($base_dirname,$file);
	1802	my $dest_file = &FileUtils::filenameConcatenate($tmp_dirname,$file);
[20774]	1803	if ((!-e $dest_file) && (!-d $src_file))
	1804	{
	1805	# just copy the original file back to the tmp directory
	1806	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
	1807	}
	1808	}
	1809
	1810	# tidy the input file
	1811	my $tidy_output_filename = $hdl_output_filename;
	1812	if ($self->{'use_realistic_book'})
	1813	{
[27306]	1814	$tidy_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
[20774]	1815	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
	1816	}
	1817	$tmp_filename = $tidy_output_filename;
	1818	}
	1819	else
	1820	{
	1821	if (!-e $tmp_filename)
	1822	{
	1823	# just copy the original file back to the tmp directory
	1824	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
	1825	}
	1826	}
	1827
	1828	return $tmp_filename;
	1829	}
	1830
	1831
	1832	# Will make the html input file as a proper XML file with removed font tag and
	1833	# image size added to the img tag.
	1834	# The tidying process takes place in a collection specific 'tmp' directory so
	1835	# that we don't accidentally damage the input.
	1836	sub tmp_tidy_file
	1837	{
	1838	my $self = shift (@_);
	1839	my ($file,$cnfile) = @_;
	1840	my $input_filename = $file;
	1841	my $tmp_filename = $cnfile;
	1842
	1843	# get the input filename
	1844	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	1845
	1846	require HTML::TokeParser::Simple;
	1847
	1848	# create HTML parser to decode the input file
	1849	my $parser = HTML::TokeParser::Simple->new($input_filename);
	1850
	1851	# write HTML tmp file without the font tag and image size are added to the img tag
	1852	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
	1853	while (my $token = $parser->get_token())
	1854	{
	1855	# is it an img tag
	1856	if ($token->is_start_tag('img'))
	1857	{
	1858	# get the attributes
	1859	my $attr = $token->return_attr;
	1860
	1861	# get the full path to the image
[27306]	1862	my $img_file = &FileUtils::filenameConcatenate($dirname,$attr->{src});
[20774]	1863
	1864	# set the width and height attribute
	1865	($attr->{width}, $attr->{height}) = imgsize($img_file);
	1866
	1867	# recreate the tag
	1868	print PROD "<img";
	1869	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
	1870	print PROD ">";
	1871	}
	1872	# is it a font tag
	1873	else
	1874	{
	1875	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
	1876	{
	1877	# remove font tag
	1878	print PROD "";
	1879	}
	1880	else
	1881	{
	1882	# print without changes
	1883	print PROD $token->as_is;
	1884	}
	1885	}
	1886	}
	1887	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
	1888
	1889	# run html-tidy on the tmp file to make it a proper XML file
	1890
[22594]	1891	my $outhandle = $self->{'outhandle'};
	1892	print $outhandle "Converting HTML to be XML compliant:\n";
	1893
	1894	my $tidy_cmd = "tidy";
	1895	$tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
[22636]	1896	$tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
[22594]	1897	if ($self->{'verbosity'} <= 2) {
	1898	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
	1899	$tidy_cmd .= " 2>nul";
	1900	}
	1901	else {
	1902	$tidy_cmd .= " 2>/dev/null";
	1903	}
	1904	print $outhandle " => $tidy_cmd\n";
	1905	}
	1906
	1907	my $tidyfile = `$tidy_cmd`;
	1908
[20774]	1909	# write result back to the tmp file
	1910	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
	1911	print PROD $tidyfile;
	1912	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
	1913
	1914	# return the output filename
	1915	return $tmp_filename;
	1916	}
	1917
[22355]	1918	sub associate_cover_image
	1919	{
	1920	my $self = shift(@_);
	1921	my ($doc_obj, $filename) = @_;
	1922	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
	1923	{
	1924	# we will have cover image in tidytmp, but want it from import
	1925	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
	1926	}
	1927	$self->SUPER::associate_cover_image($doc_obj, $filename);
	1928	}
	1929
	1930
[14665]	1931	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: