Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 28196

Last change on this file since 28196 was 28196, checked in by kjdon, 11 years ago
util::mk_dir should be FileUtils::makeDirectory
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 66.2 KB

Rev	Line
[14665]	1	###########################################################################
	2	#
[15872]	3	# HTMLPlugin.pm -- basic html plugin
[14665]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright (C) 1999 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27	#
	28	# Note that this plugin handles frames only in a very simple way
	29	# i.e. each frame is treated as a separate document. This means
	30	# search results will contain links to individual frames rather
	31	# than linking to the top level frameset.
	32	# There may also be some problems caused by the _parent target
	33	# (it's removed by this plugin)
	34	#
	35
[15872]	36	package HTMLPlugin;
[14665]	37
[22842]	38	use Encode;
[23387]	39	use Unicode::Normalize 'normalize';
[22842]	40
[15872]	41	use ReadTextFile;
	42	use HBPlugin;
[14665]	43	use ghtml;
	44	use unicode;
	45	use util;
[27306]	46	use FileUtils;
[14665]	47	use XMLParser;
	48
	49	use Image::Size;
[14913]	50	use File::Copy;
[14665]	51
	52	sub BEGIN {
[15872]	53	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
[14665]	54	}
	55
	56	use strict; # every perl program should have this!
	57	no strict 'refs'; # make an exception so we can use variables as filehandles
	58
	59	my $arguments =
	60	[ { 'name' => "process_exp",
[15872]	61	'desc' => "{BasePlugin.process_exp}",
[14665]	62	'type' => "regexp",
	63	'deft' => &get_default_process_exp() },
	64	{ 'name' => "block_exp",
[15872]	65	'desc' => "{BasePlugin.block_exp}",
[14665]	66	'type' => 'regexp',
	67	'deft' => &get_default_block_exp() },
	68	{ 'name' => "nolinks",
[15872]	69	'desc' => "{HTMLPlugin.nolinks}",
[14665]	70	'type' => "flag" },
	71	{ 'name' => "keep_head",
[15872]	72	'desc' => "{HTMLPlugin.keep_head}",
[14665]	73	'type' => "flag" },
	74	{ 'name' => "no_metadata",
[15872]	75	'desc' => "{HTMLPlugin.no_metadata}",
[14665]	76	'type' => "flag" },
	77	{ 'name' => "metadata_fields",
[15872]	78	'desc' => "{HTMLPlugin.metadata_fields}",
[14665]	79	'type' => "string",
	80	'deft' => "Title" },
[21800]	81	{ 'name' => "metadata_field_separator",
	82	'desc' => "{HTMLPlugin.metadata_field_separator}",
	83	'type' => "string",
	84	'deft' => "" },
[14665]	85	{ 'name' => "hunt_creator_metadata",
[15872]	86	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
[14665]	87	'type' => "flag" },
	88	{ 'name' => "file_is_url",
[15872]	89	'desc' => "{HTMLPlugin.file_is_url}",
[14665]	90	'type' => "flag" },
	91	{ 'name' => "assoc_files",
[15872]	92	'desc' => "{HTMLPlugin.assoc_files}",
[14665]	93	'type' => "regexp",
	94	'deft' => &get_default_block_exp() },
	95	{ 'name' => "rename_assoc_files",
[15872]	96	'desc' => "{HTMLPlugin.rename_assoc_files}",
[14665]	97	'type' => "flag" },
	98	{ 'name' => "title_sub",
[15872]	99	'desc' => "{HTMLPlugin.title_sub}",
[14665]	100	'type' => "string",
	101	'deft' => "" },
	102	{ 'name' => "description_tags",
[15872]	103	'desc' => "{HTMLPlugin.description_tags}",
[14665]	104	'type' => "flag" },
	105	# retain this for backward compatibility (w3mir option was replaced by
	106	# file_is_url)
	107	{ 'name' => "w3mir",
[15872]	108	# 'desc' => "{HTMLPlugin.w3mir}",
[14665]	109	'type' => "flag",
	110	'hiddengli' => "yes"},
	111	{ 'name' => "no_strip_metadata_html",
[15872]	112	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
[14665]	113	'type' => "string",
	114	'deft' => "",
	115	'reqd' => "no"},
	116	{ 'name' => "sectionalise_using_h_tags",
[15872]	117	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
[14665]	118	'type' => "flag" },
[14913]	119	{ 'name' => "use_realistic_book",
[15872]	120	'desc' => "{HTMLPlugin.tidy_html}",
[14665]	121	'type' => "flag"},
[15872]	122	{ 'name' => "old_style_HDL",
	123	'desc' => "{HTMLPlugin.old_style_HDL}",
[20791]	124	'type' => "flag"},
	125	{'name' => "processing_tmp_files",
	126	'desc' => "{BasePlugin.processing_tmp_files}",
	127	'type' => "flag",
	128	'hiddengli' => "yes"}
[14665]	129	];
	130
[15872]	131	my $options = { 'name' => "HTMLPlugin",
	132	'desc' => "{HTMLPlugin.desc}",
[14665]	133	'abstract' => "no",
	134	'inherits' => "yes",
	135	'args' => $arguments };
	136
	137
	138	sub new {
	139	my ($class) = shift (@_);
	140	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	141	push(@$pluginlist, $class);
	142
[15872]	143	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	144	push(@{$hashArgOptLists->{"OptList"}},$options);
[16024]	145
[14665]	146
[15872]	147	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
[14665]	148
	149	if ($self->{'w3mir'}) {
	150	$self->{'file_is_url'} = 1;
	151	}
	152	$self->{'aux_files'} = {};
	153	$self->{'dir_num'} = 0;
	154	$self->{'file_num'} = 0;
	155
	156	return bless $self, $class;
	157	}
	158
	159	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
	160	# if have eg <script language="javascript" src="img/lib.js@123">
[20791]	161	# blocking is now done by reading through the file and recording all the
	162	# images and other files
[14665]	163	sub get_default_block_exp {
	164	my $self = shift (@_);
	165
[16392]	166	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
	167	return "";
[14665]	168	}
	169
	170	sub get_default_process_exp {
	171	my $self = shift (@_);
	172
	173	# the last option is an attempt to encode the concept of an html query ...
	174	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
	175	}
	176
	177	sub store_block_files
	178	{
	179	my $self =shift (@_);
[16392]	180	my ($filename_full_path, $block_hash) = @_;
	181
	182	my $html_fname = $filename_full_path;
[14665]	183
[23335]	184	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
	185	$self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
[14665]	186
	187	# read in file ($text will be in utf8)
[16769]	188	my $raw_text = "";
[23363]	189	$self->read_file_no_decoding($filename_full_path, \$raw_text);
[16769]	190
	191	my $textref = \$raw_text;
[14665]	192	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
	193	my $closecom = '(?:-->\|(?:—\|\|--)>)';
	194	$$textref =~ s/$opencom(.*?)$closecom//gs;
	195
[23363]	196	# Convert entities to their UTF8 equivalents
	197	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
	198	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
	199	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
	200
[14665]	201	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
	202	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
	203	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
	204	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
	205	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
[17127]	206	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
[16638]	207	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
[14665]	208
[23387]	209	if(!defined $self->{'unicode_to_original_filename'}) {
[16769]	210	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
[23387]	211	$self->{'unicode_to_original_filename'} = {};
[16769]	212	}
	213
[23387]	214	foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
[14665]	215
	216	# remove quotes from link at start and end if necessary
[23387]	217	if ($raw_link =~ m/^\"/) {
	218	$raw_link =~ s/^\"//;
	219	$raw_link =~ s/\"$//;
[14665]	220	}
	221
[23371]	222	# remove any anchor names, e.g. foo.html#name becomes foo.html
	223	# but watch out for any #'s that are part of entities, such as α
[23387]	224	$raw_link =~ s/([^&])\#.*$/$1/s;
[23371]	225
[16638]	226	# some links may just be anchor names
[23387]	227	next unless ($raw_link =~ /\S+/);
[14665]	228
[23415]	229	if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/i) {
[14665]	230	# Turn relative file path into full path
[16392]	231	my $dirname = &File::Basename::dirname($filename_full_path);
[27306]	232	$raw_link = &FileUtils::filenameConcatenate($dirname, $raw_link);
[14665]	233	}
[23387]	234	$raw_link = $self->eval_dir_dots($raw_link);
[16638]	235
[16769]	236	# this is the actual filename on the filesystem (that the link refers to)
[23387]	237	my $url_original_filename = $self->opt_url_decode($raw_link);
[16769]	238
[23387]	239	my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
[16769]	240
[23387]	241	if ($exceeds_bytecodes) {
	242	# We have a link to a file name that is more complicated than a raw byte filename
	243	# What we do next depends on the operating system we are on
[16769]	244
[23387]	245	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
	246	# Assume we're dealing with a UTF-8 encoded filename
	247	$url_original_filename = encode("utf8", $url_original_filename);
	248	}
	249	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
	250	# HFS+ is UTF8 with decompostion
	251	$url_original_filename = encode("utf8", $url_original_filename);
	252	$url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
	253	}
	254	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
	255	# Don't need to do anything as later code maps Windows
	256	# unicode filenames to DOS short filenames when needed
	257	}
	258	else {
	259	my $outhandle = $self->{'outhandle'};
	260	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
	261	print $outhandle " in raw file system encoding of: $raw_link\n";
	262	print $outhandle " Assuming filesystem is UTF-8 based.\n";
	263	$url_original_filename = encode("utf8", $url_original_filename);
	264	}
	265	}
	266
	267	# Convert the (currently raw) link into its Unicode version.
	268	# Store the Unicode link along with the url_original_filename
	269	my $unicode_url_original_filename = "";
	270	$self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
	271
	272
	273	$self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
	274
	275
	276	if ($url_original_filename ne $unicode_url_original_filename) {
[17088]	277	my $outhandle = $self->{'outhandle'};
[23387]	278
[17088]	279	print $outhandle "URL Encoding $url_original_filename\n";
[23387]	280	print $outhandle " ->$unicode_url_original_filename\n";
[17088]	281
[23387]	282	# Allow for possibility of raw byte version and Unicode versions of file
[23561]	283	&util::block_filename($block_hash,$unicode_url_original_filename);
[23387]	284	}
[23363]	285
[23418]	286	# $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename);
[23561]	287	&util::block_filename($block_hash,$url_original_filename);
[23418]	288
[14665]	289	}
	290	}
	291
[16769]	292	# Given a filename in any encoding, will URL decode it to get back the original filename
	293	# in the original encoding. Because this method is intended to work out the original
[18320]	294	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
[16769]	295	# string already exists in the local folder.
[23363]	296	#
[16769]	297	sub opt_url_decode {
	298	my $self = shift (@_);
[23387]	299	my ($raw_link) = @_;
[16024]	300
[23387]	301
[16769]	302	# Replace %XX's in URL with decoded value if required.
	303	# Note that the filename may include the %XX in some situations
[23387]	304
	305	## if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
	306
	307	if (($raw_link =~ m/\%[A-F0-9]{2}/i) \|\| ($raw_link =~ m/\&\#x[0-9A-F]+;/i) \|\| ($raw_link =~ m/\&\#[0-9]+;/i)) {
	308	if (!-e $raw_link) {
	309	$raw_link = &unicode::url_decode($raw_link,1);
[16769]	310	}
	311	}
[23387]	312
	313	return $raw_link;
[16769]	314	}
	315
[20774]	316	sub read_into_doc_obj
	317	{
	318	my $self = shift (@_);
	319	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
	320
[22330]	321	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[23335]	322
	323	# Lookup content_encoding worked out in file_block pass for this file
	324	# Store it under the local name 'content_encoding' so its nice and
	325	# easy to access
	326	$self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
	327
[20774]	328	# get the input file
	329	my $input_filename = $file;
	330	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	331	$suffix = lc($suffix);
[22330]	332	my $tidy_filename;
[20774]	333	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
	334	{
	335	# because the document has to be sectionalized set the description tags
	336	$self->{'description_tags'} = 1;
	337
	338	# set the file to be tidied
[27306]	339	$input_filename = &FileUtils::filenameConcatenate($base_dir,$file) if $base_dir =~ m/\w/;
[20774]	340
	341	# get the tidied file
	342	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
[22330]	343	$tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
[20774]	344
	345	# derive tmp filename from input filename
	346	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
	347
	348	# set the new input file and base_dir to be from the tidied file
	349	$file = "$tailname$suffix";
	350	$base_dir = $dirname;
	351	}
	352
	353	# call the parent read_into_doc_obj
	354	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
[22330]	355	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
	356	{
	357	# now we need to reset the filenames in the doc obj so that the converted filenames are not used
	358	my $collect_file = &util::filename_within_collection($filename_full_path);
	359	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
	360	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
[23363]	361	# build. So set it manually.
	362	$doc_obj->set_source_path($filename_full_path);
[22330]	363	my $collect_conv_file = &util::filename_within_collection($tidy_filename);
	364	$doc_obj->set_converted_filename($collect_conv_file);
[23349]	365
	366	my $plugin_filename_encoding = $self->{'filename_encoding'};
[23352]	367	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
	368	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
[22330]	369	}
[23335]	370
	371	delete $self->{'store_content_encoding'}->{$filename_full_path};
	372	$self->{'content_encoding'} = undef;
	373
[20774]	374	return ($process_status,$doc_obj);
	375	}
[16769]	376
[14665]	377	# do plugin specific processing of doc_obj
	378	sub process {
	379	my $self = shift (@_);
	380	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
	381	my $outhandle = $self->{'outhandle'};
	382
[16769]	383	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
[16024]	384	# this makes life so much easier... perl can cope with unix-style '/'s.
[23371]	385	$base_dir =~ s@(\\)+@/@g;
	386	$file =~ s@(\\)+@/@g;
[14665]	387	}
[23371]	388
[27306]	389	my $filename = &FileUtils::filenameConcatenate($base_dir,$file);
[23371]	390	my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
	391	my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
	392
	393	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
	394	# And again
	395	$upgraded_base_dir =~ s@(\\)+@/@g;
	396	$upgraded_filename =~ s@(\\)+@/@g;
	397
	398	# Need to make sure there is a '/' on the end of upgraded_base_dir
[23387]	399	if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
[23371]	400	$upgraded_base_dir .= "/";
	401	}
	402	}
	403	my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
[14665]	404
	405	# reset per-doc stuff...
	406	$self->{'aux_files'} = {};
	407	$self->{'dir_num'} = 0;
	408	$self->{'file_num'} = 0;
	409
	410	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
	411	# you can also include metadata in the format (X can be any number)
	412	# <hX>Title<!--gsdl-metadata
	413	# <Metadata name="name1">value1</Metadata>
	414	# ...
	415	# <Metadata name="nameN">valueN</Metadata>
	416	#--></hX>
	417	if ($self->{'sectionalise_using_h_tags'}) {
	418	# description_tags should allways be activated because we convert headings to description tags
	419	$self->{'description_tags'} = 1;
	420
	421	my $arrSections = [];
[23371]	422	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
[14665]	423
	424	if (scalar(@$arrSections)) {
	425	my $strMetadata = $self->update_section_data($arrSections, -1);
	426	if (length($strMetadata)) {
	427	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
	428	$$textref =~ s/<\/body>/$strMetadata/ig;
	429	}
	430	}
	431	}
	432
	433	my $cursection = $doc_obj->get_top_section();
	434
	435	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
	436	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
	437
	438	# Store URL for page as metadata - this can be used for an
	439	# altavista style search interface. The URL won't be valid
	440	# unless the file structure contains the domain name (i.e.
	441	# like when w3mir is used to download a website).
	442
	443	# URL metadata (even invalid ones) are used to support internal
	444	# links, so even if 'file_is_url' is off, still need to store info
	445
[23371]	446	my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
[23347]	447
[23335]	448	# my $utf8_file = $self->filename_to_utf8_metadata($file);
	449	# $utf8_file =~ s/&\#095;/_/g;
[23835]	450	# variable below used to be utf8_file
	451
[23387]	452	my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
	453	my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
[23335]	454
[16735]	455	my $web_url = "http://";
[23387]	456	my $utf8_web_url = "http://";
[16735]	457	if(defined $dirname) { # local directory
[22689]	458	# Check for "ftp" in the domain name of the directory
	459	# structure to determine if this URL should be a ftp:// URL
	460	# This check is not infallible, but better than omitting the
	461	# check, which would cause all files downloaded from ftp sites
	462	# via mirroring with wget to have potentially erroneous http:// URLs
	463	# assigned in their metadata
	464	if ($dirname =~ /^[^\/]*ftp/i)
	465	{
	466	$web_url = "ftp://";
[23387]	467	$utf8_web_url = "ftp://";
[22689]	468	}
[16836]	469	$dirname = $self->eval_dir_dots($dirname);
[18626]	470	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
[23387]	471
	472	$web_url = $web_url.$dirname.$url_encoded_file;
	473	$utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
[16735]	474	} else {
[23387]	475	$web_url = $web_url.$url_encoded_file;
	476	$utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
[16735]	477	}
[19983]	478	$web_url =~ s/\\/\//g;
[23387]	479	$utf8_web_url =~ s/\\/\//g;
[23371]	480
	481	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23387]	482	print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n";
	483	print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
[23371]	484	}
	485
	486
[15872]	487	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
[23387]	488	$doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
[15872]	489
[14665]	490	if ($self->{'file_is_url'}) {
	491	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
	492	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
	493	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
	494	}
	495
	496	if ($self->{'description_tags'}) {
	497	# remove the html header - note that doing this here means any
	498	# sections defined within the header will be lost (so all <Section>
	499	# tags must appear within the body of the HTML)
	500	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
	501
	502	$$textref =~ s/^.?<body[^>]>//is;
	503	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	504
	505	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
	506	my $closecom = '(?:-->\|(?:—\|\|--)>)';
	507
	508	my $lt = '(?:<\|<)';
	509	my $gt = '(?:>\|>)';
	510	my $quot = '(?:"\|"\|”\|“)';
	511
	512	my $dont_strip = '';
	513	if ($self->{'no_strip_metadata_html'}) {
	514	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
	515	}
	516
	517	my $found_something = 0; my $top = 1;
	518	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
	519	my $text = $1;
	520	my $comment = $2;
	521	if (defined $text) {
	522	# text before a comment - note that getting to here
	523	# doesn't necessarily mean there are Section tags in
	524	# the document
[23371]	525	$self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
[14665]	526	}
	527	while ($comment =~ s/$lt(.*?)$gt//s) {
	528	my $tag = $1;
	529	if ($tag eq "Section") {
	530	$found_something = 1;
	531	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
	532	$top = 0;
	533	} elsif ($tag eq "/Section") {
	534	$found_something = 1;
	535	$cursection = $doc_obj->get_parent_section ($cursection);
[16769]	536	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
[14665]	537	my $metaname = $1;
[16769]	538	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
[14665]	539	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
	540	my $metavalue = $1;
	541	$metavalue =~ s/^\s+//;
	542	$metavalue =~ s/\s+$//;
	543	# assume that no metadata value intentionally includes
	544	# carriage returns or HTML tags (if they're there they
	545	# were probably introduced when converting to HTML from
	546	# some other format).
	547	# actually some people want to have html tags in their
	548	# metadata.
	549	$metavalue =~ s/[\cJ\cM]/ /sg;
	550	$metavalue =~ s/<[^>]+>//sg
[16769]	551	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
[14665]	552	$metavalue =~ s/\s+/ /sg;
[22348]	553	if ($metaname =~ /\./) { # has a namespace
	554	$metaname = "ex.$metaname";
	555	}
[14665]	556	if ($accumulate) {
	557	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
	558	} else {
	559	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
	560	}
	561	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
	562	# do nothing with containing Description tags
	563	} else {
	564	# simple HTML tag (probably created by the conversion
	565	# to HTML from some other format) - we'll ignore it and
	566	# hope for the best ;-)
	567	}
	568	}
	569	}
	570	if ($cursection ne "") {
[23371]	571	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
[14665]	572	}
	573
	574	$$textref =~ s/^.?<body[^>]>//is;
	575	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
[16769]	576	if ($$textref =~ m/\S/) {
[14665]	577	if (!$found_something) {
	578	if ($self->{'verbosity'} > 2) {
[23371]	579	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
[14665]	580	print $outhandle " will be processed as a single section document\n";
	581	}
	582
	583	# go ahead and process single-section document
[23371]	584	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
[14665]	585
	586	# if document contains no Section tags we'll go ahead
	587	# and extract metadata (this won't have been done
	588	# above as the -description_tags option prevents it)
	589	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
	590	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
	591	unless $self->{'no_metadata'};
	592
	593	} else {
[23371]	594	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
[14665]	595	print $outhandle " of the final closing </Section> tag. This text will\n";
	596	print $outhandle " be ignored.";
	597
	598	my ($text);
	599	if (length($$textref) > 30) {
	600	$text = substr($$textref, 0, 30) . "...";
	601	} else {
	602	$text = $$textref;
	603	}
	604	$text =~ s/\n/ /isg;
	605	print $outhandle " ($text)\n";
	606	}
	607	} elsif (!$found_something) {
	608
	609	if ($self->{'verbosity'} > 2) {
	610	# may get to here if document contained no valid Section
	611	# tags but did contain some comments. The text will have
	612	# been processed already but we should print the warning
	613	# as above and extract metadata
[23371]	614	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
[14665]	615	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
	616	}
	617
	618	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
	619	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
	620	unless $self->{'no_metadata'};
	621	}
	622
	623	} else {
	624
	625	# remove header and footer
	626	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
	627	$$textref =~ s/^.?<body[^>]>//is;
	628	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	629	}
	630
[25555]	631	$self->{'css_assoc_files'} = {};
	632
[14665]	633	# single section document
[23371]	634	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
[25555]	635
	636	#my $upgraded_filename_dirname = &File::Basename::dirname($upgraded_filename);
	637
	638	$self->acquire_css_associated_files($doc_obj, $cursection);
	639
	640	$self->{'css_assoc_files'} = {};
[14665]	641	}
[23335]	642
[14665]	643	return 1;
	644	}
	645
	646
	647	sub process_heading
	648	{
	649	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
	650	$strHeadingText = '' if (!defined($strHeadingText));
	651
	652	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
	653
	654	my $strSecMetadata = '';
	655	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
	656	{
	657	$strSecMetadata .= $1;
	658	}
	659
	660	$strHeadingText =~ s/^\s+//g;
	661	$strHeadingText =~ s/\s+$//g;
	662	$strSecMetadata =~ s/^\s+//g;
	663	$strSecMetadata =~ s/\s+$//g;
	664
	665	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
	666
	667	if (length($strSecMetadata)) {
	668	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
	669	}
	670
	671	$strMetadata .= "\t</Description>\n";
	672
	673	return "<!--" . $strMetadata . "-->";
	674	}
	675
	676
	677	sub update_section_data
	678	{
	679	my ($self, $arrSections, $nCurTocNo) = @_;
	680	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
	681
	682	if ($nSections == 0) {
	683	push @$arrSections, $nCurTocNo;
	684	return $strBuffer;
	685	}
	686	$nLast = $arrSections->[$nSections - 1];
	687	if ($nCurTocNo > $nLast) {
	688	push @$arrSections, $nCurTocNo;
	689	return $strBuffer;
	690	}
	691	for(my $i = $nSections - 1; $i >= 0; $i--) {
	692	if ($nCurTocNo <= $arrSections->[$i]) {
	693	$strBuffer .= "\n</Section>";
	694	pop @$arrSections;
	695	}
	696	}
	697	push @$arrSections, $nCurTocNo;
	698	return $strBuffer;
	699	}
	700
	701
	702	# note that process_section may be called multiple times for a single
	703	# section (relying on the fact that add_utf8_text appends the text to any
	704	# that may exist already).
	705	sub process_section {
	706	my $self = shift (@_);
	707	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
[25555]	708
	709	my @styleTagsText = ($$textref =~ m/<style[^>]>([^<])<\/style>/sg);
	710	if(scalar(@styleTagsText) > 0)
	711	{
[27306]	712	my $css_filename_dirname = &File::Basename::dirname(&FileUtils::filenameConcatenate($base_dir, $file));
[25555]	713	foreach my $styleText (@styleTagsText)
	714	{
	715	$self->acquire_css_associated_files_from_text_block($styleText, $css_filename_dirname);
	716	}
	717	}
	718
[14665]	719	# trap links
	720	if (!$self->{'nolinks'}) {
	721	# usemap="./#index" not handled correctly => change to "#index"
[16769]	722	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
	723
[23392]	724	## my $opencom = '(?:<!--\|<!(?:—\|\|--))';
	725	## my $closecom = '(?:-->\|(?:—\|\|--)>)';
[23387]	726
[16769]	727	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
[14665]	728	$self->replace_usemap_links($1, $2, $3)/isge;
	729
[23463]	730	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
	731	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
[23387]	732
[23392]	733	## $$textref =~ s/($opencom.?)?+(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)(.*?$closecom)?+/
	734	# $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
[14665]	735	}
	736
	737	# trap images
	738
[15872]	739	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
[15176]	740	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
	741	# turned regular text succeeding images into links. That is, by embedding <imgs>
	742	# inside <a href=""></a>, the text following images were no longer misbehaving.
	743	# However, there would be many occasions whereby images were not meant to link
	744	# to their source images but where the images would link to another web page.
	745	# To allow this, the no_image_links option was introduced: it would prevent
	746	# the behaviour of embedding images into links that referenced the source images.
	747
	748	# Somewhere along the line, the problem of normal text turning into links when
	749	# such text followed images which were not embedded in <a href=""></a> ceased
	750	# to occur. This is why the following lines have been commented out (as well as
	751	# two lines in replace_images). They appear to no longer apply.
	752
	753	# If at any time, there is a need for having images embedded in <a> anchor tags,
[15872]	754	# then it might be better to turn that into an HTMLPlugin option rather than make
[15176]	755	# it the default behaviour. Also, eventually, no_image_links needs to become
[15872]	756	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
[15176]	757
	758	#if(!$self->{'no_image_links'}){
[16247]	759	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
[15872]	760	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
[15176]	761	#}
	762
[14665]	763	# add text to document object
	764	# turn \ into \\ so that the rest of greenstone doesn't think there
	765	# is an escape code following. (Macro parsing loses them...)
	766	$$textref =~ s/\\/\\\\/go;
	767
	768	$doc_obj->add_utf8_text($cursection, $$textref);
	769	}
	770
	771	sub replace_images {
	772	my $self = shift (@_);
	773	my ($front, $link, $back, $base_dir,
	774	$file, $doc_obj, $section) = @_;
	775
	776	# remove quotes from link at start and end if necessary
	777	if ($link=~/^[\"\']/) {
[15838]	778	$link=~s/^[\"\']//;
	779	$link=~s/[\"\']$//;
[14665]	780	$front.='"';
	781	$back="\"$back";
	782	}
[15872]	783
[14665]	784	$link =~ s/\n/ /g;
	785
	786	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
	787	# If the Word file path has spaces in it, wv messes up and you end up with
	788	# absolute paths for the images, and without the "file://" prefix
	789	# So check for this special case and massage the data to be correct
[16769]	790	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
[14665]	791	$link =~ s/^.*\\([^\\]+)$/$1/;
	792	}
[16632]	793
[14665]	794	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
	795
	796	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
	797
[27703]	798	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n\n";
[16632]	799
[14665]	800	my $anchor_name = $img_file;
	801	#$anchor_name =~ s/^.*\///;
	802	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
	803
	804	my $image_link = $front . $img_file .$back;
[15176]	805	return $image_link;
[14665]	806
[15176]	807	# The reasons for why the following two lines are no longer necessary can be
	808	# found in subroutine process_section
	809	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
	810	#return $anchor_link;
	811
[14665]	812	#return $front . $img_file . $back . $anchor_name;
	813	}
	814
	815	sub replace_href_links {
	816	my $self = shift (@_);
[23392]	817	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
[25555]	818
	819	if($front =~ m/^<link / && $link =~ m/\.css"$/)
	820	{
	821	my $actual_link = $link;
	822	$actual_link =~ s/^"(.*)"$/$1/;
	823
	824	my $directory = &File::Basename::dirname($file);
	825
[27306]	826	my $css_filename = &FileUtils::filenameConcatenate($base_dir, $directory, $actual_link);
[25555]	827	$self->retrieve_css_associated_files($css_filename);
	828	}
	829
[16769]	830	# remove quotes from link at start and end if necessary
	831	if ($link=~/^[\"\']/) {
	832	$link=~s/^[\"\']//;
	833	$link=~s/[\"\']$//;
	834	$front.='"';
	835	$back="\"$back";
	836	}
	837
[14665]	838	# attempt to sort out targets - frames are not handled
	839	# well in this plugin and some cases will screw things
	840	# up - e.g. the _parent target (so we'll just remove
	841	# them all ;-)
	842	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
	843	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
	844	$front =~ s/target=\"?_parent\"?//is;
	845	$back =~ s/target=\"?_parent\"?//is;
	846
[25673]	847	if($link =~ m/^\#/s)
	848	{
	849	return $front . "_httpsamepagelink_" . $link . $back;
	850	}
	851
[14665]	852	$link =~ s/\n/ /g;
	853
[16769]	854	# Find file referred to by $link on file system
	855	# This is more complicated than it sounds when char encodings
	856	# is taken in to account
[14665]	857	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[23835]	858
[14665]	859	# href may use '\'s where '/'s should be on Windows
	860	$href =~ s/\\/\//g;
[16769]	861	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
[14665]	862
	863	##### leave all these links alone (they won't be picked up by intermediate
	864	##### pages). I think that's safest when dealing with frames, targets etc.
	865	##### (at least until I think of a better way to do it). Problems occur with
	866	##### mailto links from within small frames, the intermediate page is displayed
	867	##### within that frame and can't be seen. There is still potential for this to
	868	##### happen even with html pages - the solution seems to be to somehow tell
	869	##### the browser from the server side to display the page being sent (i.e.
	870	##### the intermediate page) in the top level window - I'm not sure if that's
	871	##### possible - the following line should probably be deleted if that can be done
[16769]	872	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
[14665]	873
[16769]	874	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
	875	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
[23335]	876
[23371]	877	if ($ENV{'GSDLOS'} =~ m/^windows$/) {
[23335]	878
[23371]	879	# Don't do any encoding for now, as not clear what
	880	# the right thing to do is to support filename
	881	# encoding on Windows when they are not UTF16
	882	#
[23347]	883	}
[23371]	884	else {
	885	# => Unix-based system
[23335]	886
[23371]	887	# If web page didn't give encoding, then default to utf8
	888	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
[23835]	889
[23371]	890	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
	891	print STDERR "**** Encoding with '$content_encoding', href: $href\n";
	892	}
[23335]	893
[23835]	894	# on Darwin, the unicode filenames are stored on the file
	895	# system in decomposed form, so any href link (including when
	896	# URL-encoded) should refer to the decomposed name of the file
	897	if ($ENV{'GSDLOS'} =~ /^darwin$/i) {
	898	$href = normalize('D', $href); # Normalization Form D (decomposition)
	899	}
	900
[23371]	901	$href = encode($content_encoding,$href);
	902	}
	903
[23835]	904	$href = &unicode::raw_filename_to_utf8_url_encoded($href);
[23335]	905	$href = &unicode::filename_to_url($href);
	906
[16812]	907	&ghtml::urlsafe ($href);
[23371]	908
[23347]	909	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23387]	910	print STDERR "******DEBUG: href=$href\n";
[23347]	911	}
[23335]	912
[23347]	913
[18521]	914	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
[14665]	915	} else {
[23335]	916	# link is to some other type of file (e.g., an image) so we'll
[14665]	917	# need to associate that file
	918	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
	919	}
	920	}
	921
[25555]	922	sub retrieve_css_associated_files {
	923	my $self = shift (@_);
	924	my ($css_filename) = @_;
	925
	926	my $css_filename_dirname = &File::Basename::dirname($css_filename);
	927
	928	open (CSSFILE, $css_filename) \|\| return;
	929	sysread (CSSFILE, my $file_string, -s CSSFILE);
	930
	931	$self->acquire_css_associated_files_from_text_block($file_string, $css_filename_dirname) unless !defined $file_string;
	932
	933	close CSSFILE;
	934	}
	935
	936	sub acquire_css_associated_files_from_text_block {
	937	my $self = shift (@_);
	938	my ($text, $css_filename_dirname) = @_;
	939
	940	my @image_urls = ($text =~ m/background-image:\surl[^;];/sg);
	941	foreach my $img_url (@image_urls)
	942	{
	943	$img_url =~ s/^.url.\((.)\).$/$1/;
	944	$img_url =~ s/^\s"?([^"])"?\s*$/$1/;
	945
[27306]	946	$self->{'css_assoc_files'}->{&FileUtils::filenameConcatenate($css_filename_dirname, $img_url)} = $img_url;
[25555]	947	}
	948	}
	949
	950	sub acquire_css_associated_files {
	951	my $self = shift(@_);
	952
	953	my ($doc_obj, $section) = @_;
	954
	955	foreach my $image_filename (keys %{$self->{'css_assoc_files'}})
	956	{
	957	$doc_obj->associate_file($image_filename, $self->{'css_assoc_files'}->{$image_filename}, undef, $section);
	958	}
	959	}
	960
[14665]	961	sub add_file {
	962	my $self = shift (@_);
	963	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
	964	my ($newname);
	965
	966	my $filename = $href;
	967	if ($base_dir eq "") {
[23387]	968	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
	969	# remove http://
	970	$filename =~ s/^[^:]*:\/\///;
	971	}
	972	else {
	973	# remove http:/ thereby leaving one slash at the start as
	974	# part of full pathname
	975	$filename =~ s/^[^:]*:\///;
	976	}
[14665]	977	}
	978	else {
	979	# remove http://
	980	$filename =~ s/^[^:]*:\/\///;
	981	}
[27703]	982
	983	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
	984	$filename =~ s@\/@\\@g;
	985	}
	986
[27306]	987	$filename = &FileUtils::filenameConcatenate($base_dir, $filename);
[23363]	988
[22355]	989	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'})) {
	990	# we are processing a tidytmp file - want paths to be in import
	991	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
	992	}
[23335]	993
	994	# Replace %XX's in URL with decoded value if required. Note that the
	995	# filename may include the %XX in some situations. If the original
	996	# file's name was in URL encoding, the following method will not decode
	997	# it.
[23387]	998	my $unicode_filename = $filename;
	999	my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
[16769]	1000
[23387]	1001	# wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
	1002	# (it seems) accompanying file
	1003	if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
	1004
[23335]	1005	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
	1006
[23387]	1007	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
	1008	# The filenames that come through the HTML file have been decoded
	1009	# into Unicode aware Perl strings. Need to convert them back
	1010	# to their initial raw-byte encoding to match the file that
	1011	# exists on the file system
	1012	$filename = encode($content_encoding, $opt_decode_unicode_filename);
	1013	}
	1014	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
	1015	# HFS+ is UTF8 with decompostion
	1016	$filename = encode($content_encoding, $opt_decode_unicode_filename);
	1017	$filename = normalize('D', $filename); # Normalization Form D (decomposition)
[23335]	1018
[23387]	1019	}
	1020	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
	1021	my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
	1022
	1023	if (defined $long_filename) {
	1024	my $short_filename = Win32::GetLongPathName($long_filename);
	1025	$filename = $short_filename;
	1026	}
	1027	# else {
	1028	# print STDERR "***** failed to map href to real file:\n";
	1029	# print STDERR "****** $href -> $opt_decode_unicode_filename\n";
	1030	# }
	1031	}
	1032	else {
	1033	my $outhandle = $self->{'outhandle'};
	1034	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
	1035	print $outhandle " in file system encoding of href: $href\n";
	1036	print $outhandle " No character encoding done.\n";
	1037	}
	1038
	1039
[16769]	1040	# some special processing if the intended filename was converted to utf8, but
	1041	# the actual file still needs to be renamed
[27306]	1042	#if (!&util::fd_exists($filename)) {
	1043	if (!&FileUtils::fileExists($filename)) {
[16769]	1044	# try the original filename stored in map
[23347]	1045	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23363]	1046	print STDERR "******!! orig filename did not exist: $filename\n";
[23347]	1047	}
[23335]	1048
[23387]	1049	## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
[23363]	1050
[23387]	1051	my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
[23335]	1052
[23347]	1053	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23387]	1054	print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n";
[23347]	1055	}
[23335]	1056
[16920]	1057	if (defined $original_filename && -e $original_filename) {
[23347]	1058	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23363]	1059	print STDERR "****** Found match!\n";
[23347]	1060	}
[16769]	1061	$filename = $original_filename;
[14665]	1062	}
	1063	}
[16769]	1064
	1065	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
[14665]	1066
	1067	if ($rl == 0) {
[16769]	1068	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
[18521]	1069	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
[14665]	1070	}
	1071	else {
[18521]	1072	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
[14665]	1073	}
	1074	}
	1075
[16769]	1076	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
[18521]	1077	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
[14665]	1078	}
[20778]	1079	# add the original image file as a source file
[20791]	1080	if (!$self->{'processing_tmp_files'} ) {
	1081	$doc_obj->associate_source_file($filename);
	1082	}
[14665]	1083	if ($self->{'rename_assoc_files'}) {
	1084	if (defined $self->{'aux_files'}->{$href}) {
	1085	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
	1086	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
	1087	} else {
	1088	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
	1089	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
	1090	$self->inc_filecount ();
	1091	}
	1092	$doc_obj->associate_file($filename, $newname, undef, $section);
	1093	return "_httpdocimg_/$newname";
	1094	} else {
[23387]	1095	if(&unicode::is_url_encoded($unicode_filename)) {
[16904]	1096	# use the possibly-decoded filename instead to avoid double URL encoding
	1097	($newname) = $filename =~ m/([^\/\\]*)$/;
	1098	} else {
[23387]	1099	($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
[16904]	1100	}
[16935]	1101
[18320]	1102	# Make sure this name uses only ASCII characters.
	1103	# We use either base64 or URL encoding, as these preserve original encoding
	1104	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
[16632]	1105
[23363]	1106	### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
[14665]	1107	$doc_obj->associate_file($filename, $newname, undef, $section);
[16632]	1108
[16769]	1109	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
	1110	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
[16632]	1111	my $newname_url = $newname;
[18404]	1112	$newname_url = &unicode::filename_to_url($newname_url);
[16769]	1113	return "_httpdocimg_/$newname_url";
[14665]	1114	}
	1115	}
	1116
	1117
	1118	sub format_link {
	1119	my $self = shift (@_);
	1120	my ($link, $base_dir, $file) = @_;
	1121
[23371]	1122	# strip off hash part, e.g. #foo, but watch out for any entities, e.g. α
	1123	my ($before_hash, $hash_part) = $link =~ m/^(.?[^&])(\#.)?$/;
[23463]	1124
[14665]	1125	$hash_part = "" if !defined $hash_part;
[16769]	1126	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
[23463]	1127	my $outhandle = $self->{'outhandle'};
	1128	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
	1129	if $self->{'verbosity'};
	1130	return ($link, "", 0);
[14665]	1131	}
[23463]	1132
[20576]	1133	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
[23463]	1134	my $type = $1;
[27703]	1135	my $before_hash_file = $before_hash;
	1136
[16769]	1137	if ($link =~ m/^(http\|ftp):/i) {
[27703]	1138
[14665]	1139	# Turn url (using /) into file name (possibly using \ on windows)
[27703]	1140	my @http_dir_split = split('/', $before_hash_file);
	1141	$before_hash_file = &FileUtils::filenameConcatenate(@http_dir_split);
[14665]	1142	}
[16024]	1143
[27703]	1144	# want to maintain two version of "before_hash": one representing the URL, the other using filesystem specific directory separator
	1145	$before_hash_file = $self->eval_dir_dots($before_hash_file);
	1146	my $before_hash_url = $before_hash_file;
	1147	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	1148	$before_hash_url =~ s@\\@\/@g;
	1149	}
[14665]	1150
[27703]	1151	my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash_file);
	1152
[14665]	1153	my $rl = 0;
	1154	$rl = 1 if (-e $linkfilename);
	1155
	1156	# make sure there's a slash on the end if it's a directory
[27703]	1157	if ($before_hash_url !~ m/\/$/) {
	1158	$before_hash_url .= "/" if (-d $linkfilename);
[14665]	1159	}
[27703]	1160	return ($type . $before_hash_url, $hash_part, $rl);
[16024]	1161
[16769]	1162	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
[14665]	1163
[16769]	1164	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
	1165
[14665]	1166	# the first directory will be the domain name if file_is_url
	1167	# to generate archives, otherwise we'll assume all files are
	1168	# from the same site and base_dir is the root
	1169
	1170	if ($self->{'file_is_url'}) {
	1171	my @dirs = split /[\/\\]/, $file;
	1172	my $domname = shift (@dirs);
[27306]	1173	$before_hash = &FileUtils::filenameConcatenate($domname, $before_hash);
[14665]	1174	$before_hash =~ s@\\@/@g; # for windows
	1175	}
	1176	else
	1177	{
	1178	# see if link shares directory with source document
	1179	# => turn into relative link if this is so!
	1180
[16769]	1181	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
[14665]	1182	# too difficult doing a pattern match with embedded '\'s...
	1183	my $win_before_hash=$before_hash;
	1184	$win_before_hash =~ s@(\\)+@/@g;
	1185	# $base_dir is already similarly "converted" on windows.
	1186	if ($win_before_hash =~ s@^$base_dir/@@o) {
[16024]	1187	# if this is true, we removed a prefix
	1188	$before_hash=$win_before_hash;
[14665]	1189	}
	1190	}
	1191	else {
	1192	# before_hash has lost leading slash by this point,
	1193	# -> add back in prior to substitution with $base_dir
	1194	$before_hash = "/$before_hash";
	1195
[27306]	1196	$before_hash = &FileUtils::filenameConcatenate("",$before_hash);
[14665]	1197	$before_hash =~ s@^$base_dir/@@;
	1198	}
	1199	}
	1200	} else {
	1201	# Turn relative file path into full path
	1202	my $dirname = &File::Basename::dirname($file);
[27306]	1203	$before_hash = &FileUtils::filenameConcatenate($dirname, $before_hash);
[27703]	1204	$before_hash = $self->eval_dir_dots($before_hash);
	1205	$before_hash =~ s@\\@/@g; # for windows
[14665]	1206	}
	1207
[27306]	1208	my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);
[23387]	1209
	1210
	1211	# print STDERR "**** linkfilename = $linkfilename\n";
	1212	# if (!&util::fd_exists($linkfilename)) {
	1213	# print STDERR "***** Warning: Could not find $linkfilename\n";
	1214	# }
	1215
	1216
[14665]	1217	# make sure there's a slash on the end if it's a directory
[16769]	1218	if ($before_hash !~ m/\/$/) {
[14665]	1219	$before_hash .= "/" if (-d $linkfilename);
	1220	}
[23387]	1221
	1222	# print STDERR "*** returning: $before_hash\n";
	1223
[14665]	1224	return ("http://" . $before_hash, $hash_part, 1);
	1225	} else {
	1226	# mailto, news, nntp, telnet, javascript or gopher link
	1227	return ($before_hash, "", 0);
	1228	}
	1229	}
	1230
	1231	sub extract_first_NNNN_characters {
	1232	my $self = shift (@_);
	1233	my ($textref, $doc_obj, $thissection) = @_;
	1234
	1235	foreach my $size (split /,/, $self->{'first'}) {
	1236	my $tmptext = $$textref;
	1237	# skip to the body
	1238	$tmptext =~ s/.<body[^>]>//i;
	1239	# remove javascript
	1240	$tmptext =~ s@<script.*?</script>@ @sig;
	1241	$tmptext =~ s/<[^>]*>/ /g;
	1242	$tmptext =~ s/ / /g;
	1243	$tmptext =~ s/^\s+//;
	1244	$tmptext =~ s/\s+$//;
	1245	$tmptext =~ s/\s+/ /gs;
	1246	$tmptext = &unicode::substr ($tmptext, 0, $size);
	1247	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
	1248	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
	1249	}
	1250	}
	1251
	1252
	1253	sub extract_metadata {
	1254	my $self = shift (@_);
	1255	my ($textref, $metadata, $doc_obj, $section) = @_;
	1256	my $outhandle = $self->{'outhandle'};
	1257	# if we don't want metadata, we may as well not be here ...
	1258	return if (!defined $self->{'metadata_fields'});
[22348]	1259
[21800]	1260	my $separator = $self->{'metadata_field_separator'};
	1261	if ($separator eq "") {
	1262	undef $separator;
	1263	}
	1264
[14665]	1265	# metadata fields to extract/save. 'key' is the (lowercase) name of the
	1266	# html meta, 'value' is the metadata name for greenstone to use
	1267	my %find_fields = ();
	1268
	1269	my %creator_fields = (); # short-cut for lookups
	1270
	1271
	1272	foreach my $field (split /,/, $self->{'metadata_fields'}) {
	1273	$field =~ s/^\s+//; # remove leading whitespace
	1274	$field =~ s/\s+$//; # remove trailing whitespace
[22348]	1275
[14665]	1276	# support tag<tagname>
[20689]	1277	if ($field =~ m/^(.?)\s<(.*?)>$/) {
[14665]	1278	# "$2" is the user's preferred gs metadata name
	1279	$find_fields{lc($1)}=$2; # lc = lowercase
	1280	} else { # no <tagname> for mapping
	1281	# "$field" is the user's preferred gs metadata name
	1282	$find_fields{lc($field)}=$field; # lc = lowercase
	1283	}
	1284	}
	1285
	1286	if (defined $self->{'hunt_creator_metadata'} &&
	1287	$self->{'hunt_creator_metadata'} == 1 ) {
	1288	my @extra_fields =
	1289	(
	1290	'author',
	1291	'author.email',
	1292	'creator',
	1293	'dc.creator',
	1294	'dc.creator.corporatename',
	1295	);
	1296
	1297	# add the creator_metadata fields to search for
	1298	foreach my $field (@extra_fields) {
	1299	$creator_fields{$field}=0; # add to lookup hash
	1300	}
	1301	}
	1302
	1303
	1304	# find the header in the html file, which has the meta tags
	1305	$$textref =~ m@<head>(.*?)</head>@si;
	1306
	1307	my $html_header=$1;
	1308
	1309	# go through every <meta... tag defined in the html and see if it is
	1310	# one of the tags we want to match.
	1311
	1312	# special case for title - we want to remember if its been found
	1313	my $found_title = 0;
	1314	# this assumes that ">" won't appear. (I don't think it's allowed to...)
[16769]	1315	$html_header =~ m/^/; # match the start of the string, for \G assertion
[16024]	1316
[14665]	1317	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
	1318	my $metatag=$1;
	1319	my ($tag, $value);
	1320
	1321	# find the tag name
[16769]	1322	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
[14665]	1323	$tag=$2;
	1324	# in case they're not using " or ', but they should...
	1325	if (! $tag) {
[16769]	1326	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
[14665]	1327	$tag=$1;
	1328	}
	1329
	1330	if (!defined $tag) {
[15872]	1331	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
[14665]	1332	next;
	1333	}
	1334
	1335	# don't need to assign this field if it was passed in from a previous
	1336	# (recursive) plugin
	1337	if (defined $metadata->{$tag}) {next}
	1338
	1339	# find the tag content
[16769]	1340	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
[14665]	1341	$value=$2;
	1342
[24431]	1343	# The following code assigns the metaname to value if value is
	1344	# empty. Why would we do this?
	1345	#if (! $value) {
	1346	# $metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
	1347	# $value=$1;
	1348	#}
	1349	if (!defined $value \|\| $value eq "") {
	1350	print $outhandle "HTMLPlugin: can't find VALUE in <meta $metatag >\n" if ($self->{'verbosity'} > 2);
[14665]	1351	next;
	1352	}
[22348]	1353
[14665]	1354	# clean up and add
	1355	$value =~ s/\s+/ /gs;
	1356	chomp($value); # remove trailing \n, if any
	1357	if (exists $creator_fields{lc($tag)}) {
	1358	# map this value onto greenstone's "Creator" metadata
	1359	$tag='Creator';
	1360	} elsif (!exists $find_fields{lc($tag)}) {
[16024]	1361	next; # don't want this tag
[14665]	1362	} else {
	1363	# get the user's preferred capitalisation
	1364	$tag = $find_fields{lc($tag)};
	1365	}
	1366	if (lc($tag) eq "title") {
	1367	$found_title = 1;
	1368	}
[18521]	1369
	1370	if ($self->{'verbosity'} > 2) {
	1371	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
[14665]	1372	}
[18521]	1373
[22348]	1374	if ($tag =~ /\./) {
	1375	# there is a . so has a namespace, add ex.
	1376	$tag = "ex.$tag";
	1377	}
[21800]	1378	if (defined $separator) {
	1379	my @values = split($separator, $value);
	1380	foreach my $v (@values) {
	1381	$doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
	1382	}
	1383	}
	1384	else {
	1385	$doc_obj->add_utf8_metadata($section, $tag, $value);
	1386	}
[14665]	1387	}
	1388
	1389	# TITLE: extract the document title
	1390	if (exists $find_fields{'title'} && !$found_title) {
	1391	# we want a title, and didn't find one in the meta tags
	1392	# see if there's a <title> tag
	1393	my $title;
	1394	my $from = ""; # for debugging output only
[16769]	1395	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
[14665]	1396	$title = $1;
	1397	$from = "<title> tags";
	1398	}
	1399
	1400	if (!defined $title) {
	1401	$from = "first 100 chars";
	1402	# if no title use first 100 or so characters
	1403	$title = $$textref;
	1404	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
	1405	$title =~ s/^.*?<body>//si;
	1406	# ignore javascript!
	1407	$title =~ s@<script.*?</script>@ @sig;
	1408	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
	1409	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
[27742]	1410	$title =~ s@\r@@g; # remove Windows carriage returns to ensure that titles of pdftohtml docs are consistent (the same 100 chars) across windows and linux
[14665]	1411	$title = substr ($title, 0, 100);
	1412	$title =~ s/\s\S*$/.../;
	1413	}
	1414	$title =~ s/<[^>]*>/ /g; # remove html tags
	1415	$title =~ s/ / /g;
	1416	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
	1417	$title =~ s/\s+/ /gs; # collapse multiple spaces
	1418	$title =~ s/^\s*//; # remove leading spaces
	1419	$title =~ s/\s*$//; # remove trailing spaces
	1420
	1421	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
	1422	$title =~ s/^\s+//s; # in case title_sub introduced any...
[23335]	1423	$doc_obj->add_utf8_metadata ($section, "Title", $title);
[14665]	1424	print $outhandle " extracted Title metadata \"$title\" from $from\n"
	1425	if ($self->{'verbosity'} > 2);
	1426	}
	1427
	1428	# add FileFormat metadata
	1429	$doc_obj->add_metadata($section,"FileFormat", "HTML");
	1430
	1431	# Special, for metadata names such as tagH1 - extracts
	1432	# the text between the first <H1> and </H1> tags into "H1" metadata.
	1433
	1434	foreach my $field (keys %find_fields) {
[16769]	1435	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
[14665]	1436	my $tag = $1;
	1437	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
	1438	my $content = $1;
	1439	$content =~ s/ / /g;
	1440	$content =~ s/<[^>]*>/ /g;
	1441	$content =~ s/^\s+//;
	1442	$content =~ s/\s+$//;
	1443	$content =~ s/\s+/ /gs;
	1444	if ($content) {
	1445	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
	1446	$tag =~ s/^tag//i;
	1447	$doc_obj->add_utf8_metadata ($section, $tag, $content);
	1448	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
	1449	if ($self->{'verbosity'} > 2);
	1450	}
	1451	}
	1452	}
	1453	}
	1454
	1455
	1456	# evaluate any "../" to next directory up
	1457	# evaluate any "./" as here
	1458	sub eval_dir_dots {
	1459	my $self = shift (@_);
	1460	my ($filename) = @_;
	1461	my $dirsep_os = &util::get_os_dirsep();
	1462	my @dirsep = split(/$dirsep_os/,$filename);
	1463
	1464	my @eval_dirs = ();
	1465	foreach my $d (@dirsep) {
	1466	if ($d eq "..") {
	1467	pop(@eval_dirs);
	1468
	1469	} elsif ($d eq ".") {
	1470	# do nothing!
	1471
	1472	} else {
	1473	push(@eval_dirs,$d);
	1474	}
	1475	}
	1476
	1477	# Need to fiddle with number of elements in @eval_dirs if the
	1478	# first one is the empty string. This is because of a
[27306]	1479	# modification to FileUtils::filenameConcatenate that supresses the addition
[14665]	1480	# of a leading '/' character (or \ if windows) (intended to help
	1481	# filename cat with relative paths) if the first entry in the
	1482	# array is the empty string. Making the array start with two
	1483	# empty strings is a way to defeat this "smart" option.
	1484	#
	1485	if (scalar(@eval_dirs) > 0) {
	1486	if ($eval_dirs[0] eq ""){
	1487	unshift(@eval_dirs,"");
	1488	}
	1489	}
[16836]	1490
[27306]	1491	my $evaluated_filename = (scalar @eval_dirs > 0) ? &FileUtils::filenameConcatenate(@eval_dirs) : "";
[16836]	1492	return $evaluated_filename;
[14665]	1493	}
	1494
	1495	sub replace_usemap_links {
	1496	my $self = shift (@_);
	1497	my ($front, $link, $back) = @_;
	1498
[16769]	1499	# remove quotes from link at start and end if necessary
	1500	if ($link=~/^[\"\']/) {
	1501	$link=~s/^[\"\']//;
	1502	$link=~s/[\"\']$//;
	1503	$front.='"';
	1504	$back="\"$back";
	1505	}
	1506
[14665]	1507	$link =~ s/^\.\///;
	1508	return $front . $link . $back;
	1509	}
	1510
	1511	sub inc_filecount {
	1512	my $self = shift (@_);
	1513
	1514	if ($self->{'file_num'} == 1000) {
	1515	$self->{'dir_num'} ++;
	1516	$self->{'file_num'} = 0;
	1517	} else {
	1518	$self->{'file_num'} ++;
	1519	}
	1520	}
	1521
	1522
[15872]	1523	# Extend read_file so that strings like é are
[14665]	1524	# converted to UTF8 internally.
	1525	#
	1526	# We don't convert < or > or & or " in case
	1527	# they interfere with the GML files
	1528
	1529	sub read_file {
[15872]	1530	my $self = shift(@_);
	1531	my ($filename, $encoding, $language, $textref) = @_;
[14665]	1532
[15872]	1533	$self->SUPER::read_file($filename, $encoding, $language, $textref);
[14665]	1534
[23363]	1535	# Convert entities to their Unicode code-point equivalents
[14665]	1536	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
[22951]	1537	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
[14665]	1538	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
[22842]	1539
[14665]	1540	}
	1541
[20774]	1542	sub HB_read_html_file {
	1543	my $self = shift (@_);
	1544	my ($htmlfile, $text) = @_;
	1545
	1546	# load in the file
	1547	if (!open (FILE, $htmlfile)) {
	1548	print STDERR "ERROR - could not open $htmlfile\n";
	1549	return;
	1550	}
	1551
	1552	my $foundbody = 0;
	1553	$self->HB_gettext (\$foundbody, $text, "FILE");
	1554	close FILE;
	1555
	1556	# just in case there was no <body> tag
	1557	if (!$foundbody) {
	1558	$foundbody = 1;
	1559	open (FILE, $htmlfile) \|\| return;
	1560	$self->HB_gettext (\$foundbody, $text, "FILE");
	1561	close FILE;
	1562	}
	1563	# text is in utf8
	1564	}
	1565
	1566	# converts the text to utf8, as ghtml does that for é etc.
	1567	sub HB_gettext {
	1568	my $self = shift (@_);
	1569	my ($foundbody, $text, $handle) = @_;
	1570
	1571	my $line = "";
	1572	while (defined ($line = <$handle>)) {
	1573	# look for body tag
	1574	if (!$$foundbody) {
	1575	if ($line =~ s/^.<body[^>]>//i) {
	1576	$$foundbody = 1;
	1577	} else {
	1578	next;
	1579	}
	1580	}
	1581
	1582	# check for symbol fonts
	1583	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
	1584	my $font = $1;
	1585	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
	1586	if ($font !~ m/^arial$/i);
	1587	}
	1588
	1589	$$text .= $line;
	1590	}
	1591
	1592	if ($self->{'input_encoding'} eq "iso_8859_1") {
	1593	# convert to utf-8
	1594	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
	1595	}
	1596	# convert any alphanumeric character entities to their utf-8
	1597	# equivalent for indexing purposes
	1598	#&ghtml::convertcharentities ($$text);
	1599
	1600	$$text =~ s/\s+/ /g; # remove \n's
[22857]	1601
	1602	# At this point $$text is a binary byte string
	1603	# => turn it into a Unicode aware string, so full
	1604	# Unicode aware pattern matching can be used.
	1605	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
	1606	#
	1607
	1608	$$text = decode("utf8",$$text);
[20774]	1609	}
	1610
	1611	sub HB_clean_section {
	1612	my $self = shift (@_);
	1613	my ($section) = @_;
	1614
	1615	# remove tags without a starting tag from the section
	1616	my ($tag, $tagstart);
	1617	while ($section =~ m/<\/([^>]{1,10})>/) {
	1618	$tag = $1;
	1619	$tagstart = index($section, "<$tag");
	1620	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
	1621	$section =~ s/<\/$tag>//;
	1622	}
	1623
	1624	# remove extra paragraph tags
	1625	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
	1626
	1627	# remove extra stuff at the end of the section
	1628	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
	1629
	1630	# add a newline at the beginning of each paragraph
	1631	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
	1632
	1633	# add a newline every 80 characters at a word boundary
	1634	# Note: this regular expression puts a line feed before
	1635	# the last word in each section, even when it is not
	1636	# needed.
	1637	$section =~ s/(.{1,80})\s/$1\n/g;
	1638
	1639	# fix up the image links
	1640	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
	1641	<center><img src=\"$1\" \/><\/center><br\/>/ig;
	1642	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
	1643	<center><img src=\"$1\" \/><\/center><br\/>/ig;
	1644
	1645	return $section;
	1646	}
	1647
	1648	# Will convert the oldHDL format to the new HDL format (using the Section tag)
	1649	sub convert_to_newHDLformat
	1650	{
	1651	my $self = shift (@_);
	1652	my ($file,$cnfile) = @_;
	1653	my $input_filename = $file;
	1654	my $tmp_filename = $cnfile;
	1655
	1656	# write HTML tmp file with new HDL format
	1657	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
	1658
	1659	# read in the file and do basic html cleaning (removing header etc)
	1660	my $html = "";
	1661	$self->HB_read_html_file ($input_filename, \$html);
	1662
	1663	# process the file one section at a time
	1664	my $curtoclevel = 1;
	1665	my $firstsection = 1;
	1666	my $toclevel = 0;
	1667	while (length ($html) > 0) {
	1668	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
	1669	$toclevel = $3;
	1670	my $title = $4;
	1671	my $sectiontext = "";
	1672	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
	1673	$sectiontext = $1;
	1674	} else {
	1675	$sectiontext = $html;
	1676	$html = "";
	1677	}
	1678
	1679	# remove tags and extra spaces from the title
	1680	$title =~ s/<\/?[^>]+>//g;
	1681	$title =~ s/^\s+\|\s+$//g;
	1682
	1683	# close any sections below the current level and
	1684	# create a new section (special case for the firstsection)
	1685	print PROD "<!--\n";
	1686	while (($curtoclevel > $toclevel) \|\|
	1687	(!$firstsection && $curtoclevel == $toclevel)) {
	1688	$curtoclevel--;
	1689	print PROD "</Section>\n";
	1690	}
	1691	if ($curtoclevel+1 < $toclevel) {
	1692	print STDERR "WARNING - jump in toc levels in $input_filename " .
	1693	"from $curtoclevel to $toclevel\n";
	1694	}
	1695	while ($curtoclevel < $toclevel) {
	1696	$curtoclevel++;
	1697	}
	1698
	1699	if ($curtoclevel == 1) {
	1700	# add the header tag
	1701	print PROD "-->\n";
	1702	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
	1703	print PROD "<!--\n";
	1704	}
	1705
	1706	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
	1707
	1708	print PROD "-->\n";
	1709
	1710	# clean up the section html
	1711	$sectiontext = $self->HB_clean_section($sectiontext);
	1712
	1713	print PROD "$sectiontext\n";
	1714
	1715	} else {
	1716	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
	1717	"\nin $input_filename\n";
	1718	last;
	1719	}
	1720	$firstsection = 0;
	1721	}
	1722
	1723	print PROD "<!--\n";
	1724	while ($curtoclevel > 0) {
	1725	$curtoclevel--;
	1726	print PROD "</Section>\n";
	1727	}
	1728	print PROD "-->\n";
	1729
	1730	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
	1731
	1732	return $tmp_filename;
	1733	}
	1734
	1735	sub shorten {
	1736	my $self = shift (@_);
	1737	my ($text) = @_;
	1738
	1739	return "\"$text\"" if (length($text) < 100);
	1740
	1741	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
	1742	substr ($text, length($text)-50) . "\"";
	1743	}
	1744
	1745	sub convert_tidy_or_oldHDL_file
	1746	{
	1747	my $self = shift (@_);
	1748	my ($file) = @_;
	1749	my $input_filename = $file;
	1750
	1751	if (-d $input_filename)
	1752	{
	1753	return $input_filename;
	1754	}
	1755
	1756	# get the input filename
	1757	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	1758	my $base_dirname = $dirname;
	1759	$suffix = lc($suffix);
	1760
	1761	# derive tmp filename from input filename
	1762	# Remove any white space from filename -- no risk of name collision, and
	1763	# makes later conversion by utils simpler. Leave spaces in path...
	1764	# tidy up the filename with space, dot, hyphen between
	1765	$tailname =~ s/\s+//g;
	1766	$tailname =~ s/\.+//g;
	1767	$tailname =~ s/\-+//g;
	1768	# convert to utf-8 otherwise we have problems with the doc.xml file
	1769	# later on
	1770	&unicode::ensure_utf8(\$tailname);
	1771
	1772	# softlink to collection tmp dir
[27306]	1773	my $tmp_dirname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
[28196]	1774	&FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
[20774]	1775
	1776	my $test_dirname = "";
	1777	my $f_separator = &util::get_os_dirsep();
	1778
	1779	if ($dirname =~ m/import$f_separator/)
	1780	{
	1781	$test_dirname = $'; #'
	1782
	1783	#print STDERR "init $'\n";
	1784
	1785	while ($test_dirname =~ m/[$f_separator]/)
	1786	{
	1787	my $folderdirname = $`;
[27306]	1788	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname,$folderdirname);
[28196]	1789	&FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
[20774]	1790	$test_dirname = $'; #'
	1791	}
	1792	}
	1793
[27306]	1794	my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
[20774]	1795
	1796	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
	1797	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
	1798	{
	1799	#convert the input file to a new style HDL
	1800	my $hdl_output_filename = $input_filename;
	1801	if ($self->{'old_style_HDL'})
	1802	{
[27306]	1803	$hdl_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
[20774]	1804	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
	1805	}
	1806
	1807	#just for checking copy all other file from the base dir to tmp dir if it is not exists
	1808	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
	1809	my @files = grep {!/^\.+$/} readdir(DIR);
	1810	close(DIR);
	1811
	1812	foreach my $file (@files)
	1813	{
[27306]	1814	my $src_file = &FileUtils::filenameConcatenate($base_dirname,$file);
	1815	my $dest_file = &FileUtils::filenameConcatenate($tmp_dirname,$file);
[20774]	1816	if ((!-e $dest_file) && (!-d $src_file))
	1817	{
	1818	# just copy the original file back to the tmp directory
	1819	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
	1820	}
	1821	}
	1822
	1823	# tidy the input file
	1824	my $tidy_output_filename = $hdl_output_filename;
	1825	if ($self->{'use_realistic_book'})
	1826	{
[27306]	1827	$tidy_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
[20774]	1828	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
	1829	}
	1830	$tmp_filename = $tidy_output_filename;
	1831	}
	1832	else
	1833	{
	1834	if (!-e $tmp_filename)
	1835	{
	1836	# just copy the original file back to the tmp directory
	1837	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
	1838	}
	1839	}
	1840
	1841	return $tmp_filename;
	1842	}
	1843
	1844
	1845	# Will make the html input file as a proper XML file with removed font tag and
	1846	# image size added to the img tag.
	1847	# The tidying process takes place in a collection specific 'tmp' directory so
	1848	# that we don't accidentally damage the input.
	1849	sub tmp_tidy_file
	1850	{
	1851	my $self = shift (@_);
	1852	my ($file,$cnfile) = @_;
	1853	my $input_filename = $file;
	1854	my $tmp_filename = $cnfile;
	1855
	1856	# get the input filename
	1857	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	1858
	1859	require HTML::TokeParser::Simple;
	1860
	1861	# create HTML parser to decode the input file
	1862	my $parser = HTML::TokeParser::Simple->new($input_filename);
	1863
	1864	# write HTML tmp file without the font tag and image size are added to the img tag
	1865	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
	1866	while (my $token = $parser->get_token())
	1867	{
	1868	# is it an img tag
	1869	if ($token->is_start_tag('img'))
	1870	{
	1871	# get the attributes
	1872	my $attr = $token->return_attr;
	1873
	1874	# get the full path to the image
[27306]	1875	my $img_file = &FileUtils::filenameConcatenate($dirname,$attr->{src});
[20774]	1876
	1877	# set the width and height attribute
	1878	($attr->{width}, $attr->{height}) = imgsize($img_file);
	1879
	1880	# recreate the tag
	1881	print PROD "<img";
	1882	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
	1883	print PROD ">";
	1884	}
	1885	# is it a font tag
	1886	else
	1887	{
	1888	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
	1889	{
	1890	# remove font tag
	1891	print PROD "";
	1892	}
	1893	else
	1894	{
	1895	# print without changes
	1896	print PROD $token->as_is;
	1897	}
	1898	}
	1899	}
	1900	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
	1901
	1902	# run html-tidy on the tmp file to make it a proper XML file
	1903
[22594]	1904	my $outhandle = $self->{'outhandle'};
	1905	print $outhandle "Converting HTML to be XML compliant:\n";
	1906
	1907	my $tidy_cmd = "tidy";
	1908	$tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
[22636]	1909	$tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
[22594]	1910	if ($self->{'verbosity'} <= 2) {
	1911	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
	1912	$tidy_cmd .= " 2>nul";
	1913	}
	1914	else {
	1915	$tidy_cmd .= " 2>/dev/null";
	1916	}
	1917	print $outhandle " => $tidy_cmd\n";
	1918	}
	1919
	1920	my $tidyfile = `$tidy_cmd`;
	1921
[20774]	1922	# write result back to the tmp file
	1923	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
	1924	print PROD $tidyfile;
	1925	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
	1926
	1927	# return the output filename
	1928	return $tmp_filename;
	1929	}
	1930
[22355]	1931	sub associate_cover_image
	1932	{
	1933	my $self = shift(@_);
	1934	my ($doc_obj, $filename) = @_;
	1935	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
	1936	{
	1937	# we will have cover image in tidytmp, but want it from import
	1938	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
	1939	}
	1940	$self->SUPER::associate_cover_image($doc_obj, $filename);
	1941	}
	1942
	1943
[14665]	1944	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: