Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 23387

Last change on this file since 23387 was 23387, checked in by davidb, 13 years ago
Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 62.8 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use Encode;
39	use Unicode::Normalize 'normalize';
40
41	use ReadTextFile;
42	use HBPlugin;
43	use ghtml;
44	use unicode;
45	use util;
46	use XMLParser;
47
48	use Image::Size;
49	use File::Copy;
50
51	sub BEGIN {
52	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
53	}
54
55	use strict; # every perl program should have this!
56	no strict 'refs'; # make an exception so we can use variables as filehandles
57
58	my $arguments =
59	[ { 'name' => "process_exp",
60	'desc' => "{BasePlugin.process_exp}",
61	'type' => "regexp",
62	'deft' => &get_default_process_exp() },
63	{ 'name' => "block_exp",
64	'desc' => "{BasePlugin.block_exp}",
65	'type' => 'regexp',
66	'deft' => &get_default_block_exp() },
67	{ 'name' => "nolinks",
68	'desc' => "{HTMLPlugin.nolinks}",
69	'type' => "flag" },
70	{ 'name' => "keep_head",
71	'desc' => "{HTMLPlugin.keep_head}",
72	'type' => "flag" },
73	{ 'name' => "no_metadata",
74	'desc' => "{HTMLPlugin.no_metadata}",
75	'type' => "flag" },
76	{ 'name' => "metadata_fields",
77	'desc' => "{HTMLPlugin.metadata_fields}",
78	'type' => "string",
79	'deft' => "Title" },
80	{ 'name' => "metadata_field_separator",
81	'desc' => "{HTMLPlugin.metadata_field_separator}",
82	'type' => "string",
83	'deft' => "" },
84	{ 'name' => "hunt_creator_metadata",
85	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
86	'type' => "flag" },
87	{ 'name' => "file_is_url",
88	'desc' => "{HTMLPlugin.file_is_url}",
89	'type' => "flag" },
90	{ 'name' => "assoc_files",
91	'desc' => "{HTMLPlugin.assoc_files}",
92	'type' => "regexp",
93	'deft' => &get_default_block_exp() },
94	{ 'name' => "rename_assoc_files",
95	'desc' => "{HTMLPlugin.rename_assoc_files}",
96	'type' => "flag" },
97	{ 'name' => "title_sub",
98	'desc' => "{HTMLPlugin.title_sub}",
99	'type' => "string",
100	'deft' => "" },
101	{ 'name' => "description_tags",
102	'desc' => "{HTMLPlugin.description_tags}",
103	'type' => "flag" },
104	# retain this for backward compatibility (w3mir option was replaced by
105	# file_is_url)
106	{ 'name' => "w3mir",
107	# 'desc' => "{HTMLPlugin.w3mir}",
108	'type' => "flag",
109	'hiddengli' => "yes"},
110	{ 'name' => "no_strip_metadata_html",
111	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
112	'type' => "string",
113	'deft' => "",
114	'reqd' => "no"},
115	{ 'name' => "sectionalise_using_h_tags",
116	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
117	'type' => "flag" },
118	{ 'name' => "use_realistic_book",
119	'desc' => "{HTMLPlugin.tidy_html}",
120	'type' => "flag"},
121	{ 'name' => "old_style_HDL",
122	'desc' => "{HTMLPlugin.old_style_HDL}",
123	'type' => "flag"},
124	{'name' => "processing_tmp_files",
125	'desc' => "{BasePlugin.processing_tmp_files}",
126	'type' => "flag",
127	'hiddengli' => "yes"}
128	];
129
130	my $options = { 'name' => "HTMLPlugin",
131	'desc' => "{HTMLPlugin.desc}",
132	'abstract' => "no",
133	'inherits' => "yes",
134	'args' => $arguments };
135
136
137	sub new {
138	my ($class) = shift (@_);
139	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
140	push(@$pluginlist, $class);
141
142	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
143	push(@{$hashArgOptLists->{"OptList"}},$options);
144
145
146	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
147
148	if ($self->{'w3mir'}) {
149	$self->{'file_is_url'} = 1;
150	}
151	$self->{'aux_files'} = {};
152	$self->{'dir_num'} = 0;
153	$self->{'file_num'} = 0;
154
155	return bless $self, $class;
156	}
157
158	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
159	# if have eg <script language="javascript" src="img/lib.js@123">
160	# blocking is now done by reading through the file and recording all the
161	# images and other files
162	sub get_default_block_exp {
163	my $self = shift (@_);
164
165	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
166	return "";
167	}
168
169	sub get_default_process_exp {
170	my $self = shift (@_);
171
172	# the last option is an attempt to encode the concept of an html query ...
173	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
174	}
175
176	sub store_block_files
177	{
178	my $self =shift (@_);
179	my ($filename_full_path, $block_hash) = @_;
180
181	my $html_fname = $filename_full_path;
182	my @file_blocks;
183
184	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
185	$self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
186
187	# read in file ($text will be in utf8)
188	my $raw_text = "";
189	$self->read_file_no_decoding($filename_full_path, \$raw_text);
190
191	my $textref = \$raw_text;
192	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
193	my $closecom = '(?:-->\|(?:—\|\|--)>)';
194	$$textref =~ s/$opencom(.*?)$closecom//gs;
195
196	# Convert entities to their UTF8 equivalents
197	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
198	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
199	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
200
201	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
202	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
203	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
204	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
205	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
206	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
207	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
208
209	if(!defined $self->{'unicode_to_original_filename'}) {
210	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
211	$self->{'unicode_to_original_filename'} = {};
212	}
213
214	foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
215
216	# remove quotes from link at start and end if necessary
217	if ($raw_link =~ m/^\"/) {
218	$raw_link =~ s/^\"//;
219	$raw_link =~ s/\"$//;
220	}
221
222	# remove any anchor names, e.g. foo.html#name becomes foo.html
223	# but watch out for any #'s that are part of entities, such as α
224	$raw_link =~ s/([^&])\#.*$/$1/s;
225
226	# some links may just be anchor names
227	next unless ($raw_link =~ /\S+/);
228
229	if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/) {
230	# Turn relative file path into full path
231	my $dirname = &File::Basename::dirname($filename_full_path);
232	$raw_link = &util::filename_cat($dirname, $raw_link);
233	}
234	$raw_link = $self->eval_dir_dots($raw_link);
235
236	# this is the actual filename on the filesystem (that the link refers to)
237	my $url_original_filename = $self->opt_url_decode($raw_link);
238
239	my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
240
241	if ($exceeds_bytecodes) {
242	# We have a link to a file name that is more complicated than a raw byte filename
243	# What we do next depends on the operating system we are on
244
245	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
246	# Assume we're dealing with a UTF-8 encoded filename
247	$url_original_filename = encode("utf8", $url_original_filename);
248	}
249	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
250	# HFS+ is UTF8 with decompostion
251	$url_original_filename = encode("utf8", $url_original_filename);
252	$url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
253	}
254	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
255	# Don't need to do anything as later code maps Windows
256	# unicode filenames to DOS short filenames when needed
257	}
258	else {
259	my $outhandle = $self->{'outhandle'};
260	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
261	print $outhandle " in raw file system encoding of: $raw_link\n";
262	print $outhandle " Assuming filesystem is UTF-8 based.\n";
263	$url_original_filename = encode("utf8", $url_original_filename);
264	}
265	}
266
267	# Convert the (currently raw) link into its Unicode version.
268	# Store the Unicode link along with the url_original_filename
269	my $unicode_url_original_filename = "";
270	$self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
271
272
273	$self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
274
275
276	if ($url_original_filename ne $unicode_url_original_filename) {
277	my $outhandle = $self->{'outhandle'};
278
279	print $outhandle "URL Encoding $url_original_filename\n";
280	print $outhandle " ->$unicode_url_original_filename\n";
281
282	# Allow for possibility of raw byte version and Unicode versions of file
283	$block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1;
284
285	}
286
287	$block_hash->{'file_blocks'}->{$url_original_filename} = 1;
288	}
289	}
290
291	# Given a filename in any encoding, will URL decode it to get back the original filename
292	# in the original encoding. Because this method is intended to work out the original
293	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
294	# string already exists in the local folder.
295	#
296	sub opt_url_decode {
297	my $self = shift (@_);
298	my ($raw_link) = @_;
299
300
301	# Replace %XX's in URL with decoded value if required.
302	# Note that the filename may include the %XX in some situations
303
304	## if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
305
306	if (($raw_link =~ m/\%[A-F0-9]{2}/i) \|\| ($raw_link =~ m/\&\#x[0-9A-F]+;/i) \|\| ($raw_link =~ m/\&\#[0-9]+;/i)) {
307	if (!-e $raw_link) {
308	$raw_link = &unicode::url_decode($raw_link,1);
309	}
310	}
311
312	return $raw_link;
313	}
314
315	sub read_into_doc_obj
316	{
317	my $self = shift (@_);
318	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
319
320	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
321
322	# Lookup content_encoding worked out in file_block pass for this file
323	# Store it under the local name 'content_encoding' so its nice and
324	# easy to access
325	$self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
326
327	# get the input file
328	my $input_filename = $file;
329	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
330	$suffix = lc($suffix);
331	my $tidy_filename;
332	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
333	{
334	# because the document has to be sectionalized set the description tags
335	$self->{'description_tags'} = 1;
336
337	# set the file to be tidied
338	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
339
340	# get the tidied file
341	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
342	$tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
343
344	# derive tmp filename from input filename
345	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
346
347	# set the new input file and base_dir to be from the tidied file
348	$file = "$tailname$suffix";
349	$base_dir = $dirname;
350	}
351
352	# call the parent read_into_doc_obj
353	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
354	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
355	{
356	# now we need to reset the filenames in the doc obj so that the converted filenames are not used
357	my $collect_file = &util::filename_within_collection($filename_full_path);
358	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
359	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
360	# build. So set it manually.
361	$doc_obj->set_source_path($filename_full_path);
362	my $collect_conv_file = &util::filename_within_collection($tidy_filename);
363	$doc_obj->set_converted_filename($collect_conv_file);
364
365	my $plugin_filename_encoding = $self->{'filename_encoding'};
366	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
367	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
368	}
369
370	delete $self->{'store_content_encoding'}->{$filename_full_path};
371	$self->{'content_encoding'} = undef;
372
373	return ($process_status,$doc_obj);
374	}
375
376	# do plugin specific processing of doc_obj
377	sub process {
378	my $self = shift (@_);
379	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
380	my $outhandle = $self->{'outhandle'};
381
382	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
383	# this makes life so much easier... perl can cope with unix-style '/'s.
384	$base_dir =~ s@(\\)+@/@g;
385	$file =~ s@(\\)+@/@g;
386	}
387
388	my $filename = &util::filename_cat($base_dir,$file);
389	my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
390	my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
391
392	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
393	# And again
394	$upgraded_base_dir =~ s@(\\)+@/@g;
395	$upgraded_filename =~ s@(\\)+@/@g;
396
397	# Need to make sure there is a '/' on the end of upgraded_base_dir
398	if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
399	$upgraded_base_dir .= "/";
400	}
401	}
402	my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
403
404	# reset per-doc stuff...
405	$self->{'aux_files'} = {};
406	$self->{'dir_num'} = 0;
407	$self->{'file_num'} = 0;
408
409	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
410	# you can also include metadata in the format (X can be any number)
411	# <hX>Title<!--gsdl-metadata
412	# <Metadata name="name1">value1</Metadata>
413	# ...
414	# <Metadata name="nameN">valueN</Metadata>
415	#--></hX>
416	if ($self->{'sectionalise_using_h_tags'}) {
417	# description_tags should allways be activated because we convert headings to description tags
418	$self->{'description_tags'} = 1;
419
420	my $arrSections = [];
421	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
422
423	if (scalar(@$arrSections)) {
424	my $strMetadata = $self->update_section_data($arrSections, -1);
425	if (length($strMetadata)) {
426	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
427	$$textref =~ s/<\/body>/$strMetadata/ig;
428	}
429	}
430	}
431
432	my $cursection = $doc_obj->get_top_section();
433
434	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
435	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
436
437	# Store URL for page as metadata - this can be used for an
438	# altavista style search interface. The URL won't be valid
439	# unless the file structure contains the domain name (i.e.
440	# like when w3mir is used to download a website).
441
442	# URL metadata (even invalid ones) are used to support internal
443	# links, so even if 'file_is_url' is off, still need to store info
444
445	my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
446
447	# my $utf8_file = $self->filename_to_utf8_metadata($file);
448	# $utf8_file =~ s/&\#095;/_/g;
449	# variable below used to be utf8_file
450	my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
451	my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
452
453	my $web_url = "http://";
454	my $utf8_web_url = "http://";
455	if(defined $dirname) { # local directory
456	# Check for "ftp" in the domain name of the directory
457	# structure to determine if this URL should be a ftp:// URL
458	# This check is not infallible, but better than omitting the
459	# check, which would cause all files downloaded from ftp sites
460	# via mirroring with wget to have potentially erroneous http:// URLs
461	# assigned in their metadata
462	if ($dirname =~ /^[^\/]*ftp/i)
463	{
464	$web_url = "ftp://";
465	$utf8_web_url = "ftp://";
466	}
467	$dirname = $self->eval_dir_dots($dirname);
468	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
469
470	$web_url = $web_url.$dirname.$url_encoded_file;
471	$utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
472	} else {
473	$web_url = $web_url.$url_encoded_file;
474	$utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
475	}
476	$web_url =~ s/\\/\//g;
477	$utf8_web_url =~ s/\\/\//g;
478
479	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
480	print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n";
481	print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
482	}
483
484
485	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
486	$doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
487
488	if ($self->{'file_is_url'}) {
489	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
490	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
491	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
492	}
493
494	if ($self->{'description_tags'}) {
495	# remove the html header - note that doing this here means any
496	# sections defined within the header will be lost (so all <Section>
497	# tags must appear within the body of the HTML)
498	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
499
500	$$textref =~ s/^.?<body[^>]>//is;
501	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
502
503	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
504	my $closecom = '(?:-->\|(?:—\|\|--)>)';
505
506	my $lt = '(?:<\|<)';
507	my $gt = '(?:>\|>)';
508	my $quot = '(?:"\|"\|”\|“)';
509
510	my $dont_strip = '';
511	if ($self->{'no_strip_metadata_html'}) {
512	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
513	}
514
515	my $found_something = 0; my $top = 1;
516	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
517	my $text = $1;
518	my $comment = $2;
519	if (defined $text) {
520	# text before a comment - note that getting to here
521	# doesn't necessarily mean there are Section tags in
522	# the document
523	$self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
524	}
525	while ($comment =~ s/$lt(.*?)$gt//s) {
526	my $tag = $1;
527	if ($tag eq "Section") {
528	$found_something = 1;
529	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
530	$top = 0;
531	} elsif ($tag eq "/Section") {
532	$found_something = 1;
533	$cursection = $doc_obj->get_parent_section ($cursection);
534	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
535	my $metaname = $1;
536	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
537	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
538	my $metavalue = $1;
539	$metavalue =~ s/^\s+//;
540	$metavalue =~ s/\s+$//;
541	# assume that no metadata value intentionally includes
542	# carriage returns or HTML tags (if they're there they
543	# were probably introduced when converting to HTML from
544	# some other format).
545	# actually some people want to have html tags in their
546	# metadata.
547	$metavalue =~ s/[\cJ\cM]/ /sg;
548	$metavalue =~ s/<[^>]+>//sg
549	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
550	$metavalue =~ s/\s+/ /sg;
551	if ($metaname =~ /\./) { # has a namespace
552	$metaname = "ex.$metaname";
553	}
554	if ($accumulate) {
555	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
556	} else {
557	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
558	}
559	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
560	# do nothing with containing Description tags
561	} else {
562	# simple HTML tag (probably created by the conversion
563	# to HTML from some other format) - we'll ignore it and
564	# hope for the best ;-)
565	}
566	}
567	}
568	if ($cursection ne "") {
569	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
570	}
571
572	$$textref =~ s/^.?<body[^>]>//is;
573	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
574	if ($$textref =~ m/\S/) {
575	if (!$found_something) {
576	if ($self->{'verbosity'} > 2) {
577	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
578	print $outhandle " will be processed as a single section document\n";
579	}
580
581	# go ahead and process single-section document
582	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
583
584	# if document contains no Section tags we'll go ahead
585	# and extract metadata (this won't have been done
586	# above as the -description_tags option prevents it)
587	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
588	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
589	unless $self->{'no_metadata'};
590
591	} else {
592	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
593	print $outhandle " of the final closing </Section> tag. This text will\n";
594	print $outhandle " be ignored.";
595
596	my ($text);
597	if (length($$textref) > 30) {
598	$text = substr($$textref, 0, 30) . "...";
599	} else {
600	$text = $$textref;
601	}
602	$text =~ s/\n/ /isg;
603	print $outhandle " ($text)\n";
604	}
605	} elsif (!$found_something) {
606
607	if ($self->{'verbosity'} > 2) {
608	# may get to here if document contained no valid Section
609	# tags but did contain some comments. The text will have
610	# been processed already but we should print the warning
611	# as above and extract metadata
612	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
613	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
614	}
615
616	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
617	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
618	unless $self->{'no_metadata'};
619	}
620
621	} else {
622
623	# remove header and footer
624	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
625	$$textref =~ s/^.?<body[^>]>//is;
626	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
627	}
628
629
630	# single section document
631	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
632	}
633
634	return 1;
635	}
636
637
638	sub process_heading
639	{
640	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
641	$strHeadingText = '' if (!defined($strHeadingText));
642
643	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
644
645	my $strSecMetadata = '';
646	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
647	{
648	$strSecMetadata .= $1;
649	}
650
651	$strHeadingText =~ s/^\s+//g;
652	$strHeadingText =~ s/\s+$//g;
653	$strSecMetadata =~ s/^\s+//g;
654	$strSecMetadata =~ s/\s+$//g;
655
656	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
657
658	if (length($strSecMetadata)) {
659	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
660	}
661
662	$strMetadata .= "\t</Description>\n";
663
664	return "<!--" . $strMetadata . "-->";
665	}
666
667
668	sub update_section_data
669	{
670	my ($self, $arrSections, $nCurTocNo) = @_;
671	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
672
673	if ($nSections == 0) {
674	push @$arrSections, $nCurTocNo;
675	return $strBuffer;
676	}
677	$nLast = $arrSections->[$nSections - 1];
678	if ($nCurTocNo > $nLast) {
679	push @$arrSections, $nCurTocNo;
680	return $strBuffer;
681	}
682	for(my $i = $nSections - 1; $i >= 0; $i--) {
683	if ($nCurTocNo <= $arrSections->[$i]) {
684	$strBuffer .= "\n</Section>";
685	pop @$arrSections;
686	}
687	}
688	push @$arrSections, $nCurTocNo;
689	return $strBuffer;
690	}
691
692
693	# note that process_section may be called multiple times for a single
694	# section (relying on the fact that add_utf8_text appends the text to any
695	# that may exist already).
696	sub process_section {
697	my $self = shift (@_);
698	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
699	# trap links
700	if (!$self->{'nolinks'}) {
701	# usemap="./#index" not handled correctly => change to "#index"
702	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
703
704	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
705	my $closecom = '(?:-->\|(?:—\|\|--)>)';
706
707	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
708	$self->replace_usemap_links($1, $2, $3)/isge;
709
710	## $$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
711
712	$$textref =~ s/($opencom.?)?+(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)(.*?$closecom)?+/
713	$self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
714	}
715
716	# trap images
717
718	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
719	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
720	# turned regular text succeeding images into links. That is, by embedding <imgs>
721	# inside <a href=""></a>, the text following images were no longer misbehaving.
722	# However, there would be many occasions whereby images were not meant to link
723	# to their source images but where the images would link to another web page.
724	# To allow this, the no_image_links option was introduced: it would prevent
725	# the behaviour of embedding images into links that referenced the source images.
726
727	# Somewhere along the line, the problem of normal text turning into links when
728	# such text followed images which were not embedded in <a href=""></a> ceased
729	# to occur. This is why the following lines have been commented out (as well as
730	# two lines in replace_images). They appear to no longer apply.
731
732	# If at any time, there is a need for having images embedded in <a> anchor tags,
733	# then it might be better to turn that into an HTMLPlugin option rather than make
734	# it the default behaviour. Also, eventually, no_image_links needs to become
735	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
736
737	#if(!$self->{'no_image_links'}){
738	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
739	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
740	#}
741
742	# add text to document object
743	# turn \ into \\ so that the rest of greenstone doesn't think there
744	# is an escape code following. (Macro parsing loses them...)
745	$$textref =~ s/\\/\\\\/go;
746
747	$doc_obj->add_utf8_text($cursection, $$textref);
748	}
749
750	sub replace_images {
751	my $self = shift (@_);
752	my ($front, $link, $back, $base_dir,
753	$file, $doc_obj, $section) = @_;
754
755	# remove quotes from link at start and end if necessary
756	if ($link=~/^[\"\']/) {
757	$link=~s/^[\"\']//;
758	$link=~s/[\"\']$//;
759	$front.='"';
760	$back="\"$back";
761	}
762
763	$link =~ s/\n/ /g;
764
765	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
766	# If the Word file path has spaces in it, wv messes up and you end up with
767	# absolute paths for the images, and without the "file://" prefix
768	# So check for this special case and massage the data to be correct
769	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
770	$link =~ s/^.*\\([^\\]+)$/$1/;
771	}
772
773	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
774
775	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
776
777	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n";
778
779	my $anchor_name = $img_file;
780	#$anchor_name =~ s/^.*\///;
781	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
782
783	my $image_link = $front . $img_file .$back;
784	return $image_link;
785
786	# The reasons for why the following two lines are no longer necessary can be
787	# found in subroutine process_section
788	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
789	#return $anchor_link;
790
791	#return $front . $img_file . $back . $anchor_name;
792	}
793
794	sub replace_href_links {
795	my $self = shift (@_);
796	my ($opt_open_comm, $front, $link, $back, $opt_close_comm, $base_dir, $file, $doc_obj, $section) = @_;
797
798	if ((defined $opt_open_comm) && (defined $opt_close_comm)) {
799	# href link was embedded in <!-- comments -->
800	### print STDERR "****** Link in comment, skipping $link\n";
801	return $opt_open_comm . $front . $link . $back . $opt_close_comm;
802	}
803
804	# remove quotes from link at start and end if necessary
805	if ($link=~/^[\"\']/) {
806	$link=~s/^[\"\']//;
807	$link=~s/[\"\']$//;
808	$front.='"';
809	$back="\"$back";
810	}
811
812	# attempt to sort out targets - frames are not handled
813	# well in this plugin and some cases will screw things
814	# up - e.g. the _parent target (so we'll just remove
815	# them all ;-)
816	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
817	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
818	$front =~ s/target=\"?_parent\"?//is;
819	$back =~ s/target=\"?_parent\"?//is;
820
821	return $front . $link . $back if $link =~ m/^\#/s;
822	$link =~ s/\n/ /g;
823
824	# Find file referred to by $link on file system
825	# This is more complicated than it sounds when char encodings
826	# is taken in to account
827	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
828
829	# href may use '\'s where '/'s should be on Windows
830	$href =~ s/\\/\//g;
831	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
832
833
834	##### leave all these links alone (they won't be picked up by intermediate
835	##### pages). I think that's safest when dealing with frames, targets etc.
836	##### (at least until I think of a better way to do it). Problems occur with
837	##### mailto links from within small frames, the intermediate page is displayed
838	##### within that frame and can't be seen. There is still potential for this to
839	##### happen even with html pages - the solution seems to be to somehow tell
840	##### the browser from the server side to display the page being sent (i.e.
841	##### the intermediate page) in the top level window - I'm not sure if that's
842	##### possible - the following line should probably be deleted if that can be done
843	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
844
845	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
846	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
847
848	if ($ENV{'GSDLOS'} =~ m/^windows$/) {
849
850	# Don't do any encoding for now, as not clear what
851	# the right thing to do is to support filename
852	# encoding on Windows when they are not UTF16
853	#
854	}
855	else {
856	# => Unix-based system
857
858	# If web page didn't give encoding, then default to utf8
859	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
860
861	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
862	print STDERR "**** Encoding with '$content_encoding', href: $href\n";
863	}
864
865	$href = encode($content_encoding,$href);
866	}
867
868	$href = &unicode::raw_filename_to_utf8_url_encoded($href);
869	$href = &unicode::filename_to_url($href);
870
871	&ghtml::urlsafe ($href);
872
873	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
874	print STDERR "******DEBUG: href=$href\n";
875	}
876
877
878	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
879	} else {
880	# link is to some other type of file (e.g., an image) so we'll
881	# need to associate that file
882	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
883	}
884	}
885
886	sub add_file {
887	my $self = shift (@_);
888	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
889	my ($newname);
890
891	my $filename = $href;
892	if ($base_dir eq "") {
893	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
894	# remove http://
895	$filename =~ s/^[^:]*:\/\///;
896	}
897	else {
898	# remove http:/ thereby leaving one slash at the start as
899	# part of full pathname
900	$filename =~ s/^[^:]*:\///;
901	}
902	}
903	else {
904	# remove http://
905	$filename =~ s/^[^:]*:\/\///;
906	}
907
908	$filename = &util::filename_cat($base_dir, $filename);
909
910	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'})) {
911	# we are processing a tidytmp file - want paths to be in import
912	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
913	}
914
915	# Replace %XX's in URL with decoded value if required. Note that the
916	# filename may include the %XX in some situations. If the original
917	# file's name was in URL encoding, the following method will not decode
918	# it.
919	my $unicode_filename = $filename;
920	my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
921
922	# wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
923	# (it seems) accompanying file
924	if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
925
926	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
927
928	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
929	# The filenames that come through the HTML file have been decoded
930	# into Unicode aware Perl strings. Need to convert them back
931	# to their initial raw-byte encoding to match the file that
932	# exists on the file system
933	$filename = encode($content_encoding, $opt_decode_unicode_filename);
934	}
935	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
936	# HFS+ is UTF8 with decompostion
937	$filename = encode($content_encoding, $opt_decode_unicode_filename);
938	$filename = normalize('D', $filename); # Normalization Form D (decomposition)
939
940	}
941	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
942	my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
943
944	if (defined $long_filename) {
945	my $short_filename = Win32::GetLongPathName($long_filename);
946	$filename = $short_filename;
947	}
948	# else {
949	# print STDERR "***** failed to map href to real file:\n";
950	# print STDERR "****** $href -> $opt_decode_unicode_filename\n";
951	# }
952	}
953	else {
954	my $outhandle = $self->{'outhandle'};
955	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
956	print $outhandle " in file system encoding of href: $href\n";
957	print $outhandle " No character encoding done.\n";
958	}
959
960
961	# some special processing if the intended filename was converted to utf8, but
962	# the actual file still needs to be renamed
963	if (!&util::fd_exists($filename)) {
964	# try the original filename stored in map
965	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
966	print STDERR "******!! orig filename did not exist: $filename\n";
967	}
968
969	## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
970
971	my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
972
973	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
974	print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n";
975	}
976
977	if (defined $original_filename && -e $original_filename) {
978	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
979	print STDERR "****** Found match!\n";
980	}
981	$filename = $original_filename;
982	}
983	}
984
985	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
986
987	if ($rl == 0) {
988	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
989	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
990	}
991	else {
992	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
993	}
994	}
995
996	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
997	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
998	}
999	# add the original image file as a source file
1000	if (!$self->{'processing_tmp_files'} ) {
1001	$doc_obj->associate_source_file($filename);
1002	}
1003	if ($self->{'rename_assoc_files'}) {
1004	if (defined $self->{'aux_files'}->{$href}) {
1005	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1006	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1007	} else {
1008	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1009	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1010	$self->inc_filecount ();
1011	}
1012	$doc_obj->associate_file($filename, $newname, undef, $section);
1013	return "_httpdocimg_/$newname";
1014	} else {
1015	if(&unicode::is_url_encoded($unicode_filename)) {
1016	# use the possibly-decoded filename instead to avoid double URL encoding
1017	($newname) = $filename =~ m/([^\/\\]*)$/;
1018	} else {
1019	($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
1020	}
1021
1022	# Make sure this name uses only ASCII characters.
1023	# We use either base64 or URL encoding, as these preserve original encoding
1024	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
1025
1026	### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
1027	$doc_obj->associate_file($filename, $newname, undef, $section);
1028
1029	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1030	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
1031	my $newname_url = $newname;
1032	$newname_url = &unicode::filename_to_url($newname_url);
1033	return "_httpdocimg_/$newname_url";
1034	}
1035	}
1036
1037
1038	sub format_link {
1039	my $self = shift (@_);
1040	my ($link, $base_dir, $file) = @_;
1041
1042	# strip off hash part, e.g. #foo, but watch out for any entities, e.g. α
1043	my ($before_hash, $hash_part) = $link =~ m/^(.?[^&])(\#.)?$/;
1044
1045	$hash_part = "" if !defined $hash_part;
1046	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
1047	my $outhandle = $self->{'outhandle'};
1048	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1049	if $self->{'verbosity'};
1050	return ($link, "", 0);
1051	}
1052
1053	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
1054	my $type = $1;
1055
1056	if ($link =~ m/^(http\|ftp):/i) {
1057	# Turn url (using /) into file name (possibly using \ on windows)
1058	my @http_dir_split = split('/', $before_hash);
1059	$before_hash = &util::filename_cat(@http_dir_split);
1060	}
1061
1062	$before_hash = $self->eval_dir_dots($before_hash);
1063
1064	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1065
1066	my $rl = 0;
1067	$rl = 1 if (-e $linkfilename);
1068
1069	# make sure there's a slash on the end if it's a directory
1070	if ($before_hash !~ m/\/$/) {
1071	$before_hash .= "/" if (-d $linkfilename);
1072	}
1073	return ($type . $before_hash, $hash_part, $rl);
1074
1075	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
1076
1077	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
1078
1079	# the first directory will be the domain name if file_is_url
1080	# to generate archives, otherwise we'll assume all files are
1081	# from the same site and base_dir is the root
1082
1083	if ($self->{'file_is_url'}) {
1084	my @dirs = split /[\/\\]/, $file;
1085	my $domname = shift (@dirs);
1086	$before_hash = &util::filename_cat($domname, $before_hash);
1087	$before_hash =~ s@\\@/@g; # for windows
1088	}
1089	else
1090	{
1091	# see if link shares directory with source document
1092	# => turn into relative link if this is so!
1093
1094	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1095	# too difficult doing a pattern match with embedded '\'s...
1096	my $win_before_hash=$before_hash;
1097	$win_before_hash =~ s@(\\)+@/@g;
1098	# $base_dir is already similarly "converted" on windows.
1099	if ($win_before_hash =~ s@^$base_dir/@@o) {
1100	# if this is true, we removed a prefix
1101	$before_hash=$win_before_hash;
1102	}
1103	}
1104	else {
1105	# before_hash has lost leading slash by this point,
1106	# -> add back in prior to substitution with $base_dir
1107	$before_hash = "/$before_hash";
1108
1109	$before_hash = &util::filename_cat("",$before_hash);
1110	$before_hash =~ s@^$base_dir/@@;
1111	}
1112	}
1113	} else {
1114	# Turn relative file path into full path
1115	my $dirname = &File::Basename::dirname($file);
1116	$before_hash = &util::filename_cat($dirname, $before_hash);
1117	$before_hash = $self->eval_dir_dots($before_hash);
1118	}
1119
1120	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1121
1122
1123	# print STDERR "**** linkfilename = $linkfilename\n";
1124	# if (!&util::fd_exists($linkfilename)) {
1125	# print STDERR "***** Warning: Could not find $linkfilename\n";
1126	# }
1127
1128
1129	# make sure there's a slash on the end if it's a directory
1130	if ($before_hash !~ m/\/$/) {
1131	$before_hash .= "/" if (-d $linkfilename);
1132	}
1133
1134	# print STDERR "*** returning: $before_hash\n";
1135
1136	return ("http://" . $before_hash, $hash_part, 1);
1137	} else {
1138	# mailto, news, nntp, telnet, javascript or gopher link
1139	return ($before_hash, "", 0);
1140	}
1141	}
1142
1143	sub extract_first_NNNN_characters {
1144	my $self = shift (@_);
1145	my ($textref, $doc_obj, $thissection) = @_;
1146
1147	foreach my $size (split /,/, $self->{'first'}) {
1148	my $tmptext = $$textref;
1149	# skip to the body
1150	$tmptext =~ s/.<body[^>]>//i;
1151	# remove javascript
1152	$tmptext =~ s@<script.*?</script>@ @sig;
1153	$tmptext =~ s/<[^>]*>/ /g;
1154	$tmptext =~ s/ / /g;
1155	$tmptext =~ s/^\s+//;
1156	$tmptext =~ s/\s+$//;
1157	$tmptext =~ s/\s+/ /gs;
1158	$tmptext = &unicode::substr ($tmptext, 0, $size);
1159	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
1160	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1161	}
1162	}
1163
1164
1165	sub extract_metadata {
1166	my $self = shift (@_);
1167	my ($textref, $metadata, $doc_obj, $section) = @_;
1168	my $outhandle = $self->{'outhandle'};
1169	# if we don't want metadata, we may as well not be here ...
1170	return if (!defined $self->{'metadata_fields'});
1171
1172	my $separator = $self->{'metadata_field_separator'};
1173	if ($separator eq "") {
1174	undef $separator;
1175	}
1176
1177	# metadata fields to extract/save. 'key' is the (lowercase) name of the
1178	# html meta, 'value' is the metadata name for greenstone to use
1179	my %find_fields = ();
1180
1181	my %creator_fields = (); # short-cut for lookups
1182
1183
1184	foreach my $field (split /,/, $self->{'metadata_fields'}) {
1185	$field =~ s/^\s+//; # remove leading whitespace
1186	$field =~ s/\s+$//; # remove trailing whitespace
1187
1188	# support tag<tagname>
1189	if ($field =~ m/^(.?)\s<(.*?)>$/) {
1190	# "$2" is the user's preferred gs metadata name
1191	$find_fields{lc($1)}=$2; # lc = lowercase
1192	} else { # no <tagname> for mapping
1193	# "$field" is the user's preferred gs metadata name
1194	$find_fields{lc($field)}=$field; # lc = lowercase
1195	}
1196	}
1197
1198	if (defined $self->{'hunt_creator_metadata'} &&
1199	$self->{'hunt_creator_metadata'} == 1 ) {
1200	my @extra_fields =
1201	(
1202	'author',
1203	'author.email',
1204	'creator',
1205	'dc.creator',
1206	'dc.creator.corporatename',
1207	);
1208
1209	# add the creator_metadata fields to search for
1210	foreach my $field (@extra_fields) {
1211	$creator_fields{$field}=0; # add to lookup hash
1212	}
1213	}
1214
1215
1216	# find the header in the html file, which has the meta tags
1217	$$textref =~ m@<head>(.*?)</head>@si;
1218
1219	my $html_header=$1;
1220
1221	# go through every <meta... tag defined in the html and see if it is
1222	# one of the tags we want to match.
1223
1224	# special case for title - we want to remember if its been found
1225	my $found_title = 0;
1226	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1227	$html_header =~ m/^/; # match the start of the string, for \G assertion
1228
1229	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1230	my $metatag=$1;
1231	my ($tag, $value);
1232
1233	# find the tag name
1234	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1235	$tag=$2;
1236	# in case they're not using " or ', but they should...
1237	if (! $tag) {
1238	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1239	$tag=$1;
1240	}
1241
1242	if (!defined $tag) {
1243	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1244	next;
1245	}
1246
1247	# don't need to assign this field if it was passed in from a previous
1248	# (recursive) plugin
1249	if (defined $metadata->{$tag}) {next}
1250
1251	# find the tag content
1252	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
1253	$value=$2;
1254
1255	if (! $value) {
1256	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1257	$value=$1;
1258	}
1259	if (!defined $value) {
1260	print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1261	next;
1262	}
1263
1264	# clean up and add
1265	$value =~ s/\s+/ /gs;
1266	chomp($value); # remove trailing \n, if any
1267	if (exists $creator_fields{lc($tag)}) {
1268	# map this value onto greenstone's "Creator" metadata
1269	$tag='Creator';
1270	} elsif (!exists $find_fields{lc($tag)}) {
1271	next; # don't want this tag
1272	} else {
1273	# get the user's preferred capitalisation
1274	$tag = $find_fields{lc($tag)};
1275	}
1276	if (lc($tag) eq "title") {
1277	$found_title = 1;
1278	}
1279
1280	if ($self->{'verbosity'} > 2) {
1281	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1282	}
1283
1284	if ($tag =~ /\./) {
1285	# there is a . so has a namespace, add ex.
1286	$tag = "ex.$tag";
1287	}
1288	if (defined $separator) {
1289	my @values = split($separator, $value);
1290	foreach my $v (@values) {
1291	$doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1292	}
1293	}
1294	else {
1295	$doc_obj->add_utf8_metadata($section, $tag, $value);
1296	}
1297	}
1298
1299	# TITLE: extract the document title
1300	if (exists $find_fields{'title'} && !$found_title) {
1301	# we want a title, and didn't find one in the meta tags
1302	# see if there's a <title> tag
1303	my $title;
1304	my $from = ""; # for debugging output only
1305	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
1306	$title = $1;
1307	$from = "<title> tags";
1308	}
1309
1310	if (!defined $title) {
1311	$from = "first 100 chars";
1312	# if no title use first 100 or so characters
1313	$title = $$textref;
1314	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1315	$title =~ s/^.*?<body>//si;
1316	# ignore javascript!
1317	$title =~ s@<script.*?</script>@ @sig;
1318	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1319	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1320	$title = substr ($title, 0, 100);
1321	$title =~ s/\s\S*$/.../;
1322	}
1323	$title =~ s/<[^>]*>/ /g; # remove html tags
1324	$title =~ s/ / /g;
1325	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1326	$title =~ s/\s+/ /gs; # collapse multiple spaces
1327	$title =~ s/^\s*//; # remove leading spaces
1328	$title =~ s/\s*$//; # remove trailing spaces
1329
1330	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1331	$title =~ s/^\s+//s; # in case title_sub introduced any...
1332	$doc_obj->add_utf8_metadata ($section, "Title", $title);
1333	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1334	if ($self->{'verbosity'} > 2);
1335	}
1336
1337	# add FileFormat metadata
1338	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1339
1340	# Special, for metadata names such as tagH1 - extracts
1341	# the text between the first <H1> and </H1> tags into "H1" metadata.
1342
1343	foreach my $field (keys %find_fields) {
1344	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1345	my $tag = $1;
1346	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1347	my $content = $1;
1348	$content =~ s/ / /g;
1349	$content =~ s/<[^>]*>/ /g;
1350	$content =~ s/^\s+//;
1351	$content =~ s/\s+$//;
1352	$content =~ s/\s+/ /gs;
1353	if ($content) {
1354	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1355	$tag =~ s/^tag//i;
1356	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1357	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1358	if ($self->{'verbosity'} > 2);
1359	}
1360	}
1361	}
1362	}
1363
1364
1365	# evaluate any "../" to next directory up
1366	# evaluate any "./" as here
1367	sub eval_dir_dots {
1368	my $self = shift (@_);
1369	my ($filename) = @_;
1370	my $dirsep_os = &util::get_os_dirsep();
1371	my @dirsep = split(/$dirsep_os/,$filename);
1372
1373	my @eval_dirs = ();
1374	foreach my $d (@dirsep) {
1375	if ($d eq "..") {
1376	pop(@eval_dirs);
1377
1378	} elsif ($d eq ".") {
1379	# do nothing!
1380
1381	} else {
1382	push(@eval_dirs,$d);
1383	}
1384	}
1385
1386	# Need to fiddle with number of elements in @eval_dirs if the
1387	# first one is the empty string. This is because of a
1388	# modification to util::filename_cat that supresses the addition
1389	# of a leading '/' character (or \ if windows) (intended to help
1390	# filename cat with relative paths) if the first entry in the
1391	# array is the empty string. Making the array start with two
1392	# empty strings is a way to defeat this "smart" option.
1393	#
1394	if (scalar(@eval_dirs) > 0) {
1395	if ($eval_dirs[0] eq ""){
1396	unshift(@eval_dirs,"");
1397	}
1398	}
1399
1400	my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1401	return $evaluated_filename;
1402	}
1403
1404	sub replace_usemap_links {
1405	my $self = shift (@_);
1406	my ($front, $link, $back) = @_;
1407
1408	# remove quotes from link at start and end if necessary
1409	if ($link=~/^[\"\']/) {
1410	$link=~s/^[\"\']//;
1411	$link=~s/[\"\']$//;
1412	$front.='"';
1413	$back="\"$back";
1414	}
1415
1416	$link =~ s/^\.\///;
1417	return $front . $link . $back;
1418	}
1419
1420	sub inc_filecount {
1421	my $self = shift (@_);
1422
1423	if ($self->{'file_num'} == 1000) {
1424	$self->{'dir_num'} ++;
1425	$self->{'file_num'} = 0;
1426	} else {
1427	$self->{'file_num'} ++;
1428	}
1429	}
1430
1431
1432	# Extend read_file so that strings like é are
1433	# converted to UTF8 internally.
1434	#
1435	# We don't convert < or > or & or " in case
1436	# they interfere with the GML files
1437
1438	sub read_file {
1439	my $self = shift(@_);
1440	my ($filename, $encoding, $language, $textref) = @_;
1441
1442	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1443
1444	# Convert entities to their Unicode code-point equivalents
1445	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1446	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1447	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1448
1449	}
1450
1451	sub HB_read_html_file {
1452	my $self = shift (@_);
1453	my ($htmlfile, $text) = @_;
1454
1455	# load in the file
1456	if (!open (FILE, $htmlfile)) {
1457	print STDERR "ERROR - could not open $htmlfile\n";
1458	return;
1459	}
1460
1461	my $foundbody = 0;
1462	$self->HB_gettext (\$foundbody, $text, "FILE");
1463	close FILE;
1464
1465	# just in case there was no <body> tag
1466	if (!$foundbody) {
1467	$foundbody = 1;
1468	open (FILE, $htmlfile) \|\| return;
1469	$self->HB_gettext (\$foundbody, $text, "FILE");
1470	close FILE;
1471	}
1472	# text is in utf8
1473	}
1474
1475	# converts the text to utf8, as ghtml does that for é etc.
1476	sub HB_gettext {
1477	my $self = shift (@_);
1478	my ($foundbody, $text, $handle) = @_;
1479
1480	my $line = "";
1481	while (defined ($line = <$handle>)) {
1482	# look for body tag
1483	if (!$$foundbody) {
1484	if ($line =~ s/^.<body[^>]>//i) {
1485	$$foundbody = 1;
1486	} else {
1487	next;
1488	}
1489	}
1490
1491	# check for symbol fonts
1492	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
1493	my $font = $1;
1494	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1495	if ($font !~ m/^arial$/i);
1496	}
1497
1498	$$text .= $line;
1499	}
1500
1501	if ($self->{'input_encoding'} eq "iso_8859_1") {
1502	# convert to utf-8
1503	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1504	}
1505	# convert any alphanumeric character entities to their utf-8
1506	# equivalent for indexing purposes
1507	#&ghtml::convertcharentities ($$text);
1508
1509	$$text =~ s/\s+/ /g; # remove \n's
1510
1511	# At this point $$text is a binary byte string
1512	# => turn it into a Unicode aware string, so full
1513	# Unicode aware pattern matching can be used.
1514	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
1515	#
1516
1517	$$text = decode("utf8",$$text);
1518	}
1519
1520	sub HB_clean_section {
1521	my $self = shift (@_);
1522	my ($section) = @_;
1523
1524	# remove tags without a starting tag from the section
1525	my ($tag, $tagstart);
1526	while ($section =~ m/<\/([^>]{1,10})>/) {
1527	$tag = $1;
1528	$tagstart = index($section, "<$tag");
1529	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1530	$section =~ s/<\/$tag>//;
1531	}
1532
1533	# remove extra paragraph tags
1534	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
1535
1536	# remove extra stuff at the end of the section
1537	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
1538
1539	# add a newline at the beginning of each paragraph
1540	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1541
1542	# add a newline every 80 characters at a word boundary
1543	# Note: this regular expression puts a line feed before
1544	# the last word in each section, even when it is not
1545	# needed.
1546	$section =~ s/(.{1,80})\s/$1\n/g;
1547
1548	# fix up the image links
1549	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
1550	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1551	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
1552	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1553
1554	return $section;
1555	}
1556
1557	# Will convert the oldHDL format to the new HDL format (using the Section tag)
1558	sub convert_to_newHDLformat
1559	{
1560	my $self = shift (@_);
1561	my ($file,$cnfile) = @_;
1562	my $input_filename = $file;
1563	my $tmp_filename = $cnfile;
1564
1565	# write HTML tmp file with new HDL format
1566	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1567
1568	# read in the file and do basic html cleaning (removing header etc)
1569	my $html = "";
1570	$self->HB_read_html_file ($input_filename, \$html);
1571
1572	# process the file one section at a time
1573	my $curtoclevel = 1;
1574	my $firstsection = 1;
1575	my $toclevel = 0;
1576	while (length ($html) > 0) {
1577	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
1578	$toclevel = $3;
1579	my $title = $4;
1580	my $sectiontext = "";
1581	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
1582	$sectiontext = $1;
1583	} else {
1584	$sectiontext = $html;
1585	$html = "";
1586	}
1587
1588	# remove tags and extra spaces from the title
1589	$title =~ s/<\/?[^>]+>//g;
1590	$title =~ s/^\s+\|\s+$//g;
1591
1592	# close any sections below the current level and
1593	# create a new section (special case for the firstsection)
1594	print PROD "<!--\n";
1595	while (($curtoclevel > $toclevel) \|\|
1596	(!$firstsection && $curtoclevel == $toclevel)) {
1597	$curtoclevel--;
1598	print PROD "</Section>\n";
1599	}
1600	if ($curtoclevel+1 < $toclevel) {
1601	print STDERR "WARNING - jump in toc levels in $input_filename " .
1602	"from $curtoclevel to $toclevel\n";
1603	}
1604	while ($curtoclevel < $toclevel) {
1605	$curtoclevel++;
1606	}
1607
1608	if ($curtoclevel == 1) {
1609	# add the header tag
1610	print PROD "-->\n";
1611	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1612	print PROD "<!--\n";
1613	}
1614
1615	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1616
1617	print PROD "-->\n";
1618
1619	# clean up the section html
1620	$sectiontext = $self->HB_clean_section($sectiontext);
1621
1622	print PROD "$sectiontext\n";
1623
1624	} else {
1625	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1626	"\nin $input_filename\n";
1627	last;
1628	}
1629	$firstsection = 0;
1630	}
1631
1632	print PROD "<!--\n";
1633	while ($curtoclevel > 0) {
1634	$curtoclevel--;
1635	print PROD "</Section>\n";
1636	}
1637	print PROD "-->\n";
1638
1639	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1640
1641	return $tmp_filename;
1642	}
1643
1644	sub shorten {
1645	my $self = shift (@_);
1646	my ($text) = @_;
1647
1648	return "\"$text\"" if (length($text) < 100);
1649
1650	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1651	substr ($text, length($text)-50) . "\"";
1652	}
1653
1654	sub convert_tidy_or_oldHDL_file
1655	{
1656	my $self = shift (@_);
1657	my ($file) = @_;
1658	my $input_filename = $file;
1659
1660	if (-d $input_filename)
1661	{
1662	return $input_filename;
1663	}
1664
1665	# get the input filename
1666	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1667	my $base_dirname = $dirname;
1668	$suffix = lc($suffix);
1669
1670	# derive tmp filename from input filename
1671	# Remove any white space from filename -- no risk of name collision, and
1672	# makes later conversion by utils simpler. Leave spaces in path...
1673	# tidy up the filename with space, dot, hyphen between
1674	$tailname =~ s/\s+//g;
1675	$tailname =~ s/\.+//g;
1676	$tailname =~ s/\-+//g;
1677	# convert to utf-8 otherwise we have problems with the doc.xml file
1678	# later on
1679	&unicode::ensure_utf8(\$tailname);
1680
1681	# softlink to collection tmp dir
1682	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1683	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1684
1685	my $test_dirname = "";
1686	my $f_separator = &util::get_os_dirsep();
1687
1688	if ($dirname =~ m/import$f_separator/)
1689	{
1690	$test_dirname = $'; #'
1691
1692	#print STDERR "init $'\n";
1693
1694	while ($test_dirname =~ m/[$f_separator]/)
1695	{
1696	my $folderdirname = $`;
1697	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1698	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1699	$test_dirname = $'; #'
1700	}
1701	}
1702
1703	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1704
1705	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1706	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
1707	{
1708	#convert the input file to a new style HDL
1709	my $hdl_output_filename = $input_filename;
1710	if ($self->{'old_style_HDL'})
1711	{
1712	$hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1713	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1714	}
1715
1716	#just for checking copy all other file from the base dir to tmp dir if it is not exists
1717	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1718	my @files = grep {!/^\.+$/} readdir(DIR);
1719	close(DIR);
1720
1721	foreach my $file (@files)
1722	{
1723	my $src_file = &util::filename_cat($base_dirname,$file);
1724	my $dest_file = &util::filename_cat($tmp_dirname,$file);
1725	if ((!-e $dest_file) && (!-d $src_file))
1726	{
1727	# just copy the original file back to the tmp directory
1728	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1729	}
1730	}
1731
1732	# tidy the input file
1733	my $tidy_output_filename = $hdl_output_filename;
1734	if ($self->{'use_realistic_book'})
1735	{
1736	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1737	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1738	}
1739	$tmp_filename = $tidy_output_filename;
1740	}
1741	else
1742	{
1743	if (!-e $tmp_filename)
1744	{
1745	# just copy the original file back to the tmp directory
1746	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1747	}
1748	}
1749
1750	return $tmp_filename;
1751	}
1752
1753
1754	# Will make the html input file as a proper XML file with removed font tag and
1755	# image size added to the img tag.
1756	# The tidying process takes place in a collection specific 'tmp' directory so
1757	# that we don't accidentally damage the input.
1758	sub tmp_tidy_file
1759	{
1760	my $self = shift (@_);
1761	my ($file,$cnfile) = @_;
1762	my $input_filename = $file;
1763	my $tmp_filename = $cnfile;
1764
1765	# get the input filename
1766	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1767
1768	require HTML::TokeParser::Simple;
1769
1770	# create HTML parser to decode the input file
1771	my $parser = HTML::TokeParser::Simple->new($input_filename);
1772
1773	# write HTML tmp file without the font tag and image size are added to the img tag
1774	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1775	while (my $token = $parser->get_token())
1776	{
1777	# is it an img tag
1778	if ($token->is_start_tag('img'))
1779	{
1780	# get the attributes
1781	my $attr = $token->return_attr;
1782
1783	# get the full path to the image
1784	my $img_file = &util::filename_cat($dirname,$attr->{src});
1785
1786	# set the width and height attribute
1787	($attr->{width}, $attr->{height}) = imgsize($img_file);
1788
1789	# recreate the tag
1790	print PROD "<img";
1791	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1792	print PROD ">";
1793	}
1794	# is it a font tag
1795	else
1796	{
1797	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
1798	{
1799	# remove font tag
1800	print PROD "";
1801	}
1802	else
1803	{
1804	# print without changes
1805	print PROD $token->as_is;
1806	}
1807	}
1808	}
1809	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1810
1811	# run html-tidy on the tmp file to make it a proper XML file
1812
1813	my $outhandle = $self->{'outhandle'};
1814	print $outhandle "Converting HTML to be XML compliant:\n";
1815
1816	my $tidy_cmd = "tidy";
1817	$tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1818	$tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1819	if ($self->{'verbosity'} <= 2) {
1820	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1821	$tidy_cmd .= " 2>nul";
1822	}
1823	else {
1824	$tidy_cmd .= " 2>/dev/null";
1825	}
1826	print $outhandle " => $tidy_cmd\n";
1827	}
1828
1829	my $tidyfile = `$tidy_cmd`;
1830
1831	# write result back to the tmp file
1832	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1833	print PROD $tidyfile;
1834	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1835
1836	# return the output filename
1837	return $tmp_filename;
1838	}
1839
1840	sub associate_cover_image
1841	{
1842	my $self = shift(@_);
1843	my ($doc_obj, $filename) = @_;
1844	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
1845	{
1846	# we will have cover image in tidytmp, but want it from import
1847	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1848	}
1849	$self->SUPER::associate_cover_image($doc_obj, $filename);
1850	}
1851
1852
1853	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: