Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 23419

Last change on this file since 23419 was 23419, checked in by max, 13 years ago
Setting the values to store as block files is now done through an API call to BasePlugin. This way, anything uniform requirement (such as putting in both C:\... and c:\... entries for Windows) can be done in one place.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 62.8 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use Encode;
39	use Unicode::Normalize 'normalize';
40
41	use ReadTextFile;
42	use HBPlugin;
43	use ghtml;
44	use unicode;
45	use util;
46	use XMLParser;
47
48	use Image::Size;
49	use File::Copy;
50
51	sub BEGIN {
52	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
53	}
54
55	use strict; # every perl program should have this!
56	no strict 'refs'; # make an exception so we can use variables as filehandles
57
58	my $arguments =
59	[ { 'name' => "process_exp",
60	'desc' => "{BasePlugin.process_exp}",
61	'type' => "regexp",
62	'deft' => &get_default_process_exp() },
63	{ 'name' => "block_exp",
64	'desc' => "{BasePlugin.block_exp}",
65	'type' => 'regexp',
66	'deft' => &get_default_block_exp() },
67	{ 'name' => "nolinks",
68	'desc' => "{HTMLPlugin.nolinks}",
69	'type' => "flag" },
70	{ 'name' => "keep_head",
71	'desc' => "{HTMLPlugin.keep_head}",
72	'type' => "flag" },
73	{ 'name' => "no_metadata",
74	'desc' => "{HTMLPlugin.no_metadata}",
75	'type' => "flag" },
76	{ 'name' => "metadata_fields",
77	'desc' => "{HTMLPlugin.metadata_fields}",
78	'type' => "string",
79	'deft' => "Title" },
80	{ 'name' => "metadata_field_separator",
81	'desc' => "{HTMLPlugin.metadata_field_separator}",
82	'type' => "string",
83	'deft' => "" },
84	{ 'name' => "hunt_creator_metadata",
85	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
86	'type' => "flag" },
87	{ 'name' => "file_is_url",
88	'desc' => "{HTMLPlugin.file_is_url}",
89	'type' => "flag" },
90	{ 'name' => "assoc_files",
91	'desc' => "{HTMLPlugin.assoc_files}",
92	'type' => "regexp",
93	'deft' => &get_default_block_exp() },
94	{ 'name' => "rename_assoc_files",
95	'desc' => "{HTMLPlugin.rename_assoc_files}",
96	'type' => "flag" },
97	{ 'name' => "title_sub",
98	'desc' => "{HTMLPlugin.title_sub}",
99	'type' => "string",
100	'deft' => "" },
101	{ 'name' => "description_tags",
102	'desc' => "{HTMLPlugin.description_tags}",
103	'type' => "flag" },
104	# retain this for backward compatibility (w3mir option was replaced by
105	# file_is_url)
106	{ 'name' => "w3mir",
107	# 'desc' => "{HTMLPlugin.w3mir}",
108	'type' => "flag",
109	'hiddengli' => "yes"},
110	{ 'name' => "no_strip_metadata_html",
111	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
112	'type' => "string",
113	'deft' => "",
114	'reqd' => "no"},
115	{ 'name' => "sectionalise_using_h_tags",
116	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
117	'type' => "flag" },
118	{ 'name' => "use_realistic_book",
119	'desc' => "{HTMLPlugin.tidy_html}",
120	'type' => "flag"},
121	{ 'name' => "old_style_HDL",
122	'desc' => "{HTMLPlugin.old_style_HDL}",
123	'type' => "flag"},
124	{'name' => "processing_tmp_files",
125	'desc' => "{BasePlugin.processing_tmp_files}",
126	'type' => "flag",
127	'hiddengli' => "yes"}
128	];
129
130	my $options = { 'name' => "HTMLPlugin",
131	'desc' => "{HTMLPlugin.desc}",
132	'abstract' => "no",
133	'inherits' => "yes",
134	'args' => $arguments };
135
136
137	sub new {
138	my ($class) = shift (@_);
139	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
140	push(@$pluginlist, $class);
141
142	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
143	push(@{$hashArgOptLists->{"OptList"}},$options);
144
145
146	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
147
148	if ($self->{'w3mir'}) {
149	$self->{'file_is_url'} = 1;
150	}
151	$self->{'aux_files'} = {};
152	$self->{'dir_num'} = 0;
153	$self->{'file_num'} = 0;
154
155	return bless $self, $class;
156	}
157
158	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
159	# if have eg <script language="javascript" src="img/lib.js@123">
160	# blocking is now done by reading through the file and recording all the
161	# images and other files
162	sub get_default_block_exp {
163	my $self = shift (@_);
164
165	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
166	return "";
167	}
168
169	sub get_default_process_exp {
170	my $self = shift (@_);
171
172	# the last option is an attempt to encode the concept of an html query ...
173	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
174	}
175
176	sub store_block_files
177	{
178	my $self =shift (@_);
179	my ($filename_full_path, $block_hash) = @_;
180
181	my $html_fname = $filename_full_path;
182	my @file_blocks;
183
184	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
185	$self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
186
187	# read in file ($text will be in utf8)
188	my $raw_text = "";
189	$self->read_file_no_decoding($filename_full_path, \$raw_text);
190
191	my $textref = \$raw_text;
192	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
193	my $closecom = '(?:-->\|(?:—\|\|--)>)';
194	$$textref =~ s/$opencom(.*?)$closecom//gs;
195
196	# Convert entities to their UTF8 equivalents
197	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
198	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
199	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
200
201	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
202	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
203	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
204	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
205	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
206	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
207	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
208
209	if(!defined $self->{'unicode_to_original_filename'}) {
210	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
211	$self->{'unicode_to_original_filename'} = {};
212	}
213
214	foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
215
216	# remove quotes from link at start and end if necessary
217	if ($raw_link =~ m/^\"/) {
218	$raw_link =~ s/^\"//;
219	$raw_link =~ s/\"$//;
220	}
221
222	# remove any anchor names, e.g. foo.html#name becomes foo.html
223	# but watch out for any #'s that are part of entities, such as α
224	$raw_link =~ s/([^&])\#.*$/$1/s;
225
226	# some links may just be anchor names
227	next unless ($raw_link =~ /\S+/);
228
229	if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/i) {
230	# Turn relative file path into full path
231	my $dirname = &File::Basename::dirname($filename_full_path);
232	$raw_link = &util::filename_cat($dirname, $raw_link);
233	}
234	$raw_link = $self->eval_dir_dots($raw_link);
235
236	# this is the actual filename on the filesystem (that the link refers to)
237	my $url_original_filename = $self->opt_url_decode($raw_link);
238
239	my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
240
241	if ($exceeds_bytecodes) {
242	# We have a link to a file name that is more complicated than a raw byte filename
243	# What we do next depends on the operating system we are on
244
245	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
246	# Assume we're dealing with a UTF-8 encoded filename
247	$url_original_filename = encode("utf8", $url_original_filename);
248	}
249	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
250	# HFS+ is UTF8 with decompostion
251	$url_original_filename = encode("utf8", $url_original_filename);
252	$url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
253	}
254	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
255	# Don't need to do anything as later code maps Windows
256	# unicode filenames to DOS short filenames when needed
257	}
258	else {
259	my $outhandle = $self->{'outhandle'};
260	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
261	print $outhandle " in raw file system encoding of: $raw_link\n";
262	print $outhandle " Assuming filesystem is UTF-8 based.\n";
263	$url_original_filename = encode("utf8", $url_original_filename);
264	}
265	}
266
267	# Convert the (currently raw) link into its Unicode version.
268	# Store the Unicode link along with the url_original_filename
269	my $unicode_url_original_filename = "";
270	$self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
271
272
273	$self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
274
275
276	if ($url_original_filename ne $unicode_url_original_filename) {
277	my $outhandle = $self->{'outhandle'};
278
279	print $outhandle "URL Encoding $url_original_filename\n";
280	print $outhandle " ->$unicode_url_original_filename\n";
281
282	# Allow for possibility of raw byte version and Unicode versions of file
283	$self->block_filename($block_hash,$unicode_url_original_filename);
284	}
285
286	# $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename);
287
288	$self->block_filename($block_hash,$url_original_filename);
289
290	}
291	}
292
293	# Given a filename in any encoding, will URL decode it to get back the original filename
294	# in the original encoding. Because this method is intended to work out the original
295	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
296	# string already exists in the local folder.
297	#
298	sub opt_url_decode {
299	my $self = shift (@_);
300	my ($raw_link) = @_;
301
302
303	# Replace %XX's in URL with decoded value if required.
304	# Note that the filename may include the %XX in some situations
305
306	## if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
307
308	if (($raw_link =~ m/\%[A-F0-9]{2}/i) \|\| ($raw_link =~ m/\&\#x[0-9A-F]+;/i) \|\| ($raw_link =~ m/\&\#[0-9]+;/i)) {
309	if (!-e $raw_link) {
310	$raw_link = &unicode::url_decode($raw_link,1);
311	}
312	}
313
314	return $raw_link;
315	}
316
317	sub read_into_doc_obj
318	{
319	my $self = shift (@_);
320	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
321
322	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
323
324	# Lookup content_encoding worked out in file_block pass for this file
325	# Store it under the local name 'content_encoding' so its nice and
326	# easy to access
327	$self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
328
329	# get the input file
330	my $input_filename = $file;
331	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
332	$suffix = lc($suffix);
333	my $tidy_filename;
334	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
335	{
336	# because the document has to be sectionalized set the description tags
337	$self->{'description_tags'} = 1;
338
339	# set the file to be tidied
340	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
341
342	# get the tidied file
343	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
344	$tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
345
346	# derive tmp filename from input filename
347	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
348
349	# set the new input file and base_dir to be from the tidied file
350	$file = "$tailname$suffix";
351	$base_dir = $dirname;
352	}
353
354	# call the parent read_into_doc_obj
355	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
356	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
357	{
358	# now we need to reset the filenames in the doc obj so that the converted filenames are not used
359	my $collect_file = &util::filename_within_collection($filename_full_path);
360	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
361	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
362	# build. So set it manually.
363	$doc_obj->set_source_path($filename_full_path);
364	my $collect_conv_file = &util::filename_within_collection($tidy_filename);
365	$doc_obj->set_converted_filename($collect_conv_file);
366
367	my $plugin_filename_encoding = $self->{'filename_encoding'};
368	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
369	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
370	}
371
372	delete $self->{'store_content_encoding'}->{$filename_full_path};
373	$self->{'content_encoding'} = undef;
374
375	return ($process_status,$doc_obj);
376	}
377
378	# do plugin specific processing of doc_obj
379	sub process {
380	my $self = shift (@_);
381	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
382	my $outhandle = $self->{'outhandle'};
383
384	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
385	# this makes life so much easier... perl can cope with unix-style '/'s.
386	$base_dir =~ s@(\\)+@/@g;
387	$file =~ s@(\\)+@/@g;
388	}
389
390	my $filename = &util::filename_cat($base_dir,$file);
391	my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
392	my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
393
394	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
395	# And again
396	$upgraded_base_dir =~ s@(\\)+@/@g;
397	$upgraded_filename =~ s@(\\)+@/@g;
398
399	# Need to make sure there is a '/' on the end of upgraded_base_dir
400	if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
401	$upgraded_base_dir .= "/";
402	}
403	}
404	my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
405
406	# reset per-doc stuff...
407	$self->{'aux_files'} = {};
408	$self->{'dir_num'} = 0;
409	$self->{'file_num'} = 0;
410
411	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
412	# you can also include metadata in the format (X can be any number)
413	# <hX>Title<!--gsdl-metadata
414	# <Metadata name="name1">value1</Metadata>
415	# ...
416	# <Metadata name="nameN">valueN</Metadata>
417	#--></hX>
418	if ($self->{'sectionalise_using_h_tags'}) {
419	# description_tags should allways be activated because we convert headings to description tags
420	$self->{'description_tags'} = 1;
421
422	my $arrSections = [];
423	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
424
425	if (scalar(@$arrSections)) {
426	my $strMetadata = $self->update_section_data($arrSections, -1);
427	if (length($strMetadata)) {
428	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
429	$$textref =~ s/<\/body>/$strMetadata/ig;
430	}
431	}
432	}
433
434	my $cursection = $doc_obj->get_top_section();
435
436	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
437	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
438
439	# Store URL for page as metadata - this can be used for an
440	# altavista style search interface. The URL won't be valid
441	# unless the file structure contains the domain name (i.e.
442	# like when w3mir is used to download a website).
443
444	# URL metadata (even invalid ones) are used to support internal
445	# links, so even if 'file_is_url' is off, still need to store info
446
447	my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
448
449	# my $utf8_file = $self->filename_to_utf8_metadata($file);
450	# $utf8_file =~ s/&\#095;/_/g;
451	# variable below used to be utf8_file
452	my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
453	my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
454
455	my $web_url = "http://";
456	my $utf8_web_url = "http://";
457	if(defined $dirname) { # local directory
458	# Check for "ftp" in the domain name of the directory
459	# structure to determine if this URL should be a ftp:// URL
460	# This check is not infallible, but better than omitting the
461	# check, which would cause all files downloaded from ftp sites
462	# via mirroring with wget to have potentially erroneous http:// URLs
463	# assigned in their metadata
464	if ($dirname =~ /^[^\/]*ftp/i)
465	{
466	$web_url = "ftp://";
467	$utf8_web_url = "ftp://";
468	}
469	$dirname = $self->eval_dir_dots($dirname);
470	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
471
472	$web_url = $web_url.$dirname.$url_encoded_file;
473	$utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
474	} else {
475	$web_url = $web_url.$url_encoded_file;
476	$utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
477	}
478	$web_url =~ s/\\/\//g;
479	$utf8_web_url =~ s/\\/\//g;
480
481	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
482	print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n";
483	print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
484	}
485
486
487	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
488	$doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
489
490	if ($self->{'file_is_url'}) {
491	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
492	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
493	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
494	}
495
496	if ($self->{'description_tags'}) {
497	# remove the html header - note that doing this here means any
498	# sections defined within the header will be lost (so all <Section>
499	# tags must appear within the body of the HTML)
500	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
501
502	$$textref =~ s/^.?<body[^>]>//is;
503	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
504
505	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
506	my $closecom = '(?:-->\|(?:—\|\|--)>)';
507
508	my $lt = '(?:<\|<)';
509	my $gt = '(?:>\|>)';
510	my $quot = '(?:"\|"\|”\|“)';
511
512	my $dont_strip = '';
513	if ($self->{'no_strip_metadata_html'}) {
514	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
515	}
516
517	my $found_something = 0; my $top = 1;
518	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
519	my $text = $1;
520	my $comment = $2;
521	if (defined $text) {
522	# text before a comment - note that getting to here
523	# doesn't necessarily mean there are Section tags in
524	# the document
525	$self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
526	}
527	while ($comment =~ s/$lt(.*?)$gt//s) {
528	my $tag = $1;
529	if ($tag eq "Section") {
530	$found_something = 1;
531	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
532	$top = 0;
533	} elsif ($tag eq "/Section") {
534	$found_something = 1;
535	$cursection = $doc_obj->get_parent_section ($cursection);
536	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
537	my $metaname = $1;
538	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
539	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
540	my $metavalue = $1;
541	$metavalue =~ s/^\s+//;
542	$metavalue =~ s/\s+$//;
543	# assume that no metadata value intentionally includes
544	# carriage returns or HTML tags (if they're there they
545	# were probably introduced when converting to HTML from
546	# some other format).
547	# actually some people want to have html tags in their
548	# metadata.
549	$metavalue =~ s/[\cJ\cM]/ /sg;
550	$metavalue =~ s/<[^>]+>//sg
551	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
552	$metavalue =~ s/\s+/ /sg;
553	if ($metaname =~ /\./) { # has a namespace
554	$metaname = "ex.$metaname";
555	}
556	if ($accumulate) {
557	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
558	} else {
559	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
560	}
561	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
562	# do nothing with containing Description tags
563	} else {
564	# simple HTML tag (probably created by the conversion
565	# to HTML from some other format) - we'll ignore it and
566	# hope for the best ;-)
567	}
568	}
569	}
570	if ($cursection ne "") {
571	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
572	}
573
574	$$textref =~ s/^.?<body[^>]>//is;
575	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
576	if ($$textref =~ m/\S/) {
577	if (!$found_something) {
578	if ($self->{'verbosity'} > 2) {
579	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
580	print $outhandle " will be processed as a single section document\n";
581	}
582
583	# go ahead and process single-section document
584	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
585
586	# if document contains no Section tags we'll go ahead
587	# and extract metadata (this won't have been done
588	# above as the -description_tags option prevents it)
589	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
590	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
591	unless $self->{'no_metadata'};
592
593	} else {
594	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
595	print $outhandle " of the final closing </Section> tag. This text will\n";
596	print $outhandle " be ignored.";
597
598	my ($text);
599	if (length($$textref) > 30) {
600	$text = substr($$textref, 0, 30) . "...";
601	} else {
602	$text = $$textref;
603	}
604	$text =~ s/\n/ /isg;
605	print $outhandle " ($text)\n";
606	}
607	} elsif (!$found_something) {
608
609	if ($self->{'verbosity'} > 2) {
610	# may get to here if document contained no valid Section
611	# tags but did contain some comments. The text will have
612	# been processed already but we should print the warning
613	# as above and extract metadata
614	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
615	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
616	}
617
618	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
619	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
620	unless $self->{'no_metadata'};
621	}
622
623	} else {
624
625	# remove header and footer
626	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
627	$$textref =~ s/^.?<body[^>]>//is;
628	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
629	}
630
631
632	# single section document
633	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
634	}
635
636	return 1;
637	}
638
639
640	sub process_heading
641	{
642	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
643	$strHeadingText = '' if (!defined($strHeadingText));
644
645	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
646
647	my $strSecMetadata = '';
648	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
649	{
650	$strSecMetadata .= $1;
651	}
652
653	$strHeadingText =~ s/^\s+//g;
654	$strHeadingText =~ s/\s+$//g;
655	$strSecMetadata =~ s/^\s+//g;
656	$strSecMetadata =~ s/\s+$//g;
657
658	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
659
660	if (length($strSecMetadata)) {
661	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
662	}
663
664	$strMetadata .= "\t</Description>\n";
665
666	return "<!--" . $strMetadata . "-->";
667	}
668
669
670	sub update_section_data
671	{
672	my ($self, $arrSections, $nCurTocNo) = @_;
673	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
674
675	if ($nSections == 0) {
676	push @$arrSections, $nCurTocNo;
677	return $strBuffer;
678	}
679	$nLast = $arrSections->[$nSections - 1];
680	if ($nCurTocNo > $nLast) {
681	push @$arrSections, $nCurTocNo;
682	return $strBuffer;
683	}
684	for(my $i = $nSections - 1; $i >= 0; $i--) {
685	if ($nCurTocNo <= $arrSections->[$i]) {
686	$strBuffer .= "\n</Section>";
687	pop @$arrSections;
688	}
689	}
690	push @$arrSections, $nCurTocNo;
691	return $strBuffer;
692	}
693
694
695	# note that process_section may be called multiple times for a single
696	# section (relying on the fact that add_utf8_text appends the text to any
697	# that may exist already).
698	sub process_section {
699	my $self = shift (@_);
700	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
701	# trap links
702	if (!$self->{'nolinks'}) {
703	# usemap="./#index" not handled correctly => change to "#index"
704	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
705
706	## my $opencom = '(?:<!--\|<!(?:—\|\|--))';
707	## my $closecom = '(?:-->\|(?:—\|\|--)>)';
708
709	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
710	$self->replace_usemap_links($1, $2, $3)/isge;
711
712	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
713	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
714
715	## $$textref =~ s/($opencom.?)?+(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)(.*?$closecom)?+/
716	# $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
717	}
718
719	# trap images
720
721	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
722	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
723	# turned regular text succeeding images into links. That is, by embedding <imgs>
724	# inside <a href=""></a>, the text following images were no longer misbehaving.
725	# However, there would be many occasions whereby images were not meant to link
726	# to their source images but where the images would link to another web page.
727	# To allow this, the no_image_links option was introduced: it would prevent
728	# the behaviour of embedding images into links that referenced the source images.
729
730	# Somewhere along the line, the problem of normal text turning into links when
731	# such text followed images which were not embedded in <a href=""></a> ceased
732	# to occur. This is why the following lines have been commented out (as well as
733	# two lines in replace_images). They appear to no longer apply.
734
735	# If at any time, there is a need for having images embedded in <a> anchor tags,
736	# then it might be better to turn that into an HTMLPlugin option rather than make
737	# it the default behaviour. Also, eventually, no_image_links needs to become
738	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
739
740	#if(!$self->{'no_image_links'}){
741	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
742	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
743	#}
744
745	# add text to document object
746	# turn \ into \\ so that the rest of greenstone doesn't think there
747	# is an escape code following. (Macro parsing loses them...)
748	$$textref =~ s/\\/\\\\/go;
749
750	$doc_obj->add_utf8_text($cursection, $$textref);
751	}
752
753	sub replace_images {
754	my $self = shift (@_);
755	my ($front, $link, $back, $base_dir,
756	$file, $doc_obj, $section) = @_;
757
758	# remove quotes from link at start and end if necessary
759	if ($link=~/^[\"\']/) {
760	$link=~s/^[\"\']//;
761	$link=~s/[\"\']$//;
762	$front.='"';
763	$back="\"$back";
764	}
765
766	$link =~ s/\n/ /g;
767
768	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
769	# If the Word file path has spaces in it, wv messes up and you end up with
770	# absolute paths for the images, and without the "file://" prefix
771	# So check for this special case and massage the data to be correct
772	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
773	$link =~ s/^.*\\([^\\]+)$/$1/;
774	}
775
776	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
777
778	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
779
780	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n";
781
782	my $anchor_name = $img_file;
783	#$anchor_name =~ s/^.*\///;
784	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
785
786	my $image_link = $front . $img_file .$back;
787	return $image_link;
788
789	# The reasons for why the following two lines are no longer necessary can be
790	# found in subroutine process_section
791	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
792	#return $anchor_link;
793
794	#return $front . $img_file . $back . $anchor_name;
795	}
796
797	sub replace_href_links {
798	my $self = shift (@_);
799	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
800
801	# remove quotes from link at start and end if necessary
802	if ($link=~/^[\"\']/) {
803	$link=~s/^[\"\']//;
804	$link=~s/[\"\']$//;
805	$front.='"';
806	$back="\"$back";
807	}
808
809	# attempt to sort out targets - frames are not handled
810	# well in this plugin and some cases will screw things
811	# up - e.g. the _parent target (so we'll just remove
812	# them all ;-)
813	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
814	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
815	$front =~ s/target=\"?_parent\"?//is;
816	$back =~ s/target=\"?_parent\"?//is;
817
818	return $front . $link . $back if $link =~ m/^\#/s;
819	$link =~ s/\n/ /g;
820
821	# Find file referred to by $link on file system
822	# This is more complicated than it sounds when char encodings
823	# is taken in to account
824	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
825
826	# href may use '\'s where '/'s should be on Windows
827	$href =~ s/\\/\//g;
828	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
829
830
831	##### leave all these links alone (they won't be picked up by intermediate
832	##### pages). I think that's safest when dealing with frames, targets etc.
833	##### (at least until I think of a better way to do it). Problems occur with
834	##### mailto links from within small frames, the intermediate page is displayed
835	##### within that frame and can't be seen. There is still potential for this to
836	##### happen even with html pages - the solution seems to be to somehow tell
837	##### the browser from the server side to display the page being sent (i.e.
838	##### the intermediate page) in the top level window - I'm not sure if that's
839	##### possible - the following line should probably be deleted if that can be done
840	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
841
842	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
843	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
844
845	if ($ENV{'GSDLOS'} =~ m/^windows$/) {
846
847	# Don't do any encoding for now, as not clear what
848	# the right thing to do is to support filename
849	# encoding on Windows when they are not UTF16
850	#
851	}
852	else {
853	# => Unix-based system
854
855	# If web page didn't give encoding, then default to utf8
856	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
857
858	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
859	print STDERR "**** Encoding with '$content_encoding', href: $href\n";
860	}
861
862	$href = encode($content_encoding,$href);
863	}
864
865	$href = &unicode::raw_filename_to_utf8_url_encoded($href);
866	$href = &unicode::filename_to_url($href);
867
868	&ghtml::urlsafe ($href);
869
870	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
871	print STDERR "******DEBUG: href=$href\n";
872	}
873
874
875	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
876	} else {
877	# link is to some other type of file (e.g., an image) so we'll
878	# need to associate that file
879	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
880	}
881	}
882
883	sub add_file {
884	my $self = shift (@_);
885	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
886	my ($newname);
887
888	my $filename = $href;
889	if ($base_dir eq "") {
890	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
891	# remove http://
892	$filename =~ s/^[^:]*:\/\///;
893	}
894	else {
895	# remove http:/ thereby leaving one slash at the start as
896	# part of full pathname
897	$filename =~ s/^[^:]*:\///;
898	}
899	}
900	else {
901	# remove http://
902	$filename =~ s/^[^:]*:\/\///;
903	}
904
905	$filename = &util::filename_cat($base_dir, $filename);
906
907	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'})) {
908	# we are processing a tidytmp file - want paths to be in import
909	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
910	}
911
912	# Replace %XX's in URL with decoded value if required. Note that the
913	# filename may include the %XX in some situations. If the original
914	# file's name was in URL encoding, the following method will not decode
915	# it.
916	my $unicode_filename = $filename;
917	my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
918
919	# wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
920	# (it seems) accompanying file
921	if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
922
923	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
924
925	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
926	# The filenames that come through the HTML file have been decoded
927	# into Unicode aware Perl strings. Need to convert them back
928	# to their initial raw-byte encoding to match the file that
929	# exists on the file system
930	$filename = encode($content_encoding, $opt_decode_unicode_filename);
931	}
932	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
933	# HFS+ is UTF8 with decompostion
934	$filename = encode($content_encoding, $opt_decode_unicode_filename);
935	$filename = normalize('D', $filename); # Normalization Form D (decomposition)
936
937	}
938	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
939	my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
940
941	if (defined $long_filename) {
942	my $short_filename = Win32::GetLongPathName($long_filename);
943	$filename = $short_filename;
944	}
945	# else {
946	# print STDERR "***** failed to map href to real file:\n";
947	# print STDERR "****** $href -> $opt_decode_unicode_filename\n";
948	# }
949	}
950	else {
951	my $outhandle = $self->{'outhandle'};
952	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
953	print $outhandle " in file system encoding of href: $href\n";
954	print $outhandle " No character encoding done.\n";
955	}
956
957
958	# some special processing if the intended filename was converted to utf8, but
959	# the actual file still needs to be renamed
960	if (!&util::fd_exists($filename)) {
961	# try the original filename stored in map
962	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
963	print STDERR "******!! orig filename did not exist: $filename\n";
964	}
965
966	## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
967
968	my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
969
970	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
971	print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n";
972	}
973
974	if (defined $original_filename && -e $original_filename) {
975	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
976	print STDERR "****** Found match!\n";
977	}
978	$filename = $original_filename;
979	}
980	}
981
982	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
983
984	if ($rl == 0) {
985	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
986	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
987	}
988	else {
989	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
990	}
991	}
992
993	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
994	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
995	}
996	# add the original image file as a source file
997	if (!$self->{'processing_tmp_files'} ) {
998	$doc_obj->associate_source_file($filename);
999	}
1000	if ($self->{'rename_assoc_files'}) {
1001	if (defined $self->{'aux_files'}->{$href}) {
1002	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1003	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1004	} else {
1005	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1006	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1007	$self->inc_filecount ();
1008	}
1009	$doc_obj->associate_file($filename, $newname, undef, $section);
1010	return "_httpdocimg_/$newname";
1011	} else {
1012	if(&unicode::is_url_encoded($unicode_filename)) {
1013	# use the possibly-decoded filename instead to avoid double URL encoding
1014	($newname) = $filename =~ m/([^\/\\]*)$/;
1015	} else {
1016	($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
1017	}
1018
1019	# Make sure this name uses only ASCII characters.
1020	# We use either base64 or URL encoding, as these preserve original encoding
1021	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
1022
1023	### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
1024	$doc_obj->associate_file($filename, $newname, undef, $section);
1025
1026	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1027	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
1028	my $newname_url = $newname;
1029	$newname_url = &unicode::filename_to_url($newname_url);
1030	return "_httpdocimg_/$newname_url";
1031	}
1032	}
1033
1034
1035	sub format_link {
1036	my $self = shift (@_);
1037	my ($link, $base_dir, $file) = @_;
1038
1039	# strip off hash part, e.g. #foo, but watch out for any entities, e.g. α
1040	my ($before_hash, $hash_part) = $link =~ m/^(.?[^&])(\#.)?$/;
1041
1042	$hash_part = "" if !defined $hash_part;
1043	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
1044	my $outhandle = $self->{'outhandle'};
1045	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1046	if $self->{'verbosity'};
1047	return ($link, "", 0);
1048	}
1049
1050	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
1051	my $type = $1;
1052
1053	if ($link =~ m/^(http\|ftp):/i) {
1054	# Turn url (using /) into file name (possibly using \ on windows)
1055	my @http_dir_split = split('/', $before_hash);
1056	$before_hash = &util::filename_cat(@http_dir_split);
1057	}
1058
1059	$before_hash = $self->eval_dir_dots($before_hash);
1060
1061	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1062
1063	my $rl = 0;
1064	$rl = 1 if (-e $linkfilename);
1065
1066	# make sure there's a slash on the end if it's a directory
1067	if ($before_hash !~ m/\/$/) {
1068	$before_hash .= "/" if (-d $linkfilename);
1069	}
1070	return ($type . $before_hash, $hash_part, $rl);
1071
1072	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
1073
1074	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
1075
1076	# the first directory will be the domain name if file_is_url
1077	# to generate archives, otherwise we'll assume all files are
1078	# from the same site and base_dir is the root
1079
1080	if ($self->{'file_is_url'}) {
1081	my @dirs = split /[\/\\]/, $file;
1082	my $domname = shift (@dirs);
1083	$before_hash = &util::filename_cat($domname, $before_hash);
1084	$before_hash =~ s@\\@/@g; # for windows
1085	}
1086	else
1087	{
1088	# see if link shares directory with source document
1089	# => turn into relative link if this is so!
1090
1091	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1092	# too difficult doing a pattern match with embedded '\'s...
1093	my $win_before_hash=$before_hash;
1094	$win_before_hash =~ s@(\\)+@/@g;
1095	# $base_dir is already similarly "converted" on windows.
1096	if ($win_before_hash =~ s@^$base_dir/@@o) {
1097	# if this is true, we removed a prefix
1098	$before_hash=$win_before_hash;
1099	}
1100	}
1101	else {
1102	# before_hash has lost leading slash by this point,
1103	# -> add back in prior to substitution with $base_dir
1104	$before_hash = "/$before_hash";
1105
1106	$before_hash = &util::filename_cat("",$before_hash);
1107	$before_hash =~ s@^$base_dir/@@;
1108	}
1109	}
1110	} else {
1111	# Turn relative file path into full path
1112	my $dirname = &File::Basename::dirname($file);
1113	$before_hash = &util::filename_cat($dirname, $before_hash);
1114	$before_hash = $self->eval_dir_dots($before_hash);
1115	}
1116
1117	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1118
1119
1120	# print STDERR "**** linkfilename = $linkfilename\n";
1121	# if (!&util::fd_exists($linkfilename)) {
1122	# print STDERR "***** Warning: Could not find $linkfilename\n";
1123	# }
1124
1125
1126	# make sure there's a slash on the end if it's a directory
1127	if ($before_hash !~ m/\/$/) {
1128	$before_hash .= "/" if (-d $linkfilename);
1129	}
1130
1131	# print STDERR "*** returning: $before_hash\n";
1132
1133	return ("http://" . $before_hash, $hash_part, 1);
1134	} else {
1135	# mailto, news, nntp, telnet, javascript or gopher link
1136	return ($before_hash, "", 0);
1137	}
1138	}
1139
1140	sub extract_first_NNNN_characters {
1141	my $self = shift (@_);
1142	my ($textref, $doc_obj, $thissection) = @_;
1143
1144	foreach my $size (split /,/, $self->{'first'}) {
1145	my $tmptext = $$textref;
1146	# skip to the body
1147	$tmptext =~ s/.<body[^>]>//i;
1148	# remove javascript
1149	$tmptext =~ s@<script.*?</script>@ @sig;
1150	$tmptext =~ s/<[^>]*>/ /g;
1151	$tmptext =~ s/ / /g;
1152	$tmptext =~ s/^\s+//;
1153	$tmptext =~ s/\s+$//;
1154	$tmptext =~ s/\s+/ /gs;
1155	$tmptext = &unicode::substr ($tmptext, 0, $size);
1156	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
1157	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1158	}
1159	}
1160
1161
1162	sub extract_metadata {
1163	my $self = shift (@_);
1164	my ($textref, $metadata, $doc_obj, $section) = @_;
1165	my $outhandle = $self->{'outhandle'};
1166	# if we don't want metadata, we may as well not be here ...
1167	return if (!defined $self->{'metadata_fields'});
1168
1169	my $separator = $self->{'metadata_field_separator'};
1170	if ($separator eq "") {
1171	undef $separator;
1172	}
1173
1174	# metadata fields to extract/save. 'key' is the (lowercase) name of the
1175	# html meta, 'value' is the metadata name for greenstone to use
1176	my %find_fields = ();
1177
1178	my %creator_fields = (); # short-cut for lookups
1179
1180
1181	foreach my $field (split /,/, $self->{'metadata_fields'}) {
1182	$field =~ s/^\s+//; # remove leading whitespace
1183	$field =~ s/\s+$//; # remove trailing whitespace
1184
1185	# support tag<tagname>
1186	if ($field =~ m/^(.?)\s<(.*?)>$/) {
1187	# "$2" is the user's preferred gs metadata name
1188	$find_fields{lc($1)}=$2; # lc = lowercase
1189	} else { # no <tagname> for mapping
1190	# "$field" is the user's preferred gs metadata name
1191	$find_fields{lc($field)}=$field; # lc = lowercase
1192	}
1193	}
1194
1195	if (defined $self->{'hunt_creator_metadata'} &&
1196	$self->{'hunt_creator_metadata'} == 1 ) {
1197	my @extra_fields =
1198	(
1199	'author',
1200	'author.email',
1201	'creator',
1202	'dc.creator',
1203	'dc.creator.corporatename',
1204	);
1205
1206	# add the creator_metadata fields to search for
1207	foreach my $field (@extra_fields) {
1208	$creator_fields{$field}=0; # add to lookup hash
1209	}
1210	}
1211
1212
1213	# find the header in the html file, which has the meta tags
1214	$$textref =~ m@<head>(.*?)</head>@si;
1215
1216	my $html_header=$1;
1217
1218	# go through every <meta... tag defined in the html and see if it is
1219	# one of the tags we want to match.
1220
1221	# special case for title - we want to remember if its been found
1222	my $found_title = 0;
1223	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1224	$html_header =~ m/^/; # match the start of the string, for \G assertion
1225
1226	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1227	my $metatag=$1;
1228	my ($tag, $value);
1229
1230	# find the tag name
1231	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1232	$tag=$2;
1233	# in case they're not using " or ', but they should...
1234	if (! $tag) {
1235	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1236	$tag=$1;
1237	}
1238
1239	if (!defined $tag) {
1240	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1241	next;
1242	}
1243
1244	# don't need to assign this field if it was passed in from a previous
1245	# (recursive) plugin
1246	if (defined $metadata->{$tag}) {next}
1247
1248	# find the tag content
1249	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
1250	$value=$2;
1251
1252	if (! $value) {
1253	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1254	$value=$1;
1255	}
1256	if (!defined $value) {
1257	print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1258	next;
1259	}
1260
1261	# clean up and add
1262	$value =~ s/\s+/ /gs;
1263	chomp($value); # remove trailing \n, if any
1264	if (exists $creator_fields{lc($tag)}) {
1265	# map this value onto greenstone's "Creator" metadata
1266	$tag='Creator';
1267	} elsif (!exists $find_fields{lc($tag)}) {
1268	next; # don't want this tag
1269	} else {
1270	# get the user's preferred capitalisation
1271	$tag = $find_fields{lc($tag)};
1272	}
1273	if (lc($tag) eq "title") {
1274	$found_title = 1;
1275	}
1276
1277	if ($self->{'verbosity'} > 2) {
1278	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1279	}
1280
1281	if ($tag =~ /\./) {
1282	# there is a . so has a namespace, add ex.
1283	$tag = "ex.$tag";
1284	}
1285	if (defined $separator) {
1286	my @values = split($separator, $value);
1287	foreach my $v (@values) {
1288	$doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1289	}
1290	}
1291	else {
1292	$doc_obj->add_utf8_metadata($section, $tag, $value);
1293	}
1294	}
1295
1296	# TITLE: extract the document title
1297	if (exists $find_fields{'title'} && !$found_title) {
1298	# we want a title, and didn't find one in the meta tags
1299	# see if there's a <title> tag
1300	my $title;
1301	my $from = ""; # for debugging output only
1302	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
1303	$title = $1;
1304	$from = "<title> tags";
1305	}
1306
1307	if (!defined $title) {
1308	$from = "first 100 chars";
1309	# if no title use first 100 or so characters
1310	$title = $$textref;
1311	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1312	$title =~ s/^.*?<body>//si;
1313	# ignore javascript!
1314	$title =~ s@<script.*?</script>@ @sig;
1315	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1316	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1317	$title = substr ($title, 0, 100);
1318	$title =~ s/\s\S*$/.../;
1319	}
1320	$title =~ s/<[^>]*>/ /g; # remove html tags
1321	$title =~ s/ / /g;
1322	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1323	$title =~ s/\s+/ /gs; # collapse multiple spaces
1324	$title =~ s/^\s*//; # remove leading spaces
1325	$title =~ s/\s*$//; # remove trailing spaces
1326
1327	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1328	$title =~ s/^\s+//s; # in case title_sub introduced any...
1329	$doc_obj->add_utf8_metadata ($section, "Title", $title);
1330	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1331	if ($self->{'verbosity'} > 2);
1332	}
1333
1334	# add FileFormat metadata
1335	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1336
1337	# Special, for metadata names such as tagH1 - extracts
1338	# the text between the first <H1> and </H1> tags into "H1" metadata.
1339
1340	foreach my $field (keys %find_fields) {
1341	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1342	my $tag = $1;
1343	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1344	my $content = $1;
1345	$content =~ s/ / /g;
1346	$content =~ s/<[^>]*>/ /g;
1347	$content =~ s/^\s+//;
1348	$content =~ s/\s+$//;
1349	$content =~ s/\s+/ /gs;
1350	if ($content) {
1351	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1352	$tag =~ s/^tag//i;
1353	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1354	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1355	if ($self->{'verbosity'} > 2);
1356	}
1357	}
1358	}
1359	}
1360
1361
1362	# evaluate any "../" to next directory up
1363	# evaluate any "./" as here
1364	sub eval_dir_dots {
1365	my $self = shift (@_);
1366	my ($filename) = @_;
1367	my $dirsep_os = &util::get_os_dirsep();
1368	my @dirsep = split(/$dirsep_os/,$filename);
1369
1370	my @eval_dirs = ();
1371	foreach my $d (@dirsep) {
1372	if ($d eq "..") {
1373	pop(@eval_dirs);
1374
1375	} elsif ($d eq ".") {
1376	# do nothing!
1377
1378	} else {
1379	push(@eval_dirs,$d);
1380	}
1381	}
1382
1383	# Need to fiddle with number of elements in @eval_dirs if the
1384	# first one is the empty string. This is because of a
1385	# modification to util::filename_cat that supresses the addition
1386	# of a leading '/' character (or \ if windows) (intended to help
1387	# filename cat with relative paths) if the first entry in the
1388	# array is the empty string. Making the array start with two
1389	# empty strings is a way to defeat this "smart" option.
1390	#
1391	if (scalar(@eval_dirs) > 0) {
1392	if ($eval_dirs[0] eq ""){
1393	unshift(@eval_dirs,"");
1394	}
1395	}
1396
1397	my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1398	return $evaluated_filename;
1399	}
1400
1401	sub replace_usemap_links {
1402	my $self = shift (@_);
1403	my ($front, $link, $back) = @_;
1404
1405	# remove quotes from link at start and end if necessary
1406	if ($link=~/^[\"\']/) {
1407	$link=~s/^[\"\']//;
1408	$link=~s/[\"\']$//;
1409	$front.='"';
1410	$back="\"$back";
1411	}
1412
1413	$link =~ s/^\.\///;
1414	return $front . $link . $back;
1415	}
1416
1417	sub inc_filecount {
1418	my $self = shift (@_);
1419
1420	if ($self->{'file_num'} == 1000) {
1421	$self->{'dir_num'} ++;
1422	$self->{'file_num'} = 0;
1423	} else {
1424	$self->{'file_num'} ++;
1425	}
1426	}
1427
1428
1429	# Extend read_file so that strings like é are
1430	# converted to UTF8 internally.
1431	#
1432	# We don't convert < or > or & or " in case
1433	# they interfere with the GML files
1434
1435	sub read_file {
1436	my $self = shift(@_);
1437	my ($filename, $encoding, $language, $textref) = @_;
1438
1439	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1440
1441	# Convert entities to their Unicode code-point equivalents
1442	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1443	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1444	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1445
1446	}
1447
1448	sub HB_read_html_file {
1449	my $self = shift (@_);
1450	my ($htmlfile, $text) = @_;
1451
1452	# load in the file
1453	if (!open (FILE, $htmlfile)) {
1454	print STDERR "ERROR - could not open $htmlfile\n";
1455	return;
1456	}
1457
1458	my $foundbody = 0;
1459	$self->HB_gettext (\$foundbody, $text, "FILE");
1460	close FILE;
1461
1462	# just in case there was no <body> tag
1463	if (!$foundbody) {
1464	$foundbody = 1;
1465	open (FILE, $htmlfile) \|\| return;
1466	$self->HB_gettext (\$foundbody, $text, "FILE");
1467	close FILE;
1468	}
1469	# text is in utf8
1470	}
1471
1472	# converts the text to utf8, as ghtml does that for é etc.
1473	sub HB_gettext {
1474	my $self = shift (@_);
1475	my ($foundbody, $text, $handle) = @_;
1476
1477	my $line = "";
1478	while (defined ($line = <$handle>)) {
1479	# look for body tag
1480	if (!$$foundbody) {
1481	if ($line =~ s/^.<body[^>]>//i) {
1482	$$foundbody = 1;
1483	} else {
1484	next;
1485	}
1486	}
1487
1488	# check for symbol fonts
1489	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
1490	my $font = $1;
1491	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1492	if ($font !~ m/^arial$/i);
1493	}
1494
1495	$$text .= $line;
1496	}
1497
1498	if ($self->{'input_encoding'} eq "iso_8859_1") {
1499	# convert to utf-8
1500	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1501	}
1502	# convert any alphanumeric character entities to their utf-8
1503	# equivalent for indexing purposes
1504	#&ghtml::convertcharentities ($$text);
1505
1506	$$text =~ s/\s+/ /g; # remove \n's
1507
1508	# At this point $$text is a binary byte string
1509	# => turn it into a Unicode aware string, so full
1510	# Unicode aware pattern matching can be used.
1511	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
1512	#
1513
1514	$$text = decode("utf8",$$text);
1515	}
1516
1517	sub HB_clean_section {
1518	my $self = shift (@_);
1519	my ($section) = @_;
1520
1521	# remove tags without a starting tag from the section
1522	my ($tag, $tagstart);
1523	while ($section =~ m/<\/([^>]{1,10})>/) {
1524	$tag = $1;
1525	$tagstart = index($section, "<$tag");
1526	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1527	$section =~ s/<\/$tag>//;
1528	}
1529
1530	# remove extra paragraph tags
1531	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
1532
1533	# remove extra stuff at the end of the section
1534	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
1535
1536	# add a newline at the beginning of each paragraph
1537	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1538
1539	# add a newline every 80 characters at a word boundary
1540	# Note: this regular expression puts a line feed before
1541	# the last word in each section, even when it is not
1542	# needed.
1543	$section =~ s/(.{1,80})\s/$1\n/g;
1544
1545	# fix up the image links
1546	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
1547	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1548	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
1549	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1550
1551	return $section;
1552	}
1553
1554	# Will convert the oldHDL format to the new HDL format (using the Section tag)
1555	sub convert_to_newHDLformat
1556	{
1557	my $self = shift (@_);
1558	my ($file,$cnfile) = @_;
1559	my $input_filename = $file;
1560	my $tmp_filename = $cnfile;
1561
1562	# write HTML tmp file with new HDL format
1563	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1564
1565	# read in the file and do basic html cleaning (removing header etc)
1566	my $html = "";
1567	$self->HB_read_html_file ($input_filename, \$html);
1568
1569	# process the file one section at a time
1570	my $curtoclevel = 1;
1571	my $firstsection = 1;
1572	my $toclevel = 0;
1573	while (length ($html) > 0) {
1574	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
1575	$toclevel = $3;
1576	my $title = $4;
1577	my $sectiontext = "";
1578	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
1579	$sectiontext = $1;
1580	} else {
1581	$sectiontext = $html;
1582	$html = "";
1583	}
1584
1585	# remove tags and extra spaces from the title
1586	$title =~ s/<\/?[^>]+>//g;
1587	$title =~ s/^\s+\|\s+$//g;
1588
1589	# close any sections below the current level and
1590	# create a new section (special case for the firstsection)
1591	print PROD "<!--\n";
1592	while (($curtoclevel > $toclevel) \|\|
1593	(!$firstsection && $curtoclevel == $toclevel)) {
1594	$curtoclevel--;
1595	print PROD "</Section>\n";
1596	}
1597	if ($curtoclevel+1 < $toclevel) {
1598	print STDERR "WARNING - jump in toc levels in $input_filename " .
1599	"from $curtoclevel to $toclevel\n";
1600	}
1601	while ($curtoclevel < $toclevel) {
1602	$curtoclevel++;
1603	}
1604
1605	if ($curtoclevel == 1) {
1606	# add the header tag
1607	print PROD "-->\n";
1608	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1609	print PROD "<!--\n";
1610	}
1611
1612	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1613
1614	print PROD "-->\n";
1615
1616	# clean up the section html
1617	$sectiontext = $self->HB_clean_section($sectiontext);
1618
1619	print PROD "$sectiontext\n";
1620
1621	} else {
1622	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1623	"\nin $input_filename\n";
1624	last;
1625	}
1626	$firstsection = 0;
1627	}
1628
1629	print PROD "<!--\n";
1630	while ($curtoclevel > 0) {
1631	$curtoclevel--;
1632	print PROD "</Section>\n";
1633	}
1634	print PROD "-->\n";
1635
1636	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1637
1638	return $tmp_filename;
1639	}
1640
1641	sub shorten {
1642	my $self = shift (@_);
1643	my ($text) = @_;
1644
1645	return "\"$text\"" if (length($text) < 100);
1646
1647	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1648	substr ($text, length($text)-50) . "\"";
1649	}
1650
1651	sub convert_tidy_or_oldHDL_file
1652	{
1653	my $self = shift (@_);
1654	my ($file) = @_;
1655	my $input_filename = $file;
1656
1657	if (-d $input_filename)
1658	{
1659	return $input_filename;
1660	}
1661
1662	# get the input filename
1663	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1664	my $base_dirname = $dirname;
1665	$suffix = lc($suffix);
1666
1667	# derive tmp filename from input filename
1668	# Remove any white space from filename -- no risk of name collision, and
1669	# makes later conversion by utils simpler. Leave spaces in path...
1670	# tidy up the filename with space, dot, hyphen between
1671	$tailname =~ s/\s+//g;
1672	$tailname =~ s/\.+//g;
1673	$tailname =~ s/\-+//g;
1674	# convert to utf-8 otherwise we have problems with the doc.xml file
1675	# later on
1676	&unicode::ensure_utf8(\$tailname);
1677
1678	# softlink to collection tmp dir
1679	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1680	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1681
1682	my $test_dirname = "";
1683	my $f_separator = &util::get_os_dirsep();
1684
1685	if ($dirname =~ m/import$f_separator/)
1686	{
1687	$test_dirname = $'; #'
1688
1689	#print STDERR "init $'\n";
1690
1691	while ($test_dirname =~ m/[$f_separator]/)
1692	{
1693	my $folderdirname = $`;
1694	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1695	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1696	$test_dirname = $'; #'
1697	}
1698	}
1699
1700	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1701
1702	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1703	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
1704	{
1705	#convert the input file to a new style HDL
1706	my $hdl_output_filename = $input_filename;
1707	if ($self->{'old_style_HDL'})
1708	{
1709	$hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1710	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1711	}
1712
1713	#just for checking copy all other file from the base dir to tmp dir if it is not exists
1714	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1715	my @files = grep {!/^\.+$/} readdir(DIR);
1716	close(DIR);
1717
1718	foreach my $file (@files)
1719	{
1720	my $src_file = &util::filename_cat($base_dirname,$file);
1721	my $dest_file = &util::filename_cat($tmp_dirname,$file);
1722	if ((!-e $dest_file) && (!-d $src_file))
1723	{
1724	# just copy the original file back to the tmp directory
1725	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1726	}
1727	}
1728
1729	# tidy the input file
1730	my $tidy_output_filename = $hdl_output_filename;
1731	if ($self->{'use_realistic_book'})
1732	{
1733	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1734	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1735	}
1736	$tmp_filename = $tidy_output_filename;
1737	}
1738	else
1739	{
1740	if (!-e $tmp_filename)
1741	{
1742	# just copy the original file back to the tmp directory
1743	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1744	}
1745	}
1746
1747	return $tmp_filename;
1748	}
1749
1750
1751	# Will make the html input file as a proper XML file with removed font tag and
1752	# image size added to the img tag.
1753	# The tidying process takes place in a collection specific 'tmp' directory so
1754	# that we don't accidentally damage the input.
1755	sub tmp_tidy_file
1756	{
1757	my $self = shift (@_);
1758	my ($file,$cnfile) = @_;
1759	my $input_filename = $file;
1760	my $tmp_filename = $cnfile;
1761
1762	# get the input filename
1763	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1764
1765	require HTML::TokeParser::Simple;
1766
1767	# create HTML parser to decode the input file
1768	my $parser = HTML::TokeParser::Simple->new($input_filename);
1769
1770	# write HTML tmp file without the font tag and image size are added to the img tag
1771	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1772	while (my $token = $parser->get_token())
1773	{
1774	# is it an img tag
1775	if ($token->is_start_tag('img'))
1776	{
1777	# get the attributes
1778	my $attr = $token->return_attr;
1779
1780	# get the full path to the image
1781	my $img_file = &util::filename_cat($dirname,$attr->{src});
1782
1783	# set the width and height attribute
1784	($attr->{width}, $attr->{height}) = imgsize($img_file);
1785
1786	# recreate the tag
1787	print PROD "<img";
1788	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1789	print PROD ">";
1790	}
1791	# is it a font tag
1792	else
1793	{
1794	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
1795	{
1796	# remove font tag
1797	print PROD "";
1798	}
1799	else
1800	{
1801	# print without changes
1802	print PROD $token->as_is;
1803	}
1804	}
1805	}
1806	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1807
1808	# run html-tidy on the tmp file to make it a proper XML file
1809
1810	my $outhandle = $self->{'outhandle'};
1811	print $outhandle "Converting HTML to be XML compliant:\n";
1812
1813	my $tidy_cmd = "tidy";
1814	$tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1815	$tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1816	if ($self->{'verbosity'} <= 2) {
1817	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1818	$tidy_cmd .= " 2>nul";
1819	}
1820	else {
1821	$tidy_cmd .= " 2>/dev/null";
1822	}
1823	print $outhandle " => $tidy_cmd\n";
1824	}
1825
1826	my $tidyfile = `$tidy_cmd`;
1827
1828	# write result back to the tmp file
1829	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1830	print PROD $tidyfile;
1831	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1832
1833	# return the output filename
1834	return $tmp_filename;
1835	}
1836
1837	sub associate_cover_image
1838	{
1839	my $self = shift(@_);
1840	my ($doc_obj, $filename) = @_;
1841	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
1842	{
1843	# we will have cover image in tidytmp, but want it from import
1844	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1845	}
1846	$self->SUPER::associate_cover_image($doc_obj, $filename);
1847	}
1848
1849
1850	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: