Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 24349

Last change on this file since 24349 was 23835, checked in by ak19, 13 years ago
Dr Bainbridge fixed interlinking failure on Mac OS when filenames (and therefore links to files on the system) have characters that don't occur in English. The problem had to do with the URL obtained from the href in the HTML page not matching up with the URL encoded normalised decomposed URL stored in the doc.xml and the gdb database. The latter is the right form to store the URL in, since it refers accurately to the file as it exists on the system. So the former was changed by using normalization with decomposition on the href link in the text at the correct part of the replace_href_links of the HTMLPlugin.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 63.1 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use Encode;
39	use Unicode::Normalize 'normalize';
40
41	use ReadTextFile;
42	use HBPlugin;
43	use ghtml;
44	use unicode;
45	use util;
46	use XMLParser;
47
48	use Image::Size;
49	use File::Copy;
50
51	sub BEGIN {
52	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
53	}
54
55	use strict; # every perl program should have this!
56	no strict 'refs'; # make an exception so we can use variables as filehandles
57
58	my $arguments =
59	[ { 'name' => "process_exp",
60	'desc' => "{BasePlugin.process_exp}",
61	'type' => "regexp",
62	'deft' => &get_default_process_exp() },
63	{ 'name' => "block_exp",
64	'desc' => "{BasePlugin.block_exp}",
65	'type' => 'regexp',
66	'deft' => &get_default_block_exp() },
67	{ 'name' => "nolinks",
68	'desc' => "{HTMLPlugin.nolinks}",
69	'type' => "flag" },
70	{ 'name' => "keep_head",
71	'desc' => "{HTMLPlugin.keep_head}",
72	'type' => "flag" },
73	{ 'name' => "no_metadata",
74	'desc' => "{HTMLPlugin.no_metadata}",
75	'type' => "flag" },
76	{ 'name' => "metadata_fields",
77	'desc' => "{HTMLPlugin.metadata_fields}",
78	'type' => "string",
79	'deft' => "Title" },
80	{ 'name' => "metadata_field_separator",
81	'desc' => "{HTMLPlugin.metadata_field_separator}",
82	'type' => "string",
83	'deft' => "" },
84	{ 'name' => "hunt_creator_metadata",
85	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
86	'type' => "flag" },
87	{ 'name' => "file_is_url",
88	'desc' => "{HTMLPlugin.file_is_url}",
89	'type' => "flag" },
90	{ 'name' => "assoc_files",
91	'desc' => "{HTMLPlugin.assoc_files}",
92	'type' => "regexp",
93	'deft' => &get_default_block_exp() },
94	{ 'name' => "rename_assoc_files",
95	'desc' => "{HTMLPlugin.rename_assoc_files}",
96	'type' => "flag" },
97	{ 'name' => "title_sub",
98	'desc' => "{HTMLPlugin.title_sub}",
99	'type' => "string",
100	'deft' => "" },
101	{ 'name' => "description_tags",
102	'desc' => "{HTMLPlugin.description_tags}",
103	'type' => "flag" },
104	# retain this for backward compatibility (w3mir option was replaced by
105	# file_is_url)
106	{ 'name' => "w3mir",
107	# 'desc' => "{HTMLPlugin.w3mir}",
108	'type' => "flag",
109	'hiddengli' => "yes"},
110	{ 'name' => "no_strip_metadata_html",
111	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
112	'type' => "string",
113	'deft' => "",
114	'reqd' => "no"},
115	{ 'name' => "sectionalise_using_h_tags",
116	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
117	'type' => "flag" },
118	{ 'name' => "use_realistic_book",
119	'desc' => "{HTMLPlugin.tidy_html}",
120	'type' => "flag"},
121	{ 'name' => "old_style_HDL",
122	'desc' => "{HTMLPlugin.old_style_HDL}",
123	'type' => "flag"},
124	{'name' => "processing_tmp_files",
125	'desc' => "{BasePlugin.processing_tmp_files}",
126	'type' => "flag",
127	'hiddengli' => "yes"}
128	];
129
130	my $options = { 'name' => "HTMLPlugin",
131	'desc' => "{HTMLPlugin.desc}",
132	'abstract' => "no",
133	'inherits' => "yes",
134	'args' => $arguments };
135
136
137	sub new {
138	my ($class) = shift (@_);
139	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
140	push(@$pluginlist, $class);
141
142	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
143	push(@{$hashArgOptLists->{"OptList"}},$options);
144
145
146	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
147
148	if ($self->{'w3mir'}) {
149	$self->{'file_is_url'} = 1;
150	}
151	$self->{'aux_files'} = {};
152	$self->{'dir_num'} = 0;
153	$self->{'file_num'} = 0;
154
155	return bless $self, $class;
156	}
157
158	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
159	# if have eg <script language="javascript" src="img/lib.js@123">
160	# blocking is now done by reading through the file and recording all the
161	# images and other files
162	sub get_default_block_exp {
163	my $self = shift (@_);
164
165	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
166	return "";
167	}
168
169	sub get_default_process_exp {
170	my $self = shift (@_);
171
172	# the last option is an attempt to encode the concept of an html query ...
173	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
174	}
175
176	sub store_block_files
177	{
178	my $self =shift (@_);
179	my ($filename_full_path, $block_hash) = @_;
180
181	my $html_fname = $filename_full_path;
182
183	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
184	$self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
185
186	# read in file ($text will be in utf8)
187	my $raw_text = "";
188	$self->read_file_no_decoding($filename_full_path, \$raw_text);
189
190	my $textref = \$raw_text;
191	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
192	my $closecom = '(?:-->\|(?:—\|\|--)>)';
193	$$textref =~ s/$opencom(.*?)$closecom//gs;
194
195	# Convert entities to their UTF8 equivalents
196	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
197	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
198	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
199
200	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
201	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
202	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
203	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
204	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
205	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
206	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
207
208	if(!defined $self->{'unicode_to_original_filename'}) {
209	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
210	$self->{'unicode_to_original_filename'} = {};
211	}
212
213	foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
214
215	# remove quotes from link at start and end if necessary
216	if ($raw_link =~ m/^\"/) {
217	$raw_link =~ s/^\"//;
218	$raw_link =~ s/\"$//;
219	}
220
221	# remove any anchor names, e.g. foo.html#name becomes foo.html
222	# but watch out for any #'s that are part of entities, such as α
223	$raw_link =~ s/([^&])\#.*$/$1/s;
224
225	# some links may just be anchor names
226	next unless ($raw_link =~ /\S+/);
227
228	if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/i) {
229	# Turn relative file path into full path
230	my $dirname = &File::Basename::dirname($filename_full_path);
231	$raw_link = &util::filename_cat($dirname, $raw_link);
232	}
233	$raw_link = $self->eval_dir_dots($raw_link);
234
235	# this is the actual filename on the filesystem (that the link refers to)
236	my $url_original_filename = $self->opt_url_decode($raw_link);
237
238	my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
239
240	if ($exceeds_bytecodes) {
241	# We have a link to a file name that is more complicated than a raw byte filename
242	# What we do next depends on the operating system we are on
243
244	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
245	# Assume we're dealing with a UTF-8 encoded filename
246	$url_original_filename = encode("utf8", $url_original_filename);
247	}
248	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
249	# HFS+ is UTF8 with decompostion
250	$url_original_filename = encode("utf8", $url_original_filename);
251	$url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
252	}
253	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
254	# Don't need to do anything as later code maps Windows
255	# unicode filenames to DOS short filenames when needed
256	}
257	else {
258	my $outhandle = $self->{'outhandle'};
259	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
260	print $outhandle " in raw file system encoding of: $raw_link\n";
261	print $outhandle " Assuming filesystem is UTF-8 based.\n";
262	$url_original_filename = encode("utf8", $url_original_filename);
263	}
264	}
265
266	# Convert the (currently raw) link into its Unicode version.
267	# Store the Unicode link along with the url_original_filename
268	my $unicode_url_original_filename = "";
269	$self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
270
271
272	$self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
273
274
275	if ($url_original_filename ne $unicode_url_original_filename) {
276	my $outhandle = $self->{'outhandle'};
277
278	print $outhandle "URL Encoding $url_original_filename\n";
279	print $outhandle " ->$unicode_url_original_filename\n";
280
281	# Allow for possibility of raw byte version and Unicode versions of file
282	&util::block_filename($block_hash,$unicode_url_original_filename);
283	}
284
285	# $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename);
286	&util::block_filename($block_hash,$url_original_filename);
287
288	}
289	}
290
291	# Given a filename in any encoding, will URL decode it to get back the original filename
292	# in the original encoding. Because this method is intended to work out the original
293	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
294	# string already exists in the local folder.
295	#
296	sub opt_url_decode {
297	my $self = shift (@_);
298	my ($raw_link) = @_;
299
300
301	# Replace %XX's in URL with decoded value if required.
302	# Note that the filename may include the %XX in some situations
303
304	## if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
305
306	if (($raw_link =~ m/\%[A-F0-9]{2}/i) \|\| ($raw_link =~ m/\&\#x[0-9A-F]+;/i) \|\| ($raw_link =~ m/\&\#[0-9]+;/i)) {
307	if (!-e $raw_link) {
308	$raw_link = &unicode::url_decode($raw_link,1);
309	}
310	}
311
312	return $raw_link;
313	}
314
315	sub read_into_doc_obj
316	{
317	my $self = shift (@_);
318	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
319
320	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
321
322	# Lookup content_encoding worked out in file_block pass for this file
323	# Store it under the local name 'content_encoding' so its nice and
324	# easy to access
325	$self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
326
327	# get the input file
328	my $input_filename = $file;
329	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
330	$suffix = lc($suffix);
331	my $tidy_filename;
332	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
333	{
334	# because the document has to be sectionalized set the description tags
335	$self->{'description_tags'} = 1;
336
337	# set the file to be tidied
338	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
339
340	# get the tidied file
341	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
342	$tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
343
344	# derive tmp filename from input filename
345	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
346
347	# set the new input file and base_dir to be from the tidied file
348	$file = "$tailname$suffix";
349	$base_dir = $dirname;
350	}
351
352	# call the parent read_into_doc_obj
353	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
354	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
355	{
356	# now we need to reset the filenames in the doc obj so that the converted filenames are not used
357	my $collect_file = &util::filename_within_collection($filename_full_path);
358	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
359	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
360	# build. So set it manually.
361	$doc_obj->set_source_path($filename_full_path);
362	my $collect_conv_file = &util::filename_within_collection($tidy_filename);
363	$doc_obj->set_converted_filename($collect_conv_file);
364
365	my $plugin_filename_encoding = $self->{'filename_encoding'};
366	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
367	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
368	}
369
370	delete $self->{'store_content_encoding'}->{$filename_full_path};
371	$self->{'content_encoding'} = undef;
372
373	return ($process_status,$doc_obj);
374	}
375
376	# do plugin specific processing of doc_obj
377	sub process {
378	my $self = shift (@_);
379	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
380	my $outhandle = $self->{'outhandle'};
381
382	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
383	# this makes life so much easier... perl can cope with unix-style '/'s.
384	$base_dir =~ s@(\\)+@/@g;
385	$file =~ s@(\\)+@/@g;
386	}
387
388	my $filename = &util::filename_cat($base_dir,$file);
389	my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
390	my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
391
392	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
393	# And again
394	$upgraded_base_dir =~ s@(\\)+@/@g;
395	$upgraded_filename =~ s@(\\)+@/@g;
396
397	# Need to make sure there is a '/' on the end of upgraded_base_dir
398	if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
399	$upgraded_base_dir .= "/";
400	}
401	}
402	my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
403
404	# reset per-doc stuff...
405	$self->{'aux_files'} = {};
406	$self->{'dir_num'} = 0;
407	$self->{'file_num'} = 0;
408
409	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
410	# you can also include metadata in the format (X can be any number)
411	# <hX>Title<!--gsdl-metadata
412	# <Metadata name="name1">value1</Metadata>
413	# ...
414	# <Metadata name="nameN">valueN</Metadata>
415	#--></hX>
416	if ($self->{'sectionalise_using_h_tags'}) {
417	# description_tags should allways be activated because we convert headings to description tags
418	$self->{'description_tags'} = 1;
419
420	my $arrSections = [];
421	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
422
423	if (scalar(@$arrSections)) {
424	my $strMetadata = $self->update_section_data($arrSections, -1);
425	if (length($strMetadata)) {
426	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
427	$$textref =~ s/<\/body>/$strMetadata/ig;
428	}
429	}
430	}
431
432	my $cursection = $doc_obj->get_top_section();
433
434	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
435	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
436
437	# Store URL for page as metadata - this can be used for an
438	# altavista style search interface. The URL won't be valid
439	# unless the file structure contains the domain name (i.e.
440	# like when w3mir is used to download a website).
441
442	# URL metadata (even invalid ones) are used to support internal
443	# links, so even if 'file_is_url' is off, still need to store info
444
445	my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
446
447	# my $utf8_file = $self->filename_to_utf8_metadata($file);
448	# $utf8_file =~ s/&\#095;/_/g;
449	# variable below used to be utf8_file
450
451	my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
452	my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
453
454	my $web_url = "http://";
455	my $utf8_web_url = "http://";
456	if(defined $dirname) { # local directory
457	# Check for "ftp" in the domain name of the directory
458	# structure to determine if this URL should be a ftp:// URL
459	# This check is not infallible, but better than omitting the
460	# check, which would cause all files downloaded from ftp sites
461	# via mirroring with wget to have potentially erroneous http:// URLs
462	# assigned in their metadata
463	if ($dirname =~ /^[^\/]*ftp/i)
464	{
465	$web_url = "ftp://";
466	$utf8_web_url = "ftp://";
467	}
468	$dirname = $self->eval_dir_dots($dirname);
469	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
470
471	$web_url = $web_url.$dirname.$url_encoded_file;
472	$utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
473	} else {
474	$web_url = $web_url.$url_encoded_file;
475	$utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
476	}
477	$web_url =~ s/\\/\//g;
478	$utf8_web_url =~ s/\\/\//g;
479
480	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
481	print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n";
482	print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
483	}
484
485
486	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
487	$doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
488
489	if ($self->{'file_is_url'}) {
490	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
491	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
492	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
493	}
494
495	if ($self->{'description_tags'}) {
496	# remove the html header - note that doing this here means any
497	# sections defined within the header will be lost (so all <Section>
498	# tags must appear within the body of the HTML)
499	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
500
501	$$textref =~ s/^.?<body[^>]>//is;
502	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
503
504	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
505	my $closecom = '(?:-->\|(?:—\|\|--)>)';
506
507	my $lt = '(?:<\|<)';
508	my $gt = '(?:>\|>)';
509	my $quot = '(?:"\|"\|”\|“)';
510
511	my $dont_strip = '';
512	if ($self->{'no_strip_metadata_html'}) {
513	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
514	}
515
516	my $found_something = 0; my $top = 1;
517	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
518	my $text = $1;
519	my $comment = $2;
520	if (defined $text) {
521	# text before a comment - note that getting to here
522	# doesn't necessarily mean there are Section tags in
523	# the document
524	$self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
525	}
526	while ($comment =~ s/$lt(.*?)$gt//s) {
527	my $tag = $1;
528	if ($tag eq "Section") {
529	$found_something = 1;
530	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
531	$top = 0;
532	} elsif ($tag eq "/Section") {
533	$found_something = 1;
534	$cursection = $doc_obj->get_parent_section ($cursection);
535	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
536	my $metaname = $1;
537	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
538	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
539	my $metavalue = $1;
540	$metavalue =~ s/^\s+//;
541	$metavalue =~ s/\s+$//;
542	# assume that no metadata value intentionally includes
543	# carriage returns or HTML tags (if they're there they
544	# were probably introduced when converting to HTML from
545	# some other format).
546	# actually some people want to have html tags in their
547	# metadata.
548	$metavalue =~ s/[\cJ\cM]/ /sg;
549	$metavalue =~ s/<[^>]+>//sg
550	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
551	$metavalue =~ s/\s+/ /sg;
552	if ($metaname =~ /\./) { # has a namespace
553	$metaname = "ex.$metaname";
554	}
555	if ($accumulate) {
556	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
557	} else {
558	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
559	}
560	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
561	# do nothing with containing Description tags
562	} else {
563	# simple HTML tag (probably created by the conversion
564	# to HTML from some other format) - we'll ignore it and
565	# hope for the best ;-)
566	}
567	}
568	}
569	if ($cursection ne "") {
570	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
571	}
572
573	$$textref =~ s/^.?<body[^>]>//is;
574	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
575	if ($$textref =~ m/\S/) {
576	if (!$found_something) {
577	if ($self->{'verbosity'} > 2) {
578	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
579	print $outhandle " will be processed as a single section document\n";
580	}
581
582	# go ahead and process single-section document
583	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
584
585	# if document contains no Section tags we'll go ahead
586	# and extract metadata (this won't have been done
587	# above as the -description_tags option prevents it)
588	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
589	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
590	unless $self->{'no_metadata'};
591
592	} else {
593	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
594	print $outhandle " of the final closing </Section> tag. This text will\n";
595	print $outhandle " be ignored.";
596
597	my ($text);
598	if (length($$textref) > 30) {
599	$text = substr($$textref, 0, 30) . "...";
600	} else {
601	$text = $$textref;
602	}
603	$text =~ s/\n/ /isg;
604	print $outhandle " ($text)\n";
605	}
606	} elsif (!$found_something) {
607
608	if ($self->{'verbosity'} > 2) {
609	# may get to here if document contained no valid Section
610	# tags but did contain some comments. The text will have
611	# been processed already but we should print the warning
612	# as above and extract metadata
613	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
614	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
615	}
616
617	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
618	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
619	unless $self->{'no_metadata'};
620	}
621
622	} else {
623
624	# remove header and footer
625	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
626	$$textref =~ s/^.?<body[^>]>//is;
627	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
628	}
629
630
631	# single section document
632	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
633	}
634
635	return 1;
636	}
637
638
639	sub process_heading
640	{
641	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
642	$strHeadingText = '' if (!defined($strHeadingText));
643
644	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
645
646	my $strSecMetadata = '';
647	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
648	{
649	$strSecMetadata .= $1;
650	}
651
652	$strHeadingText =~ s/^\s+//g;
653	$strHeadingText =~ s/\s+$//g;
654	$strSecMetadata =~ s/^\s+//g;
655	$strSecMetadata =~ s/\s+$//g;
656
657	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
658
659	if (length($strSecMetadata)) {
660	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
661	}
662
663	$strMetadata .= "\t</Description>\n";
664
665	return "<!--" . $strMetadata . "-->";
666	}
667
668
669	sub update_section_data
670	{
671	my ($self, $arrSections, $nCurTocNo) = @_;
672	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
673
674	if ($nSections == 0) {
675	push @$arrSections, $nCurTocNo;
676	return $strBuffer;
677	}
678	$nLast = $arrSections->[$nSections - 1];
679	if ($nCurTocNo > $nLast) {
680	push @$arrSections, $nCurTocNo;
681	return $strBuffer;
682	}
683	for(my $i = $nSections - 1; $i >= 0; $i--) {
684	if ($nCurTocNo <= $arrSections->[$i]) {
685	$strBuffer .= "\n</Section>";
686	pop @$arrSections;
687	}
688	}
689	push @$arrSections, $nCurTocNo;
690	return $strBuffer;
691	}
692
693
694	# note that process_section may be called multiple times for a single
695	# section (relying on the fact that add_utf8_text appends the text to any
696	# that may exist already).
697	sub process_section {
698	my $self = shift (@_);
699	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
700	# trap links
701	if (!$self->{'nolinks'}) {
702	# usemap="./#index" not handled correctly => change to "#index"
703	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
704
705	## my $opencom = '(?:<!--\|<!(?:—\|\|--))';
706	## my $closecom = '(?:-->\|(?:—\|\|--)>)';
707
708	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
709	$self->replace_usemap_links($1, $2, $3)/isge;
710
711	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
712	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
713
714	## $$textref =~ s/($opencom.?)?+(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)(.*?$closecom)?+/
715	# $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
716	}
717
718	# trap images
719
720	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
721	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
722	# turned regular text succeeding images into links. That is, by embedding <imgs>
723	# inside <a href=""></a>, the text following images were no longer misbehaving.
724	# However, there would be many occasions whereby images were not meant to link
725	# to their source images but where the images would link to another web page.
726	# To allow this, the no_image_links option was introduced: it would prevent
727	# the behaviour of embedding images into links that referenced the source images.
728
729	# Somewhere along the line, the problem of normal text turning into links when
730	# such text followed images which were not embedded in <a href=""></a> ceased
731	# to occur. This is why the following lines have been commented out (as well as
732	# two lines in replace_images). They appear to no longer apply.
733
734	# If at any time, there is a need for having images embedded in <a> anchor tags,
735	# then it might be better to turn that into an HTMLPlugin option rather than make
736	# it the default behaviour. Also, eventually, no_image_links needs to become
737	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
738
739	#if(!$self->{'no_image_links'}){
740	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
741	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
742	#}
743
744	# add text to document object
745	# turn \ into \\ so that the rest of greenstone doesn't think there
746	# is an escape code following. (Macro parsing loses them...)
747	$$textref =~ s/\\/\\\\/go;
748
749	$doc_obj->add_utf8_text($cursection, $$textref);
750	}
751
752	sub replace_images {
753	my $self = shift (@_);
754	my ($front, $link, $back, $base_dir,
755	$file, $doc_obj, $section) = @_;
756
757	# remove quotes from link at start and end if necessary
758	if ($link=~/^[\"\']/) {
759	$link=~s/^[\"\']//;
760	$link=~s/[\"\']$//;
761	$front.='"';
762	$back="\"$back";
763	}
764
765	$link =~ s/\n/ /g;
766
767	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
768	# If the Word file path has spaces in it, wv messes up and you end up with
769	# absolute paths for the images, and without the "file://" prefix
770	# So check for this special case and massage the data to be correct
771	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
772	$link =~ s/^.*\\([^\\]+)$/$1/;
773	}
774
775	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
776
777	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
778
779	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n";
780
781	my $anchor_name = $img_file;
782	#$anchor_name =~ s/^.*\///;
783	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
784
785	my $image_link = $front . $img_file .$back;
786	return $image_link;
787
788	# The reasons for why the following two lines are no longer necessary can be
789	# found in subroutine process_section
790	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
791	#return $anchor_link;
792
793	#return $front . $img_file . $back . $anchor_name;
794	}
795
796	sub replace_href_links {
797	my $self = shift (@_);
798	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
799
800	# remove quotes from link at start and end if necessary
801	if ($link=~/^[\"\']/) {
802	$link=~s/^[\"\']//;
803	$link=~s/[\"\']$//;
804	$front.='"';
805	$back="\"$back";
806	}
807
808	# attempt to sort out targets - frames are not handled
809	# well in this plugin and some cases will screw things
810	# up - e.g. the _parent target (so we'll just remove
811	# them all ;-)
812	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
813	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
814	$front =~ s/target=\"?_parent\"?//is;
815	$back =~ s/target=\"?_parent\"?//is;
816
817	return $front . $link . $back if $link =~ m/^\#/s;
818	$link =~ s/\n/ /g;
819
820	# Find file referred to by $link on file system
821	# This is more complicated than it sounds when char encodings
822	# is taken in to account
823	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
824
825	# href may use '\'s where '/'s should be on Windows
826	$href =~ s/\\/\//g;
827	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
828
829
830	##### leave all these links alone (they won't be picked up by intermediate
831	##### pages). I think that's safest when dealing with frames, targets etc.
832	##### (at least until I think of a better way to do it). Problems occur with
833	##### mailto links from within small frames, the intermediate page is displayed
834	##### within that frame and can't be seen. There is still potential for this to
835	##### happen even with html pages - the solution seems to be to somehow tell
836	##### the browser from the server side to display the page being sent (i.e.
837	##### the intermediate page) in the top level window - I'm not sure if that's
838	##### possible - the following line should probably be deleted if that can be done
839	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
840
841	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
842	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
843
844	if ($ENV{'GSDLOS'} =~ m/^windows$/) {
845
846	# Don't do any encoding for now, as not clear what
847	# the right thing to do is to support filename
848	# encoding on Windows when they are not UTF16
849	#
850	}
851	else {
852	# => Unix-based system
853
854	# If web page didn't give encoding, then default to utf8
855	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
856
857	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
858	print STDERR "**** Encoding with '$content_encoding', href: $href\n";
859	}
860
861	# on Darwin, the unicode filenames are stored on the file
862	# system in decomposed form, so any href link (including when
863	# URL-encoded) should refer to the decomposed name of the file
864	if ($ENV{'GSDLOS'} =~ /^darwin$/i) {
865	$href = normalize('D', $href); # Normalization Form D (decomposition)
866	}
867
868	$href = encode($content_encoding,$href);
869	}
870
871	$href = &unicode::raw_filename_to_utf8_url_encoded($href);
872	$href = &unicode::filename_to_url($href);
873
874	&ghtml::urlsafe ($href);
875
876	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
877	print STDERR "******DEBUG: href=$href\n";
878	}
879
880
881	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
882	} else {
883	# link is to some other type of file (e.g., an image) so we'll
884	# need to associate that file
885	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
886	}
887	}
888
889	sub add_file {
890	my $self = shift (@_);
891	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
892	my ($newname);
893
894	my $filename = $href;
895	if ($base_dir eq "") {
896	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
897	# remove http://
898	$filename =~ s/^[^:]*:\/\///;
899	}
900	else {
901	# remove http:/ thereby leaving one slash at the start as
902	# part of full pathname
903	$filename =~ s/^[^:]*:\///;
904	}
905	}
906	else {
907	# remove http://
908	$filename =~ s/^[^:]*:\/\///;
909	}
910
911	$filename = &util::filename_cat($base_dir, $filename);
912
913	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'})) {
914	# we are processing a tidytmp file - want paths to be in import
915	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
916	}
917
918	# Replace %XX's in URL with decoded value if required. Note that the
919	# filename may include the %XX in some situations. If the original
920	# file's name was in URL encoding, the following method will not decode
921	# it.
922	my $unicode_filename = $filename;
923	my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
924
925	# wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
926	# (it seems) accompanying file
927	if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
928
929	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
930
931	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
932	# The filenames that come through the HTML file have been decoded
933	# into Unicode aware Perl strings. Need to convert them back
934	# to their initial raw-byte encoding to match the file that
935	# exists on the file system
936	$filename = encode($content_encoding, $opt_decode_unicode_filename);
937	}
938	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
939	# HFS+ is UTF8 with decompostion
940	$filename = encode($content_encoding, $opt_decode_unicode_filename);
941	$filename = normalize('D', $filename); # Normalization Form D (decomposition)
942
943	}
944	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
945	my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
946
947	if (defined $long_filename) {
948	my $short_filename = Win32::GetLongPathName($long_filename);
949	$filename = $short_filename;
950	}
951	# else {
952	# print STDERR "***** failed to map href to real file:\n";
953	# print STDERR "****** $href -> $opt_decode_unicode_filename\n";
954	# }
955	}
956	else {
957	my $outhandle = $self->{'outhandle'};
958	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
959	print $outhandle " in file system encoding of href: $href\n";
960	print $outhandle " No character encoding done.\n";
961	}
962
963
964	# some special processing if the intended filename was converted to utf8, but
965	# the actual file still needs to be renamed
966	if (!&util::fd_exists($filename)) {
967	# try the original filename stored in map
968	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
969	print STDERR "******!! orig filename did not exist: $filename\n";
970	}
971
972	## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
973
974	my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
975
976	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
977	print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n";
978	}
979
980	if (defined $original_filename && -e $original_filename) {
981	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
982	print STDERR "****** Found match!\n";
983	}
984	$filename = $original_filename;
985	}
986	}
987
988	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
989
990	if ($rl == 0) {
991	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
992	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
993	}
994	else {
995	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
996	}
997	}
998
999	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
1000	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
1001	}
1002	# add the original image file as a source file
1003	if (!$self->{'processing_tmp_files'} ) {
1004	$doc_obj->associate_source_file($filename);
1005	}
1006	if ($self->{'rename_assoc_files'}) {
1007	if (defined $self->{'aux_files'}->{$href}) {
1008	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1009	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1010	} else {
1011	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1012	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1013	$self->inc_filecount ();
1014	}
1015	$doc_obj->associate_file($filename, $newname, undef, $section);
1016	return "_httpdocimg_/$newname";
1017	} else {
1018	if(&unicode::is_url_encoded($unicode_filename)) {
1019	# use the possibly-decoded filename instead to avoid double URL encoding
1020	($newname) = $filename =~ m/([^\/\\]*)$/;
1021	} else {
1022	($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
1023	}
1024
1025	# Make sure this name uses only ASCII characters.
1026	# We use either base64 or URL encoding, as these preserve original encoding
1027	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
1028
1029	### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
1030	$doc_obj->associate_file($filename, $newname, undef, $section);
1031
1032	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1033	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
1034	my $newname_url = $newname;
1035	$newname_url = &unicode::filename_to_url($newname_url);
1036	return "_httpdocimg_/$newname_url";
1037	}
1038	}
1039
1040
1041	sub format_link {
1042	my $self = shift (@_);
1043	my ($link, $base_dir, $file) = @_;
1044
1045	# strip off hash part, e.g. #foo, but watch out for any entities, e.g. α
1046	my ($before_hash, $hash_part) = $link =~ m/^(.?[^&])(\#.)?$/;
1047
1048	$hash_part = "" if !defined $hash_part;
1049	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
1050	my $outhandle = $self->{'outhandle'};
1051	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1052	if $self->{'verbosity'};
1053	return ($link, "", 0);
1054	}
1055
1056	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
1057	my $type = $1;
1058
1059	if ($link =~ m/^(http\|ftp):/i) {
1060	# Turn url (using /) into file name (possibly using \ on windows)
1061	my @http_dir_split = split('/', $before_hash);
1062	$before_hash = &util::filename_cat(@http_dir_split);
1063	}
1064
1065	$before_hash = $self->eval_dir_dots($before_hash);
1066
1067	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1068
1069	my $rl = 0;
1070	$rl = 1 if (-e $linkfilename);
1071
1072	# make sure there's a slash on the end if it's a directory
1073	if ($before_hash !~ m/\/$/) {
1074	$before_hash .= "/" if (-d $linkfilename);
1075	}
1076	return ($type . $before_hash, $hash_part, $rl);
1077
1078	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
1079
1080	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
1081
1082	# the first directory will be the domain name if file_is_url
1083	# to generate archives, otherwise we'll assume all files are
1084	# from the same site and base_dir is the root
1085
1086	if ($self->{'file_is_url'}) {
1087	my @dirs = split /[\/\\]/, $file;
1088	my $domname = shift (@dirs);
1089	$before_hash = &util::filename_cat($domname, $before_hash);
1090	$before_hash =~ s@\\@/@g; # for windows
1091	}
1092	else
1093	{
1094	# see if link shares directory with source document
1095	# => turn into relative link if this is so!
1096
1097	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1098	# too difficult doing a pattern match with embedded '\'s...
1099	my $win_before_hash=$before_hash;
1100	$win_before_hash =~ s@(\\)+@/@g;
1101	# $base_dir is already similarly "converted" on windows.
1102	if ($win_before_hash =~ s@^$base_dir/@@o) {
1103	# if this is true, we removed a prefix
1104	$before_hash=$win_before_hash;
1105	}
1106	}
1107	else {
1108	# before_hash has lost leading slash by this point,
1109	# -> add back in prior to substitution with $base_dir
1110	$before_hash = "/$before_hash";
1111
1112	$before_hash = &util::filename_cat("",$before_hash);
1113	$before_hash =~ s@^$base_dir/@@;
1114	}
1115	}
1116	} else {
1117	# Turn relative file path into full path
1118	my $dirname = &File::Basename::dirname($file);
1119	$before_hash = &util::filename_cat($dirname, $before_hash);
1120	$before_hash = $self->eval_dir_dots($before_hash);
1121	}
1122
1123	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1124
1125
1126	# print STDERR "**** linkfilename = $linkfilename\n";
1127	# if (!&util::fd_exists($linkfilename)) {
1128	# print STDERR "***** Warning: Could not find $linkfilename\n";
1129	# }
1130
1131
1132	# make sure there's a slash on the end if it's a directory
1133	if ($before_hash !~ m/\/$/) {
1134	$before_hash .= "/" if (-d $linkfilename);
1135	}
1136
1137	# print STDERR "*** returning: $before_hash\n";
1138
1139	return ("http://" . $before_hash, $hash_part, 1);
1140	} else {
1141	# mailto, news, nntp, telnet, javascript or gopher link
1142	return ($before_hash, "", 0);
1143	}
1144	}
1145
1146	sub extract_first_NNNN_characters {
1147	my $self = shift (@_);
1148	my ($textref, $doc_obj, $thissection) = @_;
1149
1150	foreach my $size (split /,/, $self->{'first'}) {
1151	my $tmptext = $$textref;
1152	# skip to the body
1153	$tmptext =~ s/.<body[^>]>//i;
1154	# remove javascript
1155	$tmptext =~ s@<script.*?</script>@ @sig;
1156	$tmptext =~ s/<[^>]*>/ /g;
1157	$tmptext =~ s/ / /g;
1158	$tmptext =~ s/^\s+//;
1159	$tmptext =~ s/\s+$//;
1160	$tmptext =~ s/\s+/ /gs;
1161	$tmptext = &unicode::substr ($tmptext, 0, $size);
1162	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
1163	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1164	}
1165	}
1166
1167
1168	sub extract_metadata {
1169	my $self = shift (@_);
1170	my ($textref, $metadata, $doc_obj, $section) = @_;
1171	my $outhandle = $self->{'outhandle'};
1172	# if we don't want metadata, we may as well not be here ...
1173	return if (!defined $self->{'metadata_fields'});
1174
1175	my $separator = $self->{'metadata_field_separator'};
1176	if ($separator eq "") {
1177	undef $separator;
1178	}
1179
1180	# metadata fields to extract/save. 'key' is the (lowercase) name of the
1181	# html meta, 'value' is the metadata name for greenstone to use
1182	my %find_fields = ();
1183
1184	my %creator_fields = (); # short-cut for lookups
1185
1186
1187	foreach my $field (split /,/, $self->{'metadata_fields'}) {
1188	$field =~ s/^\s+//; # remove leading whitespace
1189	$field =~ s/\s+$//; # remove trailing whitespace
1190
1191	# support tag<tagname>
1192	if ($field =~ m/^(.?)\s<(.*?)>$/) {
1193	# "$2" is the user's preferred gs metadata name
1194	$find_fields{lc($1)}=$2; # lc = lowercase
1195	} else { # no <tagname> for mapping
1196	# "$field" is the user's preferred gs metadata name
1197	$find_fields{lc($field)}=$field; # lc = lowercase
1198	}
1199	}
1200
1201	if (defined $self->{'hunt_creator_metadata'} &&
1202	$self->{'hunt_creator_metadata'} == 1 ) {
1203	my @extra_fields =
1204	(
1205	'author',
1206	'author.email',
1207	'creator',
1208	'dc.creator',
1209	'dc.creator.corporatename',
1210	);
1211
1212	# add the creator_metadata fields to search for
1213	foreach my $field (@extra_fields) {
1214	$creator_fields{$field}=0; # add to lookup hash
1215	}
1216	}
1217
1218
1219	# find the header in the html file, which has the meta tags
1220	$$textref =~ m@<head>(.*?)</head>@si;
1221
1222	my $html_header=$1;
1223
1224	# go through every <meta... tag defined in the html and see if it is
1225	# one of the tags we want to match.
1226
1227	# special case for title - we want to remember if its been found
1228	my $found_title = 0;
1229	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1230	$html_header =~ m/^/; # match the start of the string, for \G assertion
1231
1232	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1233	my $metatag=$1;
1234	my ($tag, $value);
1235
1236	# find the tag name
1237	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1238	$tag=$2;
1239	# in case they're not using " or ', but they should...
1240	if (! $tag) {
1241	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1242	$tag=$1;
1243	}
1244
1245	if (!defined $tag) {
1246	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1247	next;
1248	}
1249
1250	# don't need to assign this field if it was passed in from a previous
1251	# (recursive) plugin
1252	if (defined $metadata->{$tag}) {next}
1253
1254	# find the tag content
1255	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
1256	$value=$2;
1257
1258	if (! $value) {
1259	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1260	$value=$1;
1261	}
1262	if (!defined $value) {
1263	print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1264	next;
1265	}
1266
1267	# clean up and add
1268	$value =~ s/\s+/ /gs;
1269	chomp($value); # remove trailing \n, if any
1270	if (exists $creator_fields{lc($tag)}) {
1271	# map this value onto greenstone's "Creator" metadata
1272	$tag='Creator';
1273	} elsif (!exists $find_fields{lc($tag)}) {
1274	next; # don't want this tag
1275	} else {
1276	# get the user's preferred capitalisation
1277	$tag = $find_fields{lc($tag)};
1278	}
1279	if (lc($tag) eq "title") {
1280	$found_title = 1;
1281	}
1282
1283	if ($self->{'verbosity'} > 2) {
1284	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1285	}
1286
1287	if ($tag =~ /\./) {
1288	# there is a . so has a namespace, add ex.
1289	$tag = "ex.$tag";
1290	}
1291	if (defined $separator) {
1292	my @values = split($separator, $value);
1293	foreach my $v (@values) {
1294	$doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1295	}
1296	}
1297	else {
1298	$doc_obj->add_utf8_metadata($section, $tag, $value);
1299	}
1300	}
1301
1302	# TITLE: extract the document title
1303	if (exists $find_fields{'title'} && !$found_title) {
1304	# we want a title, and didn't find one in the meta tags
1305	# see if there's a <title> tag
1306	my $title;
1307	my $from = ""; # for debugging output only
1308	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
1309	$title = $1;
1310	$from = "<title> tags";
1311	}
1312
1313	if (!defined $title) {
1314	$from = "first 100 chars";
1315	# if no title use first 100 or so characters
1316	$title = $$textref;
1317	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1318	$title =~ s/^.*?<body>//si;
1319	# ignore javascript!
1320	$title =~ s@<script.*?</script>@ @sig;
1321	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1322	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1323	$title = substr ($title, 0, 100);
1324	$title =~ s/\s\S*$/.../;
1325	}
1326	$title =~ s/<[^>]*>/ /g; # remove html tags
1327	$title =~ s/ / /g;
1328	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1329	$title =~ s/\s+/ /gs; # collapse multiple spaces
1330	$title =~ s/^\s*//; # remove leading spaces
1331	$title =~ s/\s*$//; # remove trailing spaces
1332
1333	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1334	$title =~ s/^\s+//s; # in case title_sub introduced any...
1335	$doc_obj->add_utf8_metadata ($section, "Title", $title);
1336	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1337	if ($self->{'verbosity'} > 2);
1338	}
1339
1340	# add FileFormat metadata
1341	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1342
1343	# Special, for metadata names such as tagH1 - extracts
1344	# the text between the first <H1> and </H1> tags into "H1" metadata.
1345
1346	foreach my $field (keys %find_fields) {
1347	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1348	my $tag = $1;
1349	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1350	my $content = $1;
1351	$content =~ s/ / /g;
1352	$content =~ s/<[^>]*>/ /g;
1353	$content =~ s/^\s+//;
1354	$content =~ s/\s+$//;
1355	$content =~ s/\s+/ /gs;
1356	if ($content) {
1357	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1358	$tag =~ s/^tag//i;
1359	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1360	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1361	if ($self->{'verbosity'} > 2);
1362	}
1363	}
1364	}
1365	}
1366
1367
1368	# evaluate any "../" to next directory up
1369	# evaluate any "./" as here
1370	sub eval_dir_dots {
1371	my $self = shift (@_);
1372	my ($filename) = @_;
1373	my $dirsep_os = &util::get_os_dirsep();
1374	my @dirsep = split(/$dirsep_os/,$filename);
1375
1376	my @eval_dirs = ();
1377	foreach my $d (@dirsep) {
1378	if ($d eq "..") {
1379	pop(@eval_dirs);
1380
1381	} elsif ($d eq ".") {
1382	# do nothing!
1383
1384	} else {
1385	push(@eval_dirs,$d);
1386	}
1387	}
1388
1389	# Need to fiddle with number of elements in @eval_dirs if the
1390	# first one is the empty string. This is because of a
1391	# modification to util::filename_cat that supresses the addition
1392	# of a leading '/' character (or \ if windows) (intended to help
1393	# filename cat with relative paths) if the first entry in the
1394	# array is the empty string. Making the array start with two
1395	# empty strings is a way to defeat this "smart" option.
1396	#
1397	if (scalar(@eval_dirs) > 0) {
1398	if ($eval_dirs[0] eq ""){
1399	unshift(@eval_dirs,"");
1400	}
1401	}
1402
1403	my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1404	return $evaluated_filename;
1405	}
1406
1407	sub replace_usemap_links {
1408	my $self = shift (@_);
1409	my ($front, $link, $back) = @_;
1410
1411	# remove quotes from link at start and end if necessary
1412	if ($link=~/^[\"\']/) {
1413	$link=~s/^[\"\']//;
1414	$link=~s/[\"\']$//;
1415	$front.='"';
1416	$back="\"$back";
1417	}
1418
1419	$link =~ s/^\.\///;
1420	return $front . $link . $back;
1421	}
1422
1423	sub inc_filecount {
1424	my $self = shift (@_);
1425
1426	if ($self->{'file_num'} == 1000) {
1427	$self->{'dir_num'} ++;
1428	$self->{'file_num'} = 0;
1429	} else {
1430	$self->{'file_num'} ++;
1431	}
1432	}
1433
1434
1435	# Extend read_file so that strings like é are
1436	# converted to UTF8 internally.
1437	#
1438	# We don't convert < or > or & or " in case
1439	# they interfere with the GML files
1440
1441	sub read_file {
1442	my $self = shift(@_);
1443	my ($filename, $encoding, $language, $textref) = @_;
1444
1445	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1446
1447	# Convert entities to their Unicode code-point equivalents
1448	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1449	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1450	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1451
1452	}
1453
1454	sub HB_read_html_file {
1455	my $self = shift (@_);
1456	my ($htmlfile, $text) = @_;
1457
1458	# load in the file
1459	if (!open (FILE, $htmlfile)) {
1460	print STDERR "ERROR - could not open $htmlfile\n";
1461	return;
1462	}
1463
1464	my $foundbody = 0;
1465	$self->HB_gettext (\$foundbody, $text, "FILE");
1466	close FILE;
1467
1468	# just in case there was no <body> tag
1469	if (!$foundbody) {
1470	$foundbody = 1;
1471	open (FILE, $htmlfile) \|\| return;
1472	$self->HB_gettext (\$foundbody, $text, "FILE");
1473	close FILE;
1474	}
1475	# text is in utf8
1476	}
1477
1478	# converts the text to utf8, as ghtml does that for é etc.
1479	sub HB_gettext {
1480	my $self = shift (@_);
1481	my ($foundbody, $text, $handle) = @_;
1482
1483	my $line = "";
1484	while (defined ($line = <$handle>)) {
1485	# look for body tag
1486	if (!$$foundbody) {
1487	if ($line =~ s/^.<body[^>]>//i) {
1488	$$foundbody = 1;
1489	} else {
1490	next;
1491	}
1492	}
1493
1494	# check for symbol fonts
1495	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
1496	my $font = $1;
1497	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1498	if ($font !~ m/^arial$/i);
1499	}
1500
1501	$$text .= $line;
1502	}
1503
1504	if ($self->{'input_encoding'} eq "iso_8859_1") {
1505	# convert to utf-8
1506	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1507	}
1508	# convert any alphanumeric character entities to their utf-8
1509	# equivalent for indexing purposes
1510	#&ghtml::convertcharentities ($$text);
1511
1512	$$text =~ s/\s+/ /g; # remove \n's
1513
1514	# At this point $$text is a binary byte string
1515	# => turn it into a Unicode aware string, so full
1516	# Unicode aware pattern matching can be used.
1517	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
1518	#
1519
1520	$$text = decode("utf8",$$text);
1521	}
1522
1523	sub HB_clean_section {
1524	my $self = shift (@_);
1525	my ($section) = @_;
1526
1527	# remove tags without a starting tag from the section
1528	my ($tag, $tagstart);
1529	while ($section =~ m/<\/([^>]{1,10})>/) {
1530	$tag = $1;
1531	$tagstart = index($section, "<$tag");
1532	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1533	$section =~ s/<\/$tag>//;
1534	}
1535
1536	# remove extra paragraph tags
1537	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
1538
1539	# remove extra stuff at the end of the section
1540	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
1541
1542	# add a newline at the beginning of each paragraph
1543	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1544
1545	# add a newline every 80 characters at a word boundary
1546	# Note: this regular expression puts a line feed before
1547	# the last word in each section, even when it is not
1548	# needed.
1549	$section =~ s/(.{1,80})\s/$1\n/g;
1550
1551	# fix up the image links
1552	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
1553	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1554	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
1555	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1556
1557	return $section;
1558	}
1559
1560	# Will convert the oldHDL format to the new HDL format (using the Section tag)
1561	sub convert_to_newHDLformat
1562	{
1563	my $self = shift (@_);
1564	my ($file,$cnfile) = @_;
1565	my $input_filename = $file;
1566	my $tmp_filename = $cnfile;
1567
1568	# write HTML tmp file with new HDL format
1569	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1570
1571	# read in the file and do basic html cleaning (removing header etc)
1572	my $html = "";
1573	$self->HB_read_html_file ($input_filename, \$html);
1574
1575	# process the file one section at a time
1576	my $curtoclevel = 1;
1577	my $firstsection = 1;
1578	my $toclevel = 0;
1579	while (length ($html) > 0) {
1580	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
1581	$toclevel = $3;
1582	my $title = $4;
1583	my $sectiontext = "";
1584	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
1585	$sectiontext = $1;
1586	} else {
1587	$sectiontext = $html;
1588	$html = "";
1589	}
1590
1591	# remove tags and extra spaces from the title
1592	$title =~ s/<\/?[^>]+>//g;
1593	$title =~ s/^\s+\|\s+$//g;
1594
1595	# close any sections below the current level and
1596	# create a new section (special case for the firstsection)
1597	print PROD "<!--\n";
1598	while (($curtoclevel > $toclevel) \|\|
1599	(!$firstsection && $curtoclevel == $toclevel)) {
1600	$curtoclevel--;
1601	print PROD "</Section>\n";
1602	}
1603	if ($curtoclevel+1 < $toclevel) {
1604	print STDERR "WARNING - jump in toc levels in $input_filename " .
1605	"from $curtoclevel to $toclevel\n";
1606	}
1607	while ($curtoclevel < $toclevel) {
1608	$curtoclevel++;
1609	}
1610
1611	if ($curtoclevel == 1) {
1612	# add the header tag
1613	print PROD "-->\n";
1614	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1615	print PROD "<!--\n";
1616	}
1617
1618	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1619
1620	print PROD "-->\n";
1621
1622	# clean up the section html
1623	$sectiontext = $self->HB_clean_section($sectiontext);
1624
1625	print PROD "$sectiontext\n";
1626
1627	} else {
1628	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1629	"\nin $input_filename\n";
1630	last;
1631	}
1632	$firstsection = 0;
1633	}
1634
1635	print PROD "<!--\n";
1636	while ($curtoclevel > 0) {
1637	$curtoclevel--;
1638	print PROD "</Section>\n";
1639	}
1640	print PROD "-->\n";
1641
1642	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1643
1644	return $tmp_filename;
1645	}
1646
1647	sub shorten {
1648	my $self = shift (@_);
1649	my ($text) = @_;
1650
1651	return "\"$text\"" if (length($text) < 100);
1652
1653	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1654	substr ($text, length($text)-50) . "\"";
1655	}
1656
1657	sub convert_tidy_or_oldHDL_file
1658	{
1659	my $self = shift (@_);
1660	my ($file) = @_;
1661	my $input_filename = $file;
1662
1663	if (-d $input_filename)
1664	{
1665	return $input_filename;
1666	}
1667
1668	# get the input filename
1669	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1670	my $base_dirname = $dirname;
1671	$suffix = lc($suffix);
1672
1673	# derive tmp filename from input filename
1674	# Remove any white space from filename -- no risk of name collision, and
1675	# makes later conversion by utils simpler. Leave spaces in path...
1676	# tidy up the filename with space, dot, hyphen between
1677	$tailname =~ s/\s+//g;
1678	$tailname =~ s/\.+//g;
1679	$tailname =~ s/\-+//g;
1680	# convert to utf-8 otherwise we have problems with the doc.xml file
1681	# later on
1682	&unicode::ensure_utf8(\$tailname);
1683
1684	# softlink to collection tmp dir
1685	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1686	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1687
1688	my $test_dirname = "";
1689	my $f_separator = &util::get_os_dirsep();
1690
1691	if ($dirname =~ m/import$f_separator/)
1692	{
1693	$test_dirname = $'; #'
1694
1695	#print STDERR "init $'\n";
1696
1697	while ($test_dirname =~ m/[$f_separator]/)
1698	{
1699	my $folderdirname = $`;
1700	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1701	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1702	$test_dirname = $'; #'
1703	}
1704	}
1705
1706	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1707
1708	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1709	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
1710	{
1711	#convert the input file to a new style HDL
1712	my $hdl_output_filename = $input_filename;
1713	if ($self->{'old_style_HDL'})
1714	{
1715	$hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1716	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1717	}
1718
1719	#just for checking copy all other file from the base dir to tmp dir if it is not exists
1720	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1721	my @files = grep {!/^\.+$/} readdir(DIR);
1722	close(DIR);
1723
1724	foreach my $file (@files)
1725	{
1726	my $src_file = &util::filename_cat($base_dirname,$file);
1727	my $dest_file = &util::filename_cat($tmp_dirname,$file);
1728	if ((!-e $dest_file) && (!-d $src_file))
1729	{
1730	# just copy the original file back to the tmp directory
1731	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1732	}
1733	}
1734
1735	# tidy the input file
1736	my $tidy_output_filename = $hdl_output_filename;
1737	if ($self->{'use_realistic_book'})
1738	{
1739	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1740	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1741	}
1742	$tmp_filename = $tidy_output_filename;
1743	}
1744	else
1745	{
1746	if (!-e $tmp_filename)
1747	{
1748	# just copy the original file back to the tmp directory
1749	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1750	}
1751	}
1752
1753	return $tmp_filename;
1754	}
1755
1756
1757	# Will make the html input file as a proper XML file with removed font tag and
1758	# image size added to the img tag.
1759	# The tidying process takes place in a collection specific 'tmp' directory so
1760	# that we don't accidentally damage the input.
1761	sub tmp_tidy_file
1762	{
1763	my $self = shift (@_);
1764	my ($file,$cnfile) = @_;
1765	my $input_filename = $file;
1766	my $tmp_filename = $cnfile;
1767
1768	# get the input filename
1769	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1770
1771	require HTML::TokeParser::Simple;
1772
1773	# create HTML parser to decode the input file
1774	my $parser = HTML::TokeParser::Simple->new($input_filename);
1775
1776	# write HTML tmp file without the font tag and image size are added to the img tag
1777	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1778	while (my $token = $parser->get_token())
1779	{
1780	# is it an img tag
1781	if ($token->is_start_tag('img'))
1782	{
1783	# get the attributes
1784	my $attr = $token->return_attr;
1785
1786	# get the full path to the image
1787	my $img_file = &util::filename_cat($dirname,$attr->{src});
1788
1789	# set the width and height attribute
1790	($attr->{width}, $attr->{height}) = imgsize($img_file);
1791
1792	# recreate the tag
1793	print PROD "<img";
1794	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1795	print PROD ">";
1796	}
1797	# is it a font tag
1798	else
1799	{
1800	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
1801	{
1802	# remove font tag
1803	print PROD "";
1804	}
1805	else
1806	{
1807	# print without changes
1808	print PROD $token->as_is;
1809	}
1810	}
1811	}
1812	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1813
1814	# run html-tidy on the tmp file to make it a proper XML file
1815
1816	my $outhandle = $self->{'outhandle'};
1817	print $outhandle "Converting HTML to be XML compliant:\n";
1818
1819	my $tidy_cmd = "tidy";
1820	$tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1821	$tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1822	if ($self->{'verbosity'} <= 2) {
1823	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1824	$tidy_cmd .= " 2>nul";
1825	}
1826	else {
1827	$tidy_cmd .= " 2>/dev/null";
1828	}
1829	print $outhandle " => $tidy_cmd\n";
1830	}
1831
1832	my $tidyfile = `$tidy_cmd`;
1833
1834	# write result back to the tmp file
1835	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1836	print PROD $tidyfile;
1837	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1838
1839	# return the output filename
1840	return $tmp_filename;
1841	}
1842
1843	sub associate_cover_image
1844	{
1845	my $self = shift(@_);
1846	my ($doc_obj, $filename) = @_;
1847	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
1848	{
1849	# we will have cover image in tidytmp, but want it from import
1850	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1851	}
1852	$self->SUPER::associate_cover_image($doc_obj, $filename);
1853	}
1854
1855
1856	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: