Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 27703

Last change on this file since 27703 was 27703, checked in by ak19, 11 years ago
Dr Bainbridge fixed the final diffcol issue with Small-HTML on windows where links in the body of the text had backslashes on Windows. They needed to be linux style slashes. The code change and logic was slightly more complicated as the links were used to test whether referenced files existed on the file system and were therefore relative (to the GS collection) or external links.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 66.0 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use Encode;
39	use Unicode::Normalize 'normalize';
40
41	use ReadTextFile;
42	use HBPlugin;
43	use ghtml;
44	use unicode;
45	use util;
46	use FileUtils;
47	use XMLParser;
48
49	use Image::Size;
50	use File::Copy;
51
52	sub BEGIN {
53	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
54	}
55
56	use strict; # every perl program should have this!
57	no strict 'refs'; # make an exception so we can use variables as filehandles
58
59	my $arguments =
60	[ { 'name' => "process_exp",
61	'desc' => "{BasePlugin.process_exp}",
62	'type' => "regexp",
63	'deft' => &get_default_process_exp() },
64	{ 'name' => "block_exp",
65	'desc' => "{BasePlugin.block_exp}",
66	'type' => 'regexp',
67	'deft' => &get_default_block_exp() },
68	{ 'name' => "nolinks",
69	'desc' => "{HTMLPlugin.nolinks}",
70	'type' => "flag" },
71	{ 'name' => "keep_head",
72	'desc' => "{HTMLPlugin.keep_head}",
73	'type' => "flag" },
74	{ 'name' => "no_metadata",
75	'desc' => "{HTMLPlugin.no_metadata}",
76	'type' => "flag" },
77	{ 'name' => "metadata_fields",
78	'desc' => "{HTMLPlugin.metadata_fields}",
79	'type' => "string",
80	'deft' => "Title" },
81	{ 'name' => "metadata_field_separator",
82	'desc' => "{HTMLPlugin.metadata_field_separator}",
83	'type' => "string",
84	'deft' => "" },
85	{ 'name' => "hunt_creator_metadata",
86	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
87	'type' => "flag" },
88	{ 'name' => "file_is_url",
89	'desc' => "{HTMLPlugin.file_is_url}",
90	'type' => "flag" },
91	{ 'name' => "assoc_files",
92	'desc' => "{HTMLPlugin.assoc_files}",
93	'type' => "regexp",
94	'deft' => &get_default_block_exp() },
95	{ 'name' => "rename_assoc_files",
96	'desc' => "{HTMLPlugin.rename_assoc_files}",
97	'type' => "flag" },
98	{ 'name' => "title_sub",
99	'desc' => "{HTMLPlugin.title_sub}",
100	'type' => "string",
101	'deft' => "" },
102	{ 'name' => "description_tags",
103	'desc' => "{HTMLPlugin.description_tags}",
104	'type' => "flag" },
105	# retain this for backward compatibility (w3mir option was replaced by
106	# file_is_url)
107	{ 'name' => "w3mir",
108	# 'desc' => "{HTMLPlugin.w3mir}",
109	'type' => "flag",
110	'hiddengli' => "yes"},
111	{ 'name' => "no_strip_metadata_html",
112	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
113	'type' => "string",
114	'deft' => "",
115	'reqd' => "no"},
116	{ 'name' => "sectionalise_using_h_tags",
117	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
118	'type' => "flag" },
119	{ 'name' => "use_realistic_book",
120	'desc' => "{HTMLPlugin.tidy_html}",
121	'type' => "flag"},
122	{ 'name' => "old_style_HDL",
123	'desc' => "{HTMLPlugin.old_style_HDL}",
124	'type' => "flag"},
125	{'name' => "processing_tmp_files",
126	'desc' => "{BasePlugin.processing_tmp_files}",
127	'type' => "flag",
128	'hiddengli' => "yes"}
129	];
130
131	my $options = { 'name' => "HTMLPlugin",
132	'desc' => "{HTMLPlugin.desc}",
133	'abstract' => "no",
134	'inherits' => "yes",
135	'args' => $arguments };
136
137
138	sub new {
139	my ($class) = shift (@_);
140	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
141	push(@$pluginlist, $class);
142
143	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
144	push(@{$hashArgOptLists->{"OptList"}},$options);
145
146
147	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
148
149	if ($self->{'w3mir'}) {
150	$self->{'file_is_url'} = 1;
151	}
152	$self->{'aux_files'} = {};
153	$self->{'dir_num'} = 0;
154	$self->{'file_num'} = 0;
155
156	return bless $self, $class;
157	}
158
159	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
160	# if have eg <script language="javascript" src="img/lib.js@123">
161	# blocking is now done by reading through the file and recording all the
162	# images and other files
163	sub get_default_block_exp {
164	my $self = shift (@_);
165
166	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
167	return "";
168	}
169
170	sub get_default_process_exp {
171	my $self = shift (@_);
172
173	# the last option is an attempt to encode the concept of an html query ...
174	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
175	}
176
177	sub store_block_files
178	{
179	my $self =shift (@_);
180	my ($filename_full_path, $block_hash) = @_;
181
182	my $html_fname = $filename_full_path;
183
184	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
185	$self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
186
187	# read in file ($text will be in utf8)
188	my $raw_text = "";
189	$self->read_file_no_decoding($filename_full_path, \$raw_text);
190
191	my $textref = \$raw_text;
192	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
193	my $closecom = '(?:-->\|(?:—\|\|--)>)';
194	$$textref =~ s/$opencom(.*?)$closecom//gs;
195
196	# Convert entities to their UTF8 equivalents
197	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
198	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
199	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
200
201	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
202	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
203	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
204	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
205	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
206	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
207	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
208
209	if(!defined $self->{'unicode_to_original_filename'}) {
210	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
211	$self->{'unicode_to_original_filename'} = {};
212	}
213
214	foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
215
216	# remove quotes from link at start and end if necessary
217	if ($raw_link =~ m/^\"/) {
218	$raw_link =~ s/^\"//;
219	$raw_link =~ s/\"$//;
220	}
221
222	# remove any anchor names, e.g. foo.html#name becomes foo.html
223	# but watch out for any #'s that are part of entities, such as α
224	$raw_link =~ s/([^&])\#.*$/$1/s;
225
226	# some links may just be anchor names
227	next unless ($raw_link =~ /\S+/);
228
229	if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/i) {
230	# Turn relative file path into full path
231	my $dirname = &File::Basename::dirname($filename_full_path);
232	$raw_link = &FileUtils::filenameConcatenate($dirname, $raw_link);
233	}
234	$raw_link = $self->eval_dir_dots($raw_link);
235
236	# this is the actual filename on the filesystem (that the link refers to)
237	my $url_original_filename = $self->opt_url_decode($raw_link);
238
239	my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
240
241	if ($exceeds_bytecodes) {
242	# We have a link to a file name that is more complicated than a raw byte filename
243	# What we do next depends on the operating system we are on
244
245	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
246	# Assume we're dealing with a UTF-8 encoded filename
247	$url_original_filename = encode("utf8", $url_original_filename);
248	}
249	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
250	# HFS+ is UTF8 with decompostion
251	$url_original_filename = encode("utf8", $url_original_filename);
252	$url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
253	}
254	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
255	# Don't need to do anything as later code maps Windows
256	# unicode filenames to DOS short filenames when needed
257	}
258	else {
259	my $outhandle = $self->{'outhandle'};
260	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
261	print $outhandle " in raw file system encoding of: $raw_link\n";
262	print $outhandle " Assuming filesystem is UTF-8 based.\n";
263	$url_original_filename = encode("utf8", $url_original_filename);
264	}
265	}
266
267	# Convert the (currently raw) link into its Unicode version.
268	# Store the Unicode link along with the url_original_filename
269	my $unicode_url_original_filename = "";
270	$self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
271
272
273	$self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
274
275
276	if ($url_original_filename ne $unicode_url_original_filename) {
277	my $outhandle = $self->{'outhandle'};
278
279	print $outhandle "URL Encoding $url_original_filename\n";
280	print $outhandle " ->$unicode_url_original_filename\n";
281
282	# Allow for possibility of raw byte version and Unicode versions of file
283	&util::block_filename($block_hash,$unicode_url_original_filename);
284	}
285
286	# $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename);
287	&util::block_filename($block_hash,$url_original_filename);
288
289	}
290	}
291
292	# Given a filename in any encoding, will URL decode it to get back the original filename
293	# in the original encoding. Because this method is intended to work out the original
294	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
295	# string already exists in the local folder.
296	#
297	sub opt_url_decode {
298	my $self = shift (@_);
299	my ($raw_link) = @_;
300
301
302	# Replace %XX's in URL with decoded value if required.
303	# Note that the filename may include the %XX in some situations
304
305	## if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
306
307	if (($raw_link =~ m/\%[A-F0-9]{2}/i) \|\| ($raw_link =~ m/\&\#x[0-9A-F]+;/i) \|\| ($raw_link =~ m/\&\#[0-9]+;/i)) {
308	if (!-e $raw_link) {
309	$raw_link = &unicode::url_decode($raw_link,1);
310	}
311	}
312
313	return $raw_link;
314	}
315
316	sub read_into_doc_obj
317	{
318	my $self = shift (@_);
319	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
320
321	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
322
323	# Lookup content_encoding worked out in file_block pass for this file
324	# Store it under the local name 'content_encoding' so its nice and
325	# easy to access
326	$self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
327
328	# get the input file
329	my $input_filename = $file;
330	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
331	$suffix = lc($suffix);
332	my $tidy_filename;
333	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
334	{
335	# because the document has to be sectionalized set the description tags
336	$self->{'description_tags'} = 1;
337
338	# set the file to be tidied
339	$input_filename = &FileUtils::filenameConcatenate($base_dir,$file) if $base_dir =~ m/\w/;
340
341	# get the tidied file
342	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
343	$tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
344
345	# derive tmp filename from input filename
346	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
347
348	# set the new input file and base_dir to be from the tidied file
349	$file = "$tailname$suffix";
350	$base_dir = $dirname;
351	}
352
353	# call the parent read_into_doc_obj
354	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
355	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
356	{
357	# now we need to reset the filenames in the doc obj so that the converted filenames are not used
358	my $collect_file = &util::filename_within_collection($filename_full_path);
359	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
360	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
361	# build. So set it manually.
362	$doc_obj->set_source_path($filename_full_path);
363	my $collect_conv_file = &util::filename_within_collection($tidy_filename);
364	$doc_obj->set_converted_filename($collect_conv_file);
365
366	my $plugin_filename_encoding = $self->{'filename_encoding'};
367	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
368	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
369	}
370
371	delete $self->{'store_content_encoding'}->{$filename_full_path};
372	$self->{'content_encoding'} = undef;
373
374	return ($process_status,$doc_obj);
375	}
376
377	# do plugin specific processing of doc_obj
378	sub process {
379	my $self = shift (@_);
380	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
381	my $outhandle = $self->{'outhandle'};
382
383	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
384	# this makes life so much easier... perl can cope with unix-style '/'s.
385	$base_dir =~ s@(\\)+@/@g;
386	$file =~ s@(\\)+@/@g;
387	}
388
389	my $filename = &FileUtils::filenameConcatenate($base_dir,$file);
390	my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
391	my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
392
393	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
394	# And again
395	$upgraded_base_dir =~ s@(\\)+@/@g;
396	$upgraded_filename =~ s@(\\)+@/@g;
397
398	# Need to make sure there is a '/' on the end of upgraded_base_dir
399	if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
400	$upgraded_base_dir .= "/";
401	}
402	}
403	my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
404
405	# reset per-doc stuff...
406	$self->{'aux_files'} = {};
407	$self->{'dir_num'} = 0;
408	$self->{'file_num'} = 0;
409
410	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
411	# you can also include metadata in the format (X can be any number)
412	# <hX>Title<!--gsdl-metadata
413	# <Metadata name="name1">value1</Metadata>
414	# ...
415	# <Metadata name="nameN">valueN</Metadata>
416	#--></hX>
417	if ($self->{'sectionalise_using_h_tags'}) {
418	# description_tags should allways be activated because we convert headings to description tags
419	$self->{'description_tags'} = 1;
420
421	my $arrSections = [];
422	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
423
424	if (scalar(@$arrSections)) {
425	my $strMetadata = $self->update_section_data($arrSections, -1);
426	if (length($strMetadata)) {
427	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
428	$$textref =~ s/<\/body>/$strMetadata/ig;
429	}
430	}
431	}
432
433	my $cursection = $doc_obj->get_top_section();
434
435	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
436	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
437
438	# Store URL for page as metadata - this can be used for an
439	# altavista style search interface. The URL won't be valid
440	# unless the file structure contains the domain name (i.e.
441	# like when w3mir is used to download a website).
442
443	# URL metadata (even invalid ones) are used to support internal
444	# links, so even if 'file_is_url' is off, still need to store info
445
446	my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
447
448	# my $utf8_file = $self->filename_to_utf8_metadata($file);
449	# $utf8_file =~ s/&\#095;/_/g;
450	# variable below used to be utf8_file
451
452	my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
453	my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
454
455	my $web_url = "http://";
456	my $utf8_web_url = "http://";
457	if(defined $dirname) { # local directory
458	# Check for "ftp" in the domain name of the directory
459	# structure to determine if this URL should be a ftp:// URL
460	# This check is not infallible, but better than omitting the
461	# check, which would cause all files downloaded from ftp sites
462	# via mirroring with wget to have potentially erroneous http:// URLs
463	# assigned in their metadata
464	if ($dirname =~ /^[^\/]*ftp/i)
465	{
466	$web_url = "ftp://";
467	$utf8_web_url = "ftp://";
468	}
469	$dirname = $self->eval_dir_dots($dirname);
470	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
471
472	$web_url = $web_url.$dirname.$url_encoded_file;
473	$utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
474	} else {
475	$web_url = $web_url.$url_encoded_file;
476	$utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
477	}
478	$web_url =~ s/\\/\//g;
479	$utf8_web_url =~ s/\\/\//g;
480
481	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
482	print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n";
483	print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
484	}
485
486
487	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
488	$doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
489
490	if ($self->{'file_is_url'}) {
491	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
492	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
493	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
494	}
495
496	if ($self->{'description_tags'}) {
497	# remove the html header - note that doing this here means any
498	# sections defined within the header will be lost (so all <Section>
499	# tags must appear within the body of the HTML)
500	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
501
502	$$textref =~ s/^.?<body[^>]>//is;
503	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
504
505	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
506	my $closecom = '(?:-->\|(?:—\|\|--)>)';
507
508	my $lt = '(?:<\|<)';
509	my $gt = '(?:>\|>)';
510	my $quot = '(?:"\|"\|”\|“)';
511
512	my $dont_strip = '';
513	if ($self->{'no_strip_metadata_html'}) {
514	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
515	}
516
517	my $found_something = 0; my $top = 1;
518	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
519	my $text = $1;
520	my $comment = $2;
521	if (defined $text) {
522	# text before a comment - note that getting to here
523	# doesn't necessarily mean there are Section tags in
524	# the document
525	$self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
526	}
527	while ($comment =~ s/$lt(.*?)$gt//s) {
528	my $tag = $1;
529	if ($tag eq "Section") {
530	$found_something = 1;
531	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
532	$top = 0;
533	} elsif ($tag eq "/Section") {
534	$found_something = 1;
535	$cursection = $doc_obj->get_parent_section ($cursection);
536	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
537	my $metaname = $1;
538	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
539	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
540	my $metavalue = $1;
541	$metavalue =~ s/^\s+//;
542	$metavalue =~ s/\s+$//;
543	# assume that no metadata value intentionally includes
544	# carriage returns or HTML tags (if they're there they
545	# were probably introduced when converting to HTML from
546	# some other format).
547	# actually some people want to have html tags in their
548	# metadata.
549	$metavalue =~ s/[\cJ\cM]/ /sg;
550	$metavalue =~ s/<[^>]+>//sg
551	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
552	$metavalue =~ s/\s+/ /sg;
553	if ($metaname =~ /\./) { # has a namespace
554	$metaname = "ex.$metaname";
555	}
556	if ($accumulate) {
557	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
558	} else {
559	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
560	}
561	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
562	# do nothing with containing Description tags
563	} else {
564	# simple HTML tag (probably created by the conversion
565	# to HTML from some other format) - we'll ignore it and
566	# hope for the best ;-)
567	}
568	}
569	}
570	if ($cursection ne "") {
571	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
572	}
573
574	$$textref =~ s/^.?<body[^>]>//is;
575	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
576	if ($$textref =~ m/\S/) {
577	if (!$found_something) {
578	if ($self->{'verbosity'} > 2) {
579	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
580	print $outhandle " will be processed as a single section document\n";
581	}
582
583	# go ahead and process single-section document
584	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
585
586	# if document contains no Section tags we'll go ahead
587	# and extract metadata (this won't have been done
588	# above as the -description_tags option prevents it)
589	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
590	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
591	unless $self->{'no_metadata'};
592
593	} else {
594	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
595	print $outhandle " of the final closing </Section> tag. This text will\n";
596	print $outhandle " be ignored.";
597
598	my ($text);
599	if (length($$textref) > 30) {
600	$text = substr($$textref, 0, 30) . "...";
601	} else {
602	$text = $$textref;
603	}
604	$text =~ s/\n/ /isg;
605	print $outhandle " ($text)\n";
606	}
607	} elsif (!$found_something) {
608
609	if ($self->{'verbosity'} > 2) {
610	# may get to here if document contained no valid Section
611	# tags but did contain some comments. The text will have
612	# been processed already but we should print the warning
613	# as above and extract metadata
614	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
615	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
616	}
617
618	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
619	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
620	unless $self->{'no_metadata'};
621	}
622
623	} else {
624
625	# remove header and footer
626	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
627	$$textref =~ s/^.?<body[^>]>//is;
628	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
629	}
630
631	$self->{'css_assoc_files'} = {};
632
633	# single section document
634	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
635
636	#my $upgraded_filename_dirname = &File::Basename::dirname($upgraded_filename);
637
638	$self->acquire_css_associated_files($doc_obj, $cursection);
639
640	$self->{'css_assoc_files'} = {};
641	}
642
643	return 1;
644	}
645
646
647	sub process_heading
648	{
649	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
650	$strHeadingText = '' if (!defined($strHeadingText));
651
652	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
653
654	my $strSecMetadata = '';
655	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
656	{
657	$strSecMetadata .= $1;
658	}
659
660	$strHeadingText =~ s/^\s+//g;
661	$strHeadingText =~ s/\s+$//g;
662	$strSecMetadata =~ s/^\s+//g;
663	$strSecMetadata =~ s/\s+$//g;
664
665	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
666
667	if (length($strSecMetadata)) {
668	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
669	}
670
671	$strMetadata .= "\t</Description>\n";
672
673	return "<!--" . $strMetadata . "-->";
674	}
675
676
677	sub update_section_data
678	{
679	my ($self, $arrSections, $nCurTocNo) = @_;
680	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
681
682	if ($nSections == 0) {
683	push @$arrSections, $nCurTocNo;
684	return $strBuffer;
685	}
686	$nLast = $arrSections->[$nSections - 1];
687	if ($nCurTocNo > $nLast) {
688	push @$arrSections, $nCurTocNo;
689	return $strBuffer;
690	}
691	for(my $i = $nSections - 1; $i >= 0; $i--) {
692	if ($nCurTocNo <= $arrSections->[$i]) {
693	$strBuffer .= "\n</Section>";
694	pop @$arrSections;
695	}
696	}
697	push @$arrSections, $nCurTocNo;
698	return $strBuffer;
699	}
700
701
702	# note that process_section may be called multiple times for a single
703	# section (relying on the fact that add_utf8_text appends the text to any
704	# that may exist already).
705	sub process_section {
706	my $self = shift (@_);
707	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
708
709	my @styleTagsText = ($$textref =~ m/<style[^>]>([^<])<\/style>/sg);
710	if(scalar(@styleTagsText) > 0)
711	{
712	my $css_filename_dirname = &File::Basename::dirname(&FileUtils::filenameConcatenate($base_dir, $file));
713	foreach my $styleText (@styleTagsText)
714	{
715	$self->acquire_css_associated_files_from_text_block($styleText, $css_filename_dirname);
716	}
717	}
718
719	# trap links
720	if (!$self->{'nolinks'}) {
721	# usemap="./#index" not handled correctly => change to "#index"
722	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
723
724	## my $opencom = '(?:<!--\|<!(?:—\|\|--))';
725	## my $closecom = '(?:-->\|(?:—\|\|--)>)';
726
727	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
728	$self->replace_usemap_links($1, $2, $3)/isge;
729
730	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
731	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
732
733	## $$textref =~ s/($opencom.?)?+(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)(.*?$closecom)?+/
734	# $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
735	}
736
737	# trap images
738
739	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
740	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
741	# turned regular text succeeding images into links. That is, by embedding <imgs>
742	# inside <a href=""></a>, the text following images were no longer misbehaving.
743	# However, there would be many occasions whereby images were not meant to link
744	# to their source images but where the images would link to another web page.
745	# To allow this, the no_image_links option was introduced: it would prevent
746	# the behaviour of embedding images into links that referenced the source images.
747
748	# Somewhere along the line, the problem of normal text turning into links when
749	# such text followed images which were not embedded in <a href=""></a> ceased
750	# to occur. This is why the following lines have been commented out (as well as
751	# two lines in replace_images). They appear to no longer apply.
752
753	# If at any time, there is a need for having images embedded in <a> anchor tags,
754	# then it might be better to turn that into an HTMLPlugin option rather than make
755	# it the default behaviour. Also, eventually, no_image_links needs to become
756	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
757
758	#if(!$self->{'no_image_links'}){
759	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
760	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
761	#}
762
763	# add text to document object
764	# turn \ into \\ so that the rest of greenstone doesn't think there
765	# is an escape code following. (Macro parsing loses them...)
766	$$textref =~ s/\\/\\\\/go;
767
768	$doc_obj->add_utf8_text($cursection, $$textref);
769	}
770
771	sub replace_images {
772	my $self = shift (@_);
773	my ($front, $link, $back, $base_dir,
774	$file, $doc_obj, $section) = @_;
775
776	# remove quotes from link at start and end if necessary
777	if ($link=~/^[\"\']/) {
778	$link=~s/^[\"\']//;
779	$link=~s/[\"\']$//;
780	$front.='"';
781	$back="\"$back";
782	}
783
784	$link =~ s/\n/ /g;
785
786	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
787	# If the Word file path has spaces in it, wv messes up and you end up with
788	# absolute paths for the images, and without the "file://" prefix
789	# So check for this special case and massage the data to be correct
790	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
791	$link =~ s/^.*\\([^\\]+)$/$1/;
792	}
793
794	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
795
796	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
797
798	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n\n";
799
800	my $anchor_name = $img_file;
801	#$anchor_name =~ s/^.*\///;
802	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
803
804	my $image_link = $front . $img_file .$back;
805	return $image_link;
806
807	# The reasons for why the following two lines are no longer necessary can be
808	# found in subroutine process_section
809	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
810	#return $anchor_link;
811
812	#return $front . $img_file . $back . $anchor_name;
813	}
814
815	sub replace_href_links {
816	my $self = shift (@_);
817	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
818
819	if($front =~ m/^<link / && $link =~ m/\.css"$/)
820	{
821	my $actual_link = $link;
822	$actual_link =~ s/^"(.*)"$/$1/;
823
824	my $directory = &File::Basename::dirname($file);
825
826	my $css_filename = &FileUtils::filenameConcatenate($base_dir, $directory, $actual_link);
827	$self->retrieve_css_associated_files($css_filename);
828	}
829
830	# remove quotes from link at start and end if necessary
831	if ($link=~/^[\"\']/) {
832	$link=~s/^[\"\']//;
833	$link=~s/[\"\']$//;
834	$front.='"';
835	$back="\"$back";
836	}
837
838	# attempt to sort out targets - frames are not handled
839	# well in this plugin and some cases will screw things
840	# up - e.g. the _parent target (so we'll just remove
841	# them all ;-)
842	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
843	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
844	$front =~ s/target=\"?_parent\"?//is;
845	$back =~ s/target=\"?_parent\"?//is;
846
847	if($link =~ m/^\#/s)
848	{
849	return $front . "_httpsamepagelink_" . $link . $back;
850	}
851
852	$link =~ s/\n/ /g;
853
854	# Find file referred to by $link on file system
855	# This is more complicated than it sounds when char encodings
856	# is taken in to account
857	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
858
859	# href may use '\'s where '/'s should be on Windows
860	$href =~ s/\\/\//g;
861	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
862
863	##### leave all these links alone (they won't be picked up by intermediate
864	##### pages). I think that's safest when dealing with frames, targets etc.
865	##### (at least until I think of a better way to do it). Problems occur with
866	##### mailto links from within small frames, the intermediate page is displayed
867	##### within that frame and can't be seen. There is still potential for this to
868	##### happen even with html pages - the solution seems to be to somehow tell
869	##### the browser from the server side to display the page being sent (i.e.
870	##### the intermediate page) in the top level window - I'm not sure if that's
871	##### possible - the following line should probably be deleted if that can be done
872	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
873
874	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
875	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
876
877	if ($ENV{'GSDLOS'} =~ m/^windows$/) {
878
879	# Don't do any encoding for now, as not clear what
880	# the right thing to do is to support filename
881	# encoding on Windows when they are not UTF16
882	#
883	}
884	else {
885	# => Unix-based system
886
887	# If web page didn't give encoding, then default to utf8
888	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
889
890	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
891	print STDERR "**** Encoding with '$content_encoding', href: $href\n";
892	}
893
894	# on Darwin, the unicode filenames are stored on the file
895	# system in decomposed form, so any href link (including when
896	# URL-encoded) should refer to the decomposed name of the file
897	if ($ENV{'GSDLOS'} =~ /^darwin$/i) {
898	$href = normalize('D', $href); # Normalization Form D (decomposition)
899	}
900
901	$href = encode($content_encoding,$href);
902	}
903
904	$href = &unicode::raw_filename_to_utf8_url_encoded($href);
905	$href = &unicode::filename_to_url($href);
906
907	&ghtml::urlsafe ($href);
908
909	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
910	print STDERR "******DEBUG: href=$href\n";
911	}
912
913
914	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
915	} else {
916	# link is to some other type of file (e.g., an image) so we'll
917	# need to associate that file
918	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
919	}
920	}
921
922	sub retrieve_css_associated_files {
923	my $self = shift (@_);
924	my ($css_filename) = @_;
925
926	my $css_filename_dirname = &File::Basename::dirname($css_filename);
927
928	open (CSSFILE, $css_filename) \|\| return;
929	sysread (CSSFILE, my $file_string, -s CSSFILE);
930
931	$self->acquire_css_associated_files_from_text_block($file_string, $css_filename_dirname) unless !defined $file_string;
932
933	close CSSFILE;
934	}
935
936	sub acquire_css_associated_files_from_text_block {
937	my $self = shift (@_);
938	my ($text, $css_filename_dirname) = @_;
939
940	my @image_urls = ($text =~ m/background-image:\surl[^;];/sg);
941	foreach my $img_url (@image_urls)
942	{
943	$img_url =~ s/^.url.\((.)\).$/$1/;
944	$img_url =~ s/^\s"?([^"])"?\s*$/$1/;
945
946	$self->{'css_assoc_files'}->{&FileUtils::filenameConcatenate($css_filename_dirname, $img_url)} = $img_url;
947	}
948	}
949
950	sub acquire_css_associated_files {
951	my $self = shift(@_);
952
953	my ($doc_obj, $section) = @_;
954
955	foreach my $image_filename (keys %{$self->{'css_assoc_files'}})
956	{
957	$doc_obj->associate_file($image_filename, $self->{'css_assoc_files'}->{$image_filename}, undef, $section);
958	}
959	}
960
961	sub add_file {
962	my $self = shift (@_);
963	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
964	my ($newname);
965
966	my $filename = $href;
967	if ($base_dir eq "") {
968	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
969	# remove http://
970	$filename =~ s/^[^:]*:\/\///;
971	}
972	else {
973	# remove http:/ thereby leaving one slash at the start as
974	# part of full pathname
975	$filename =~ s/^[^:]*:\///;
976	}
977	}
978	else {
979	# remove http://
980	$filename =~ s/^[^:]*:\/\///;
981	}
982
983	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
984	$filename =~ s@\/@\\@g;
985	}
986
987	$filename = &FileUtils::filenameConcatenate($base_dir, $filename);
988
989	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'})) {
990	# we are processing a tidytmp file - want paths to be in import
991	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
992	}
993
994	# Replace %XX's in URL with decoded value if required. Note that the
995	# filename may include the %XX in some situations. If the original
996	# file's name was in URL encoding, the following method will not decode
997	# it.
998	my $unicode_filename = $filename;
999	my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
1000
1001	# wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
1002	# (it seems) accompanying file
1003	if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
1004
1005	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
1006
1007	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
1008	# The filenames that come through the HTML file have been decoded
1009	# into Unicode aware Perl strings. Need to convert them back
1010	# to their initial raw-byte encoding to match the file that
1011	# exists on the file system
1012	$filename = encode($content_encoding, $opt_decode_unicode_filename);
1013	}
1014	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
1015	# HFS+ is UTF8 with decompostion
1016	$filename = encode($content_encoding, $opt_decode_unicode_filename);
1017	$filename = normalize('D', $filename); # Normalization Form D (decomposition)
1018
1019	}
1020	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
1021	my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
1022
1023	if (defined $long_filename) {
1024	my $short_filename = Win32::GetLongPathName($long_filename);
1025	$filename = $short_filename;
1026	}
1027	# else {
1028	# print STDERR "***** failed to map href to real file:\n";
1029	# print STDERR "****** $href -> $opt_decode_unicode_filename\n";
1030	# }
1031	}
1032	else {
1033	my $outhandle = $self->{'outhandle'};
1034	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
1035	print $outhandle " in file system encoding of href: $href\n";
1036	print $outhandle " No character encoding done.\n";
1037	}
1038
1039
1040	# some special processing if the intended filename was converted to utf8, but
1041	# the actual file still needs to be renamed
1042	#if (!&util::fd_exists($filename)) {
1043	if (!&FileUtils::fileExists($filename)) {
1044	# try the original filename stored in map
1045	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1046	print STDERR "******!! orig filename did not exist: $filename\n";
1047	}
1048
1049	## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
1050
1051	my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
1052
1053	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1054	print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n";
1055	}
1056
1057	if (defined $original_filename && -e $original_filename) {
1058	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1059	print STDERR "****** Found match!\n";
1060	}
1061	$filename = $original_filename;
1062	}
1063	}
1064
1065	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
1066
1067	if ($rl == 0) {
1068	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
1069	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
1070	}
1071	else {
1072	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
1073	}
1074	}
1075
1076	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
1077	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
1078	}
1079	# add the original image file as a source file
1080	if (!$self->{'processing_tmp_files'} ) {
1081	$doc_obj->associate_source_file($filename);
1082	}
1083	if ($self->{'rename_assoc_files'}) {
1084	if (defined $self->{'aux_files'}->{$href}) {
1085	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1086	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1087	} else {
1088	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1089	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1090	$self->inc_filecount ();
1091	}
1092	$doc_obj->associate_file($filename, $newname, undef, $section);
1093	return "_httpdocimg_/$newname";
1094	} else {
1095	if(&unicode::is_url_encoded($unicode_filename)) {
1096	# use the possibly-decoded filename instead to avoid double URL encoding
1097	($newname) = $filename =~ m/([^\/\\]*)$/;
1098	} else {
1099	($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
1100	}
1101
1102	# Make sure this name uses only ASCII characters.
1103	# We use either base64 or URL encoding, as these preserve original encoding
1104	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
1105
1106	### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
1107	$doc_obj->associate_file($filename, $newname, undef, $section);
1108
1109	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1110	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
1111	my $newname_url = $newname;
1112	$newname_url = &unicode::filename_to_url($newname_url);
1113	return "_httpdocimg_/$newname_url";
1114	}
1115	}
1116
1117
1118	sub format_link {
1119	my $self = shift (@_);
1120	my ($link, $base_dir, $file) = @_;
1121
1122	# strip off hash part, e.g. #foo, but watch out for any entities, e.g. α
1123	my ($before_hash, $hash_part) = $link =~ m/^(.?[^&])(\#.)?$/;
1124
1125	$hash_part = "" if !defined $hash_part;
1126	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
1127	my $outhandle = $self->{'outhandle'};
1128	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1129	if $self->{'verbosity'};
1130	return ($link, "", 0);
1131	}
1132
1133	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
1134	my $type = $1;
1135	my $before_hash_file = $before_hash;
1136
1137	if ($link =~ m/^(http\|ftp):/i) {
1138
1139	# Turn url (using /) into file name (possibly using \ on windows)
1140	my @http_dir_split = split('/', $before_hash_file);
1141	$before_hash_file = &FileUtils::filenameConcatenate(@http_dir_split);
1142	}
1143
1144	# want to maintain two version of "before_hash": one representing the URL, the other using filesystem specific directory separator
1145	$before_hash_file = $self->eval_dir_dots($before_hash_file);
1146	my $before_hash_url = $before_hash_file;
1147	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1148	$before_hash_url =~ s@\\@\/@g;
1149	}
1150
1151	my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash_file);
1152
1153	my $rl = 0;
1154	$rl = 1 if (-e $linkfilename);
1155
1156	# make sure there's a slash on the end if it's a directory
1157	if ($before_hash_url !~ m/\/$/) {
1158	$before_hash_url .= "/" if (-d $linkfilename);
1159	}
1160	return ($type . $before_hash_url, $hash_part, $rl);
1161
1162	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
1163
1164	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
1165
1166	# the first directory will be the domain name if file_is_url
1167	# to generate archives, otherwise we'll assume all files are
1168	# from the same site and base_dir is the root
1169
1170	if ($self->{'file_is_url'}) {
1171	my @dirs = split /[\/\\]/, $file;
1172	my $domname = shift (@dirs);
1173	$before_hash = &FileUtils::filenameConcatenate($domname, $before_hash);
1174	$before_hash =~ s@\\@/@g; # for windows
1175	}
1176	else
1177	{
1178	# see if link shares directory with source document
1179	# => turn into relative link if this is so!
1180
1181	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1182	# too difficult doing a pattern match with embedded '\'s...
1183	my $win_before_hash=$before_hash;
1184	$win_before_hash =~ s@(\\)+@/@g;
1185	# $base_dir is already similarly "converted" on windows.
1186	if ($win_before_hash =~ s@^$base_dir/@@o) {
1187	# if this is true, we removed a prefix
1188	$before_hash=$win_before_hash;
1189	}
1190	}
1191	else {
1192	# before_hash has lost leading slash by this point,
1193	# -> add back in prior to substitution with $base_dir
1194	$before_hash = "/$before_hash";
1195
1196	$before_hash = &FileUtils::filenameConcatenate("",$before_hash);
1197	$before_hash =~ s@^$base_dir/@@;
1198	}
1199	}
1200	} else {
1201	# Turn relative file path into full path
1202	my $dirname = &File::Basename::dirname($file);
1203	$before_hash = &FileUtils::filenameConcatenate($dirname, $before_hash);
1204	$before_hash = $self->eval_dir_dots($before_hash);
1205	$before_hash =~ s@\\@/@g; # for windows
1206	}
1207
1208	my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);
1209
1210
1211	# print STDERR "**** linkfilename = $linkfilename\n";
1212	# if (!&util::fd_exists($linkfilename)) {
1213	# print STDERR "***** Warning: Could not find $linkfilename\n";
1214	# }
1215
1216
1217	# make sure there's a slash on the end if it's a directory
1218	if ($before_hash !~ m/\/$/) {
1219	$before_hash .= "/" if (-d $linkfilename);
1220	}
1221
1222	# print STDERR "*** returning: $before_hash\n";
1223
1224	return ("http://" . $before_hash, $hash_part, 1);
1225	} else {
1226	# mailto, news, nntp, telnet, javascript or gopher link
1227	return ($before_hash, "", 0);
1228	}
1229	}
1230
1231	sub extract_first_NNNN_characters {
1232	my $self = shift (@_);
1233	my ($textref, $doc_obj, $thissection) = @_;
1234
1235	foreach my $size (split /,/, $self->{'first'}) {
1236	my $tmptext = $$textref;
1237	# skip to the body
1238	$tmptext =~ s/.<body[^>]>//i;
1239	# remove javascript
1240	$tmptext =~ s@<script.*?</script>@ @sig;
1241	$tmptext =~ s/<[^>]*>/ /g;
1242	$tmptext =~ s/ / /g;
1243	$tmptext =~ s/^\s+//;
1244	$tmptext =~ s/\s+$//;
1245	$tmptext =~ s/\s+/ /gs;
1246	$tmptext = &unicode::substr ($tmptext, 0, $size);
1247	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
1248	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1249	}
1250	}
1251
1252
1253	sub extract_metadata {
1254	my $self = shift (@_);
1255	my ($textref, $metadata, $doc_obj, $section) = @_;
1256	my $outhandle = $self->{'outhandle'};
1257	# if we don't want metadata, we may as well not be here ...
1258	return if (!defined $self->{'metadata_fields'});
1259
1260	my $separator = $self->{'metadata_field_separator'};
1261	if ($separator eq "") {
1262	undef $separator;
1263	}
1264
1265	# metadata fields to extract/save. 'key' is the (lowercase) name of the
1266	# html meta, 'value' is the metadata name for greenstone to use
1267	my %find_fields = ();
1268
1269	my %creator_fields = (); # short-cut for lookups
1270
1271
1272	foreach my $field (split /,/, $self->{'metadata_fields'}) {
1273	$field =~ s/^\s+//; # remove leading whitespace
1274	$field =~ s/\s+$//; # remove trailing whitespace
1275
1276	# support tag<tagname>
1277	if ($field =~ m/^(.?)\s<(.*?)>$/) {
1278	# "$2" is the user's preferred gs metadata name
1279	$find_fields{lc($1)}=$2; # lc = lowercase
1280	} else { # no <tagname> for mapping
1281	# "$field" is the user's preferred gs metadata name
1282	$find_fields{lc($field)}=$field; # lc = lowercase
1283	}
1284	}
1285
1286	if (defined $self->{'hunt_creator_metadata'} &&
1287	$self->{'hunt_creator_metadata'} == 1 ) {
1288	my @extra_fields =
1289	(
1290	'author',
1291	'author.email',
1292	'creator',
1293	'dc.creator',
1294	'dc.creator.corporatename',
1295	);
1296
1297	# add the creator_metadata fields to search for
1298	foreach my $field (@extra_fields) {
1299	$creator_fields{$field}=0; # add to lookup hash
1300	}
1301	}
1302
1303
1304	# find the header in the html file, which has the meta tags
1305	$$textref =~ m@<head>(.*?)</head>@si;
1306
1307	my $html_header=$1;
1308
1309	# go through every <meta... tag defined in the html and see if it is
1310	# one of the tags we want to match.
1311
1312	# special case for title - we want to remember if its been found
1313	my $found_title = 0;
1314	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1315	$html_header =~ m/^/; # match the start of the string, for \G assertion
1316
1317	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1318	my $metatag=$1;
1319	my ($tag, $value);
1320
1321	# find the tag name
1322	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1323	$tag=$2;
1324	# in case they're not using " or ', but they should...
1325	if (! $tag) {
1326	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1327	$tag=$1;
1328	}
1329
1330	if (!defined $tag) {
1331	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1332	next;
1333	}
1334
1335	# don't need to assign this field if it was passed in from a previous
1336	# (recursive) plugin
1337	if (defined $metadata->{$tag}) {next}
1338
1339	# find the tag content
1340	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
1341	$value=$2;
1342
1343	# The following code assigns the metaname to value if value is
1344	# empty. Why would we do this?
1345	#if (! $value) {
1346	# $metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1347	# $value=$1;
1348	#}
1349	if (!defined $value \|\| $value eq "") {
1350	print $outhandle "HTMLPlugin: can't find VALUE in <meta $metatag >\n" if ($self->{'verbosity'} > 2);
1351	next;
1352	}
1353
1354	# clean up and add
1355	$value =~ s/\s+/ /gs;
1356	chomp($value); # remove trailing \n, if any
1357	if (exists $creator_fields{lc($tag)}) {
1358	# map this value onto greenstone's "Creator" metadata
1359	$tag='Creator';
1360	} elsif (!exists $find_fields{lc($tag)}) {
1361	next; # don't want this tag
1362	} else {
1363	# get the user's preferred capitalisation
1364	$tag = $find_fields{lc($tag)};
1365	}
1366	if (lc($tag) eq "title") {
1367	$found_title = 1;
1368	}
1369
1370	if ($self->{'verbosity'} > 2) {
1371	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1372	}
1373
1374	if ($tag =~ /\./) {
1375	# there is a . so has a namespace, add ex.
1376	$tag = "ex.$tag";
1377	}
1378	if (defined $separator) {
1379	my @values = split($separator, $value);
1380	foreach my $v (@values) {
1381	$doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1382	}
1383	}
1384	else {
1385	$doc_obj->add_utf8_metadata($section, $tag, $value);
1386	}
1387	}
1388
1389	# TITLE: extract the document title
1390	if (exists $find_fields{'title'} && !$found_title) {
1391	# we want a title, and didn't find one in the meta tags
1392	# see if there's a <title> tag
1393	my $title;
1394	my $from = ""; # for debugging output only
1395	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
1396	$title = $1;
1397	$from = "<title> tags";
1398	}
1399
1400	if (!defined $title) {
1401	$from = "first 100 chars";
1402	# if no title use first 100 or so characters
1403	$title = $$textref;
1404	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1405	$title =~ s/^.*?<body>//si;
1406	# ignore javascript!
1407	$title =~ s@<script.*?</script>@ @sig;
1408	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1409	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1410	$title = substr ($title, 0, 100);
1411	$title =~ s/\s\S*$/.../;
1412	}
1413	$title =~ s/<[^>]*>/ /g; # remove html tags
1414	$title =~ s/ / /g;
1415	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1416	$title =~ s/\s+/ /gs; # collapse multiple spaces
1417	$title =~ s/^\s*//; # remove leading spaces
1418	$title =~ s/\s*$//; # remove trailing spaces
1419
1420	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1421	$title =~ s/^\s+//s; # in case title_sub introduced any...
1422	$doc_obj->add_utf8_metadata ($section, "Title", $title);
1423	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1424	if ($self->{'verbosity'} > 2);
1425	}
1426
1427	# add FileFormat metadata
1428	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1429
1430	# Special, for metadata names such as tagH1 - extracts
1431	# the text between the first <H1> and </H1> tags into "H1" metadata.
1432
1433	foreach my $field (keys %find_fields) {
1434	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1435	my $tag = $1;
1436	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1437	my $content = $1;
1438	$content =~ s/ / /g;
1439	$content =~ s/<[^>]*>/ /g;
1440	$content =~ s/^\s+//;
1441	$content =~ s/\s+$//;
1442	$content =~ s/\s+/ /gs;
1443	if ($content) {
1444	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1445	$tag =~ s/^tag//i;
1446	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1447	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1448	if ($self->{'verbosity'} > 2);
1449	}
1450	}
1451	}
1452	}
1453
1454
1455	# evaluate any "../" to next directory up
1456	# evaluate any "./" as here
1457	sub eval_dir_dots {
1458	my $self = shift (@_);
1459	my ($filename) = @_;
1460	my $dirsep_os = &util::get_os_dirsep();
1461	my @dirsep = split(/$dirsep_os/,$filename);
1462
1463	my @eval_dirs = ();
1464	foreach my $d (@dirsep) {
1465	if ($d eq "..") {
1466	pop(@eval_dirs);
1467
1468	} elsif ($d eq ".") {
1469	# do nothing!
1470
1471	} else {
1472	push(@eval_dirs,$d);
1473	}
1474	}
1475
1476	# Need to fiddle with number of elements in @eval_dirs if the
1477	# first one is the empty string. This is because of a
1478	# modification to FileUtils::filenameConcatenate that supresses the addition
1479	# of a leading '/' character (or \ if windows) (intended to help
1480	# filename cat with relative paths) if the first entry in the
1481	# array is the empty string. Making the array start with two
1482	# empty strings is a way to defeat this "smart" option.
1483	#
1484	if (scalar(@eval_dirs) > 0) {
1485	if ($eval_dirs[0] eq ""){
1486	unshift(@eval_dirs,"");
1487	}
1488	}
1489
1490	my $evaluated_filename = (scalar @eval_dirs > 0) ? &FileUtils::filenameConcatenate(@eval_dirs) : "";
1491	return $evaluated_filename;
1492	}
1493
1494	sub replace_usemap_links {
1495	my $self = shift (@_);
1496	my ($front, $link, $back) = @_;
1497
1498	# remove quotes from link at start and end if necessary
1499	if ($link=~/^[\"\']/) {
1500	$link=~s/^[\"\']//;
1501	$link=~s/[\"\']$//;
1502	$front.='"';
1503	$back="\"$back";
1504	}
1505
1506	$link =~ s/^\.\///;
1507	return $front . $link . $back;
1508	}
1509
1510	sub inc_filecount {
1511	my $self = shift (@_);
1512
1513	if ($self->{'file_num'} == 1000) {
1514	$self->{'dir_num'} ++;
1515	$self->{'file_num'} = 0;
1516	} else {
1517	$self->{'file_num'} ++;
1518	}
1519	}
1520
1521
1522	# Extend read_file so that strings like é are
1523	# converted to UTF8 internally.
1524	#
1525	# We don't convert < or > or & or " in case
1526	# they interfere with the GML files
1527
1528	sub read_file {
1529	my $self = shift(@_);
1530	my ($filename, $encoding, $language, $textref) = @_;
1531
1532	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1533
1534	# Convert entities to their Unicode code-point equivalents
1535	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1536	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1537	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1538
1539	}
1540
1541	sub HB_read_html_file {
1542	my $self = shift (@_);
1543	my ($htmlfile, $text) = @_;
1544
1545	# load in the file
1546	if (!open (FILE, $htmlfile)) {
1547	print STDERR "ERROR - could not open $htmlfile\n";
1548	return;
1549	}
1550
1551	my $foundbody = 0;
1552	$self->HB_gettext (\$foundbody, $text, "FILE");
1553	close FILE;
1554
1555	# just in case there was no <body> tag
1556	if (!$foundbody) {
1557	$foundbody = 1;
1558	open (FILE, $htmlfile) \|\| return;
1559	$self->HB_gettext (\$foundbody, $text, "FILE");
1560	close FILE;
1561	}
1562	# text is in utf8
1563	}
1564
1565	# converts the text to utf8, as ghtml does that for é etc.
1566	sub HB_gettext {
1567	my $self = shift (@_);
1568	my ($foundbody, $text, $handle) = @_;
1569
1570	my $line = "";
1571	while (defined ($line = <$handle>)) {
1572	# look for body tag
1573	if (!$$foundbody) {
1574	if ($line =~ s/^.<body[^>]>//i) {
1575	$$foundbody = 1;
1576	} else {
1577	next;
1578	}
1579	}
1580
1581	# check for symbol fonts
1582	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
1583	my $font = $1;
1584	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1585	if ($font !~ m/^arial$/i);
1586	}
1587
1588	$$text .= $line;
1589	}
1590
1591	if ($self->{'input_encoding'} eq "iso_8859_1") {
1592	# convert to utf-8
1593	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1594	}
1595	# convert any alphanumeric character entities to their utf-8
1596	# equivalent for indexing purposes
1597	#&ghtml::convertcharentities ($$text);
1598
1599	$$text =~ s/\s+/ /g; # remove \n's
1600
1601	# At this point $$text is a binary byte string
1602	# => turn it into a Unicode aware string, so full
1603	# Unicode aware pattern matching can be used.
1604	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
1605	#
1606
1607	$$text = decode("utf8",$$text);
1608	}
1609
1610	sub HB_clean_section {
1611	my $self = shift (@_);
1612	my ($section) = @_;
1613
1614	# remove tags without a starting tag from the section
1615	my ($tag, $tagstart);
1616	while ($section =~ m/<\/([^>]{1,10})>/) {
1617	$tag = $1;
1618	$tagstart = index($section, "<$tag");
1619	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1620	$section =~ s/<\/$tag>//;
1621	}
1622
1623	# remove extra paragraph tags
1624	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
1625
1626	# remove extra stuff at the end of the section
1627	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
1628
1629	# add a newline at the beginning of each paragraph
1630	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1631
1632	# add a newline every 80 characters at a word boundary
1633	# Note: this regular expression puts a line feed before
1634	# the last word in each section, even when it is not
1635	# needed.
1636	$section =~ s/(.{1,80})\s/$1\n/g;
1637
1638	# fix up the image links
1639	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
1640	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1641	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
1642	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1643
1644	return $section;
1645	}
1646
1647	# Will convert the oldHDL format to the new HDL format (using the Section tag)
1648	sub convert_to_newHDLformat
1649	{
1650	my $self = shift (@_);
1651	my ($file,$cnfile) = @_;
1652	my $input_filename = $file;
1653	my $tmp_filename = $cnfile;
1654
1655	# write HTML tmp file with new HDL format
1656	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1657
1658	# read in the file and do basic html cleaning (removing header etc)
1659	my $html = "";
1660	$self->HB_read_html_file ($input_filename, \$html);
1661
1662	# process the file one section at a time
1663	my $curtoclevel = 1;
1664	my $firstsection = 1;
1665	my $toclevel = 0;
1666	while (length ($html) > 0) {
1667	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
1668	$toclevel = $3;
1669	my $title = $4;
1670	my $sectiontext = "";
1671	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
1672	$sectiontext = $1;
1673	} else {
1674	$sectiontext = $html;
1675	$html = "";
1676	}
1677
1678	# remove tags and extra spaces from the title
1679	$title =~ s/<\/?[^>]+>//g;
1680	$title =~ s/^\s+\|\s+$//g;
1681
1682	# close any sections below the current level and
1683	# create a new section (special case for the firstsection)
1684	print PROD "<!--\n";
1685	while (($curtoclevel > $toclevel) \|\|
1686	(!$firstsection && $curtoclevel == $toclevel)) {
1687	$curtoclevel--;
1688	print PROD "</Section>\n";
1689	}
1690	if ($curtoclevel+1 < $toclevel) {
1691	print STDERR "WARNING - jump in toc levels in $input_filename " .
1692	"from $curtoclevel to $toclevel\n";
1693	}
1694	while ($curtoclevel < $toclevel) {
1695	$curtoclevel++;
1696	}
1697
1698	if ($curtoclevel == 1) {
1699	# add the header tag
1700	print PROD "-->\n";
1701	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1702	print PROD "<!--\n";
1703	}
1704
1705	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1706
1707	print PROD "-->\n";
1708
1709	# clean up the section html
1710	$sectiontext = $self->HB_clean_section($sectiontext);
1711
1712	print PROD "$sectiontext\n";
1713
1714	} else {
1715	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1716	"\nin $input_filename\n";
1717	last;
1718	}
1719	$firstsection = 0;
1720	}
1721
1722	print PROD "<!--\n";
1723	while ($curtoclevel > 0) {
1724	$curtoclevel--;
1725	print PROD "</Section>\n";
1726	}
1727	print PROD "-->\n";
1728
1729	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1730
1731	return $tmp_filename;
1732	}
1733
1734	sub shorten {
1735	my $self = shift (@_);
1736	my ($text) = @_;
1737
1738	return "\"$text\"" if (length($text) < 100);
1739
1740	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1741	substr ($text, length($text)-50) . "\"";
1742	}
1743
1744	sub convert_tidy_or_oldHDL_file
1745	{
1746	my $self = shift (@_);
1747	my ($file) = @_;
1748	my $input_filename = $file;
1749
1750	if (-d $input_filename)
1751	{
1752	return $input_filename;
1753	}
1754
1755	# get the input filename
1756	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1757	my $base_dirname = $dirname;
1758	$suffix = lc($suffix);
1759
1760	# derive tmp filename from input filename
1761	# Remove any white space from filename -- no risk of name collision, and
1762	# makes later conversion by utils simpler. Leave spaces in path...
1763	# tidy up the filename with space, dot, hyphen between
1764	$tailname =~ s/\s+//g;
1765	$tailname =~ s/\.+//g;
1766	$tailname =~ s/\-+//g;
1767	# convert to utf-8 otherwise we have problems with the doc.xml file
1768	# later on
1769	&unicode::ensure_utf8(\$tailname);
1770
1771	# softlink to collection tmp dir
1772	my $tmp_dirname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1773	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1774
1775	my $test_dirname = "";
1776	my $f_separator = &util::get_os_dirsep();
1777
1778	if ($dirname =~ m/import$f_separator/)
1779	{
1780	$test_dirname = $'; #'
1781
1782	#print STDERR "init $'\n";
1783
1784	while ($test_dirname =~ m/[$f_separator]/)
1785	{
1786	my $folderdirname = $`;
1787	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname,$folderdirname);
1788	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1789	$test_dirname = $'; #'
1790	}
1791	}
1792
1793	my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1794
1795	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1796	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
1797	{
1798	#convert the input file to a new style HDL
1799	my $hdl_output_filename = $input_filename;
1800	if ($self->{'old_style_HDL'})
1801	{
1802	$hdl_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1803	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1804	}
1805
1806	#just for checking copy all other file from the base dir to tmp dir if it is not exists
1807	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1808	my @files = grep {!/^\.+$/} readdir(DIR);
1809	close(DIR);
1810
1811	foreach my $file (@files)
1812	{
1813	my $src_file = &FileUtils::filenameConcatenate($base_dirname,$file);
1814	my $dest_file = &FileUtils::filenameConcatenate($tmp_dirname,$file);
1815	if ((!-e $dest_file) && (!-d $src_file))
1816	{
1817	# just copy the original file back to the tmp directory
1818	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1819	}
1820	}
1821
1822	# tidy the input file
1823	my $tidy_output_filename = $hdl_output_filename;
1824	if ($self->{'use_realistic_book'})
1825	{
1826	$tidy_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1827	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1828	}
1829	$tmp_filename = $tidy_output_filename;
1830	}
1831	else
1832	{
1833	if (!-e $tmp_filename)
1834	{
1835	# just copy the original file back to the tmp directory
1836	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1837	}
1838	}
1839
1840	return $tmp_filename;
1841	}
1842
1843
1844	# Will make the html input file as a proper XML file with removed font tag and
1845	# image size added to the img tag.
1846	# The tidying process takes place in a collection specific 'tmp' directory so
1847	# that we don't accidentally damage the input.
1848	sub tmp_tidy_file
1849	{
1850	my $self = shift (@_);
1851	my ($file,$cnfile) = @_;
1852	my $input_filename = $file;
1853	my $tmp_filename = $cnfile;
1854
1855	# get the input filename
1856	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1857
1858	require HTML::TokeParser::Simple;
1859
1860	# create HTML parser to decode the input file
1861	my $parser = HTML::TokeParser::Simple->new($input_filename);
1862
1863	# write HTML tmp file without the font tag and image size are added to the img tag
1864	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1865	while (my $token = $parser->get_token())
1866	{
1867	# is it an img tag
1868	if ($token->is_start_tag('img'))
1869	{
1870	# get the attributes
1871	my $attr = $token->return_attr;
1872
1873	# get the full path to the image
1874	my $img_file = &FileUtils::filenameConcatenate($dirname,$attr->{src});
1875
1876	# set the width and height attribute
1877	($attr->{width}, $attr->{height}) = imgsize($img_file);
1878
1879	# recreate the tag
1880	print PROD "<img";
1881	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1882	print PROD ">";
1883	}
1884	# is it a font tag
1885	else
1886	{
1887	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
1888	{
1889	# remove font tag
1890	print PROD "";
1891	}
1892	else
1893	{
1894	# print without changes
1895	print PROD $token->as_is;
1896	}
1897	}
1898	}
1899	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1900
1901	# run html-tidy on the tmp file to make it a proper XML file
1902
1903	my $outhandle = $self->{'outhandle'};
1904	print $outhandle "Converting HTML to be XML compliant:\n";
1905
1906	my $tidy_cmd = "tidy";
1907	$tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1908	$tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1909	if ($self->{'verbosity'} <= 2) {
1910	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1911	$tidy_cmd .= " 2>nul";
1912	}
1913	else {
1914	$tidy_cmd .= " 2>/dev/null";
1915	}
1916	print $outhandle " => $tidy_cmd\n";
1917	}
1918
1919	my $tidyfile = `$tidy_cmd`;
1920
1921	# write result back to the tmp file
1922	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1923	print PROD $tidyfile;
1924	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1925
1926	# return the output filename
1927	return $tmp_filename;
1928	}
1929
1930	sub associate_cover_image
1931	{
1932	my $self = shift(@_);
1933	my ($doc_obj, $filename) = @_;
1934	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
1935	{
1936	# we will have cover image in tidytmp, but want it from import
1937	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1938	}
1939	$self->SUPER::associate_cover_image($doc_obj, $filename);
1940	}
1941
1942
1943	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: