Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago
Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 65.6 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use Encode;
39	use Unicode::Normalize 'normalize';
40
41	use ReadTextFile;
42	use HBPlugin;
43	use ghtml;
44	use unicode;
45	use util;
46	use FileUtils;
47	use XMLParser;
48
49	use Image::Size;
50	use File::Copy;
51
52	sub BEGIN {
53	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
54	}
55
56	use strict; # every perl program should have this!
57	no strict 'refs'; # make an exception so we can use variables as filehandles
58
59	my $arguments =
60	[ { 'name' => "process_exp",
61	'desc' => "{BasePlugin.process_exp}",
62	'type' => "regexp",
63	'deft' => &get_default_process_exp() },
64	{ 'name' => "block_exp",
65	'desc' => "{BasePlugin.block_exp}",
66	'type' => 'regexp',
67	'deft' => &get_default_block_exp() },
68	{ 'name' => "nolinks",
69	'desc' => "{HTMLPlugin.nolinks}",
70	'type' => "flag" },
71	{ 'name' => "keep_head",
72	'desc' => "{HTMLPlugin.keep_head}",
73	'type' => "flag" },
74	{ 'name' => "no_metadata",
75	'desc' => "{HTMLPlugin.no_metadata}",
76	'type' => "flag" },
77	{ 'name' => "metadata_fields",
78	'desc' => "{HTMLPlugin.metadata_fields}",
79	'type' => "string",
80	'deft' => "Title" },
81	{ 'name' => "metadata_field_separator",
82	'desc' => "{HTMLPlugin.metadata_field_separator}",
83	'type' => "string",
84	'deft' => "" },
85	{ 'name' => "hunt_creator_metadata",
86	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
87	'type' => "flag" },
88	{ 'name' => "file_is_url",
89	'desc' => "{HTMLPlugin.file_is_url}",
90	'type' => "flag" },
91	{ 'name' => "assoc_files",
92	'desc' => "{HTMLPlugin.assoc_files}",
93	'type' => "regexp",
94	'deft' => &get_default_block_exp() },
95	{ 'name' => "rename_assoc_files",
96	'desc' => "{HTMLPlugin.rename_assoc_files}",
97	'type' => "flag" },
98	{ 'name' => "title_sub",
99	'desc' => "{HTMLPlugin.title_sub}",
100	'type' => "string",
101	'deft' => "" },
102	{ 'name' => "description_tags",
103	'desc' => "{HTMLPlugin.description_tags}",
104	'type' => "flag" },
105	# retain this for backward compatibility (w3mir option was replaced by
106	# file_is_url)
107	{ 'name' => "w3mir",
108	# 'desc' => "{HTMLPlugin.w3mir}",
109	'type' => "flag",
110	'hiddengli' => "yes"},
111	{ 'name' => "no_strip_metadata_html",
112	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
113	'type' => "string",
114	'deft' => "",
115	'reqd' => "no"},
116	{ 'name' => "sectionalise_using_h_tags",
117	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
118	'type' => "flag" },
119	{ 'name' => "use_realistic_book",
120	'desc' => "{HTMLPlugin.tidy_html}",
121	'type' => "flag"},
122	{ 'name' => "old_style_HDL",
123	'desc' => "{HTMLPlugin.old_style_HDL}",
124	'type' => "flag"},
125	{'name' => "processing_tmp_files",
126	'desc' => "{BasePlugin.processing_tmp_files}",
127	'type' => "flag",
128	'hiddengli' => "yes"}
129	];
130
131	my $options = { 'name' => "HTMLPlugin",
132	'desc' => "{HTMLPlugin.desc}",
133	'abstract' => "no",
134	'inherits' => "yes",
135	'args' => $arguments };
136
137
138	sub new {
139	my ($class) = shift (@_);
140	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
141	push(@$pluginlist, $class);
142
143	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
144	push(@{$hashArgOptLists->{"OptList"}},$options);
145
146
147	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
148
149	if ($self->{'w3mir'}) {
150	$self->{'file_is_url'} = 1;
151	}
152	$self->{'aux_files'} = {};
153	$self->{'dir_num'} = 0;
154	$self->{'file_num'} = 0;
155
156	return bless $self, $class;
157	}
158
159	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
160	# if have eg <script language="javascript" src="img/lib.js@123">
161	# blocking is now done by reading through the file and recording all the
162	# images and other files
163	sub get_default_block_exp {
164	my $self = shift (@_);
165
166	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
167	return "";
168	}
169
170	sub get_default_process_exp {
171	my $self = shift (@_);
172
173	# the last option is an attempt to encode the concept of an html query ...
174	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
175	}
176
177	sub store_block_files
178	{
179	my $self =shift (@_);
180	my ($filename_full_path, $block_hash) = @_;
181
182	my $html_fname = $filename_full_path;
183
184	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
185	$self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
186
187	# read in file ($text will be in utf8)
188	my $raw_text = "";
189	$self->read_file_no_decoding($filename_full_path, \$raw_text);
190
191	my $textref = \$raw_text;
192	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
193	my $closecom = '(?:-->\|(?:—\|\|--)>)';
194	$$textref =~ s/$opencom(.*?)$closecom//gs;
195
196	# Convert entities to their UTF8 equivalents
197	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
198	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
199	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
200
201	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
202	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
203	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
204	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
205	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
206	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
207	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
208
209	if(!defined $self->{'unicode_to_original_filename'}) {
210	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
211	$self->{'unicode_to_original_filename'} = {};
212	}
213
214	foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
215
216	# remove quotes from link at start and end if necessary
217	if ($raw_link =~ m/^\"/) {
218	$raw_link =~ s/^\"//;
219	$raw_link =~ s/\"$//;
220	}
221
222	# remove any anchor names, e.g. foo.html#name becomes foo.html
223	# but watch out for any #'s that are part of entities, such as α
224	$raw_link =~ s/([^&])\#.*$/$1/s;
225
226	# some links may just be anchor names
227	next unless ($raw_link =~ /\S+/);
228
229	if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/i) {
230	# Turn relative file path into full path
231	my $dirname = &File::Basename::dirname($filename_full_path);
232	$raw_link = &FileUtils::filenameConcatenate($dirname, $raw_link);
233	}
234	$raw_link = $self->eval_dir_dots($raw_link);
235
236	# this is the actual filename on the filesystem (that the link refers to)
237	my $url_original_filename = $self->opt_url_decode($raw_link);
238
239	my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
240
241	if ($exceeds_bytecodes) {
242	# We have a link to a file name that is more complicated than a raw byte filename
243	# What we do next depends on the operating system we are on
244
245	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
246	# Assume we're dealing with a UTF-8 encoded filename
247	$url_original_filename = encode("utf8", $url_original_filename);
248	}
249	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
250	# HFS+ is UTF8 with decompostion
251	$url_original_filename = encode("utf8", $url_original_filename);
252	$url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
253	}
254	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
255	# Don't need to do anything as later code maps Windows
256	# unicode filenames to DOS short filenames when needed
257	}
258	else {
259	my $outhandle = $self->{'outhandle'};
260	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
261	print $outhandle " in raw file system encoding of: $raw_link\n";
262	print $outhandle " Assuming filesystem is UTF-8 based.\n";
263	$url_original_filename = encode("utf8", $url_original_filename);
264	}
265	}
266
267	# Convert the (currently raw) link into its Unicode version.
268	# Store the Unicode link along with the url_original_filename
269	my $unicode_url_original_filename = "";
270	$self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
271
272
273	$self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
274
275
276	if ($url_original_filename ne $unicode_url_original_filename) {
277	my $outhandle = $self->{'outhandle'};
278
279	print $outhandle "URL Encoding $url_original_filename\n";
280	print $outhandle " ->$unicode_url_original_filename\n";
281
282	# Allow for possibility of raw byte version and Unicode versions of file
283	&util::block_filename($block_hash,$unicode_url_original_filename);
284	}
285
286	# $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename);
287	&util::block_filename($block_hash,$url_original_filename);
288
289	}
290	}
291
292	# Given a filename in any encoding, will URL decode it to get back the original filename
293	# in the original encoding. Because this method is intended to work out the original
294	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
295	# string already exists in the local folder.
296	#
297	sub opt_url_decode {
298	my $self = shift (@_);
299	my ($raw_link) = @_;
300
301
302	# Replace %XX's in URL with decoded value if required.
303	# Note that the filename may include the %XX in some situations
304
305	## if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
306
307	if (($raw_link =~ m/\%[A-F0-9]{2}/i) \|\| ($raw_link =~ m/\&\#x[0-9A-F]+;/i) \|\| ($raw_link =~ m/\&\#[0-9]+;/i)) {
308	if (!-e $raw_link) {
309	$raw_link = &unicode::url_decode($raw_link,1);
310	}
311	}
312
313	return $raw_link;
314	}
315
316	sub read_into_doc_obj
317	{
318	my $self = shift (@_);
319	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
320
321	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
322
323	# Lookup content_encoding worked out in file_block pass for this file
324	# Store it under the local name 'content_encoding' so its nice and
325	# easy to access
326	$self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
327
328	# get the input file
329	my $input_filename = $file;
330	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
331	$suffix = lc($suffix);
332	my $tidy_filename;
333	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
334	{
335	# because the document has to be sectionalized set the description tags
336	$self->{'description_tags'} = 1;
337
338	# set the file to be tidied
339	$input_filename = &FileUtils::filenameConcatenate($base_dir,$file) if $base_dir =~ m/\w/;
340
341	# get the tidied file
342	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
343	$tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
344
345	# derive tmp filename from input filename
346	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
347
348	# set the new input file and base_dir to be from the tidied file
349	$file = "$tailname$suffix";
350	$base_dir = $dirname;
351	}
352
353	# call the parent read_into_doc_obj
354	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
355	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
356	{
357	# now we need to reset the filenames in the doc obj so that the converted filenames are not used
358	my $collect_file = &util::filename_within_collection($filename_full_path);
359	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
360	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
361	# build. So set it manually.
362	$doc_obj->set_source_path($filename_full_path);
363	my $collect_conv_file = &util::filename_within_collection($tidy_filename);
364	$doc_obj->set_converted_filename($collect_conv_file);
365
366	my $plugin_filename_encoding = $self->{'filename_encoding'};
367	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
368	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
369	}
370
371	delete $self->{'store_content_encoding'}->{$filename_full_path};
372	$self->{'content_encoding'} = undef;
373
374	return ($process_status,$doc_obj);
375	}
376
377	# do plugin specific processing of doc_obj
378	sub process {
379	my $self = shift (@_);
380	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
381	my $outhandle = $self->{'outhandle'};
382
383	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
384	# this makes life so much easier... perl can cope with unix-style '/'s.
385	$base_dir =~ s@(\\)+@/@g;
386	$file =~ s@(\\)+@/@g;
387	}
388
389	my $filename = &FileUtils::filenameConcatenate($base_dir,$file);
390	my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
391	my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
392
393	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
394	# And again
395	$upgraded_base_dir =~ s@(\\)+@/@g;
396	$upgraded_filename =~ s@(\\)+@/@g;
397
398	# Need to make sure there is a '/' on the end of upgraded_base_dir
399	if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
400	$upgraded_base_dir .= "/";
401	}
402	}
403	my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
404
405	# reset per-doc stuff...
406	$self->{'aux_files'} = {};
407	$self->{'dir_num'} = 0;
408	$self->{'file_num'} = 0;
409
410	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
411	# you can also include metadata in the format (X can be any number)
412	# <hX>Title<!--gsdl-metadata
413	# <Metadata name="name1">value1</Metadata>
414	# ...
415	# <Metadata name="nameN">valueN</Metadata>
416	#--></hX>
417	if ($self->{'sectionalise_using_h_tags'}) {
418	# description_tags should allways be activated because we convert headings to description tags
419	$self->{'description_tags'} = 1;
420
421	my $arrSections = [];
422	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
423
424	if (scalar(@$arrSections)) {
425	my $strMetadata = $self->update_section_data($arrSections, -1);
426	if (length($strMetadata)) {
427	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
428	$$textref =~ s/<\/body>/$strMetadata/ig;
429	}
430	}
431	}
432
433	my $cursection = $doc_obj->get_top_section();
434
435	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
436	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
437
438	# Store URL for page as metadata - this can be used for an
439	# altavista style search interface. The URL won't be valid
440	# unless the file structure contains the domain name (i.e.
441	# like when w3mir is used to download a website).
442
443	# URL metadata (even invalid ones) are used to support internal
444	# links, so even if 'file_is_url' is off, still need to store info
445
446	my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
447
448	# my $utf8_file = $self->filename_to_utf8_metadata($file);
449	# $utf8_file =~ s/&\#095;/_/g;
450	# variable below used to be utf8_file
451
452	my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
453	my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
454
455	my $web_url = "http://";
456	my $utf8_web_url = "http://";
457	if(defined $dirname) { # local directory
458	# Check for "ftp" in the domain name of the directory
459	# structure to determine if this URL should be a ftp:// URL
460	# This check is not infallible, but better than omitting the
461	# check, which would cause all files downloaded from ftp sites
462	# via mirroring with wget to have potentially erroneous http:// URLs
463	# assigned in their metadata
464	if ($dirname =~ /^[^\/]*ftp/i)
465	{
466	$web_url = "ftp://";
467	$utf8_web_url = "ftp://";
468	}
469	$dirname = $self->eval_dir_dots($dirname);
470	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
471
472	$web_url = $web_url.$dirname.$url_encoded_file;
473	$utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
474	} else {
475	$web_url = $web_url.$url_encoded_file;
476	$utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
477	}
478	$web_url =~ s/\\/\//g;
479	$utf8_web_url =~ s/\\/\//g;
480
481	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
482	print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n";
483	print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
484	}
485
486
487	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
488	$doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
489
490	if ($self->{'file_is_url'}) {
491	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
492	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
493	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
494	}
495
496	if ($self->{'description_tags'}) {
497	# remove the html header - note that doing this here means any
498	# sections defined within the header will be lost (so all <Section>
499	# tags must appear within the body of the HTML)
500	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
501
502	$$textref =~ s/^.?<body[^>]>//is;
503	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
504
505	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
506	my $closecom = '(?:-->\|(?:—\|\|--)>)';
507
508	my $lt = '(?:<\|<)';
509	my $gt = '(?:>\|>)';
510	my $quot = '(?:"\|"\|”\|“)';
511
512	my $dont_strip = '';
513	if ($self->{'no_strip_metadata_html'}) {
514	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
515	}
516
517	my $found_something = 0; my $top = 1;
518	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
519	my $text = $1;
520	my $comment = $2;
521	if (defined $text) {
522	# text before a comment - note that getting to here
523	# doesn't necessarily mean there are Section tags in
524	# the document
525	$self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
526	}
527	while ($comment =~ s/$lt(.*?)$gt//s) {
528	my $tag = $1;
529	if ($tag eq "Section") {
530	$found_something = 1;
531	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
532	$top = 0;
533	} elsif ($tag eq "/Section") {
534	$found_something = 1;
535	$cursection = $doc_obj->get_parent_section ($cursection);
536	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
537	my $metaname = $1;
538	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
539	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
540	my $metavalue = $1;
541	$metavalue =~ s/^\s+//;
542	$metavalue =~ s/\s+$//;
543	# assume that no metadata value intentionally includes
544	# carriage returns or HTML tags (if they're there they
545	# were probably introduced when converting to HTML from
546	# some other format).
547	# actually some people want to have html tags in their
548	# metadata.
549	$metavalue =~ s/[\cJ\cM]/ /sg;
550	$metavalue =~ s/<[^>]+>//sg
551	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
552	$metavalue =~ s/\s+/ /sg;
553	if ($metaname =~ /\./) { # has a namespace
554	$metaname = "ex.$metaname";
555	}
556	if ($accumulate) {
557	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
558	} else {
559	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
560	}
561	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
562	# do nothing with containing Description tags
563	} else {
564	# simple HTML tag (probably created by the conversion
565	# to HTML from some other format) - we'll ignore it and
566	# hope for the best ;-)
567	}
568	}
569	}
570	if ($cursection ne "") {
571	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
572	}
573
574	$$textref =~ s/^.?<body[^>]>//is;
575	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
576	if ($$textref =~ m/\S/) {
577	if (!$found_something) {
578	if ($self->{'verbosity'} > 2) {
579	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
580	print $outhandle " will be processed as a single section document\n";
581	}
582
583	# go ahead and process single-section document
584	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
585
586	# if document contains no Section tags we'll go ahead
587	# and extract metadata (this won't have been done
588	# above as the -description_tags option prevents it)
589	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
590	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
591	unless $self->{'no_metadata'};
592
593	} else {
594	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
595	print $outhandle " of the final closing </Section> tag. This text will\n";
596	print $outhandle " be ignored.";
597
598	my ($text);
599	if (length($$textref) > 30) {
600	$text = substr($$textref, 0, 30) . "...";
601	} else {
602	$text = $$textref;
603	}
604	$text =~ s/\n/ /isg;
605	print $outhandle " ($text)\n";
606	}
607	} elsif (!$found_something) {
608
609	if ($self->{'verbosity'} > 2) {
610	# may get to here if document contained no valid Section
611	# tags but did contain some comments. The text will have
612	# been processed already but we should print the warning
613	# as above and extract metadata
614	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
615	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
616	}
617
618	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
619	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
620	unless $self->{'no_metadata'};
621	}
622
623	} else {
624
625	# remove header and footer
626	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
627	$$textref =~ s/^.?<body[^>]>//is;
628	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
629	}
630
631	$self->{'css_assoc_files'} = {};
632
633	# single section document
634	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
635
636	#my $upgraded_filename_dirname = &File::Basename::dirname($upgraded_filename);
637
638	$self->acquire_css_associated_files($doc_obj, $cursection);
639
640	$self->{'css_assoc_files'} = {};
641	}
642
643	return 1;
644	}
645
646
647	sub process_heading
648	{
649	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
650	$strHeadingText = '' if (!defined($strHeadingText));
651
652	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
653
654	my $strSecMetadata = '';
655	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
656	{
657	$strSecMetadata .= $1;
658	}
659
660	$strHeadingText =~ s/^\s+//g;
661	$strHeadingText =~ s/\s+$//g;
662	$strSecMetadata =~ s/^\s+//g;
663	$strSecMetadata =~ s/\s+$//g;
664
665	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
666
667	if (length($strSecMetadata)) {
668	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
669	}
670
671	$strMetadata .= "\t</Description>\n";
672
673	return "<!--" . $strMetadata . "-->";
674	}
675
676
677	sub update_section_data
678	{
679	my ($self, $arrSections, $nCurTocNo) = @_;
680	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
681
682	if ($nSections == 0) {
683	push @$arrSections, $nCurTocNo;
684	return $strBuffer;
685	}
686	$nLast = $arrSections->[$nSections - 1];
687	if ($nCurTocNo > $nLast) {
688	push @$arrSections, $nCurTocNo;
689	return $strBuffer;
690	}
691	for(my $i = $nSections - 1; $i >= 0; $i--) {
692	if ($nCurTocNo <= $arrSections->[$i]) {
693	$strBuffer .= "\n</Section>";
694	pop @$arrSections;
695	}
696	}
697	push @$arrSections, $nCurTocNo;
698	return $strBuffer;
699	}
700
701
702	# note that process_section may be called multiple times for a single
703	# section (relying on the fact that add_utf8_text appends the text to any
704	# that may exist already).
705	sub process_section {
706	my $self = shift (@_);
707	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
708
709	my @styleTagsText = ($$textref =~ m/<style[^>]>([^<])<\/style>/sg);
710	if(scalar(@styleTagsText) > 0)
711	{
712	my $css_filename_dirname = &File::Basename::dirname(&FileUtils::filenameConcatenate($base_dir, $file));
713	foreach my $styleText (@styleTagsText)
714	{
715	$self->acquire_css_associated_files_from_text_block($styleText, $css_filename_dirname);
716	}
717	}
718
719	# trap links
720	if (!$self->{'nolinks'}) {
721	# usemap="./#index" not handled correctly => change to "#index"
722	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
723
724	## my $opencom = '(?:<!--\|<!(?:—\|\|--))';
725	## my $closecom = '(?:-->\|(?:—\|\|--)>)';
726
727	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
728	$self->replace_usemap_links($1, $2, $3)/isge;
729
730	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
731	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
732
733	## $$textref =~ s/($opencom.?)?+(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)(.*?$closecom)?+/
734	# $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
735	}
736
737	# trap images
738
739	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
740	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
741	# turned regular text succeeding images into links. That is, by embedding <imgs>
742	# inside <a href=""></a>, the text following images were no longer misbehaving.
743	# However, there would be many occasions whereby images were not meant to link
744	# to their source images but where the images would link to another web page.
745	# To allow this, the no_image_links option was introduced: it would prevent
746	# the behaviour of embedding images into links that referenced the source images.
747
748	# Somewhere along the line, the problem of normal text turning into links when
749	# such text followed images which were not embedded in <a href=""></a> ceased
750	# to occur. This is why the following lines have been commented out (as well as
751	# two lines in replace_images). They appear to no longer apply.
752
753	# If at any time, there is a need for having images embedded in <a> anchor tags,
754	# then it might be better to turn that into an HTMLPlugin option rather than make
755	# it the default behaviour. Also, eventually, no_image_links needs to become
756	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
757
758	#if(!$self->{'no_image_links'}){
759	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
760	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
761	#}
762
763	# add text to document object
764	# turn \ into \\ so that the rest of greenstone doesn't think there
765	# is an escape code following. (Macro parsing loses them...)
766	$$textref =~ s/\\/\\\\/go;
767
768	$doc_obj->add_utf8_text($cursection, $$textref);
769	}
770
771	sub replace_images {
772	my $self = shift (@_);
773	my ($front, $link, $back, $base_dir,
774	$file, $doc_obj, $section) = @_;
775
776	# remove quotes from link at start and end if necessary
777	if ($link=~/^[\"\']/) {
778	$link=~s/^[\"\']//;
779	$link=~s/[\"\']$//;
780	$front.='"';
781	$back="\"$back";
782	}
783
784	$link =~ s/\n/ /g;
785
786	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
787	# If the Word file path has spaces in it, wv messes up and you end up with
788	# absolute paths for the images, and without the "file://" prefix
789	# So check for this special case and massage the data to be correct
790	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
791	$link =~ s/^.*\\([^\\]+)$/$1/;
792	}
793
794	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
795
796	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
797
798	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n";
799
800	my $anchor_name = $img_file;
801	#$anchor_name =~ s/^.*\///;
802	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
803
804	my $image_link = $front . $img_file .$back;
805	return $image_link;
806
807	# The reasons for why the following two lines are no longer necessary can be
808	# found in subroutine process_section
809	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
810	#return $anchor_link;
811
812	#return $front . $img_file . $back . $anchor_name;
813	}
814
815	sub replace_href_links {
816	my $self = shift (@_);
817	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
818
819	if($front =~ m/^<link / && $link =~ m/\.css"$/)
820	{
821	my $actual_link = $link;
822	$actual_link =~ s/^"(.*)"$/$1/;
823
824	my $directory = &File::Basename::dirname($file);
825
826	my $css_filename = &FileUtils::filenameConcatenate($base_dir, $directory, $actual_link);
827	$self->retrieve_css_associated_files($css_filename);
828	}
829
830	# remove quotes from link at start and end if necessary
831	if ($link=~/^[\"\']/) {
832	$link=~s/^[\"\']//;
833	$link=~s/[\"\']$//;
834	$front.='"';
835	$back="\"$back";
836	}
837
838	# attempt to sort out targets - frames are not handled
839	# well in this plugin and some cases will screw things
840	# up - e.g. the _parent target (so we'll just remove
841	# them all ;-)
842	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
843	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
844	$front =~ s/target=\"?_parent\"?//is;
845	$back =~ s/target=\"?_parent\"?//is;
846
847	if($link =~ m/^\#/s)
848	{
849	return $front . "_httpsamepagelink_" . $link . $back;
850	}
851
852	$link =~ s/\n/ /g;
853
854	# Find file referred to by $link on file system
855	# This is more complicated than it sounds when char encodings
856	# is taken in to account
857	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
858
859	# href may use '\'s where '/'s should be on Windows
860	$href =~ s/\\/\//g;
861	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
862
863	##### leave all these links alone (they won't be picked up by intermediate
864	##### pages). I think that's safest when dealing with frames, targets etc.
865	##### (at least until I think of a better way to do it). Problems occur with
866	##### mailto links from within small frames, the intermediate page is displayed
867	##### within that frame and can't be seen. There is still potential for this to
868	##### happen even with html pages - the solution seems to be to somehow tell
869	##### the browser from the server side to display the page being sent (i.e.
870	##### the intermediate page) in the top level window - I'm not sure if that's
871	##### possible - the following line should probably be deleted if that can be done
872	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
873
874	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
875	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
876
877	if ($ENV{'GSDLOS'} =~ m/^windows$/) {
878
879	# Don't do any encoding for now, as not clear what
880	# the right thing to do is to support filename
881	# encoding on Windows when they are not UTF16
882	#
883	}
884	else {
885	# => Unix-based system
886
887	# If web page didn't give encoding, then default to utf8
888	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
889
890	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
891	print STDERR "**** Encoding with '$content_encoding', href: $href\n";
892	}
893
894	# on Darwin, the unicode filenames are stored on the file
895	# system in decomposed form, so any href link (including when
896	# URL-encoded) should refer to the decomposed name of the file
897	if ($ENV{'GSDLOS'} =~ /^darwin$/i) {
898	$href = normalize('D', $href); # Normalization Form D (decomposition)
899	}
900
901	$href = encode($content_encoding,$href);
902	}
903
904	$href = &unicode::raw_filename_to_utf8_url_encoded($href);
905	$href = &unicode::filename_to_url($href);
906
907	&ghtml::urlsafe ($href);
908
909	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
910	print STDERR "******DEBUG: href=$href\n";
911	}
912
913
914	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
915	} else {
916	# link is to some other type of file (e.g., an image) so we'll
917	# need to associate that file
918	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
919	}
920	}
921
922	sub retrieve_css_associated_files {
923	my $self = shift (@_);
924	my ($css_filename) = @_;
925
926	my $css_filename_dirname = &File::Basename::dirname($css_filename);
927
928	open (CSSFILE, $css_filename) \|\| return;
929	sysread (CSSFILE, my $file_string, -s CSSFILE);
930
931	$self->acquire_css_associated_files_from_text_block($file_string, $css_filename_dirname) unless !defined $file_string;
932
933	close CSSFILE;
934	}
935
936	sub acquire_css_associated_files_from_text_block {
937	my $self = shift (@_);
938	my ($text, $css_filename_dirname) = @_;
939
940	my @image_urls = ($text =~ m/background-image:\surl[^;];/sg);
941	foreach my $img_url (@image_urls)
942	{
943	$img_url =~ s/^.url.\((.)\).$/$1/;
944	$img_url =~ s/^\s"?([^"])"?\s*$/$1/;
945
946	$self->{'css_assoc_files'}->{&FileUtils::filenameConcatenate($css_filename_dirname, $img_url)} = $img_url;
947	}
948	}
949
950	sub acquire_css_associated_files {
951	my $self = shift(@_);
952
953	my ($doc_obj, $section) = @_;
954
955	foreach my $image_filename (keys %{$self->{'css_assoc_files'}})
956	{
957	$doc_obj->associate_file($image_filename, $self->{'css_assoc_files'}->{$image_filename}, undef, $section);
958	}
959	}
960
961	sub add_file {
962	my $self = shift (@_);
963	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
964	my ($newname);
965
966	my $filename = $href;
967	if ($base_dir eq "") {
968	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
969	# remove http://
970	$filename =~ s/^[^:]*:\/\///;
971	}
972	else {
973	# remove http:/ thereby leaving one slash at the start as
974	# part of full pathname
975	$filename =~ s/^[^:]*:\///;
976	}
977	}
978	else {
979	# remove http://
980	$filename =~ s/^[^:]*:\/\///;
981	}
982
983	$filename = &FileUtils::filenameConcatenate($base_dir, $filename);
984
985	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'})) {
986	# we are processing a tidytmp file - want paths to be in import
987	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
988	}
989
990	# Replace %XX's in URL with decoded value if required. Note that the
991	# filename may include the %XX in some situations. If the original
992	# file's name was in URL encoding, the following method will not decode
993	# it.
994	my $unicode_filename = $filename;
995	my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
996
997	# wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
998	# (it seems) accompanying file
999	if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
1000
1001	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
1002
1003	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
1004	# The filenames that come through the HTML file have been decoded
1005	# into Unicode aware Perl strings. Need to convert them back
1006	# to their initial raw-byte encoding to match the file that
1007	# exists on the file system
1008	$filename = encode($content_encoding, $opt_decode_unicode_filename);
1009	}
1010	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
1011	# HFS+ is UTF8 with decompostion
1012	$filename = encode($content_encoding, $opt_decode_unicode_filename);
1013	$filename = normalize('D', $filename); # Normalization Form D (decomposition)
1014
1015	}
1016	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
1017	my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
1018
1019	if (defined $long_filename) {
1020	my $short_filename = Win32::GetLongPathName($long_filename);
1021	$filename = $short_filename;
1022	}
1023	# else {
1024	# print STDERR "***** failed to map href to real file:\n";
1025	# print STDERR "****** $href -> $opt_decode_unicode_filename\n";
1026	# }
1027	}
1028	else {
1029	my $outhandle = $self->{'outhandle'};
1030	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
1031	print $outhandle " in file system encoding of href: $href\n";
1032	print $outhandle " No character encoding done.\n";
1033	}
1034
1035
1036	# some special processing if the intended filename was converted to utf8, but
1037	# the actual file still needs to be renamed
1038	#if (!&util::fd_exists($filename)) {
1039	if (!&FileUtils::fileExists($filename)) {
1040	# try the original filename stored in map
1041	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1042	print STDERR "******!! orig filename did not exist: $filename\n";
1043	}
1044
1045	## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
1046
1047	my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
1048
1049	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1050	print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n";
1051	}
1052
1053	if (defined $original_filename && -e $original_filename) {
1054	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1055	print STDERR "****** Found match!\n";
1056	}
1057	$filename = $original_filename;
1058	}
1059	}
1060
1061	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
1062
1063	if ($rl == 0) {
1064	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
1065	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
1066	}
1067	else {
1068	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
1069	}
1070	}
1071
1072	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
1073	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
1074	}
1075	# add the original image file as a source file
1076	if (!$self->{'processing_tmp_files'} ) {
1077	$doc_obj->associate_source_file($filename);
1078	}
1079	if ($self->{'rename_assoc_files'}) {
1080	if (defined $self->{'aux_files'}->{$href}) {
1081	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1082	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1083	} else {
1084	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1085	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1086	$self->inc_filecount ();
1087	}
1088	$doc_obj->associate_file($filename, $newname, undef, $section);
1089	return "_httpdocimg_/$newname";
1090	} else {
1091	if(&unicode::is_url_encoded($unicode_filename)) {
1092	# use the possibly-decoded filename instead to avoid double URL encoding
1093	($newname) = $filename =~ m/([^\/\\]*)$/;
1094	} else {
1095	($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
1096	}
1097
1098	# Make sure this name uses only ASCII characters.
1099	# We use either base64 or URL encoding, as these preserve original encoding
1100	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
1101
1102	### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
1103	$doc_obj->associate_file($filename, $newname, undef, $section);
1104
1105	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1106	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
1107	my $newname_url = $newname;
1108	$newname_url = &unicode::filename_to_url($newname_url);
1109	return "_httpdocimg_/$newname_url";
1110	}
1111	}
1112
1113
1114	sub format_link {
1115	my $self = shift (@_);
1116	my ($link, $base_dir, $file) = @_;
1117
1118	# strip off hash part, e.g. #foo, but watch out for any entities, e.g. α
1119	my ($before_hash, $hash_part) = $link =~ m/^(.?[^&])(\#.)?$/;
1120
1121	$hash_part = "" if !defined $hash_part;
1122	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
1123	my $outhandle = $self->{'outhandle'};
1124	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1125	if $self->{'verbosity'};
1126	return ($link, "", 0);
1127	}
1128
1129	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
1130	my $type = $1;
1131
1132	if ($link =~ m/^(http\|ftp):/i) {
1133	# Turn url (using /) into file name (possibly using \ on windows)
1134	my @http_dir_split = split('/', $before_hash);
1135	$before_hash = &FileUtils::filenameConcatenate(@http_dir_split);
1136	}
1137
1138	$before_hash = $self->eval_dir_dots($before_hash);
1139
1140	my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);
1141
1142	my $rl = 0;
1143	$rl = 1 if (-e $linkfilename);
1144
1145	# make sure there's a slash on the end if it's a directory
1146	if ($before_hash !~ m/\/$/) {
1147	$before_hash .= "/" if (-d $linkfilename);
1148	}
1149	return ($type . $before_hash, $hash_part, $rl);
1150
1151	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
1152
1153	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
1154
1155	# the first directory will be the domain name if file_is_url
1156	# to generate archives, otherwise we'll assume all files are
1157	# from the same site and base_dir is the root
1158
1159	if ($self->{'file_is_url'}) {
1160	my @dirs = split /[\/\\]/, $file;
1161	my $domname = shift (@dirs);
1162	$before_hash = &FileUtils::filenameConcatenate($domname, $before_hash);
1163	$before_hash =~ s@\\@/@g; # for windows
1164	}
1165	else
1166	{
1167	# see if link shares directory with source document
1168	# => turn into relative link if this is so!
1169
1170	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1171	# too difficult doing a pattern match with embedded '\'s...
1172	my $win_before_hash=$before_hash;
1173	$win_before_hash =~ s@(\\)+@/@g;
1174	# $base_dir is already similarly "converted" on windows.
1175	if ($win_before_hash =~ s@^$base_dir/@@o) {
1176	# if this is true, we removed a prefix
1177	$before_hash=$win_before_hash;
1178	}
1179	}
1180	else {
1181	# before_hash has lost leading slash by this point,
1182	# -> add back in prior to substitution with $base_dir
1183	$before_hash = "/$before_hash";
1184
1185	$before_hash = &FileUtils::filenameConcatenate("",$before_hash);
1186	$before_hash =~ s@^$base_dir/@@;
1187	}
1188	}
1189	} else {
1190	# Turn relative file path into full path
1191	my $dirname = &File::Basename::dirname($file);
1192	$before_hash = &FileUtils::filenameConcatenate($dirname, $before_hash);
1193	$before_hash = $self->eval_dir_dots($before_hash);
1194	}
1195
1196	my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);
1197
1198
1199	# print STDERR "**** linkfilename = $linkfilename\n";
1200	# if (!&util::fd_exists($linkfilename)) {
1201	# print STDERR "***** Warning: Could not find $linkfilename\n";
1202	# }
1203
1204
1205	# make sure there's a slash on the end if it's a directory
1206	if ($before_hash !~ m/\/$/) {
1207	$before_hash .= "/" if (-d $linkfilename);
1208	}
1209
1210	# print STDERR "*** returning: $before_hash\n";
1211
1212	return ("http://" . $before_hash, $hash_part, 1);
1213	} else {
1214	# mailto, news, nntp, telnet, javascript or gopher link
1215	return ($before_hash, "", 0);
1216	}
1217	}
1218
1219	sub extract_first_NNNN_characters {
1220	my $self = shift (@_);
1221	my ($textref, $doc_obj, $thissection) = @_;
1222
1223	foreach my $size (split /,/, $self->{'first'}) {
1224	my $tmptext = $$textref;
1225	# skip to the body
1226	$tmptext =~ s/.<body[^>]>//i;
1227	# remove javascript
1228	$tmptext =~ s@<script.*?</script>@ @sig;
1229	$tmptext =~ s/<[^>]*>/ /g;
1230	$tmptext =~ s/ / /g;
1231	$tmptext =~ s/^\s+//;
1232	$tmptext =~ s/\s+$//;
1233	$tmptext =~ s/\s+/ /gs;
1234	$tmptext = &unicode::substr ($tmptext, 0, $size);
1235	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
1236	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1237	}
1238	}
1239
1240
1241	sub extract_metadata {
1242	my $self = shift (@_);
1243	my ($textref, $metadata, $doc_obj, $section) = @_;
1244	my $outhandle = $self->{'outhandle'};
1245	# if we don't want metadata, we may as well not be here ...
1246	return if (!defined $self->{'metadata_fields'});
1247
1248	my $separator = $self->{'metadata_field_separator'};
1249	if ($separator eq "") {
1250	undef $separator;
1251	}
1252
1253	# metadata fields to extract/save. 'key' is the (lowercase) name of the
1254	# html meta, 'value' is the metadata name for greenstone to use
1255	my %find_fields = ();
1256
1257	my %creator_fields = (); # short-cut for lookups
1258
1259
1260	foreach my $field (split /,/, $self->{'metadata_fields'}) {
1261	$field =~ s/^\s+//; # remove leading whitespace
1262	$field =~ s/\s+$//; # remove trailing whitespace
1263
1264	# support tag<tagname>
1265	if ($field =~ m/^(.?)\s<(.*?)>$/) {
1266	# "$2" is the user's preferred gs metadata name
1267	$find_fields{lc($1)}=$2; # lc = lowercase
1268	} else { # no <tagname> for mapping
1269	# "$field" is the user's preferred gs metadata name
1270	$find_fields{lc($field)}=$field; # lc = lowercase
1271	}
1272	}
1273
1274	if (defined $self->{'hunt_creator_metadata'} &&
1275	$self->{'hunt_creator_metadata'} == 1 ) {
1276	my @extra_fields =
1277	(
1278	'author',
1279	'author.email',
1280	'creator',
1281	'dc.creator',
1282	'dc.creator.corporatename',
1283	);
1284
1285	# add the creator_metadata fields to search for
1286	foreach my $field (@extra_fields) {
1287	$creator_fields{$field}=0; # add to lookup hash
1288	}
1289	}
1290
1291
1292	# find the header in the html file, which has the meta tags
1293	$$textref =~ m@<head>(.*?)</head>@si;
1294
1295	my $html_header=$1;
1296
1297	# go through every <meta... tag defined in the html and see if it is
1298	# one of the tags we want to match.
1299
1300	# special case for title - we want to remember if its been found
1301	my $found_title = 0;
1302	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1303	$html_header =~ m/^/; # match the start of the string, for \G assertion
1304
1305	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1306	my $metatag=$1;
1307	my ($tag, $value);
1308
1309	# find the tag name
1310	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1311	$tag=$2;
1312	# in case they're not using " or ', but they should...
1313	if (! $tag) {
1314	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1315	$tag=$1;
1316	}
1317
1318	if (!defined $tag) {
1319	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1320	next;
1321	}
1322
1323	# don't need to assign this field if it was passed in from a previous
1324	# (recursive) plugin
1325	if (defined $metadata->{$tag}) {next}
1326
1327	# find the tag content
1328	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
1329	$value=$2;
1330
1331	# The following code assigns the metaname to value if value is
1332	# empty. Why would we do this?
1333	#if (! $value) {
1334	# $metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1335	# $value=$1;
1336	#}
1337	if (!defined $value \|\| $value eq "") {
1338	print $outhandle "HTMLPlugin: can't find VALUE in <meta $metatag >\n" if ($self->{'verbosity'} > 2);
1339	next;
1340	}
1341
1342	# clean up and add
1343	$value =~ s/\s+/ /gs;
1344	chomp($value); # remove trailing \n, if any
1345	if (exists $creator_fields{lc($tag)}) {
1346	# map this value onto greenstone's "Creator" metadata
1347	$tag='Creator';
1348	} elsif (!exists $find_fields{lc($tag)}) {
1349	next; # don't want this tag
1350	} else {
1351	# get the user's preferred capitalisation
1352	$tag = $find_fields{lc($tag)};
1353	}
1354	if (lc($tag) eq "title") {
1355	$found_title = 1;
1356	}
1357
1358	if ($self->{'verbosity'} > 2) {
1359	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1360	}
1361
1362	if ($tag =~ /\./) {
1363	# there is a . so has a namespace, add ex.
1364	$tag = "ex.$tag";
1365	}
1366	if (defined $separator) {
1367	my @values = split($separator, $value);
1368	foreach my $v (@values) {
1369	$doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1370	}
1371	}
1372	else {
1373	$doc_obj->add_utf8_metadata($section, $tag, $value);
1374	}
1375	}
1376
1377	# TITLE: extract the document title
1378	if (exists $find_fields{'title'} && !$found_title) {
1379	# we want a title, and didn't find one in the meta tags
1380	# see if there's a <title> tag
1381	my $title;
1382	my $from = ""; # for debugging output only
1383	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
1384	$title = $1;
1385	$from = "<title> tags";
1386	}
1387
1388	if (!defined $title) {
1389	$from = "first 100 chars";
1390	# if no title use first 100 or so characters
1391	$title = $$textref;
1392	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1393	$title =~ s/^.*?<body>//si;
1394	# ignore javascript!
1395	$title =~ s@<script.*?</script>@ @sig;
1396	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1397	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1398	$title = substr ($title, 0, 100);
1399	$title =~ s/\s\S*$/.../;
1400	}
1401	$title =~ s/<[^>]*>/ /g; # remove html tags
1402	$title =~ s/ / /g;
1403	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1404	$title =~ s/\s+/ /gs; # collapse multiple spaces
1405	$title =~ s/^\s*//; # remove leading spaces
1406	$title =~ s/\s*$//; # remove trailing spaces
1407
1408	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1409	$title =~ s/^\s+//s; # in case title_sub introduced any...
1410	$doc_obj->add_utf8_metadata ($section, "Title", $title);
1411	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1412	if ($self->{'verbosity'} > 2);
1413	}
1414
1415	# add FileFormat metadata
1416	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1417
1418	# Special, for metadata names such as tagH1 - extracts
1419	# the text between the first <H1> and </H1> tags into "H1" metadata.
1420
1421	foreach my $field (keys %find_fields) {
1422	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1423	my $tag = $1;
1424	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1425	my $content = $1;
1426	$content =~ s/ / /g;
1427	$content =~ s/<[^>]*>/ /g;
1428	$content =~ s/^\s+//;
1429	$content =~ s/\s+$//;
1430	$content =~ s/\s+/ /gs;
1431	if ($content) {
1432	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1433	$tag =~ s/^tag//i;
1434	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1435	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1436	if ($self->{'verbosity'} > 2);
1437	}
1438	}
1439	}
1440	}
1441
1442
1443	# evaluate any "../" to next directory up
1444	# evaluate any "./" as here
1445	sub eval_dir_dots {
1446	my $self = shift (@_);
1447	my ($filename) = @_;
1448	my $dirsep_os = &util::get_os_dirsep();
1449	my @dirsep = split(/$dirsep_os/,$filename);
1450
1451	my @eval_dirs = ();
1452	foreach my $d (@dirsep) {
1453	if ($d eq "..") {
1454	pop(@eval_dirs);
1455
1456	} elsif ($d eq ".") {
1457	# do nothing!
1458
1459	} else {
1460	push(@eval_dirs,$d);
1461	}
1462	}
1463
1464	# Need to fiddle with number of elements in @eval_dirs if the
1465	# first one is the empty string. This is because of a
1466	# modification to FileUtils::filenameConcatenate that supresses the addition
1467	# of a leading '/' character (or \ if windows) (intended to help
1468	# filename cat with relative paths) if the first entry in the
1469	# array is the empty string. Making the array start with two
1470	# empty strings is a way to defeat this "smart" option.
1471	#
1472	if (scalar(@eval_dirs) > 0) {
1473	if ($eval_dirs[0] eq ""){
1474	unshift(@eval_dirs,"");
1475	}
1476	}
1477
1478	my $evaluated_filename = (scalar @eval_dirs > 0) ? &FileUtils::filenameConcatenate(@eval_dirs) : "";
1479	return $evaluated_filename;
1480	}
1481
1482	sub replace_usemap_links {
1483	my $self = shift (@_);
1484	my ($front, $link, $back) = @_;
1485
1486	# remove quotes from link at start and end if necessary
1487	if ($link=~/^[\"\']/) {
1488	$link=~s/^[\"\']//;
1489	$link=~s/[\"\']$//;
1490	$front.='"';
1491	$back="\"$back";
1492	}
1493
1494	$link =~ s/^\.\///;
1495	return $front . $link . $back;
1496	}
1497
1498	sub inc_filecount {
1499	my $self = shift (@_);
1500
1501	if ($self->{'file_num'} == 1000) {
1502	$self->{'dir_num'} ++;
1503	$self->{'file_num'} = 0;
1504	} else {
1505	$self->{'file_num'} ++;
1506	}
1507	}
1508
1509
1510	# Extend read_file so that strings like é are
1511	# converted to UTF8 internally.
1512	#
1513	# We don't convert < or > or & or " in case
1514	# they interfere with the GML files
1515
1516	sub read_file {
1517	my $self = shift(@_);
1518	my ($filename, $encoding, $language, $textref) = @_;
1519
1520	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1521
1522	# Convert entities to their Unicode code-point equivalents
1523	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1524	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1525	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1526
1527	}
1528
1529	sub HB_read_html_file {
1530	my $self = shift (@_);
1531	my ($htmlfile, $text) = @_;
1532
1533	# load in the file
1534	if (!open (FILE, $htmlfile)) {
1535	print STDERR "ERROR - could not open $htmlfile\n";
1536	return;
1537	}
1538
1539	my $foundbody = 0;
1540	$self->HB_gettext (\$foundbody, $text, "FILE");
1541	close FILE;
1542
1543	# just in case there was no <body> tag
1544	if (!$foundbody) {
1545	$foundbody = 1;
1546	open (FILE, $htmlfile) \|\| return;
1547	$self->HB_gettext (\$foundbody, $text, "FILE");
1548	close FILE;
1549	}
1550	# text is in utf8
1551	}
1552
1553	# converts the text to utf8, as ghtml does that for é etc.
1554	sub HB_gettext {
1555	my $self = shift (@_);
1556	my ($foundbody, $text, $handle) = @_;
1557
1558	my $line = "";
1559	while (defined ($line = <$handle>)) {
1560	# look for body tag
1561	if (!$$foundbody) {
1562	if ($line =~ s/^.<body[^>]>//i) {
1563	$$foundbody = 1;
1564	} else {
1565	next;
1566	}
1567	}
1568
1569	# check for symbol fonts
1570	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
1571	my $font = $1;
1572	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1573	if ($font !~ m/^arial$/i);
1574	}
1575
1576	$$text .= $line;
1577	}
1578
1579	if ($self->{'input_encoding'} eq "iso_8859_1") {
1580	# convert to utf-8
1581	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1582	}
1583	# convert any alphanumeric character entities to their utf-8
1584	# equivalent for indexing purposes
1585	#&ghtml::convertcharentities ($$text);
1586
1587	$$text =~ s/\s+/ /g; # remove \n's
1588
1589	# At this point $$text is a binary byte string
1590	# => turn it into a Unicode aware string, so full
1591	# Unicode aware pattern matching can be used.
1592	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
1593	#
1594
1595	$$text = decode("utf8",$$text);
1596	}
1597
1598	sub HB_clean_section {
1599	my $self = shift (@_);
1600	my ($section) = @_;
1601
1602	# remove tags without a starting tag from the section
1603	my ($tag, $tagstart);
1604	while ($section =~ m/<\/([^>]{1,10})>/) {
1605	$tag = $1;
1606	$tagstart = index($section, "<$tag");
1607	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1608	$section =~ s/<\/$tag>//;
1609	}
1610
1611	# remove extra paragraph tags
1612	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
1613
1614	# remove extra stuff at the end of the section
1615	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
1616
1617	# add a newline at the beginning of each paragraph
1618	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1619
1620	# add a newline every 80 characters at a word boundary
1621	# Note: this regular expression puts a line feed before
1622	# the last word in each section, even when it is not
1623	# needed.
1624	$section =~ s/(.{1,80})\s/$1\n/g;
1625
1626	# fix up the image links
1627	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
1628	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1629	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
1630	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1631
1632	return $section;
1633	}
1634
1635	# Will convert the oldHDL format to the new HDL format (using the Section tag)
1636	sub convert_to_newHDLformat
1637	{
1638	my $self = shift (@_);
1639	my ($file,$cnfile) = @_;
1640	my $input_filename = $file;
1641	my $tmp_filename = $cnfile;
1642
1643	# write HTML tmp file with new HDL format
1644	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1645
1646	# read in the file and do basic html cleaning (removing header etc)
1647	my $html = "";
1648	$self->HB_read_html_file ($input_filename, \$html);
1649
1650	# process the file one section at a time
1651	my $curtoclevel = 1;
1652	my $firstsection = 1;
1653	my $toclevel = 0;
1654	while (length ($html) > 0) {
1655	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
1656	$toclevel = $3;
1657	my $title = $4;
1658	my $sectiontext = "";
1659	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
1660	$sectiontext = $1;
1661	} else {
1662	$sectiontext = $html;
1663	$html = "";
1664	}
1665
1666	# remove tags and extra spaces from the title
1667	$title =~ s/<\/?[^>]+>//g;
1668	$title =~ s/^\s+\|\s+$//g;
1669
1670	# close any sections below the current level and
1671	# create a new section (special case for the firstsection)
1672	print PROD "<!--\n";
1673	while (($curtoclevel > $toclevel) \|\|
1674	(!$firstsection && $curtoclevel == $toclevel)) {
1675	$curtoclevel--;
1676	print PROD "</Section>\n";
1677	}
1678	if ($curtoclevel+1 < $toclevel) {
1679	print STDERR "WARNING - jump in toc levels in $input_filename " .
1680	"from $curtoclevel to $toclevel\n";
1681	}
1682	while ($curtoclevel < $toclevel) {
1683	$curtoclevel++;
1684	}
1685
1686	if ($curtoclevel == 1) {
1687	# add the header tag
1688	print PROD "-->\n";
1689	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1690	print PROD "<!--\n";
1691	}
1692
1693	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1694
1695	print PROD "-->\n";
1696
1697	# clean up the section html
1698	$sectiontext = $self->HB_clean_section($sectiontext);
1699
1700	print PROD "$sectiontext\n";
1701
1702	} else {
1703	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1704	"\nin $input_filename\n";
1705	last;
1706	}
1707	$firstsection = 0;
1708	}
1709
1710	print PROD "<!--\n";
1711	while ($curtoclevel > 0) {
1712	$curtoclevel--;
1713	print PROD "</Section>\n";
1714	}
1715	print PROD "-->\n";
1716
1717	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1718
1719	return $tmp_filename;
1720	}
1721
1722	sub shorten {
1723	my $self = shift (@_);
1724	my ($text) = @_;
1725
1726	return "\"$text\"" if (length($text) < 100);
1727
1728	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1729	substr ($text, length($text)-50) . "\"";
1730	}
1731
1732	sub convert_tidy_or_oldHDL_file
1733	{
1734	my $self = shift (@_);
1735	my ($file) = @_;
1736	my $input_filename = $file;
1737
1738	if (-d $input_filename)
1739	{
1740	return $input_filename;
1741	}
1742
1743	# get the input filename
1744	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1745	my $base_dirname = $dirname;
1746	$suffix = lc($suffix);
1747
1748	# derive tmp filename from input filename
1749	# Remove any white space from filename -- no risk of name collision, and
1750	# makes later conversion by utils simpler. Leave spaces in path...
1751	# tidy up the filename with space, dot, hyphen between
1752	$tailname =~ s/\s+//g;
1753	$tailname =~ s/\.+//g;
1754	$tailname =~ s/\-+//g;
1755	# convert to utf-8 otherwise we have problems with the doc.xml file
1756	# later on
1757	&unicode::ensure_utf8(\$tailname);
1758
1759	# softlink to collection tmp dir
1760	my $tmp_dirname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1761	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1762
1763	my $test_dirname = "";
1764	my $f_separator = &util::get_os_dirsep();
1765
1766	if ($dirname =~ m/import$f_separator/)
1767	{
1768	$test_dirname = $'; #'
1769
1770	#print STDERR "init $'\n";
1771
1772	while ($test_dirname =~ m/[$f_separator]/)
1773	{
1774	my $folderdirname = $`;
1775	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname,$folderdirname);
1776	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1777	$test_dirname = $'; #'
1778	}
1779	}
1780
1781	my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1782
1783	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1784	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
1785	{
1786	#convert the input file to a new style HDL
1787	my $hdl_output_filename = $input_filename;
1788	if ($self->{'old_style_HDL'})
1789	{
1790	$hdl_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1791	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1792	}
1793
1794	#just for checking copy all other file from the base dir to tmp dir if it is not exists
1795	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1796	my @files = grep {!/^\.+$/} readdir(DIR);
1797	close(DIR);
1798
1799	foreach my $file (@files)
1800	{
1801	my $src_file = &FileUtils::filenameConcatenate($base_dirname,$file);
1802	my $dest_file = &FileUtils::filenameConcatenate($tmp_dirname,$file);
1803	if ((!-e $dest_file) && (!-d $src_file))
1804	{
1805	# just copy the original file back to the tmp directory
1806	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1807	}
1808	}
1809
1810	# tidy the input file
1811	my $tidy_output_filename = $hdl_output_filename;
1812	if ($self->{'use_realistic_book'})
1813	{
1814	$tidy_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1815	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1816	}
1817	$tmp_filename = $tidy_output_filename;
1818	}
1819	else
1820	{
1821	if (!-e $tmp_filename)
1822	{
1823	# just copy the original file back to the tmp directory
1824	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1825	}
1826	}
1827
1828	return $tmp_filename;
1829	}
1830
1831
1832	# Will make the html input file as a proper XML file with removed font tag and
1833	# image size added to the img tag.
1834	# The tidying process takes place in a collection specific 'tmp' directory so
1835	# that we don't accidentally damage the input.
1836	sub tmp_tidy_file
1837	{
1838	my $self = shift (@_);
1839	my ($file,$cnfile) = @_;
1840	my $input_filename = $file;
1841	my $tmp_filename = $cnfile;
1842
1843	# get the input filename
1844	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1845
1846	require HTML::TokeParser::Simple;
1847
1848	# create HTML parser to decode the input file
1849	my $parser = HTML::TokeParser::Simple->new($input_filename);
1850
1851	# write HTML tmp file without the font tag and image size are added to the img tag
1852	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1853	while (my $token = $parser->get_token())
1854	{
1855	# is it an img tag
1856	if ($token->is_start_tag('img'))
1857	{
1858	# get the attributes
1859	my $attr = $token->return_attr;
1860
1861	# get the full path to the image
1862	my $img_file = &FileUtils::filenameConcatenate($dirname,$attr->{src});
1863
1864	# set the width and height attribute
1865	($attr->{width}, $attr->{height}) = imgsize($img_file);
1866
1867	# recreate the tag
1868	print PROD "<img";
1869	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1870	print PROD ">";
1871	}
1872	# is it a font tag
1873	else
1874	{
1875	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
1876	{
1877	# remove font tag
1878	print PROD "";
1879	}
1880	else
1881	{
1882	# print without changes
1883	print PROD $token->as_is;
1884	}
1885	}
1886	}
1887	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1888
1889	# run html-tidy on the tmp file to make it a proper XML file
1890
1891	my $outhandle = $self->{'outhandle'};
1892	print $outhandle "Converting HTML to be XML compliant:\n";
1893
1894	my $tidy_cmd = "tidy";
1895	$tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1896	$tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1897	if ($self->{'verbosity'} <= 2) {
1898	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1899	$tidy_cmd .= " 2>nul";
1900	}
1901	else {
1902	$tidy_cmd .= " 2>/dev/null";
1903	}
1904	print $outhandle " => $tidy_cmd\n";
1905	}
1906
1907	my $tidyfile = `$tidy_cmd`;
1908
1909	# write result back to the tmp file
1910	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1911	print PROD $tidyfile;
1912	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1913
1914	# return the output filename
1915	return $tmp_filename;
1916	}
1917
1918	sub associate_cover_image
1919	{
1920	my $self = shift(@_);
1921	my ($doc_obj, $filename) = @_;
1922	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
1923	{
1924	# we will have cover image in tidytmp, but want it from import
1925	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1926	}
1927	$self->SUPER::associate_cover_image($doc_obj, $filename);
1928	}
1929
1930
1931	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: