Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago
renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 70.3 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use Encode;
39	use Unicode::Normalize 'normalize';
40
41	use ReadTextFile;
42	use HBPlugin;
43	use ghtml;
44	use unicode;
45	use util;
46	use FileUtils;
47	use XMLParser;
48
49	use File::Copy;
50
51	sub BEGIN {
52	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
53	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
54	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); # for Image/Size.pm
55	}
56
57	use Image::Size;
58
59	use strict; # every perl program should have this!
60	no strict 'refs'; # make an exception so we can use variables as filehandles
61
62	my $arguments =
63	[ { 'name' => "process_exp",
64	'desc' => "{BaseImporter.process_exp}",
65	'type' => "regexp",
66	'deft' => &get_default_process_exp() },
67	{ 'name' => "block_exp",
68	'desc' => "{BaseImporter.block_exp}",
69	'type' => 'regexp',
70	'deft' => &get_default_block_exp() },
71	{ 'name' => "nolinks",
72	'desc' => "{HTMLPlugin.nolinks}",
73	'type' => "flag" },
74	{ 'name' => "keep_head",
75	'desc' => "{HTMLPlugin.keep_head}",
76	'type' => "flag" },
77	{ 'name' => "no_metadata",
78	'desc' => "{HTMLPlugin.no_metadata}",
79	'type' => "flag" },
80	{ 'name' => "metadata_fields",
81	'desc' => "{HTMLPlugin.metadata_fields}",
82	'type' => "string",
83	'deft' => "Title" },
84	{ 'name' => "metadata_field_separator",
85	'desc' => "{HTMLPlugin.metadata_field_separator}",
86	'type' => "string",
87	'deft' => "" },
88	{ 'name' => "hunt_creator_metadata",
89	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
90	'type' => "flag" },
91	{ 'name' => "file_is_url",
92	'desc' => "{HTMLPlugin.file_is_url}",
93	'type' => "flag" },
94	{ 'name' => "assoc_files",
95	'desc' => "{HTMLPlugin.assoc_files}",
96	'type' => "regexp",
97	'deft' => &get_default_block_exp() },
98	{ 'name' => "rename_assoc_files",
99	'desc' => "{HTMLPlugin.rename_assoc_files}",
100	'type' => "flag" },
101	{ 'name' => "title_sub",
102	'desc' => "{HTMLPlugin.title_sub}",
103	'type' => "string",
104	'deft' => "" },
105	{ 'name' => "description_tags",
106	'desc' => "{HTMLPlugin.description_tags}",
107	'type' => "flag" },
108	# retain this for backward compatibility (w3mir option was replaced by
109	# file_is_url)
110	{ 'name' => "w3mir",
111	# 'desc' => "{HTMLPlugin.w3mir}",
112	'type' => "flag",
113	'hiddengli' => "yes"},
114	{ 'name' => "no_strip_metadata_html",
115	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
116	'type' => "string",
117	'deft' => "",
118	'reqd' => "no"},
119	{ 'name' => "sectionalise_using_h_tags",
120	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
121	'type' => "flag" },
122	{ 'name' => "use_realistic_book",
123	'desc' => "{HTMLPlugin.tidy_html}",
124	'type' => "flag"},
125	{ 'name' => "old_style_HDL",
126	'desc' => "{HTMLPlugin.old_style_HDL}",
127	'type' => "flag"},
128	{'name' => "processing_tmp_files",
129	'desc' => "{BaseImporter.processing_tmp_files}",
130	'type' => "flag",
131	'hiddengli' => "yes"}
132	];
133
134	my $options = { 'name' => "HTMLPlugin",
135	'desc' => "{HTMLPlugin.desc}",
136	'abstract' => "no",
137	'inherits' => "yes",
138	'args' => $arguments };
139
140
141	sub new {
142	my ($class) = shift (@_);
143	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
144	push(@$pluginlist, $class);
145
146	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
147	push(@{$hashArgOptLists->{"OptList"}},$options);
148
149
150	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
151
152	if ($self->{'w3mir'}) {
153	$self->{'file_is_url'} = 1;
154	}
155	$self->{'aux_files'} = {};
156	$self->{'dir_num'} = 0;
157	$self->{'file_num'} = 0;
158
159	return bless $self, $class;
160	}
161
162	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
163	# if have eg <script language="javascript" src="img/lib.js@123">
164	# blocking is now done by reading through the file and recording all the
165	# images and other files
166	sub get_default_block_exp {
167	my $self = shift (@_);
168
169	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
170	return "";
171	}
172
173	sub get_default_process_exp {
174	my $self = shift (@_);
175
176	# the last option is an attempt to encode the concept of an html query ...
177	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
178	}
179
180	sub store_block_files
181	{
182	my $self =shift (@_);
183	my ($filename_full_path, $block_hash) = @_;
184
185	my $html_fname = $filename_full_path;
186
187	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
188	$self->{'store_content_encoding'}->{$filename_full_path} = [$content_encoding, $language];
189
190
191	# read in file ($text will be in the filesystem encoding)
192	my $raw_text = "";
193	$self->read_file_no_decoding($filename_full_path, \$raw_text);
194
195	my $textref = \$raw_text;
196	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
197	my $closecom = '(?:-->\|(?:—\|\|--)>)';
198	$$textref =~ s/$opencom(.*?)$closecom//gs;
199
200	# Convert entities to their UTF8 equivalents
201	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
202	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
203	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
204
205	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
206	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
207	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
208	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
209	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
210	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
211	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
212
213	if(!defined $self->{'unicode_to_original_filename'}) {
214	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
215	$self->{'unicode_to_original_filename'} = {};
216	}
217
218	foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
219
220	# remove quotes from link at start and end if necessary
221	if ($raw_link =~ m/^\"/) {
222	$raw_link =~ s/^\"//;
223	$raw_link =~ s/\"$//;
224	}
225
226	# remove any anchor names, e.g. foo.html#name becomes foo.html
227	# but watch out for any #'s that are part of entities, such as α
228	$raw_link =~ s/([^&])\#.*$/$1/s;
229
230	# some links may just be anchor names
231	next unless ($raw_link =~ /\S+/);
232
233	if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/i) {
234	# Turn relative file path into full path
235	my $dirname = &File::Basename::dirname($filename_full_path);
236	$raw_link = &FileUtils::filenameConcatenate($dirname, $raw_link);
237	}
238	$raw_link = $self->eval_dir_dots($raw_link);
239
240	# this is the actual filename on the filesystem (that the link refers to)
241	my $url_original_filename = $self->opt_url_decode($raw_link);
242
243	my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
244
245	if ($exceeds_bytecodes) {
246	# We have a link to a file name that is more complicated than a raw byte filename
247	# What we do next depends on the operating system we are on
248
249	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
250	# Assume we're dealing with a UTF-8 encoded filename
251	$url_original_filename = encode("utf8", $url_original_filename);
252	}
253	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
254	# HFS+ is UTF8 with decompostion
255	$url_original_filename = encode("utf8", $url_original_filename);
256	$url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
257	}
258	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
259	# Don't need to do anything as later code maps Windows
260	# unicode filenames to DOS short filenames when needed
261	}
262	else {
263	my $outhandle = $self->{'outhandle'};
264	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
265	print $outhandle " in raw file system encoding of: $raw_link\n";
266	print $outhandle " Assuming filesystem is UTF-8 based.\n";
267	$url_original_filename = encode("utf8", $url_original_filename);
268	}
269	}
270
271	# Convert the (currently raw) link into its Unicode version.
272	# Store the Unicode link along with the url_original_filename
273	my $unicode_url_original_filename = "";
274	$self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
275
276
277	$self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
278
279
280	if ($url_original_filename ne $unicode_url_original_filename) {
281	my $outhandle = $self->{'outhandle'};
282
283	print $outhandle "URL Encoding $url_original_filename\n";
284	print $outhandle " ->$unicode_url_original_filename\n";
285
286	# make sure not to block the file itself, as happens when an html file links to itself
287	# e.g. if the current file is mary-boleyn/index.html and contains <link rel="canonical" href="index.html" />
288	my $unicode_html_fname = "";
289	$self->decode_text($html_fname,$content_encoding,$language,\$unicode_html_fname);
290	if($unicode_url_original_filename ne $unicode_html_fname) {
291	# Allow for possibility of raw byte version and Unicode versions of file
292	$self->block_filename($block_hash,$unicode_url_original_filename);
293	}
294	}
295
296	# $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename);
297	# TODO now use unicode files in block, do we need this??
298	#$self->block_raw_filename($block_hash,$url_original_filename) if $url_original_filename ne $html_fname;
299
300	# but only add the linked file to the blocklist if the current html file does not link to itself
301
302	}
303	}
304
305	# Given a filename in any encoding, will URL decode it to get back the original filename
306	# in the original encoding. Because this method is intended to work out the original
307	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
308	# string already exists in the local folder.
309	#
310	sub opt_url_decode {
311	my $self = shift (@_);
312	my ($raw_link) = @_;
313
314
315	# Replace %XX's in URL with decoded value if required.
316	# Note that the filename may include the %XX in some situations
317
318	## if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
319
320	if (($raw_link =~ m/\%[A-F0-9]{2}/i) \|\| ($raw_link =~ m/\&\#x[0-9A-F]+;/i) \|\| ($raw_link =~ m/\&\#[0-9]+;/i)) {
321	if (!-e $raw_link) {
322	$raw_link = &unicode::url_decode($raw_link,1);
323	}
324	}
325
326	return $raw_link;
327	}
328
329	sub read_into_doc_obj
330	{
331	my $self = shift (@_);
332	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
333
334	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
335
336	# Lookup content_encoding and language worked out in file_block pass for this file
337	# Store them under the local names they are nice and easy to access
338	$self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path}[0];
339	$self->{'language'} = $self->{'store_content_encoding'}->{$filename_full_path}[1];
340
341	# get the input file
342	my $input_filename = $file;
343	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
344	$suffix = lc($suffix);
345	my $tidy_filename;
346	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
347	{
348	# because the document has to be sectionalized set the description tags
349	$self->{'description_tags'} = 1;
350
351	# set the file to be tidied
352	$input_filename = &FileUtils::filenameConcatenate($base_dir,$file) if $base_dir =~ m/\w/;
353
354	# get the tidied file
355	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
356	$tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
357
358	# derive tmp filename from input filename
359	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
360
361	# set the new input file and base_dir to be from the tidied file
362	$file = "$tailname$suffix";
363	$base_dir = $dirname;
364	}
365
366	# call the parent read_into_doc_obj
367	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
368	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
369	{
370	# now we need to reset the filenames in the doc obj so that the converted filenames are not used
371	my $collect_file = &util::filename_within_collection($filename_full_path);
372	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
373	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
374	# build. So set it manually.
375	$doc_obj->set_source_path($filename_full_path);
376	my $collect_conv_file = &util::filename_within_collection($tidy_filename);
377	$doc_obj->set_converted_filename($collect_conv_file);
378
379	my $plugin_filename_encoding = $self->{'filename_encoding'};
380	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
381	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
382	}
383
384	delete $self->{'store_content_encoding'}->{$filename_full_path};
385	$self->{'content_encoding'} = undef;
386
387	return ($process_status,$doc_obj);
388	}
389
390	# do plugin specific processing of doc_obj
391	sub process {
392	my $self = shift (@_);
393	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
394	my $outhandle = $self->{'outhandle'};
395
396	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
397	# this makes life so much easier... perl can cope with unix-style '/'s.
398	$base_dir =~ s@(\\)+@/@g;
399	$file =~ s@(\\)+@/@g;
400	}
401
402	my $filename = &FileUtils::filenameConcatenate($base_dir,$file);
403	my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
404	my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
405
406	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
407	# And again
408	$upgraded_base_dir =~ s@(\\)+@/@g;
409	$upgraded_filename =~ s@(\\)+@/@g;
410
411	# Need to make sure there is a '/' on the end of upgraded_base_dir
412	if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
413	$upgraded_base_dir .= "/";
414	}
415	}
416	my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
417
418	# reset per-doc stuff...
419	$self->{'aux_files'} = {};
420	$self->{'dir_num'} = 0;
421	$self->{'file_num'} = 0;
422
423	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
424	# you can also include metadata in the format (X can be any number)
425	# <hX>Title<!--gsdl-metadata
426	# <Metadata name="name1">value1</Metadata>
427	# ...
428	# <Metadata name="nameN">valueN</Metadata>
429	#--></hX>
430	if ($self->{'sectionalise_using_h_tags'}) {
431	# description_tags should allways be activated because we convert headings to description tags
432	$self->{'description_tags'} = 1;
433
434	my $arrSections = [];
435	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
436
437	if (scalar(@$arrSections)) {
438	my $strMetadata = $self->update_section_data($arrSections, -1);
439	if (length($strMetadata)) {
440	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
441	$$textref =~ s/<\/body>/$strMetadata/ig;
442	}
443	}
444	}
445
446	my $cursection = $doc_obj->get_top_section();
447
448	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
449	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
450
451	# Store URL for page as metadata - this can be used for an
452	# altavista style search interface. The URL won't be valid
453	# unless the file structure contains the domain name (i.e.
454	# like when w3mir is used to download a website).
455
456	# URL metadata (even invalid ones) are used to support internal
457	# links, so even if 'file_is_url' is off, still need to store info
458
459	my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
460
461	# my $utf8_file = $self->filename_to_utf8_metadata($file);
462	# $utf8_file =~ s/&\#095;/_/g;
463	# variable below used to be utf8_file
464
465	my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
466	my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
467
468	my $web_url = "http://";
469	my $utf8_web_url = "http://";
470
471	if(defined $dirname) { # local directory
472
473	# Check for "ftp" in the domain name of the directory
474	# structure to determine if this URL should be a ftp:// URL
475	# This check is not infallible, but better than omitting the
476	# check, which would cause all files downloaded from ftp sites
477	# via mirroring with wget to have potentially erroneous http:// URLs
478	# assigned in their metadata
479	if ($dirname =~ /^[^\/]*ftp/i)
480	{
481	$web_url = "ftp://";
482	$utf8_web_url = "ftp://";
483	}
484	$dirname = $self->eval_dir_dots($dirname);
485	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
486
487	# this local directory in import may need to be URL encoded like the file
488	my $url_encoded_dir = &unicode::raw_filename_to_url_encoded($dirname);
489	my $utf8_url_encoded_dir = &unicode::raw_filename_to_utf8_url_encoded($dirname);
490
491	# changed here
492	$web_url = $web_url.$url_encoded_dir.$url_encoded_file;
493	$utf8_web_url = $utf8_web_url.$utf8_url_encoded_dir.$utf8_url_encoded_file;
494	} else {
495	$web_url = $web_url.$url_encoded_file;
496	$utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
497	}
498	$web_url =~ s/\\/\//g;
499	$utf8_web_url =~ s/\\/\//g;
500
501	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
502	print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n";
503	print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
504	print STDERR "*******DEBUG: web url: $web_url\n";
505	print STDERR "*******DEBUG: utf8 web url: $utf8_web_url\n";
506	}
507
508
509	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
510	$doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
511
512	if ($self->{'file_is_url'}) {
513	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
514	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
515	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
516	}
517
518	if ($self->{'description_tags'}) {
519	# remove the html header - note that doing this here means any
520	# sections defined within the header will be lost (so all <Section>
521	# tags must appear within the body of the HTML)
522	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
523
524	$$textref =~ s/^.?<body[^>]>//is;
525	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
526
527	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
528	my $closecom = '(?:-->\|(?:—\|\|--)>)';
529
530	my $lt = '(?:<\|<)';
531	my $gt = '(?:>\|>)';
532	my $quot = '(?:"\|"\|”\|“)';
533
534	my $dont_strip = '';
535	if ($self->{'no_strip_metadata_html'}) {
536	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
537	}
538
539	my $found_something = 0; my $top = 1;
540	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
541	my $text = $1;
542	my $comment = $2;
543	if (defined $text) {
544	# text before a comment - note that getting to here
545	# doesn't necessarily mean there are Section tags in
546	# the document
547	$self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
548	}
549	while ($comment =~ s/$lt(.*?)$gt//s) {
550	my $tag = $1;
551	if ($tag eq "Section") {
552	$found_something = 1;
553	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
554	$top = 0;
555	} elsif ($tag eq "/Section") {
556	$found_something = 1;
557	$cursection = $doc_obj->get_parent_section ($cursection);
558	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
559	my $metaname = $1;
560	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
561	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
562	my $metavalue = $1;
563	$metavalue =~ s/^\s+//;
564	$metavalue =~ s/\s+$//;
565	# assume that no metadata value intentionally includes
566	# carriage returns or HTML tags (if they're there they
567	# were probably introduced when converting to HTML from
568	# some other format).
569	# actually some people want to have html tags in their
570	# metadata.
571	$metavalue =~ s/[\cJ\cM]/ /sg;
572	$metavalue =~ s/<[^>]+>//sg
573	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
574	$metavalue =~ s/\s+/ /sg;
575	if ($metaname =~ /\./) { # has a namespace
576	$metaname = "ex.$metaname";
577	}
578	if ($accumulate) {
579	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
580	} else {
581	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
582	}
583	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
584	# do nothing with containing Description tags
585	} else {
586	# simple HTML tag (probably created by the conversion
587	# to HTML from some other format) - we'll ignore it and
588	# hope for the best ;-)
589	}
590	}
591	}
592	if ($cursection ne "") {
593	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
594	}
595
596	$$textref =~ s/^.?<body[^>]>//is;
597	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
598	if ($$textref =~ m/\S/) {
599	if (!$found_something) {
600	if ($self->{'verbosity'} > 2) {
601	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
602	print $outhandle " will be processed as a single section document\n";
603	}
604
605	# go ahead and process single-section document
606	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
607
608	# if document contains no Section tags we'll go ahead
609	# and extract metadata (this won't have been done
610	# above as the -description_tags option prevents it)
611	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
612	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
613	unless $self->{'no_metadata'};
614
615	} else {
616	print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
617	print $outhandle " of the final closing </Section> tag. This text will\n";
618	print $outhandle " be ignored.";
619
620	my ($text);
621	if (length($$textref) > 30) {
622	$text = substr($$textref, 0, 30) . "...";
623	} else {
624	$text = $$textref;
625	}
626	$text =~ s/\n/ /isg;
627	print $outhandle " ($text)\n";
628	}
629	} elsif (!$found_something) {
630
631	if ($self->{'verbosity'} > 2) {
632	# may get to here if document contained no valid Section
633	# tags but did contain some comments. The text will have
634	# been processed already but we should print the warning
635	# as above and extract metadata
636	print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
637	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
638	}
639
640	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
641	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
642	unless $self->{'no_metadata'};
643	}
644	$self->replace_section_links($doc_obj);
645	} else {
646
647	# remove header and footer
648	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
649	$$textref =~ s/^.?<body[^>]>//is;
650	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
651	}
652
653	$self->{'css_assoc_files'} = {};
654
655	# single section document
656	$self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
657
658	#my $upgraded_filename_dirname = &File::Basename::dirname($upgraded_filename);
659
660	$self->acquire_css_associated_files($doc_obj, $cursection);
661
662	$self->{'css_assoc_files'} = {};
663	}
664
665	return 1;
666	}
667
668
669	sub process_heading
670	{
671	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
672	$strHeadingText = '' if (!defined($strHeadingText));
673
674	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
675
676	my $strSecMetadata = '';
677	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
678	{
679	$strSecMetadata .= $1;
680	}
681
682	$strHeadingText =~ s/^\s+//g;
683	$strHeadingText =~ s/\s+$//g;
684	$strSecMetadata =~ s/^\s+//g;
685	$strSecMetadata =~ s/\s+$//g;
686
687	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
688
689	if (length($strSecMetadata)) {
690	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
691	}
692
693	$strMetadata .= "\t</Description>\n";
694
695	return "<!--" . $strMetadata . "-->";
696	}
697
698
699	sub update_section_data
700	{
701	my ($self, $arrSections, $nCurTocNo) = @_;
702	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
703
704	if ($nSections == 0) {
705	push @$arrSections, $nCurTocNo;
706	return $strBuffer;
707	}
708	$nLast = $arrSections->[$nSections - 1];
709	if ($nCurTocNo > $nLast) {
710	push @$arrSections, $nCurTocNo;
711	return $strBuffer;
712	}
713	for(my $i = $nSections - 1; $i >= 0; $i--) {
714	if ($nCurTocNo <= $arrSections->[$i]) {
715	$strBuffer .= "\n</Section>";
716	pop @$arrSections;
717	}
718	}
719	push @$arrSections, $nCurTocNo;
720	return $strBuffer;
721	}
722
723
724	# note that process_section may be called multiple times for a single
725	# section (relying on the fact that add_utf8_text appends the text to any
726	# that may exist already).
727	sub process_section {
728	my $self = shift (@_);
729	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
730
731	my @styleTagsText = ($$textref =~ m/<style[^>]>([^<])<\/style>/sg);
732	if(scalar(@styleTagsText) > 0)
733	{
734	my $css_filename_dirname = &File::Basename::dirname(&FileUtils::filenameConcatenate($base_dir, $file));
735	foreach my $styleText (@styleTagsText)
736	{
737	$self->acquire_css_associated_files_from_text_block($styleText, $css_filename_dirname);
738	}
739	}
740
741	# trap links
742	if (!$self->{'nolinks'}) {
743	# usemap="./#index" not handled correctly => change to "#index"
744	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
745
746	## my $opencom = '(?:<!--\|<!(?:—\|\|--))';
747	## my $closecom = '(?:-->\|(?:—\|\|--)>)';
748
749	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
750	$self->replace_usemap_links($1, $2, $3)/isge;
751
752	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
753	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
754
755	## $$textref =~ s/($opencom.?)?+(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)(.*?$closecom)?+/
756	# $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
757	}
758
759	# trap images
760
761	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
762	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
763	# turned regular text succeeding images into links. That is, by embedding <imgs>
764	# inside <a href=""></a>, the text following images were no longer misbehaving.
765	# However, there would be many occasions whereby images were not meant to link
766	# to their source images but where the images would link to another web page.
767	# To allow this, the no_image_links option was introduced: it would prevent
768	# the behaviour of embedding images into links that referenced the source images.
769
770	# Somewhere along the line, the problem of normal text turning into links when
771	# such text followed images which were not embedded in <a href=""></a> ceased
772	# to occur. This is why the following lines have been commented out (as well as
773	# two lines in replace_images). They appear to no longer apply.
774
775	# If at any time, there is a need for having images embedded in <a> anchor tags,
776	# then it might be better to turn that into an HTMLPlugin option rather than make
777	# it the default behaviour. Also, eventually, no_image_links needs to become
778	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
779
780	#if(!$self->{'no_image_links'}){
781	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
782	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
783	#}
784
785	# add text to document object
786	# turn \ into \\ so that the rest of greenstone doesn't think there
787	# is an escape code following. (Macro parsing loses them...)
788	$$textref =~ s/\\/\\\\/go;
789
790	$doc_obj->add_utf8_text($cursection, $$textref);
791	}
792
793	sub replace_images {
794	my $self = shift (@_);
795	my ($front, $link, $back, $base_dir,
796	$file, $doc_obj, $section) = @_;
797
798	# remove quotes from link at start and end if necessary
799	if ($link=~/^[\"\']/) {
800	$link=~s/^[\"\']//;
801	$link=~s/[\"\']$//;
802	$front.='"';
803	$back="\"$back";
804	}
805
806	$link =~ s/\n/ /g;
807
808	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
809	# If the Word file path has spaces in it, wv messes up and you end up with
810	# absolute paths for the images, and without the "file://" prefix
811	# So check for this special case and massage the data to be correct
812	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
813	$link =~ s/^.*\\([^\\]+)$/$1/;
814	}
815
816	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
817
818	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
819
820	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n\n";
821
822	my $anchor_name = $img_file;
823	#$anchor_name =~ s/^.*\///;
824	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
825
826	my $image_link = $front . $img_file .$back;
827	return $image_link;
828
829	# The reasons for why the following two lines are no longer necessary can be
830	# found in subroutine process_section
831	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
832	#return $anchor_link;
833
834	#return $front . $img_file . $back . $anchor_name;
835	}
836
837	sub replace_href_links {
838	my $self = shift (@_);
839	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
840
841	if($front =~ m/^<link / && $link =~ m/\.css"$/)
842	{
843	my $actual_link = $link;
844	$actual_link =~ s/^"(.*)"$/$1/;
845
846	my $directory = &File::Basename::dirname($file);
847
848	my $css_filename = &FileUtils::filenameConcatenate($base_dir, $directory, $actual_link);
849	$self->retrieve_css_associated_files($css_filename);
850	}
851
852	# remove quotes from link at start and end if necessary
853	if ($link=~/^[\"\']/) {
854	$link=~s/^[\"\']//;
855	$link=~s/[\"\']$//;
856	$front.='"';
857	$back="\"$back";
858	}
859
860	# can't remember adding this :-( must have had a reason though...
861	if ($link =~ /^\_http/ \|\| $link =~ /^\_libraryname\_/) {
862	# assume it is a greenstone one and leave alone
863	return $front . $link . $back;
864	}
865
866	# attempt to sort out targets - frames are not handled
867	# well in this plugin and some cases will screw things
868	# up - e.g. the _parent target (so we'll just remove
869	# them all ;-)
870	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
871	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
872	$front =~ s/target=\"?_parent\"?//is;
873	$back =~ s/target=\"?_parent\"?//is;
874
875	if($link =~ m/^\#/s)
876	{
877	return $front . "_httpsamepagelink_" . $link . $back;
878	}
879
880	$link =~ s/\n/ /g;
881
882	# Find file referred to by $link on file system
883	# This is more complicated than it sounds when char encodings
884	# is taken in to account
885	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
886
887	# href may use '\'s where '/'s should be on Windows
888	$href =~ s/\\/\//g;
889	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
890
891	##### leave all these links alone (they won't be picked up by intermediate
892	##### pages). I think that's safest when dealing with frames, targets etc.
893	##### (at least until I think of a better way to do it). Problems occur with
894	##### mailto links from within small frames, the intermediate page is displayed
895	##### within that frame and can't be seen. There is still potential for this to
896	##### happen even with html pages - the solution seems to be to somehow tell
897	##### the browser from the server side to display the page being sent (i.e.
898	##### the intermediate page) in the top level window - I'm not sure if that's
899	##### possible - the following line should probably be deleted if that can be done
900	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
901
902	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
903	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
904
905	if ($ENV{'GSDLOS'} =~ m/^windows$/) {
906
907	# Don't do any encoding for now, as not clear what
908	# the right thing to do is to support filename
909	# encoding on Windows when they are not UTF16
910	#
911	}
912	else {
913	# => Unix-based system
914
915	# If web page didn't give encoding, then default to utf8
916	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
917
918	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
919	print STDERR "**** Encoding with '$content_encoding', href: $href\n";
920	}
921
922	# on Darwin, the unicode filenames are stored on the file
923	# system in decomposed form, so any href link (including when
924	# URL-encoded) should refer to the decomposed name of the file
925	if ($ENV{'GSDLOS'} =~ /^darwin$/i) {
926	$href = normalize('D', $href); # Normalization Form D (decomposition)
927	}
928
929	$href = encode($content_encoding,$href);
930	}
931
932	$href = &unicode::raw_filename_to_utf8_url_encoded($href);
933	$href = &unicode::filename_to_url($href);
934
935	&ghtml::urlsafe ($href);
936
937	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
938	print STDERR "******DEBUG: href=$href\n";
939	}
940
941	#TODO here
942	# if ($rl ==1) {
943	# have a relative link, we need to do URL encoding etc so it matches what has happened for that file
944	#$href = &util::rename_file($href, $self->{'file_rename_method'});
945	# $href = &unicode::raw_filename_to_url_encoded($href);
946	# then, this might be url encoded, so we replace % with %25
947	# $href = &unicode::filename_to_url($href);
948	# print STDERR "DEBUG: url encoded href = $href\n";
949	# }
950
951	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
952	} else {
953	# link is to some other type of file (e.g., an image) so we'll
954	# need to associate that file
955	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
956	}
957	}
958
959	sub retrieve_css_associated_files {
960	my $self = shift (@_);
961	my ($css_filename) = @_;
962
963	my $css_filename_dirname = &File::Basename::dirname($css_filename);
964
965	open (CSSFILE, $css_filename) \|\| return;
966	sysread (CSSFILE, my $file_string, -s CSSFILE);
967
968	$self->acquire_css_associated_files_from_text_block($file_string, $css_filename_dirname) unless !defined $file_string;
969
970	close CSSFILE;
971	}
972
973	sub acquire_css_associated_files_from_text_block {
974	my $self = shift (@_);
975	my ($text, $css_filename_dirname) = @_;
976
977	my @image_urls = ($text =~ m/background-image:\surl[^;];/sg);
978	foreach my $img_url (@image_urls)
979	{
980	$img_url =~ s/^.url.\((.)\).$/$1/;
981	$img_url =~ s/^\s"?([^"])"?\s*$/$1/;
982
983	$self->{'css_assoc_files'}->{&FileUtils::filenameConcatenate($css_filename_dirname, $img_url)} = $img_url;
984	}
985	}
986
987	sub acquire_css_associated_files {
988	my $self = shift(@_);
989
990	my ($doc_obj, $section) = @_;
991
992	foreach my $image_filename (keys %{$self->{'css_assoc_files'}})
993	{
994	$doc_obj->associate_file($image_filename, $self->{'css_assoc_files'}->{$image_filename}, undef, $section);
995	}
996	}
997
998	sub add_file {
999	my $self = shift (@_);
1000	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
1001	my ($newname);
1002
1003	my $filename = $href;
1004	if ($base_dir eq "") {
1005	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1006	# remove http://
1007	$filename =~ s/^[^:]*:\/\///;
1008	}
1009	else {
1010	# remove http:/ thereby leaving one slash at the start as
1011	# part of full pathname
1012	$filename =~ s/^[^:]*:\///;
1013	}
1014	}
1015	else {
1016	# remove http://
1017	$filename =~ s/^[^:]*:\/\///;
1018	}
1019
1020	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1021	$filename =~ s@\/@\\@g;
1022	}
1023
1024	$filename = &FileUtils::filenameConcatenate($base_dir, $filename);
1025
1026	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'})) {
1027	# we are processing a tidytmp file - want paths to be in import
1028	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1029	}
1030
1031	# Replace %XX's in URL with decoded value if required. Note that the
1032	# filename may include the %XX in some situations. If the original
1033	# file's name was in URL encoding, the following method will not decode
1034	# it.
1035	my $unicode_filename = $filename;
1036	my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
1037
1038	# wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
1039	# (it seems) accompanying file
1040	if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
1041
1042	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
1043
1044	if ($ENV{'GSDLOS'} =~ /^(linux\|solaris)$/i) {
1045	# The filenames that come through the HTML file have been decoded
1046	# into Unicode aware Perl strings. Need to convert them back
1047	# to their initial raw-byte encoding to match the file that
1048	# exists on the file system
1049	$filename = encode($content_encoding, $opt_decode_unicode_filename);
1050
1051	}
1052	elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
1053	# HFS+ is UTF8 with decompostion
1054	$filename = encode($content_encoding, $opt_decode_unicode_filename);
1055	$filename = normalize('D', $filename); # Normalization Form D (decomposition)
1056
1057	}
1058	elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
1059	my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
1060
1061	if (defined $long_filename) {
1062	my $short_filename = Win32::GetLongPathName($long_filename);
1063	$filename = $short_filename;
1064	}
1065	# else {
1066	# print STDERR "***** failed to map href to real file:\n";
1067	# print STDERR "****** $href -> $opt_decode_unicode_filename\n";
1068	# }
1069	}
1070	else {
1071	my $outhandle = $self->{'outhandle'};
1072	print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
1073	print $outhandle " in file system encoding of href: $href\n";
1074	print $outhandle " No character encoding done.\n";
1075	}
1076
1077
1078	# some special processing if the intended filename was converted to utf8, but
1079	# the actual file still needs to be renamed
1080	if (!&FileUtils::fileExists($filename)) {
1081	# try the original filename stored in map
1082	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1083	print STDERR "******!! orig filename did not exist: $filename\n";
1084	}
1085
1086	## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
1087
1088	my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
1089
1090	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1091	print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n";
1092	}
1093
1094	if (defined $original_filename && -e $original_filename) {
1095	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1096	print STDERR "****** Found match!\n";
1097	}
1098	$filename = $original_filename;
1099	}
1100	}
1101
1102	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
1103
1104	if ($rl == 0) {
1105	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
1106	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
1107	}
1108	else {
1109	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
1110	}
1111	}
1112
1113	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
1114	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
1115	}
1116	# add the original image file as a source file
1117	if (!$self->{'processing_tmp_files'} ) {
1118	$doc_obj->associate_source_file($filename);
1119	}
1120	if ($self->{'rename_assoc_files'}) {
1121	if (defined $self->{'aux_files'}->{$href}) {
1122	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1123	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1124	} else {
1125	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1126	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1127	$self->inc_filecount ();
1128	}
1129	$doc_obj->associate_file($filename, $newname, undef, $section);
1130	return "_httpdocimg_/$newname";
1131	} else {
1132	if(&unicode::is_url_encoded($unicode_filename)) {
1133	# use the possibly-decoded filename instead to avoid double URL encoding
1134	($newname) = $filename =~ m/([^\/\\]*)$/;
1135	} else {
1136	($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
1137	}
1138
1139	# Make sure this name uses only ASCII characters.
1140	# We use either base64 or URL encoding, as these preserve original encoding
1141	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
1142
1143	### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
1144	$doc_obj->associate_file($filename, $newname, undef, $section);
1145
1146	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1147	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
1148	my $newname_url = $newname;
1149	$newname_url = &unicode::filename_to_url($newname_url);
1150	return "_httpdocimg_/$newname_url";
1151	}
1152	}
1153
1154	sub replace_section_links {
1155	my $self = shift(@_);
1156	my ($doc_obj) = @_;
1157	my %anchors;
1158	my $top_section = $doc_obj->get_top_section();
1159	my $thissection = $doc_obj->get_next_section($top_section);
1160	while ( defined $thissection ) {
1161	my $text = $doc_obj->get_text($thissection);
1162	while ( $text =~ /(?:(?:id\|name)\s=\s[\'\"])([^\'\"]+)/gi ) {
1163	$anchors{$1} = $thissection;
1164	}
1165	$thissection = $doc_obj->get_next_section($thissection);
1166	}
1167	$thissection = $top_section;
1168	while (defined $thissection) {
1169	my $text = $doc_obj->get_text($thissection);
1170	$text =~ s/(href\s=\s[\"\'])(_httpsamepagelink_#)([^\'\"]+)/$self->replace_link_to_anchor($1,$2,$3,$thissection,$anchors{$3})/ige;
1171	$doc_obj->delete_text( $thissection);
1172	$doc_obj->add_utf8_text( $thissection, $text );
1173	$thissection = $doc_obj->get_next_section ($thissection);
1174	}
1175	}
1176	sub replace_link_to_anchor {
1177	my $self = shift(@_);
1178	my ($href_part,$old_link,$identifier,$current_section,$target_section) = @_;
1179	if (length $target_section && $current_section ne $target_section){
1180	return $href_part . "javascript:goToAnchor(\'" . $target_section . "\',\'" . $identifier . "\');" ;
1181	}
1182	return $href_part . $old_link . $identifier ;
1183	}
1184
1185	sub format_link {
1186	my $self = shift (@_);
1187	my ($link, $base_dir, $file) = @_;
1188
1189	# strip off hash part, e.g. #foo, but watch out for any entities, e.g. α
1190	my ($before_hash, $hash_part) = $link =~ m/^(.?[^&])(\#.)?$/;
1191
1192	$hash_part = "" if !defined $hash_part;
1193	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
1194	my $outhandle = $self->{'outhandle'};
1195	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1196	if $self->{'verbosity'};
1197	return ($link, "", 0);
1198	}
1199
1200	# my $dirname;
1201	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
1202	my $type = $1;
1203	my $before_hash_file = $before_hash;
1204
1205	if ($link =~ m/^(http\|ftp):/i) {
1206
1207	# Turn url (using /) into file name (possibly using \ on windows)
1208	my @http_dir_split = split('/', $before_hash_file);
1209	$before_hash_file = &FileUtils::filenameConcatenate(@http_dir_split);
1210	}
1211
1212	# want to maintain two version of "before_hash": one representing the URL, the other using filesystem specific directory separator
1213	$before_hash_file = $self->eval_dir_dots($before_hash_file);
1214	my $before_hash_url = $before_hash_file;
1215	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1216	$before_hash_url =~ s@\\@\/@g;
1217	}
1218
1219	######## TODO need to check this for encoding stufff
1220	my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash_file);
1221	print STDERR "chekcing for existence whether relative link or not $linkfilename\n";
1222	my $rl = 0;
1223	$rl = 1 if (-e $linkfilename);
1224	if (-e $linkfilename) {
1225
1226	print STDERR "DOES exist $linkfilename\n";
1227	} else {
1228	print STDERR "DOESN'T exist $linkfilename\n";
1229	}
1230	# make sure there's a slash on the end if it's a directory
1231	if ($before_hash_url !~ m/\/$/) {
1232	$before_hash_url .= "/" if (-d $linkfilename);
1233	}
1234	return ($type . $before_hash_url, $hash_part, $rl);
1235
1236	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
1237
1238	#### TODO whst is this test doing???
1239	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
1240
1241	# the first directory will be the domain name if file_is_url
1242	# to generate archives, otherwise we'll assume all files are
1243	# from the same site and base_dir is the root
1244
1245	if ($self->{'file_is_url'}) {
1246	my @dirs = split /[\/\\]/, $file;
1247	my $domname = shift (@dirs);
1248	$before_hash = &FileUtils::filenameConcatenate($domname, $before_hash);
1249	$before_hash =~ s@\\@/@g; # for windows
1250	}
1251	else
1252	{
1253	# see if link shares directory with source document
1254	# => turn into relative link if this is so!
1255
1256	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1257	# too difficult doing a pattern match with embedded '\'s...
1258	my $win_before_hash=$before_hash;
1259	$win_before_hash =~ s@(\\)+@/@g;
1260	# $base_dir is already similarly "converted" on windows.
1261	if ($win_before_hash =~ s@^$base_dir/@@o) {
1262	# if this is true, we removed a prefix
1263	$before_hash=$win_before_hash;
1264	}
1265	}
1266	else {
1267	# before_hash has lost leading slash by this point,
1268	# -> add back in prior to substitution with $base_dir
1269	$before_hash = "/$before_hash";
1270
1271	$before_hash = &FileUtils::filenameConcatenate("",$before_hash);
1272	$before_hash =~ s@^$base_dir/@@;
1273	}
1274	}
1275	} else {
1276
1277	# Turn relative file path into full path (inside import dir)
1278	my $dirname = &File::Basename::dirname($file);
1279
1280	# we want to add dirname (which is raw filesystem path) onto $before_hash, (which is perl unicode aware string). Convert dirname to perl string
1281
1282	my $unicode_dirname ="";
1283	# we need to turn raw filesystem filename into perl unicode aware string
1284	my $filename_encoding = $self->guess_filesystem_encoding();
1285
1286	# copied this from set_Source_metadata in BaseImporter
1287	if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
1288	# Use filename_encoding to map raw filename to a Perl unicode-aware string
1289	$unicode_dirname = decode($filename_encoding,$dirname);
1290	}
1291	else {
1292	# otherwise generate %xx encoded version of filename for char > 127
1293	$unicode_dirname = &unicode::raw_filename_to_url_encoded($dirname);
1294	}
1295	print STDERR "unicode dirname = ".&unicode::debug_unicode_string($unicode_dirname)."\n";
1296	$before_hash = &FileUtils::filenameConcatenate($unicode_dirname, $before_hash);
1297	$before_hash = $self->eval_dir_dots($before_hash);
1298	$before_hash =~ s@\\@/@g; # for windows
1299	}
1300
1301	my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);
1302
1303	# make sure there's a slash on the end if it's a directory
1304	if ($before_hash !~ m/\/$/) {
1305	$before_hash .= "/" if (-d $linkfilename);
1306	}
1307	return ("http://" . $before_hash, $hash_part, 1);
1308	} else {
1309	# mailto, news, nntp, telnet, javascript or gopher link
1310	return ($before_hash, "", 0);
1311	}
1312	}
1313
1314	sub extract_first_NNNN_characters {
1315	my $self = shift (@_);
1316	my ($textref, $doc_obj, $thissection) = @_;
1317
1318	foreach my $size (split /,/, $self->{'first'}) {
1319	my $tmptext = $$textref;
1320	# skip to the body
1321	$tmptext =~ s/.<body[^>]>//i;
1322	# remove javascript
1323	$tmptext =~ s@<script.*?</script>@ @sig;
1324	$tmptext =~ s/<[^>]*>/ /g;
1325	$tmptext =~ s/ / /g;
1326	$tmptext =~ s/^\s+//;
1327	$tmptext =~ s/\s+$//;
1328	$tmptext =~ s/\s+/ /gs;
1329	$tmptext = &unicode::substr ($tmptext, 0, $size);
1330	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
1331	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1332	}
1333	}
1334
1335
1336	sub extract_metadata {
1337	my $self = shift (@_);
1338	my ($textref, $metadata, $doc_obj, $section) = @_;
1339	my $outhandle = $self->{'outhandle'};
1340	# if we don't want metadata, we may as well not be here ...
1341	return if (!defined $self->{'metadata_fields'});
1342
1343	my $separator = $self->{'metadata_field_separator'};
1344	if ($separator eq "") {
1345	undef $separator;
1346	}
1347
1348	# metadata fields to extract/save. 'key' is the (lowercase) name of the
1349	# html meta, 'value' is the metadata name for greenstone to use
1350	my %find_fields = ();
1351
1352	my %creator_fields = (); # short-cut for lookups
1353
1354
1355	foreach my $field (split /,/, $self->{'metadata_fields'}) {
1356	$field =~ s/^\s+//; # remove leading whitespace
1357	$field =~ s/\s+$//; # remove trailing whitespace
1358
1359	# support tag<tagname>
1360	if ($field =~ m/^(.?)\s<(.*?)>$/) {
1361	# "$2" is the user's preferred gs metadata name
1362	$find_fields{lc($1)}=$2; # lc = lowercase
1363	} else { # no <tagname> for mapping
1364	# "$field" is the user's preferred gs metadata name
1365	$find_fields{lc($field)}=$field; # lc = lowercase
1366	}
1367	}
1368
1369	if (defined $self->{'hunt_creator_metadata'} &&
1370	$self->{'hunt_creator_metadata'} == 1 ) {
1371	my @extra_fields =
1372	(
1373	'author',
1374	'author.email',
1375	'creator',
1376	'dc.creator',
1377	'dc.creator.corporatename',
1378	);
1379
1380	# add the creator_metadata fields to search for
1381	foreach my $field (@extra_fields) {
1382	$creator_fields{$field}=0; # add to lookup hash
1383	}
1384	}
1385
1386
1387	# find the header in the html file, which has the meta tags
1388	$$textref =~ m@<head>(.*?)</head>@si;
1389
1390	my $html_header=$1;
1391
1392	# go through every <meta... tag defined in the html and see if it is
1393	# one of the tags we want to match.
1394
1395	# special case for title - we want to remember if its been found
1396	my $found_title = 0;
1397	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1398	$html_header =~ m/^/; # match the start of the string, for \G assertion
1399
1400	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1401	my $metatag=$1;
1402	my ($tag, $value);
1403
1404	# find the tag name
1405	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1406	$tag=$2;
1407	# in case they're not using " or ', but they should...
1408	if (! $tag) {
1409	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1410	$tag=$1;
1411	}
1412
1413	if (!defined $tag) {
1414	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1415	next;
1416	}
1417
1418	# don't need to assign this field if it was passed in from a previous
1419	# (recursive) plugin
1420	if (defined $metadata->{$tag}) {next}
1421
1422	# find the tag content
1423	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
1424	$value=$2;
1425
1426	# The following code assigns the metaname to value if value is
1427	# empty. Why would we do this?
1428	#if (! $value) {
1429	# $metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1430	# $value=$1;
1431	#}
1432	if (!defined $value \|\| $value eq "") {
1433	print $outhandle "HTMLPlugin: can't find VALUE in <meta $metatag >\n" if ($self->{'verbosity'} > 2);
1434	next;
1435	}
1436
1437	# clean up and add
1438	$value =~ s/\s+/ /gs;
1439	chomp($value); # remove trailing \n, if any
1440	if (exists $creator_fields{lc($tag)}) {
1441	# map this value onto greenstone's "Creator" metadata
1442	$tag='Creator';
1443	} elsif (!exists $find_fields{lc($tag)}) {
1444	next; # don't want this tag
1445	} else {
1446	# get the user's preferred capitalisation
1447	$tag = $find_fields{lc($tag)};
1448	}
1449	if (lc($tag) eq "title") {
1450	$found_title = 1;
1451	}
1452
1453	if ($self->{'verbosity'} > 2) {
1454	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1455	}
1456
1457	if ($tag =~ /\./) {
1458	# there is a . so has a namespace, add ex.
1459	$tag = "ex.$tag";
1460	}
1461	if (defined $separator) {
1462	my @values = split($separator, $value);
1463	foreach my $v (@values) {
1464	$doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1465	}
1466	}
1467	else {
1468	$doc_obj->add_utf8_metadata($section, $tag, $value);
1469	}
1470	}
1471
1472	# TITLE: extract the document title
1473	if (exists $find_fields{'title'} && !$found_title) {
1474	# we want a title, and didn't find one in the meta tags
1475	# see if there's a <title> tag
1476	my $title;
1477	my $from = ""; # for debugging output only
1478	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
1479	$title = $1;
1480	$from = "<title> tags";
1481	}
1482
1483	if (!defined $title) {
1484	$from = "first 100 chars";
1485	# if no title use first 100 or so characters
1486	$title = $$textref;
1487	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1488	$title =~ s/^.*?<body>//si;
1489	# ignore javascript!
1490	$title =~ s@<script.*?</script>@ @sig;
1491	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1492	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1493	$title =~ s@\r@@g; # remove Windows carriage returns to ensure that titles of pdftohtml docs are consistent (the same 100 chars) across windows and linux
1494	$title = substr ($title, 0, 100);
1495	$title =~ s/\s\S*$/.../;
1496	}
1497	$title =~ s/<[^>]*>/ /g; # remove html tags
1498	$title =~ s/ / /g;
1499	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1500	$title =~ s/\s+/ /gs; # collapse multiple spaces
1501	$title =~ s/^\s*//; # remove leading spaces
1502	$title =~ s/\s*$//; # remove trailing spaces
1503
1504	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1505	$title =~ s/^\s+//s; # in case title_sub introduced any...
1506	$doc_obj->add_utf8_metadata ($section, "Title", $title);
1507	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1508	if ($self->{'verbosity'} > 2);
1509	}
1510
1511	# add FileFormat metadata
1512	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1513
1514	# Special, for metadata names such as tagH1 - extracts
1515	# the text between the first <H1> and </H1> tags into "H1" metadata.
1516
1517	foreach my $field (keys %find_fields) {
1518	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1519	my $tag = $1;
1520	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1521	my $content = $1;
1522	$content =~ s/ / /g;
1523	$content =~ s/<[^>]*>/ /g;
1524	$content =~ s/^\s+//;
1525	$content =~ s/\s+$//;
1526	$content =~ s/\s+/ /gs;
1527	if ($content) {
1528	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1529	$tag =~ s/^tag//i;
1530	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1531	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1532	if ($self->{'verbosity'} > 2);
1533	}
1534	}
1535	}
1536	}
1537
1538
1539	# evaluate any "../" to next directory up
1540	# evaluate any "./" as here
1541	sub eval_dir_dots {
1542	my $self = shift (@_);
1543	my ($filename) = @_;
1544	my $dirsep_os = &util::get_os_dirsep();
1545	my @dirsep = split(/$dirsep_os/,$filename);
1546
1547	my @eval_dirs = ();
1548	foreach my $d (@dirsep) {
1549	if ($d eq "..") {
1550	pop(@eval_dirs);
1551
1552	} elsif ($d eq ".") {
1553	# do nothing!
1554
1555	} else {
1556	push(@eval_dirs,$d);
1557	}
1558	}
1559
1560	# Need to fiddle with number of elements in @eval_dirs if the
1561	# first one is the empty string. This is because of a
1562	# modification to FileUtils::filenameConcatenate that supresses the addition
1563	# of a leading '/' character (or \ if windows) (intended to help
1564	# filename cat with relative paths) if the first entry in the
1565	# array is the empty string. Making the array start with two
1566	# empty strings is a way to defeat this "smart" option.
1567	#
1568	if (scalar(@eval_dirs) > 0) {
1569	if ($eval_dirs[0] eq ""){
1570	unshift(@eval_dirs,"");
1571	}
1572	}
1573
1574	my $evaluated_filename = (scalar @eval_dirs > 0) ? &FileUtils::filenameConcatenate(@eval_dirs) : "";
1575	return $evaluated_filename;
1576	}
1577
1578	sub replace_usemap_links {
1579	my $self = shift (@_);
1580	my ($front, $link, $back) = @_;
1581
1582	# remove quotes from link at start and end if necessary
1583	if ($link=~/^[\"\']/) {
1584	$link=~s/^[\"\']//;
1585	$link=~s/[\"\']$//;
1586	$front.='"';
1587	$back="\"$back";
1588	}
1589
1590	$link =~ s/^\.\///;
1591	return $front . $link . $back;
1592	}
1593
1594	sub inc_filecount {
1595	my $self = shift (@_);
1596
1597	if ($self->{'file_num'} == 1000) {
1598	$self->{'dir_num'} ++;
1599	$self->{'file_num'} = 0;
1600	} else {
1601	$self->{'file_num'} ++;
1602	}
1603	}
1604
1605
1606	# Extend read_file so that strings like é are
1607	# converted to UTF8 internally.
1608	#
1609	# We don't convert < or > or & or " in case
1610	# they interfere with the GML files
1611
1612	sub read_file {
1613	my $self = shift(@_);
1614	my ($filename, $encoding, $language, $textref) = @_;
1615
1616	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1617
1618	# Convert entities to their Unicode code-point equivalents
1619	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1620	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1621	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1622
1623	}
1624
1625	sub HB_read_html_file {
1626	my $self = shift (@_);
1627	my ($htmlfile, $text) = @_;
1628
1629	# load in the file
1630	if (!open (FILE, $htmlfile)) {
1631	print STDERR "ERROR - could not open $htmlfile\n";
1632	return;
1633	}
1634
1635	my $foundbody = 0;
1636	$self->HB_gettext (\$foundbody, $text, "FILE");
1637	close FILE;
1638
1639	# just in case there was no <body> tag
1640	if (!$foundbody) {
1641	$foundbody = 1;
1642	open (FILE, $htmlfile) \|\| return;
1643	$self->HB_gettext (\$foundbody, $text, "FILE");
1644	close FILE;
1645	}
1646	# text is in utf8
1647	}
1648
1649	# converts the text to utf8, as ghtml does that for é etc.
1650	sub HB_gettext {
1651	my $self = shift (@_);
1652	my ($foundbody, $text, $handle) = @_;
1653
1654	my $line = "";
1655	while (defined ($line = <$handle>)) {
1656	# look for body tag
1657	if (!$$foundbody) {
1658	if ($line =~ s/^.<body[^>]>//i) {
1659	$$foundbody = 1;
1660	} else {
1661	next;
1662	}
1663	}
1664
1665	# check for symbol fonts
1666	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
1667	my $font = $1;
1668	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1669	if ($font !~ m/^arial$/i);
1670	}
1671
1672	$$text .= $line;
1673	}
1674
1675	if ($self->{'input_encoding'} eq "iso_8859_1") {
1676	# convert to utf-8
1677	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1678	}
1679	# convert any alphanumeric character entities to their utf-8
1680	# equivalent for indexing purposes
1681	#&ghtml::convertcharentities ($$text);
1682
1683	$$text =~ s/\s+/ /g; # remove \n's
1684
1685	# At this point $$text is a binary byte string
1686	# => turn it into a Unicode aware string, so full
1687	# Unicode aware pattern matching can be used.
1688	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
1689	#
1690
1691	$$text = decode("utf8",$$text);
1692	}
1693
1694	sub HB_clean_section {
1695	my $self = shift (@_);
1696	my ($section) = @_;
1697
1698	# remove tags without a starting tag from the section
1699	my ($tag, $tagstart);
1700	while ($section =~ m/<\/([^>]{1,10})>/) {
1701	$tag = $1;
1702	$tagstart = index($section, "<$tag");
1703	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1704	$section =~ s/<\/$tag>//;
1705	}
1706
1707	# remove extra paragraph tags
1708	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
1709
1710	# remove extra stuff at the end of the section
1711	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
1712
1713	# add a newline at the beginning of each paragraph
1714	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1715
1716	# add a newline every 80 characters at a word boundary
1717	# Note: this regular expression puts a line feed before
1718	# the last word in each section, even when it is not
1719	# needed.
1720	$section =~ s/(.{1,80})\s/$1\n/g;
1721
1722	# fix up the image links
1723	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
1724	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1725	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
1726	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1727
1728	return $section;
1729	}
1730
1731	# Will convert the oldHDL format to the new HDL format (using the Section tag)
1732	sub convert_to_newHDLformat
1733	{
1734	my $self = shift (@_);
1735	my ($file,$cnfile) = @_;
1736	my $input_filename = $file;
1737	my $tmp_filename = $cnfile;
1738
1739	# write HTML tmp file with new HDL format
1740	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1741
1742	# read in the file and do basic html cleaning (removing header etc)
1743	my $html = "";
1744	$self->HB_read_html_file ($input_filename, \$html);
1745
1746	# process the file one section at a time
1747	my $curtoclevel = 1;
1748	my $firstsection = 1;
1749	my $toclevel = 0;
1750	while (length ($html) > 0) {
1751	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
1752	$toclevel = $3;
1753	my $title = $4;
1754	my $sectiontext = "";
1755	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
1756	$sectiontext = $1;
1757	} else {
1758	$sectiontext = $html;
1759	$html = "";
1760	}
1761
1762	# remove tags and extra spaces from the title
1763	$title =~ s/<\/?[^>]+>//g;
1764	$title =~ s/^\s+\|\s+$//g;
1765
1766	# close any sections below the current level and
1767	# create a new section (special case for the firstsection)
1768	print PROD "<!--\n";
1769	while (($curtoclevel > $toclevel) \|\|
1770	(!$firstsection && $curtoclevel == $toclevel)) {
1771	$curtoclevel--;
1772	print PROD "</Section>\n";
1773	}
1774	if ($curtoclevel+1 < $toclevel) {
1775	print STDERR "WARNING - jump in toc levels in $input_filename " .
1776	"from $curtoclevel to $toclevel\n";
1777	}
1778	while ($curtoclevel < $toclevel) {
1779	$curtoclevel++;
1780	}
1781
1782	if ($curtoclevel == 1) {
1783	# add the header tag
1784	print PROD "-->\n";
1785	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1786	print PROD "<!--\n";
1787	}
1788
1789	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1790
1791	print PROD "-->\n";
1792
1793	# clean up the section html
1794	$sectiontext = $self->HB_clean_section($sectiontext);
1795
1796	print PROD "$sectiontext\n";
1797
1798	} else {
1799	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1800	"\nin $input_filename\n";
1801	last;
1802	}
1803	$firstsection = 0;
1804	}
1805
1806	print PROD "<!--\n";
1807	while ($curtoclevel > 0) {
1808	$curtoclevel--;
1809	print PROD "</Section>\n";
1810	}
1811	print PROD "-->\n";
1812
1813	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1814
1815	return $tmp_filename;
1816	}
1817
1818	sub shorten {
1819	my $self = shift (@_);
1820	my ($text) = @_;
1821
1822	return "\"$text\"" if (length($text) < 100);
1823
1824	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1825	substr ($text, length($text)-50) . "\"";
1826	}
1827
1828	sub convert_tidy_or_oldHDL_file
1829	{
1830	my $self = shift (@_);
1831	my ($file) = @_;
1832	my $input_filename = $file;
1833
1834	if (-d $input_filename)
1835	{
1836	return $input_filename;
1837	}
1838
1839	# get the input filename
1840	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1841	my $base_dirname = $dirname;
1842	$suffix = lc($suffix);
1843
1844	# derive tmp filename from input filename
1845	# Remove any white space from filename -- no risk of name collision, and
1846	# makes later conversion by utils simpler. Leave spaces in path...
1847	# tidy up the filename with space, dot, hyphen between
1848	$tailname =~ s/\s+//g;
1849	$tailname =~ s/\.+//g;
1850	$tailname =~ s/\-+//g;
1851	# convert to utf-8 otherwise we have problems with the doc.xml file
1852	# later on
1853	&unicode::ensure_utf8(\$tailname);
1854
1855	# softlink to collection tmp dir
1856	my $tmp_dirname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1857	&FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
1858
1859	my $test_dirname = "";
1860	my $f_separator = &util::get_os_dirsep();
1861
1862	if ($dirname =~ m/import$f_separator/)
1863	{
1864	$test_dirname = $'; #'
1865
1866	#print STDERR "init $'\n";
1867
1868	while ($test_dirname =~ m/[$f_separator]/)
1869	{
1870	my $folderdirname = $`;
1871	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname,$folderdirname);
1872	&FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
1873	$test_dirname = $'; #'
1874	}
1875	}
1876
1877	my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1878
1879	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1880	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
1881	{
1882	#convert the input file to a new style HDL
1883	my $hdl_output_filename = $input_filename;
1884	if ($self->{'old_style_HDL'})
1885	{
1886	$hdl_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1887	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1888	}
1889
1890	#just for checking copy all other file from the base dir to tmp dir if it is not exists
1891	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1892	my @files = grep {!/^\.+$/} readdir(DIR);
1893	close(DIR);
1894
1895	foreach my $file (@files)
1896	{
1897	my $src_file = &FileUtils::filenameConcatenate($base_dirname,$file);
1898	my $dest_file = &FileUtils::filenameConcatenate($tmp_dirname,$file);
1899	if ((!-e $dest_file) && (!-d $src_file))
1900	{
1901	# just copy the original file back to the tmp directory
1902	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1903	}
1904	}
1905
1906	# tidy the input file
1907	my $tidy_output_filename = $hdl_output_filename;
1908	if ($self->{'use_realistic_book'})
1909	{
1910	$tidy_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1911	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1912	}
1913	$tmp_filename = $tidy_output_filename;
1914	}
1915	else
1916	{
1917	if (!-e $tmp_filename)
1918	{
1919	# just copy the original file back to the tmp directory
1920	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1921	}
1922	}
1923
1924	return $tmp_filename;
1925	}
1926
1927
1928	# Will make the html input file as a proper XML file with removed font tag and
1929	# image size added to the img tag.
1930	# The tidying process takes place in a collection specific 'tmp' directory so
1931	# that we don't accidentally damage the input.
1932	sub tmp_tidy_file
1933	{
1934	my $self = shift (@_);
1935	my ($file,$cnfile) = @_;
1936	my $input_filename = $file;
1937	my $tmp_filename = $cnfile;
1938
1939	# get the input filename
1940	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1941
1942	require HTML::TokeParser::Simple;
1943
1944	# create HTML parser to decode the input file
1945	my $parser = HTML::TokeParser::Simple->new($input_filename);
1946
1947	# write HTML tmp file without the font tag and image size are added to the img tag
1948	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1949	while (my $token = $parser->get_token())
1950	{
1951	# is it an img tag
1952	if ($token->is_start_tag('img'))
1953	{
1954	# get the attributes
1955	my $attr = $token->return_attr;
1956
1957	# get the full path to the image
1958	my $img_file = &FileUtils::filenameConcatenate($dirname,$attr->{src});
1959
1960	# set the width and height attribute
1961	($attr->{width}, $attr->{height}) = imgsize($img_file);
1962
1963	# recreate the tag
1964	print PROD "<img";
1965	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1966	print PROD ">";
1967	}
1968	# is it a font tag
1969	else
1970	{
1971	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
1972	{
1973	# remove font tag
1974	print PROD "";
1975	}
1976	else
1977	{
1978	# print without changes
1979	print PROD $token->as_is;
1980	}
1981	}
1982	}
1983	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1984
1985	# run html-tidy on the tmp file to make it a proper XML file
1986
1987	my $outhandle = $self->{'outhandle'};
1988	print $outhandle "Converting HTML to be XML compliant:\n";
1989
1990	my $tidy_cmd = "tidy";
1991	$tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1992	$tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1993	if ($self->{'verbosity'} <= 2) {
1994	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1995	$tidy_cmd .= " 2>nul";
1996	}
1997	else {
1998	$tidy_cmd .= " 2>/dev/null";
1999	}
2000	print $outhandle " => $tidy_cmd\n";
2001	}
2002
2003	my $tidyfile = `$tidy_cmd`;
2004
2005	# write result back to the tmp file
2006	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
2007	print PROD $tidyfile;
2008	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
2009
2010	# return the output filename
2011	return $tmp_filename;
2012	}
2013
2014	sub associate_cover_image
2015	{
2016	my $self = shift(@_);
2017	my ($doc_obj, $filename) = @_;
2018	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
2019	{
2020	# we will have cover image in tidytmp, but want it from import
2021	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
2022	}
2023	$self->SUPER::associate_cover_image($doc_obj, $filename);
2024	}
2025
2026
2027	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: