Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 22951

Last change on this file since 22951 was 22951, checked in by davidb, 14 years ago
Encode::decode cannot be applied to all characters returned by ghtml::getcharequiv(). If getcharequiv does not recognize a character then it does not encode it, and so we cannot apply decode() to it. getcharequiv() upgraded to include an optional extra param that says whether or not to decode the equiv char should there be one it can map it to.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 55.0 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use Encode;
39
40	use ReadTextFile;
41	use HBPlugin;
42	use ghtml;
43	use unicode;
44	use util;
45	use XMLParser;
46
47	use Image::Size;
48	use File::Copy;
49
50	sub BEGIN {
51	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
52	}
53
54	use strict; # every perl program should have this!
55	no strict 'refs'; # make an exception so we can use variables as filehandles
56
57	my $arguments =
58	[ { 'name' => "process_exp",
59	'desc' => "{BasePlugin.process_exp}",
60	'type' => "regexp",
61	'deft' => &get_default_process_exp() },
62	{ 'name' => "block_exp",
63	'desc' => "{BasePlugin.block_exp}",
64	'type' => 'regexp',
65	'deft' => &get_default_block_exp() },
66	{ 'name' => "nolinks",
67	'desc' => "{HTMLPlugin.nolinks}",
68	'type' => "flag" },
69	{ 'name' => "keep_head",
70	'desc' => "{HTMLPlugin.keep_head}",
71	'type' => "flag" },
72	{ 'name' => "no_metadata",
73	'desc' => "{HTMLPlugin.no_metadata}",
74	'type' => "flag" },
75	{ 'name' => "metadata_fields",
76	'desc' => "{HTMLPlugin.metadata_fields}",
77	'type' => "string",
78	'deft' => "Title" },
79	{ 'name' => "metadata_field_separator",
80	'desc' => "{HTMLPlugin.metadata_field_separator}",
81	'type' => "string",
82	'deft' => "" },
83	{ 'name' => "hunt_creator_metadata",
84	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
85	'type' => "flag" },
86	{ 'name' => "file_is_url",
87	'desc' => "{HTMLPlugin.file_is_url}",
88	'type' => "flag" },
89	{ 'name' => "assoc_files",
90	'desc' => "{HTMLPlugin.assoc_files}",
91	'type' => "regexp",
92	'deft' => &get_default_block_exp() },
93	{ 'name' => "rename_assoc_files",
94	'desc' => "{HTMLPlugin.rename_assoc_files}",
95	'type' => "flag" },
96	{ 'name' => "title_sub",
97	'desc' => "{HTMLPlugin.title_sub}",
98	'type' => "string",
99	'deft' => "" },
100	{ 'name' => "description_tags",
101	'desc' => "{HTMLPlugin.description_tags}",
102	'type' => "flag" },
103	# retain this for backward compatibility (w3mir option was replaced by
104	# file_is_url)
105	{ 'name' => "w3mir",
106	# 'desc' => "{HTMLPlugin.w3mir}",
107	'type' => "flag",
108	'hiddengli' => "yes"},
109	{ 'name' => "no_strip_metadata_html",
110	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
111	'type' => "string",
112	'deft' => "",
113	'reqd' => "no"},
114	{ 'name' => "sectionalise_using_h_tags",
115	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
116	'type' => "flag" },
117	{ 'name' => "use_realistic_book",
118	'desc' => "{HTMLPlugin.tidy_html}",
119	'type' => "flag"},
120	{ 'name' => "old_style_HDL",
121	'desc' => "{HTMLPlugin.old_style_HDL}",
122	'type' => "flag"},
123	{'name' => "processing_tmp_files",
124	'desc' => "{BasePlugin.processing_tmp_files}",
125	'type' => "flag",
126	'hiddengli' => "yes"}
127	];
128
129	my $options = { 'name' => "HTMLPlugin",
130	'desc' => "{HTMLPlugin.desc}",
131	'abstract' => "no",
132	'inherits' => "yes",
133	'args' => $arguments };
134
135
136	sub new {
137	my ($class) = shift (@_);
138	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139	push(@$pluginlist, $class);
140
141	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142	push(@{$hashArgOptLists->{"OptList"}},$options);
143
144
145	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
146
147	if ($self->{'w3mir'}) {
148	$self->{'file_is_url'} = 1;
149	}
150	$self->{'aux_files'} = {};
151	$self->{'dir_num'} = 0;
152	$self->{'file_num'} = 0;
153
154	return bless $self, $class;
155	}
156
157	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
158	# if have eg <script language="javascript" src="img/lib.js@123">
159	# blocking is now done by reading through the file and recording all the
160	# images and other files
161	sub get_default_block_exp {
162	my $self = shift (@_);
163
164	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
165	return "";
166	}
167
168	sub get_default_process_exp {
169	my $self = shift (@_);
170
171	# the last option is an attempt to encode the concept of an html query ...
172	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
173	}
174
175	sub store_block_files
176	{
177	my $self =shift (@_);
178	my ($filename_full_path, $block_hash) = @_;
179
180	my $html_fname = $filename_full_path;
181	my @file_blocks;
182
183	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
184
185	# read in file ($text will be in utf8)
186	my $raw_text = "";
187	$self->read_file_no_decoding ($filename_full_path, \$raw_text);
188
189	my $textref = \$raw_text;
190	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
191	my $closecom = '(?:-->\|(?:—\|\|--)>)';
192	$$textref =~ s/$opencom(.*?)$closecom//gs;
193
194	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
195	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
196	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
197	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
198	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
199	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
200	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
201
202	if(!defined $self->{'utf8_to_original_filename'}) {
203	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
204	$self->{'utf8_to_original_filename'} = {};
205	}
206
207	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
208
209	# remove quotes from link at start and end if necessary
210	if ($link=~/^\"/) {
211	$link=~s/^\"//;
212	$link=~s/\"$//;
213	}
214
215	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
216	# some links may just be anchor names
217	next unless ($link =~ /\S+/);
218
219	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
220	# Turn relative file path into full path
221	my $dirname = &File::Basename::dirname($filename_full_path);
222	$link = &util::filename_cat($dirname, $link);
223	}
224	$link = $self->eval_dir_dots($link);
225
226	# this is the actual filename on the filesystem (that the link refers to)
227	my $url_original_filename = $self->opt_url_decode($link);
228
229	# Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
230	my $utf8_link = "";
231	$self->decode_text($link,$encoding,$language,\$utf8_link);
232
233	$self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
234	# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
235
236	if ($url_original_filename ne $utf8_link) {
237	my $outhandle = $self->{'outhandle'};
238
239	print $outhandle "URL Encoding $url_original_filename\n";
240	print $outhandle " ->$utf8_link\n";
241	}
242
243	$block_hash->{'file_blocks'}->{$url_original_filename} = 1;
244	}
245	}
246
247	# Given a filename in any encoding, will URL decode it to get back the original filename
248	# in the original encoding. Because this method is intended to work out the original
249	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
250	# string already exists in the local folder.
251	# Return the original filename corresponding to the parameter URL-encoded filename, and
252	# a decoded flag that is set to true iff URL-decoding had to be applied.
253	sub opt_url_decode {
254	my $self = shift (@_);
255	my ($link) = @_;
256
257	# Replace %XX's in URL with decoded value if required.
258	# Note that the filename may include the %XX in some situations
259	if ($link =~ m/\%[A-F0-9]{2}/i) {
260	if (!-e $link) {
261	$link = &unicode::url_decode($link);
262	}
263	}
264
265	return $link;
266	}
267
268	sub read_into_doc_obj
269	{
270	my $self = shift (@_);
271	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
272
273	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
274	# get the input file
275	my $input_filename = $file;
276	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
277	$suffix = lc($suffix);
278	my $tidy_filename;
279	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
280	{
281	# because the document has to be sectionalized set the description tags
282	$self->{'description_tags'} = 1;
283
284	# set the file to be tidied
285	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
286
287	# get the tidied file
288	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
289	$tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
290
291	# derive tmp filename from input filename
292	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
293
294	# set the new input file and base_dir to be from the tidied file
295	$file = "$tailname$suffix";
296	$base_dir = $dirname;
297	}
298
299	# call the parent read_into_doc_obj
300	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
301	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
302	{
303	# now we need to reset the filenames in the doc obj so that the converted filenames are not used
304	my $collect_file = &util::filename_within_collection($filename_full_path);
305	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
306	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
307	# build. so set it manually.
308	$doc_obj->{'source_path'} = $filename_full_path;
309	my $collect_conv_file = &util::filename_within_collection($tidy_filename);
310	$doc_obj->set_converted_filename($collect_conv_file);
311	$self->set_Source_metadata($doc_obj, $filename_no_path);
312	}
313	return ($process_status,$doc_obj);
314	}
315
316	# do plugin specific processing of doc_obj
317	sub process {
318	my $self = shift (@_);
319	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
320	my $outhandle = $self->{'outhandle'};
321
322	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
323	# this makes life so much easier... perl can cope with unix-style '/'s.
324	$base_dir =~ s@(\\)+@/@g;
325	$file =~ s@(\\)+@/@g;
326	}
327
328	# reset per-doc stuff...
329	$self->{'aux_files'} = {};
330	$self->{'dir_num'} = 0;
331	$self->{'file_num'} = 0;
332
333	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
334	# you can also include metadata in the format (X can be any number)
335	# <hX>Title<!--gsdl-metadata
336	# <Metadata name="name1">value1</Metadata>
337	# ...
338	# <Metadata name="nameN">valueN</Metadata>
339	#--></hX>
340	if ($self->{'sectionalise_using_h_tags'}) {
341	# description_tags should allways be activated because we convert headings to description tags
342	$self->{'description_tags'} = 1;
343
344	my $arrSections = [];
345	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
346
347	if (scalar(@$arrSections)) {
348	my $strMetadata = $self->update_section_data($arrSections, -1);
349	if (length($strMetadata)) {
350	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
351	$$textref =~ s/<\/body>/$strMetadata/ig;
352	}
353	}
354	}
355
356	my $cursection = $doc_obj->get_top_section();
357
358	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
359	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
360
361	# Store URL for page as metadata - this can be used for an
362	# altavista style search interface. The URL won't be valid
363	# unless the file structure contains the domain name (i.e.
364	# like when w3mir is used to download a website).
365
366	# URL metadata (even invalid ones) are used to support internal
367	# links, so even if 'file_is_url' is off, still need to store info
368
369	my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$");
370	my $utf8_file = $self->filename_to_utf8_metadata($file);
371	$utf8_file =~ s/&\#095;/_/g;
372	my $web_url = "http://";
373	if(defined $dirname) { # local directory
374	# Check for "ftp" in the domain name of the directory
375	# structure to determine if this URL should be a ftp:// URL
376	# This check is not infallible, but better than omitting the
377	# check, which would cause all files downloaded from ftp sites
378	# via mirroring with wget to have potentially erroneous http:// URLs
379	# assigned in their metadata
380	if ($dirname =~ /^[^\/]*ftp/i)
381	{
382	$web_url = "ftp://";
383	}
384	$dirname = $self->eval_dir_dots($dirname);
385	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
386	$web_url = $web_url.$dirname.$utf8_file;
387	} else {
388	$web_url = $web_url.$utf8_file;
389	}
390	$web_url =~ s/\\/\//g;
391	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
392
393	if ($self->{'file_is_url'}) {
394	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
395	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
396	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
397	}
398
399	if ($self->{'description_tags'}) {
400	# remove the html header - note that doing this here means any
401	# sections defined within the header will be lost (so all <Section>
402	# tags must appear within the body of the HTML)
403	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
404
405	$$textref =~ s/^.?<body[^>]>//is;
406	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
407
408	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
409	my $closecom = '(?:-->\|(?:—\|\|--)>)';
410
411	my $lt = '(?:<\|<)';
412	my $gt = '(?:>\|>)';
413	my $quot = '(?:"\|"\|”\|“)';
414
415	my $dont_strip = '';
416	if ($self->{'no_strip_metadata_html'}) {
417	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
418	}
419
420	my $found_something = 0; my $top = 1;
421	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
422	my $text = $1;
423	my $comment = $2;
424	if (defined $text) {
425	# text before a comment - note that getting to here
426	# doesn't necessarily mean there are Section tags in
427	# the document
428	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
429	}
430	while ($comment =~ s/$lt(.*?)$gt//s) {
431	my $tag = $1;
432	if ($tag eq "Section") {
433	$found_something = 1;
434	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
435	$top = 0;
436	} elsif ($tag eq "/Section") {
437	$found_something = 1;
438	$cursection = $doc_obj->get_parent_section ($cursection);
439	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
440	my $metaname = $1;
441	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
442	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
443	my $metavalue = $1;
444	$metavalue =~ s/^\s+//;
445	$metavalue =~ s/\s+$//;
446	# assume that no metadata value intentionally includes
447	# carriage returns or HTML tags (if they're there they
448	# were probably introduced when converting to HTML from
449	# some other format).
450	# actually some people want to have html tags in their
451	# metadata.
452	$metavalue =~ s/[\cJ\cM]/ /sg;
453	$metavalue =~ s/<[^>]+>//sg
454	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
455	$metavalue =~ s/\s+/ /sg;
456	if ($metaname =~ /\./) { # has a namespace
457	$metaname = "ex.$metaname";
458	}
459	if ($accumulate) {
460	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
461	} else {
462	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
463	}
464	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
465	# do nothing with containing Description tags
466	} else {
467	# simple HTML tag (probably created by the conversion
468	# to HTML from some other format) - we'll ignore it and
469	# hope for the best ;-)
470	}
471	}
472	}
473	if ($cursection ne "") {
474	print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
475	}
476
477	$$textref =~ s/^.?<body[^>]>//is;
478	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
479	if ($$textref =~ m/\S/) {
480	if (!$found_something) {
481	if ($self->{'verbosity'} > 2) {
482	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
483	print $outhandle " will be processed as a single section document\n";
484	}
485
486	# go ahead and process single-section document
487	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
488
489	# if document contains no Section tags we'll go ahead
490	# and extract metadata (this won't have been done
491	# above as the -description_tags option prevents it)
492	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
493	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
494	unless $self->{'no_metadata'};
495
496	} else {
497	print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
498	print $outhandle " of the final closing </Section> tag. This text will\n";
499	print $outhandle " be ignored.";
500
501	my ($text);
502	if (length($$textref) > 30) {
503	$text = substr($$textref, 0, 30) . "...";
504	} else {
505	$text = $$textref;
506	}
507	$text =~ s/\n/ /isg;
508	print $outhandle " ($text)\n";
509	}
510	} elsif (!$found_something) {
511
512	if ($self->{'verbosity'} > 2) {
513	# may get to here if document contained no valid Section
514	# tags but did contain some comments. The text will have
515	# been processed already but we should print the warning
516	# as above and extract metadata
517	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
518	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
519	}
520
521	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
522	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
523	unless $self->{'no_metadata'};
524	}
525
526	} else {
527
528	# remove header and footer
529	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
530	$$textref =~ s/^.?<body[^>]>//is;
531	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
532	}
533
534	# single section document
535	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
536	}
537	return 1;
538	}
539
540
541	sub process_heading
542	{
543	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
544	$strHeadingText = '' if (!defined($strHeadingText));
545
546	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
547
548	my $strSecMetadata = '';
549	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
550	{
551	$strSecMetadata .= $1;
552	}
553
554	$strHeadingText =~ s/^\s+//g;
555	$strHeadingText =~ s/\s+$//g;
556	$strSecMetadata =~ s/^\s+//g;
557	$strSecMetadata =~ s/\s+$//g;
558
559	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
560
561	if (length($strSecMetadata)) {
562	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
563	}
564
565	$strMetadata .= "\t</Description>\n";
566
567	return "<!--" . $strMetadata . "-->";
568	}
569
570
571	sub update_section_data
572	{
573	my ($self, $arrSections, $nCurTocNo) = @_;
574	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
575
576	if ($nSections == 0) {
577	push @$arrSections, $nCurTocNo;
578	return $strBuffer;
579	}
580	$nLast = $arrSections->[$nSections - 1];
581	if ($nCurTocNo > $nLast) {
582	push @$arrSections, $nCurTocNo;
583	return $strBuffer;
584	}
585	for(my $i = $nSections - 1; $i >= 0; $i--) {
586	if ($nCurTocNo <= $arrSections->[$i]) {
587	$strBuffer .= "\n</Section>";
588	pop @$arrSections;
589	}
590	}
591	push @$arrSections, $nCurTocNo;
592	return $strBuffer;
593	}
594
595
596	# note that process_section may be called multiple times for a single
597	# section (relying on the fact that add_utf8_text appends the text to any
598	# that may exist already).
599	sub process_section {
600	my $self = shift (@_);
601	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
602	# trap links
603	if (!$self->{'nolinks'}) {
604	# usemap="./#index" not handled correctly => change to "#index"
605	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
606
607	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
608	$self->replace_usemap_links($1, $2, $3)/isge;
609
610	## $$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
611
612	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
613	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
614	}
615
616	# trap images
617
618	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
619	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
620	# turned regular text succeeding images into links. That is, by embedding <imgs>
621	# inside <a href=""></a>, the text following images were no longer misbehaving.
622	# However, there would be many occasions whereby images were not meant to link
623	# to their source images but where the images would link to another web page.
624	# To allow this, the no_image_links option was introduced: it would prevent
625	# the behaviour of embedding images into links that referenced the source images.
626
627	# Somewhere along the line, the problem of normal text turning into links when
628	# such text followed images which were not embedded in <a href=""></a> ceased
629	# to occur. This is why the following lines have been commented out (as well as
630	# two lines in replace_images). They appear to no longer apply.
631
632	# If at any time, there is a need for having images embedded in <a> anchor tags,
633	# then it might be better to turn that into an HTMLPlugin option rather than make
634	# it the default behaviour. Also, eventually, no_image_links needs to become
635	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
636
637	#if(!$self->{'no_image_links'}){
638	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
639	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
640	#}
641
642	# add text to document object
643	# turn \ into \\ so that the rest of greenstone doesn't think there
644	# is an escape code following. (Macro parsing loses them...)
645	$$textref =~ s/\\/\\\\/go;
646
647	$doc_obj->add_utf8_text($cursection, $$textref);
648	}
649
650	sub replace_images {
651	my $self = shift (@_);
652	my ($front, $link, $back, $base_dir,
653	$file, $doc_obj, $section) = @_;
654
655	# remove quotes from link at start and end if necessary
656	if ($link=~/^[\"\']/) {
657	$link=~s/^[\"\']//;
658	$link=~s/[\"\']$//;
659	$front.='"';
660	$back="\"$back";
661	}
662
663	$link =~ s/\n/ /g;
664
665	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
666	# If the Word file path has spaces in it, wv messes up and you end up with
667	# absolute paths for the images, and without the "file://" prefix
668	# So check for this special case and massage the data to be correct
669	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
670	$link =~ s/^.*\\([^\\]+)$/$1/;
671	}
672
673	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
674
675	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
676
677	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n";
678
679	my $anchor_name = $img_file;
680	#$anchor_name =~ s/^.*\///;
681	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
682
683	my $image_link = $front . $img_file .$back;
684	return $image_link;
685
686	# The reasons for why the following two lines are no longer necessary can be
687	# found in subroutine process_section
688	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
689	#return $anchor_link;
690
691	#return $front . $img_file . $back . $anchor_name;
692	}
693
694	sub replace_href_links {
695	my $self = shift (@_);
696	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
697
698	# remove quotes from link at start and end if necessary
699	if ($link=~/^[\"\']/) {
700	$link=~s/^[\"\']//;
701	$link=~s/[\"\']$//;
702	$front.='"';
703	$back="\"$back";
704	}
705
706	# attempt to sort out targets - frames are not handled
707	# well in this plugin and some cases will screw things
708	# up - e.g. the _parent target (so we'll just remove
709	# them all ;-)
710	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
711	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
712	$front =~ s/target=\"?_parent\"?//is;
713	$back =~ s/target=\"?_parent\"?//is;
714
715	return $front . $link . $back if $link =~ m/^\#/s;
716	$link =~ s/\n/ /g;
717
718	# Find file referred to by $link on file system
719	# This is more complicated than it sounds when char encodings
720	# is taken in to account
721	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
722
723	# href may use '\'s where '/'s should be on Windows
724	$href =~ s/\\/\//g;
725	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
726
727
728	##### leave all these links alone (they won't be picked up by intermediate
729	##### pages). I think that's safest when dealing with frames, targets etc.
730	##### (at least until I think of a better way to do it). Problems occur with
731	##### mailto links from within small frames, the intermediate page is displayed
732	##### within that frame and can't be seen. There is still potential for this to
733	##### happen even with html pages - the solution seems to be to somehow tell
734	##### the browser from the server side to display the page being sent (i.e.
735	##### the intermediate page) in the top level window - I'm not sure if that's
736	##### possible - the following line should probably be deleted if that can be done
737	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
738
739
740	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
741	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
742	&ghtml::urlsafe ($href);
743	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
744	} else {
745	# link is to some other type of file (eg image) so we'll
746	# need to associate that file
747	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
748	}
749	}
750
751	sub add_file {
752	my $self = shift (@_);
753	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
754	my ($newname);
755
756	my $filename = $href;
757	if ($base_dir eq "") {
758	# remove http:/ thereby leaving one slash at the start
759	$filename =~ s/^[^:]*:\///;
760	}
761	else {
762	# remove http://
763	$filename =~ s/^[^:]*:\/\///;
764	}
765
766	$filename = &util::filename_cat($base_dir, $filename);
767	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'})) {
768	# we are processing a tidytmp file - want paths to be in import
769	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
770	}
771	# Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some
772	# situations. If the original file's name was in URL encoding, the following method will not decode it.
773	my $utf8_filename = $filename;
774	$filename = $self->opt_url_decode($utf8_filename);
775
776	# some special processing if the intended filename was converted to utf8, but
777	# the actual file still needs to be renamed
778	if (!-e $filename) {
779	# try the original filename stored in map
780	my $original_filename = $self->{'utf8_to_original_filename'}->{$filename};
781	if (defined $original_filename && -e $original_filename) {
782	$filename = $original_filename;
783	}
784	}
785
786	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
787
788	if ($rl == 0) {
789	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
790	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
791	}
792	else {
793	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
794	}
795	}
796
797	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
798	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
799	}
800	# add the original image file as a source file
801	if (!$self->{'processing_tmp_files'} ) {
802	$doc_obj->associate_source_file($filename);
803	}
804	if ($self->{'rename_assoc_files'}) {
805	if (defined $self->{'aux_files'}->{$href}) {
806	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
807	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
808	} else {
809	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
810	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
811	$self->inc_filecount ();
812	}
813	$doc_obj->associate_file($filename, $newname, undef, $section);
814	return "_httpdocimg_/$newname";
815	} else {
816	if(&unicode::is_url_encoded($utf8_filename)) {
817	# use the possibly-decoded filename instead to avoid double URL encoding
818	($newname) = $filename =~ m/([^\/\\]*)$/;
819	} else {
820	($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
821	}
822
823	# Make sure this name uses only ASCII characters.
824	# We use either base64 or URL encoding, as these preserve original encoding
825	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
826
827	$doc_obj->associate_file($filename, $newname, undef, $section);
828
829	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
830	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
831	my $newname_url = $newname;
832	$newname_url = &unicode::filename_to_url($newname_url);
833	return "_httpdocimg_/$newname_url";
834	}
835	}
836
837
838	sub format_link {
839	my $self = shift (@_);
840	my ($link, $base_dir, $file) = @_;
841
842	my ($before_hash, $hash_part) = $link =~ m/^([^\#])(\#?.)$/;
843
844	$hash_part = "" if !defined $hash_part;
845	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
846	my $outhandle = $self->{'outhandle'};
847	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
848	if $self->{'verbosity'};
849	return ($link, "", 0);
850	}
851
852	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
853	my $type = $1;
854
855	if ($link =~ m/^(http\|ftp):/i) {
856	# Turn url (using /) into file name (possibly using \ on windows)
857	my @http_dir_split = split('/', $before_hash);
858	$before_hash = &util::filename_cat(@http_dir_split);
859	}
860
861	$before_hash = $self->eval_dir_dots($before_hash);
862
863	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
864
865	my $rl = 0;
866	$rl = 1 if (-e $linkfilename);
867
868	# make sure there's a slash on the end if it's a directory
869	if ($before_hash !~ m/\/$/) {
870	$before_hash .= "/" if (-d $linkfilename);
871	}
872	return ($type . $before_hash, $hash_part, $rl);
873
874	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
875
876	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
877
878	# the first directory will be the domain name if file_is_url
879	# to generate archives, otherwise we'll assume all files are
880	# from the same site and base_dir is the root
881
882	if ($self->{'file_is_url'}) {
883	my @dirs = split /[\/\\]/, $file;
884	my $domname = shift (@dirs);
885	$before_hash = &util::filename_cat($domname, $before_hash);
886	$before_hash =~ s@\\@/@g; # for windows
887	}
888	else
889	{
890	# see if link shares directory with source document
891	# => turn into relative link if this is so!
892
893	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
894	# too difficult doing a pattern match with embedded '\'s...
895	my $win_before_hash=$before_hash;
896	$win_before_hash =~ s@(\\)+@/@g;
897	# $base_dir is already similarly "converted" on windows.
898	if ($win_before_hash =~ s@^$base_dir/@@o) {
899	# if this is true, we removed a prefix
900	$before_hash=$win_before_hash;
901	}
902	}
903	else {
904	# before_hash has lost leading slash by this point,
905	# -> add back in prior to substitution with $base_dir
906	$before_hash = "/$before_hash";
907
908	$before_hash = &util::filename_cat("",$before_hash);
909	$before_hash =~ s@^$base_dir/@@;
910	}
911	}
912	} else {
913	# Turn relative file path into full path
914	my $dirname = &File::Basename::dirname($file);
915	$before_hash = &util::filename_cat($dirname, $before_hash);
916	$before_hash = $self->eval_dir_dots($before_hash);
917	}
918
919	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
920	# make sure there's a slash on the end if it's a directory
921	if ($before_hash !~ m/\/$/) {
922	$before_hash .= "/" if (-d $linkfilename);
923	}
924	return ("http://" . $before_hash, $hash_part, 1);
925	} else {
926	# mailto, news, nntp, telnet, javascript or gopher link
927	return ($before_hash, "", 0);
928	}
929	}
930
931	sub extract_first_NNNN_characters {
932	my $self = shift (@_);
933	my ($textref, $doc_obj, $thissection) = @_;
934
935	foreach my $size (split /,/, $self->{'first'}) {
936	my $tmptext = $$textref;
937	# skip to the body
938	$tmptext =~ s/.<body[^>]>//i;
939	# remove javascript
940	$tmptext =~ s@<script.*?</script>@ @sig;
941	$tmptext =~ s/<[^>]*>/ /g;
942	$tmptext =~ s/ / /g;
943	$tmptext =~ s/^\s+//;
944	$tmptext =~ s/\s+$//;
945	$tmptext =~ s/\s+/ /gs;
946	$tmptext = &unicode::substr ($tmptext, 0, $size);
947	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
948	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
949	}
950	}
951
952
953	sub extract_metadata {
954	my $self = shift (@_);
955	my ($textref, $metadata, $doc_obj, $section) = @_;
956	my $outhandle = $self->{'outhandle'};
957	# if we don't want metadata, we may as well not be here ...
958	return if (!defined $self->{'metadata_fields'});
959
960	my $separator = $self->{'metadata_field_separator'};
961	if ($separator eq "") {
962	undef $separator;
963	}
964
965	# metadata fields to extract/save. 'key' is the (lowercase) name of the
966	# html meta, 'value' is the metadata name for greenstone to use
967	my %find_fields = ();
968
969	my %creator_fields = (); # short-cut for lookups
970
971
972	foreach my $field (split /,/, $self->{'metadata_fields'}) {
973	$field =~ s/^\s+//; # remove leading whitespace
974	$field =~ s/\s+$//; # remove trailing whitespace
975
976	# support tag<tagname>
977	if ($field =~ m/^(.?)\s<(.*?)>$/) {
978	# "$2" is the user's preferred gs metadata name
979	$find_fields{lc($1)}=$2; # lc = lowercase
980	} else { # no <tagname> for mapping
981	# "$field" is the user's preferred gs metadata name
982	$find_fields{lc($field)}=$field; # lc = lowercase
983	}
984	}
985
986	if (defined $self->{'hunt_creator_metadata'} &&
987	$self->{'hunt_creator_metadata'} == 1 ) {
988	my @extra_fields =
989	(
990	'author',
991	'author.email',
992	'creator',
993	'dc.creator',
994	'dc.creator.corporatename',
995	);
996
997	# add the creator_metadata fields to search for
998	foreach my $field (@extra_fields) {
999	$creator_fields{$field}=0; # add to lookup hash
1000	}
1001	}
1002
1003
1004	# find the header in the html file, which has the meta tags
1005	$$textref =~ m@<head>(.*?)</head>@si;
1006
1007	my $html_header=$1;
1008
1009	# go through every <meta... tag defined in the html and see if it is
1010	# one of the tags we want to match.
1011
1012	# special case for title - we want to remember if its been found
1013	my $found_title = 0;
1014	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1015	$html_header =~ m/^/; # match the start of the string, for \G assertion
1016
1017	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1018	my $metatag=$1;
1019	my ($tag, $value);
1020
1021	# find the tag name
1022	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1023	$tag=$2;
1024	# in case they're not using " or ', but they should...
1025	if (! $tag) {
1026	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1027	$tag=$1;
1028	}
1029
1030	if (!defined $tag) {
1031	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1032	next;
1033	}
1034
1035	# don't need to assign this field if it was passed in from a previous
1036	# (recursive) plugin
1037	if (defined $metadata->{$tag}) {next}
1038
1039	# find the tag content
1040	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
1041	$value=$2;
1042
1043	if (! $value) {
1044	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1045	$value=$1;
1046	}
1047	if (!defined $value) {
1048	print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1049	next;
1050	}
1051
1052	# clean up and add
1053	$value =~ s/\s+/ /gs;
1054	chomp($value); # remove trailing \n, if any
1055	if (exists $creator_fields{lc($tag)}) {
1056	# map this value onto greenstone's "Creator" metadata
1057	$tag='Creator';
1058	} elsif (!exists $find_fields{lc($tag)}) {
1059	next; # don't want this tag
1060	} else {
1061	# get the user's preferred capitalisation
1062	$tag = $find_fields{lc($tag)};
1063	}
1064	if (lc($tag) eq "title") {
1065	$found_title = 1;
1066	}
1067
1068	if ($self->{'verbosity'} > 2) {
1069	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1070	}
1071
1072	if ($tag =~ /\./) {
1073	# there is a . so has a namespace, add ex.
1074	$tag = "ex.$tag";
1075	}
1076	if (defined $separator) {
1077	my @values = split($separator, $value);
1078	foreach my $v (@values) {
1079	$doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1080	}
1081	}
1082	else {
1083	$doc_obj->add_utf8_metadata($section, $tag, $value);
1084	}
1085	}
1086
1087	# TITLE: extract the document title
1088	if (exists $find_fields{'title'} && !$found_title) {
1089	# we want a title, and didn't find one in the meta tags
1090	# see if there's a <title> tag
1091	my $title;
1092	my $from = ""; # for debugging output only
1093	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
1094	$title = $1;
1095	$from = "<title> tags";
1096	}
1097
1098	if (!defined $title) {
1099	$from = "first 100 chars";
1100	# if no title use first 100 or so characters
1101	$title = $$textref;
1102	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1103	$title =~ s/^.*?<body>//si;
1104	# ignore javascript!
1105	$title =~ s@<script.*?</script>@ @sig;
1106	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1107	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1108	$title = substr ($title, 0, 100);
1109	$title =~ s/\s\S*$/.../;
1110	}
1111	$title =~ s/<[^>]*>/ /g; # remove html tags
1112	$title =~ s/ / /g;
1113	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1114	$title =~ s/\s+/ /gs; # collapse multiple spaces
1115	$title =~ s/^\s*//; # remove leading spaces
1116	$title =~ s/\s*$//; # remove trailing spaces
1117
1118	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1119	$title =~ s/^\s+//s; # in case title_sub introduced any...
1120	$doc_obj->add_utf8_metadata ($section, 'Title', $title);
1121	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1122	if ($self->{'verbosity'} > 2);
1123	}
1124
1125	# add FileFormat metadata
1126	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1127
1128	# Special, for metadata names such as tagH1 - extracts
1129	# the text between the first <H1> and </H1> tags into "H1" metadata.
1130
1131	foreach my $field (keys %find_fields) {
1132	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1133	my $tag = $1;
1134	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1135	my $content = $1;
1136	$content =~ s/ / /g;
1137	$content =~ s/<[^>]*>/ /g;
1138	$content =~ s/^\s+//;
1139	$content =~ s/\s+$//;
1140	$content =~ s/\s+/ /gs;
1141	if ($content) {
1142	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1143	$tag =~ s/^tag//i;
1144	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1145	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1146	if ($self->{'verbosity'} > 2);
1147	}
1148	}
1149	}
1150	}
1151
1152
1153	# evaluate any "../" to next directory up
1154	# evaluate any "./" as here
1155	sub eval_dir_dots {
1156	my $self = shift (@_);
1157	my ($filename) = @_;
1158	my $dirsep_os = &util::get_os_dirsep();
1159	my @dirsep = split(/$dirsep_os/,$filename);
1160
1161	my @eval_dirs = ();
1162	foreach my $d (@dirsep) {
1163	if ($d eq "..") {
1164	pop(@eval_dirs);
1165
1166	} elsif ($d eq ".") {
1167	# do nothing!
1168
1169	} else {
1170	push(@eval_dirs,$d);
1171	}
1172	}
1173
1174	# Need to fiddle with number of elements in @eval_dirs if the
1175	# first one is the empty string. This is because of a
1176	# modification to util::filename_cat that supresses the addition
1177	# of a leading '/' character (or \ if windows) (intended to help
1178	# filename cat with relative paths) if the first entry in the
1179	# array is the empty string. Making the array start with two
1180	# empty strings is a way to defeat this "smart" option.
1181	#
1182	if (scalar(@eval_dirs) > 0) {
1183	if ($eval_dirs[0] eq ""){
1184	unshift(@eval_dirs,"");
1185	}
1186	}
1187
1188	my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1189	return $evaluated_filename;
1190	}
1191
1192	sub replace_usemap_links {
1193	my $self = shift (@_);
1194	my ($front, $link, $back) = @_;
1195
1196	# remove quotes from link at start and end if necessary
1197	if ($link=~/^[\"\']/) {
1198	$link=~s/^[\"\']//;
1199	$link=~s/[\"\']$//;
1200	$front.='"';
1201	$back="\"$back";
1202	}
1203
1204	$link =~ s/^\.\///;
1205	return $front . $link . $back;
1206	}
1207
1208	sub inc_filecount {
1209	my $self = shift (@_);
1210
1211	if ($self->{'file_num'} == 1000) {
1212	$self->{'dir_num'} ++;
1213	$self->{'file_num'} = 0;
1214	} else {
1215	$self->{'file_num'} ++;
1216	}
1217	}
1218
1219
1220	# Extend read_file so that strings like é are
1221	# converted to UTF8 internally.
1222	#
1223	# We don't convert < or > or & or " in case
1224	# they interfere with the GML files
1225
1226	sub read_file {
1227	my $self = shift(@_);
1228	my ($filename, $encoding, $language, $textref) = @_;
1229
1230	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1231
1232	# Convert entities to their UTF8 equivalents
1233	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1234	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1235	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1236
1237	}
1238
1239	sub HB_read_html_file {
1240	my $self = shift (@_);
1241	my ($htmlfile, $text) = @_;
1242
1243	# load in the file
1244	if (!open (FILE, $htmlfile)) {
1245	print STDERR "ERROR - could not open $htmlfile\n";
1246	return;
1247	}
1248
1249	my $foundbody = 0;
1250	$self->HB_gettext (\$foundbody, $text, "FILE");
1251	close FILE;
1252
1253	# just in case there was no <body> tag
1254	if (!$foundbody) {
1255	$foundbody = 1;
1256	open (FILE, $htmlfile) \|\| return;
1257	$self->HB_gettext (\$foundbody, $text, "FILE");
1258	close FILE;
1259	}
1260	# text is in utf8
1261	}
1262
1263	# converts the text to utf8, as ghtml does that for é etc.
1264	sub HB_gettext {
1265	my $self = shift (@_);
1266	my ($foundbody, $text, $handle) = @_;
1267
1268	my $line = "";
1269	while (defined ($line = <$handle>)) {
1270	# look for body tag
1271	if (!$$foundbody) {
1272	if ($line =~ s/^.<body[^>]>//i) {
1273	$$foundbody = 1;
1274	} else {
1275	next;
1276	}
1277	}
1278
1279	# check for symbol fonts
1280	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
1281	my $font = $1;
1282	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1283	if ($font !~ m/^arial$/i);
1284	}
1285
1286	$$text .= $line;
1287	}
1288
1289	if ($self->{'input_encoding'} eq "iso_8859_1") {
1290	# convert to utf-8
1291	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1292	}
1293	# convert any alphanumeric character entities to their utf-8
1294	# equivalent for indexing purposes
1295	#&ghtml::convertcharentities ($$text);
1296
1297	$$text =~ s/\s+/ /g; # remove \n's
1298
1299	# At this point $$text is a binary byte string
1300	# => turn it into a Unicode aware string, so full
1301	# Unicode aware pattern matching can be used.
1302	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
1303	#
1304
1305	$$text = decode("utf8",$$text);
1306	}
1307
1308	sub HB_clean_section {
1309	my $self = shift (@_);
1310	my ($section) = @_;
1311
1312	# remove tags without a starting tag from the section
1313	my ($tag, $tagstart);
1314	while ($section =~ m/<\/([^>]{1,10})>/) {
1315	$tag = $1;
1316	$tagstart = index($section, "<$tag");
1317	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1318	$section =~ s/<\/$tag>//;
1319	}
1320
1321	# remove extra paragraph tags
1322	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
1323
1324	# remove extra stuff at the end of the section
1325	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
1326
1327	# add a newline at the beginning of each paragraph
1328	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1329
1330	# add a newline every 80 characters at a word boundary
1331	# Note: this regular expression puts a line feed before
1332	# the last word in each section, even when it is not
1333	# needed.
1334	$section =~ s/(.{1,80})\s/$1\n/g;
1335
1336	# fix up the image links
1337	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
1338	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1339	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
1340	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1341
1342	return $section;
1343	}
1344
1345	# Will convert the oldHDL format to the new HDL format (using the Section tag)
1346	sub convert_to_newHDLformat
1347	{
1348	my $self = shift (@_);
1349	my ($file,$cnfile) = @_;
1350	my $input_filename = $file;
1351	my $tmp_filename = $cnfile;
1352
1353	# write HTML tmp file with new HDL format
1354	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1355
1356	# read in the file and do basic html cleaning (removing header etc)
1357	my $html = "";
1358	$self->HB_read_html_file ($input_filename, \$html);
1359
1360	# process the file one section at a time
1361	my $curtoclevel = 1;
1362	my $firstsection = 1;
1363	my $toclevel = 0;
1364	while (length ($html) > 0) {
1365	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
1366	$toclevel = $3;
1367	my $title = $4;
1368	my $sectiontext = "";
1369	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
1370	$sectiontext = $1;
1371	} else {
1372	$sectiontext = $html;
1373	$html = "";
1374	}
1375
1376	# remove tags and extra spaces from the title
1377	$title =~ s/<\/?[^>]+>//g;
1378	$title =~ s/^\s+\|\s+$//g;
1379
1380	# close any sections below the current level and
1381	# create a new section (special case for the firstsection)
1382	print PROD "<!--\n";
1383	while (($curtoclevel > $toclevel) \|\|
1384	(!$firstsection && $curtoclevel == $toclevel)) {
1385	$curtoclevel--;
1386	print PROD "</Section>\n";
1387	}
1388	if ($curtoclevel+1 < $toclevel) {
1389	print STDERR "WARNING - jump in toc levels in $input_filename " .
1390	"from $curtoclevel to $toclevel\n";
1391	}
1392	while ($curtoclevel < $toclevel) {
1393	$curtoclevel++;
1394	}
1395
1396	if ($curtoclevel == 1) {
1397	# add the header tag
1398	print PROD "-->\n";
1399	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1400	print PROD "<!--\n";
1401	}
1402
1403	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1404
1405	print PROD "-->\n";
1406
1407	# clean up the section html
1408	$sectiontext = $self->HB_clean_section($sectiontext);
1409
1410	print PROD "$sectiontext\n";
1411
1412	} else {
1413	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1414	"\nin $input_filename\n";
1415	last;
1416	}
1417	$firstsection = 0;
1418	}
1419
1420	print PROD "<!--\n";
1421	while ($curtoclevel > 0) {
1422	$curtoclevel--;
1423	print PROD "</Section>\n";
1424	}
1425	print PROD "-->\n";
1426
1427	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1428
1429	return $tmp_filename;
1430	}
1431
1432	sub shorten {
1433	my $self = shift (@_);
1434	my ($text) = @_;
1435
1436	return "\"$text\"" if (length($text) < 100);
1437
1438	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1439	substr ($text, length($text)-50) . "\"";
1440	}
1441
1442	sub convert_tidy_or_oldHDL_file
1443	{
1444	my $self = shift (@_);
1445	my ($file) = @_;
1446	my $input_filename = $file;
1447
1448	if (-d $input_filename)
1449	{
1450	return $input_filename;
1451	}
1452
1453	# get the input filename
1454	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1455	my $base_dirname = $dirname;
1456	$suffix = lc($suffix);
1457
1458	# derive tmp filename from input filename
1459	# Remove any white space from filename -- no risk of name collision, and
1460	# makes later conversion by utils simpler. Leave spaces in path...
1461	# tidy up the filename with space, dot, hyphen between
1462	$tailname =~ s/\s+//g;
1463	$tailname =~ s/\.+//g;
1464	$tailname =~ s/\-+//g;
1465	# convert to utf-8 otherwise we have problems with the doc.xml file
1466	# later on
1467	&unicode::ensure_utf8(\$tailname);
1468
1469	# softlink to collection tmp dir
1470	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1471	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1472
1473	my $test_dirname = "";
1474	my $f_separator = &util::get_os_dirsep();
1475
1476	if ($dirname =~ m/import$f_separator/)
1477	{
1478	$test_dirname = $'; #'
1479
1480	#print STDERR "init $'\n";
1481
1482	while ($test_dirname =~ m/[$f_separator]/)
1483	{
1484	my $folderdirname = $`;
1485	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1486	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1487	$test_dirname = $'; #'
1488	}
1489	}
1490
1491	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1492
1493	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1494	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
1495	{
1496	#convert the input file to a new style HDL
1497	my $hdl_output_filename = $input_filename;
1498	if ($self->{'old_style_HDL'})
1499	{
1500	$hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1501	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1502	}
1503
1504	#just for checking copy all other file from the base dir to tmp dir if it is not exists
1505	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1506	my @files = grep {!/^\.+$/} readdir(DIR);
1507	close(DIR);
1508
1509	foreach my $file (@files)
1510	{
1511	my $src_file = &util::filename_cat($base_dirname,$file);
1512	my $dest_file = &util::filename_cat($tmp_dirname,$file);
1513	if ((!-e $dest_file) && (!-d $src_file))
1514	{
1515	# just copy the original file back to the tmp directory
1516	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1517	}
1518	}
1519
1520	# tidy the input file
1521	my $tidy_output_filename = $hdl_output_filename;
1522	if ($self->{'use_realistic_book'})
1523	{
1524	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1525	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1526	}
1527	$tmp_filename = $tidy_output_filename;
1528	}
1529	else
1530	{
1531	if (!-e $tmp_filename)
1532	{
1533	# just copy the original file back to the tmp directory
1534	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1535	}
1536	}
1537
1538	return $tmp_filename;
1539	}
1540
1541
1542	# Will make the html input file as a proper XML file with removed font tag and
1543	# image size added to the img tag.
1544	# The tidying process takes place in a collection specific 'tmp' directory so
1545	# that we don't accidentally damage the input.
1546	sub tmp_tidy_file
1547	{
1548	my $self = shift (@_);
1549	my ($file,$cnfile) = @_;
1550	my $input_filename = $file;
1551	my $tmp_filename = $cnfile;
1552
1553	# get the input filename
1554	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1555
1556	require HTML::TokeParser::Simple;
1557
1558	# create HTML parser to decode the input file
1559	my $parser = HTML::TokeParser::Simple->new($input_filename);
1560
1561	# write HTML tmp file without the font tag and image size are added to the img tag
1562	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1563	while (my $token = $parser->get_token())
1564	{
1565	# is it an img tag
1566	if ($token->is_start_tag('img'))
1567	{
1568	# get the attributes
1569	my $attr = $token->return_attr;
1570
1571	# get the full path to the image
1572	my $img_file = &util::filename_cat($dirname,$attr->{src});
1573
1574	# set the width and height attribute
1575	($attr->{width}, $attr->{height}) = imgsize($img_file);
1576
1577	# recreate the tag
1578	print PROD "<img";
1579	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1580	print PROD ">";
1581	}
1582	# is it a font tag
1583	else
1584	{
1585	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
1586	{
1587	# remove font tag
1588	print PROD "";
1589	}
1590	else
1591	{
1592	# print without changes
1593	print PROD $token->as_is;
1594	}
1595	}
1596	}
1597	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1598
1599	# run html-tidy on the tmp file to make it a proper XML file
1600
1601	my $outhandle = $self->{'outhandle'};
1602	print $outhandle "Converting HTML to be XML compliant:\n";
1603
1604	my $tidy_cmd = "tidy";
1605	$tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1606	$tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1607	if ($self->{'verbosity'} <= 2) {
1608	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1609	$tidy_cmd .= " 2>nul";
1610	}
1611	else {
1612	$tidy_cmd .= " 2>/dev/null";
1613	}
1614	print $outhandle " => $tidy_cmd\n";
1615	}
1616
1617	my $tidyfile = `$tidy_cmd`;
1618
1619	# write result back to the tmp file
1620	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1621	print PROD $tidyfile;
1622	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1623
1624	# return the output filename
1625	return $tmp_filename;
1626	}
1627
1628	sub associate_cover_image
1629	{
1630	my $self = shift(@_);
1631	my ($doc_obj, $filename) = @_;
1632	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
1633	{
1634	# we will have cover image in tidytmp, but want it from import
1635	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1636	}
1637	$self->SUPER::associate_cover_image($doc_obj, $filename);
1638	}
1639
1640
1641	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: