Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 22355

Last change on this file since 22355 was 22355, checked in by kjdon, 14 years ago
previously, when use_realistic_book was set, all files listed in archivesinf dbs were in tidytmp - change paths to be import paths. This assumes that same directory structure is kept in tidytmp, which appears to be the case. No tidytmp paths should now appear in archiveinf dbs
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 53.9 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use ReadTextFile;
39	use HBPlugin;
40	use ghtml;
41	use unicode;
42	use util;
43	use XMLParser;
44
45	use Image::Size;
46	use File::Copy;
47
48	sub BEGIN {
49	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
50	}
51
52	use strict; # every perl program should have this!
53	no strict 'refs'; # make an exception so we can use variables as filehandles
54
55	my $arguments =
56	[ { 'name' => "process_exp",
57	'desc' => "{BasePlugin.process_exp}",
58	'type' => "regexp",
59	'deft' => &get_default_process_exp() },
60	{ 'name' => "block_exp",
61	'desc' => "{BasePlugin.block_exp}",
62	'type' => 'regexp',
63	'deft' => &get_default_block_exp() },
64	{ 'name' => "nolinks",
65	'desc' => "{HTMLPlugin.nolinks}",
66	'type' => "flag" },
67	{ 'name' => "keep_head",
68	'desc' => "{HTMLPlugin.keep_head}",
69	'type' => "flag" },
70	{ 'name' => "no_metadata",
71	'desc' => "{HTMLPlugin.no_metadata}",
72	'type' => "flag" },
73	{ 'name' => "metadata_fields",
74	'desc' => "{HTMLPlugin.metadata_fields}",
75	'type' => "string",
76	'deft' => "Title" },
77	{ 'name' => "metadata_field_separator",
78	'desc' => "{HTMLPlugin.metadata_field_separator}",
79	'type' => "string",
80	'deft' => "" },
81	{ 'name' => "hunt_creator_metadata",
82	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
83	'type' => "flag" },
84	{ 'name' => "file_is_url",
85	'desc' => "{HTMLPlugin.file_is_url}",
86	'type' => "flag" },
87	{ 'name' => "assoc_files",
88	'desc' => "{HTMLPlugin.assoc_files}",
89	'type' => "regexp",
90	'deft' => &get_default_block_exp() },
91	{ 'name' => "rename_assoc_files",
92	'desc' => "{HTMLPlugin.rename_assoc_files}",
93	'type' => "flag" },
94	{ 'name' => "title_sub",
95	'desc' => "{HTMLPlugin.title_sub}",
96	'type' => "string",
97	'deft' => "" },
98	{ 'name' => "description_tags",
99	'desc' => "{HTMLPlugin.description_tags}",
100	'type' => "flag" },
101	# retain this for backward compatibility (w3mir option was replaced by
102	# file_is_url)
103	{ 'name' => "w3mir",
104	# 'desc' => "{HTMLPlugin.w3mir}",
105	'type' => "flag",
106	'hiddengli' => "yes"},
107	{ 'name' => "no_strip_metadata_html",
108	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
109	'type' => "string",
110	'deft' => "",
111	'reqd' => "no"},
112	{ 'name' => "sectionalise_using_h_tags",
113	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
114	'type' => "flag" },
115	{ 'name' => "use_realistic_book",
116	'desc' => "{HTMLPlugin.tidy_html}",
117	'type' => "flag"},
118	{ 'name' => "old_style_HDL",
119	'desc' => "{HTMLPlugin.old_style_HDL}",
120	'type' => "flag"},
121	{'name' => "processing_tmp_files",
122	'desc' => "{BasePlugin.processing_tmp_files}",
123	'type' => "flag",
124	'hiddengli' => "yes"}
125	];
126
127	my $options = { 'name' => "HTMLPlugin",
128	'desc' => "{HTMLPlugin.desc}",
129	'abstract' => "no",
130	'inherits' => "yes",
131	'args' => $arguments };
132
133
134	sub new {
135	my ($class) = shift (@_);
136	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
137	push(@$pluginlist, $class);
138
139	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
140	push(@{$hashArgOptLists->{"OptList"}},$options);
141
142
143	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
144
145	if ($self->{'w3mir'}) {
146	$self->{'file_is_url'} = 1;
147	}
148	$self->{'aux_files'} = {};
149	$self->{'dir_num'} = 0;
150	$self->{'file_num'} = 0;
151
152	return bless $self, $class;
153	}
154
155	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
156	# if have eg <script language="javascript" src="img/lib.js@123">
157	# blocking is now done by reading through the file and recording all the
158	# images and other files
159	sub get_default_block_exp {
160	my $self = shift (@_);
161
162	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
163	return "";
164	}
165
166	sub get_default_process_exp {
167	my $self = shift (@_);
168
169	# the last option is an attempt to encode the concept of an html query ...
170	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
171	}
172
173	sub store_block_files
174	{
175	my $self =shift (@_);
176	my ($filename_full_path, $block_hash) = @_;
177
178	my $html_fname = $filename_full_path;
179	my @file_blocks;
180
181	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
182
183	# read in file ($text will be in utf8)
184	my $raw_text = "";
185	$self->read_file_no_decoding ($filename_full_path, \$raw_text);
186
187	my $textref = \$raw_text;
188	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
189	my $closecom = '(?:-->\|(?:—\|\|--)>)';
190	$$textref =~ s/$opencom(.*?)$closecom//gs;
191
192	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
193	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
194	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
195	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
196	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
197	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
198	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
199
200	if(!defined $self->{'utf8_to_original_filename'}) {
201	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
202	$self->{'utf8_to_original_filename'} = {};
203	}
204
205	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
206
207	# remove quotes from link at start and end if necessary
208	if ($link=~/^\"/) {
209	$link=~s/^\"//;
210	$link=~s/\"$//;
211	}
212
213	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
214	# some links may just be anchor names
215	next unless ($link =~ /\S+/);
216
217	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
218	# Turn relative file path into full path
219	my $dirname = &File::Basename::dirname($filename_full_path);
220	$link = &util::filename_cat($dirname, $link);
221	}
222	$link = $self->eval_dir_dots($link);
223
224	# this is the actual filename on the filesystem (that the link refers to)
225	my $url_original_filename = $self->opt_url_decode($link);
226
227	# Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
228	my $utf8_link = "";
229	$self->decode_text($link,$encoding,$language,\$utf8_link);
230
231	$self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
232	# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
233
234	if ($url_original_filename ne $utf8_link) {
235	my $outhandle = $self->{'outhandle'};
236
237	print $outhandle "URL Encoding $url_original_filename\n";
238	print $outhandle " ->$utf8_link\n";
239	}
240
241	$block_hash->{'file_blocks'}->{$url_original_filename} = 1;
242	}
243	}
244
245	# Given a filename in any encoding, will URL decode it to get back the original filename
246	# in the original encoding. Because this method is intended to work out the original
247	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
248	# string already exists in the local folder.
249	# Return the original filename corresponding to the parameter URL-encoded filename, and
250	# a decoded flag that is set to true iff URL-decoding had to be applied.
251	sub opt_url_decode {
252	my $self = shift (@_);
253	my ($link) = @_;
254
255	# Replace %XX's in URL with decoded value if required.
256	# Note that the filename may include the %XX in some situations
257	if ($link =~ m/\%[A-F0-9]{2}/i) {
258	if (!-e $link) {
259	$link = &unicode::url_decode($link);
260	}
261	}
262
263	return $link;
264	}
265
266	sub read_into_doc_obj
267	{
268	my $self = shift (@_);
269	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
270
271	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
272	# get the input file
273	my $input_filename = $file;
274	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
275	$suffix = lc($suffix);
276	my $tidy_filename;
277	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
278	{
279	# because the document has to be sectionalized set the description tags
280	$self->{'description_tags'} = 1;
281
282	# set the file to be tidied
283	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
284
285	# get the tidied file
286	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
287	$tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
288
289	# derive tmp filename from input filename
290	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
291
292	# set the new input file and base_dir to be from the tidied file
293	$file = "$tailname$suffix";
294	$base_dir = $dirname;
295	}
296
297	# call the parent read_into_doc_obj
298	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
299	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
300	{
301	# now we need to reset the filenames in the doc obj so that the converted filenames are not used
302	my $collect_file = &util::filename_within_collection($filename_full_path);
303	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
304	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
305	# build. so set it manually.
306	$doc_obj->{'source_path'} = $filename_full_path;
307	my $collect_conv_file = &util::filename_within_collection($tidy_filename);
308	$doc_obj->set_converted_filename($collect_conv_file);
309	$self->set_Source_metadata($doc_obj, $filename_no_path);
310	}
311	return ($process_status,$doc_obj);
312	}
313
314	# do plugin specific processing of doc_obj
315	sub process {
316	my $self = shift (@_);
317	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
318	my $outhandle = $self->{'outhandle'};
319
320	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
321	# this makes life so much easier... perl can cope with unix-style '/'s.
322	$base_dir =~ s@(\\)+@/@g;
323	$file =~ s@(\\)+@/@g;
324	}
325
326	# reset per-doc stuff...
327	$self->{'aux_files'} = {};
328	$self->{'dir_num'} = 0;
329	$self->{'file_num'} = 0;
330
331	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
332	# you can also include metadata in the format (X can be any number)
333	# <hX>Title<!--gsdl-metadata
334	# <Metadata name="name1">value1</Metadata>
335	# ...
336	# <Metadata name="nameN">valueN</Metadata>
337	#--></hX>
338	if ($self->{'sectionalise_using_h_tags'}) {
339	# description_tags should allways be activated because we convert headings to description tags
340	$self->{'description_tags'} = 1;
341
342	my $arrSections = [];
343	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
344
345	if (scalar(@$arrSections)) {
346	my $strMetadata = $self->update_section_data($arrSections, -1);
347	if (length($strMetadata)) {
348	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
349	$$textref =~ s/<\/body>/$strMetadata/ig;
350	}
351	}
352	}
353
354	my $cursection = $doc_obj->get_top_section();
355
356	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
357	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
358
359	# Store URL for page as metadata - this can be used for an
360	# altavista style search interface. The URL won't be valid
361	# unless the file structure contains the domain name (i.e.
362	# like when w3mir is used to download a website).
363
364	# URL metadata (even invalid ones) are used to support internal
365	# links, so even if 'file_is_url' is off, still need to store info
366
367	my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$");
368	my $utf8_file = $self->filename_to_utf8_metadata($file);
369	$utf8_file =~ s/&\#095;/_/g;
370	my $web_url = "http://";
371	if(defined $dirname) { # local directory
372	$dirname = $self->eval_dir_dots($dirname);
373	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
374	$web_url = $web_url.$dirname.$utf8_file;
375	} else {
376	$web_url = $web_url.$utf8_file;
377	}
378	$web_url =~ s/\\/\//g;
379	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
380
381	if ($self->{'file_is_url'}) {
382	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
383	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
384	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
385	}
386
387	if ($self->{'description_tags'}) {
388	# remove the html header - note that doing this here means any
389	# sections defined within the header will be lost (so all <Section>
390	# tags must appear within the body of the HTML)
391	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
392
393	$$textref =~ s/^.?<body[^>]>//is;
394	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
395
396	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
397	my $closecom = '(?:-->\|(?:—\|\|--)>)';
398
399	my $lt = '(?:<\|<)';
400	my $gt = '(?:>\|>)';
401	my $quot = '(?:"\|"\|”\|“)';
402
403	my $dont_strip = '';
404	if ($self->{'no_strip_metadata_html'}) {
405	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
406	}
407
408	my $found_something = 0; my $top = 1;
409	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
410	my $text = $1;
411	my $comment = $2;
412	if (defined $text) {
413	# text before a comment - note that getting to here
414	# doesn't necessarily mean there are Section tags in
415	# the document
416	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
417	}
418	while ($comment =~ s/$lt(.*?)$gt//s) {
419	my $tag = $1;
420	if ($tag eq "Section") {
421	$found_something = 1;
422	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
423	$top = 0;
424	} elsif ($tag eq "/Section") {
425	$found_something = 1;
426	$cursection = $doc_obj->get_parent_section ($cursection);
427	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
428	my $metaname = $1;
429	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
430	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
431	my $metavalue = $1;
432	$metavalue =~ s/^\s+//;
433	$metavalue =~ s/\s+$//;
434	# assume that no metadata value intentionally includes
435	# carriage returns or HTML tags (if they're there they
436	# were probably introduced when converting to HTML from
437	# some other format).
438	# actually some people want to have html tags in their
439	# metadata.
440	$metavalue =~ s/[\cJ\cM]/ /sg;
441	$metavalue =~ s/<[^>]+>//sg
442	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
443	$metavalue =~ s/\s+/ /sg;
444	if ($metaname =~ /\./) { # has a namespace
445	$metaname = "ex.$metaname";
446	}
447	if ($accumulate) {
448	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
449	} else {
450	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
451	}
452	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
453	# do nothing with containing Description tags
454	} else {
455	# simple HTML tag (probably created by the conversion
456	# to HTML from some other format) - we'll ignore it and
457	# hope for the best ;-)
458	}
459	}
460	}
461	if ($cursection ne "") {
462	print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
463	}
464
465	$$textref =~ s/^.?<body[^>]>//is;
466	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
467	if ($$textref =~ m/\S/) {
468	if (!$found_something) {
469	if ($self->{'verbosity'} > 2) {
470	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
471	print $outhandle " will be processed as a single section document\n";
472	}
473
474	# go ahead and process single-section document
475	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
476
477	# if document contains no Section tags we'll go ahead
478	# and extract metadata (this won't have been done
479	# above as the -description_tags option prevents it)
480	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
481	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
482	unless $self->{'no_metadata'};
483
484	} else {
485	print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
486	print $outhandle " of the final closing </Section> tag. This text will\n";
487	print $outhandle " be ignored.";
488
489	my ($text);
490	if (length($$textref) > 30) {
491	$text = substr($$textref, 0, 30) . "...";
492	} else {
493	$text = $$textref;
494	}
495	$text =~ s/\n/ /isg;
496	print $outhandle " ($text)\n";
497	}
498	} elsif (!$found_something) {
499
500	if ($self->{'verbosity'} > 2) {
501	# may get to here if document contained no valid Section
502	# tags but did contain some comments. The text will have
503	# been processed already but we should print the warning
504	# as above and extract metadata
505	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
506	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
507	}
508
509	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
510	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
511	unless $self->{'no_metadata'};
512	}
513
514	} else {
515
516	# remove header and footer
517	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
518	$$textref =~ s/^.?<body[^>]>//is;
519	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
520	}
521
522	# single section document
523	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
524	}
525	return 1;
526	}
527
528
529	sub process_heading
530	{
531	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
532	$strHeadingText = '' if (!defined($strHeadingText));
533
534	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
535
536	my $strSecMetadata = '';
537	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
538	{
539	$strSecMetadata .= $1;
540	}
541
542	$strHeadingText =~ s/^\s+//g;
543	$strHeadingText =~ s/\s+$//g;
544	$strSecMetadata =~ s/^\s+//g;
545	$strSecMetadata =~ s/\s+$//g;
546
547	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
548
549	if (length($strSecMetadata)) {
550	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
551	}
552
553	$strMetadata .= "\t</Description>\n";
554
555	return "<!--" . $strMetadata . "-->";
556	}
557
558
559	sub update_section_data
560	{
561	my ($self, $arrSections, $nCurTocNo) = @_;
562	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
563
564	if ($nSections == 0) {
565	push @$arrSections, $nCurTocNo;
566	return $strBuffer;
567	}
568	$nLast = $arrSections->[$nSections - 1];
569	if ($nCurTocNo > $nLast) {
570	push @$arrSections, $nCurTocNo;
571	return $strBuffer;
572	}
573	for(my $i = $nSections - 1; $i >= 0; $i--) {
574	if ($nCurTocNo <= $arrSections->[$i]) {
575	$strBuffer .= "\n</Section>";
576	pop @$arrSections;
577	}
578	}
579	push @$arrSections, $nCurTocNo;
580	return $strBuffer;
581	}
582
583
584	# note that process_section may be called multiple times for a single
585	# section (relying on the fact that add_utf8_text appends the text to any
586	# that may exist already).
587	sub process_section {
588	my $self = shift (@_);
589	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
590	# trap links
591	if (!$self->{'nolinks'}) {
592	# usemap="./#index" not handled correctly => change to "#index"
593	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
594
595	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
596	$self->replace_usemap_links($1, $2, $3)/isge;
597
598	## $$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
599
600	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
601	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
602	}
603
604	# trap images
605
606	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
607	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
608	# turned regular text succeeding images into links. That is, by embedding <imgs>
609	# inside <a href=""></a>, the text following images were no longer misbehaving.
610	# However, there would be many occasions whereby images were not meant to link
611	# to their source images but where the images would link to another web page.
612	# To allow this, the no_image_links option was introduced: it would prevent
613	# the behaviour of embedding images into links that referenced the source images.
614
615	# Somewhere along the line, the problem of normal text turning into links when
616	# such text followed images which were not embedded in <a href=""></a> ceased
617	# to occur. This is why the following lines have been commented out (as well as
618	# two lines in replace_images). They appear to no longer apply.
619
620	# If at any time, there is a need for having images embedded in <a> anchor tags,
621	# then it might be better to turn that into an HTMLPlugin option rather than make
622	# it the default behaviour. Also, eventually, no_image_links needs to become
623	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
624
625	#if(!$self->{'no_image_links'}){
626	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
627	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
628	#}
629
630	# add text to document object
631	# turn \ into \\ so that the rest of greenstone doesn't think there
632	# is an escape code following. (Macro parsing loses them...)
633	$$textref =~ s/\\/\\\\/go;
634
635	$doc_obj->add_utf8_text($cursection, $$textref);
636	}
637
638	sub replace_images {
639	my $self = shift (@_);
640	my ($front, $link, $back, $base_dir,
641	$file, $doc_obj, $section) = @_;
642
643	# remove quotes from link at start and end if necessary
644	if ($link=~/^[\"\']/) {
645	$link=~s/^[\"\']//;
646	$link=~s/[\"\']$//;
647	$front.='"';
648	$back="\"$back";
649	}
650
651	$link =~ s/\n/ /g;
652
653	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
654	# If the Word file path has spaces in it, wv messes up and you end up with
655	# absolute paths for the images, and without the "file://" prefix
656	# So check for this special case and massage the data to be correct
657	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
658	$link =~ s/^.*\\([^\\]+)$/$1/;
659	}
660
661	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
662
663	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
664
665	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n";
666
667	my $anchor_name = $img_file;
668	#$anchor_name =~ s/^.*\///;
669	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
670
671	my $image_link = $front . $img_file .$back;
672	return $image_link;
673
674	# The reasons for why the following two lines are no longer necessary can be
675	# found in subroutine process_section
676	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
677	#return $anchor_link;
678
679	#return $front . $img_file . $back . $anchor_name;
680	}
681
682	sub replace_href_links {
683	my $self = shift (@_);
684	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
685
686	# remove quotes from link at start and end if necessary
687	if ($link=~/^[\"\']/) {
688	$link=~s/^[\"\']//;
689	$link=~s/[\"\']$//;
690	$front.='"';
691	$back="\"$back";
692	}
693
694	# attempt to sort out targets - frames are not handled
695	# well in this plugin and some cases will screw things
696	# up - e.g. the _parent target (so we'll just remove
697	# them all ;-)
698	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
699	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
700	$front =~ s/target=\"?_parent\"?//is;
701	$back =~ s/target=\"?_parent\"?//is;
702
703	return $front . $link . $back if $link =~ m/^\#/s;
704	$link =~ s/\n/ /g;
705
706	# Find file referred to by $link on file system
707	# This is more complicated than it sounds when char encodings
708	# is taken in to account
709	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
710
711	# href may use '\'s where '/'s should be on Windows
712	$href =~ s/\\/\//g;
713	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
714
715
716	##### leave all these links alone (they won't be picked up by intermediate
717	##### pages). I think that's safest when dealing with frames, targets etc.
718	##### (at least until I think of a better way to do it). Problems occur with
719	##### mailto links from within small frames, the intermediate page is displayed
720	##### within that frame and can't be seen. There is still potential for this to
721	##### happen even with html pages - the solution seems to be to somehow tell
722	##### the browser from the server side to display the page being sent (i.e.
723	##### the intermediate page) in the top level window - I'm not sure if that's
724	##### possible - the following line should probably be deleted if that can be done
725	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
726
727
728	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
729	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
730	&ghtml::urlsafe ($href);
731	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
732	} else {
733	# link is to some other type of file (eg image) so we'll
734	# need to associate that file
735	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
736	}
737	}
738
739	sub add_file {
740	my $self = shift (@_);
741	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
742	my ($newname);
743
744	my $filename = $href;
745	if ($base_dir eq "") {
746	# remove http:/ thereby leaving one slash at the start
747	$filename =~ s/^[^:]*:\///;
748	}
749	else {
750	# remove http://
751	$filename =~ s/^[^:]*:\/\///;
752	}
753
754	$filename = &util::filename_cat($base_dir, $filename);
755	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'})) {
756	# we are processing a tidytmp file - want paths to be in import
757	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
758	}
759	# Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some
760	# situations. If the original file's name was in URL encoding, the following method will not decode it.
761	my $utf8_filename = $filename;
762	$filename = $self->opt_url_decode($utf8_filename);
763
764	# some special processing if the intended filename was converted to utf8, but
765	# the actual file still needs to be renamed
766	if (!-e $filename) {
767	# try the original filename stored in map
768	my $original_filename = $self->{'utf8_to_original_filename'}->{$filename};
769	if (defined $original_filename && -e $original_filename) {
770	$filename = $original_filename;
771	}
772	}
773
774	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
775
776	if ($rl == 0) {
777	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
778	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
779	}
780	else {
781	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
782	}
783	}
784
785	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
786	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
787	}
788	# add the original image file as a source file
789	if (!$self->{'processing_tmp_files'} ) {
790	$doc_obj->associate_source_file($filename);
791	}
792	if ($self->{'rename_assoc_files'}) {
793	if (defined $self->{'aux_files'}->{$href}) {
794	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
795	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
796	} else {
797	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
798	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
799	$self->inc_filecount ();
800	}
801	$doc_obj->associate_file($filename, $newname, undef, $section);
802	return "_httpdocimg_/$newname";
803	} else {
804	if(&unicode::is_url_encoded($utf8_filename)) {
805	# use the possibly-decoded filename instead to avoid double URL encoding
806	($newname) = $filename =~ m/([^\/\\]*)$/;
807	} else {
808	($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
809	}
810
811	# Make sure this name uses only ASCII characters.
812	# We use either base64 or URL encoding, as these preserve original encoding
813	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
814
815	$doc_obj->associate_file($filename, $newname, undef, $section);
816
817	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
818	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
819	my $newname_url = $newname;
820	$newname_url = &unicode::filename_to_url($newname_url);
821	return "_httpdocimg_/$newname_url";
822	}
823	}
824
825
826	sub format_link {
827	my $self = shift (@_);
828	my ($link, $base_dir, $file) = @_;
829
830	my ($before_hash, $hash_part) = $link =~ m/^([^\#])(\#?.)$/;
831
832	$hash_part = "" if !defined $hash_part;
833	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
834	my $outhandle = $self->{'outhandle'};
835	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
836	if $self->{'verbosity'};
837	return ($link, "", 0);
838	}
839
840	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
841	my $type = $1;
842
843	if ($link =~ m/^(http\|ftp):/i) {
844	# Turn url (using /) into file name (possibly using \ on windows)
845	my @http_dir_split = split('/', $before_hash);
846	$before_hash = &util::filename_cat(@http_dir_split);
847	}
848
849	$before_hash = $self->eval_dir_dots($before_hash);
850
851	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
852
853	my $rl = 0;
854	$rl = 1 if (-e $linkfilename);
855
856	# make sure there's a slash on the end if it's a directory
857	if ($before_hash !~ m/\/$/) {
858	$before_hash .= "/" if (-d $linkfilename);
859	}
860	return ($type . $before_hash, $hash_part, $rl);
861
862	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
863
864	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
865
866	# the first directory will be the domain name if file_is_url
867	# to generate archives, otherwise we'll assume all files are
868	# from the same site and base_dir is the root
869
870	if ($self->{'file_is_url'}) {
871	my @dirs = split /[\/\\]/, $file;
872	my $domname = shift (@dirs);
873	$before_hash = &util::filename_cat($domname, $before_hash);
874	$before_hash =~ s@\\@/@g; # for windows
875	}
876	else
877	{
878	# see if link shares directory with source document
879	# => turn into relative link if this is so!
880
881	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
882	# too difficult doing a pattern match with embedded '\'s...
883	my $win_before_hash=$before_hash;
884	$win_before_hash =~ s@(\\)+@/@g;
885	# $base_dir is already similarly "converted" on windows.
886	if ($win_before_hash =~ s@^$base_dir/@@o) {
887	# if this is true, we removed a prefix
888	$before_hash=$win_before_hash;
889	}
890	}
891	else {
892	# before_hash has lost leading slash by this point,
893	# -> add back in prior to substitution with $base_dir
894	$before_hash = "/$before_hash";
895
896	$before_hash = &util::filename_cat("",$before_hash);
897	$before_hash =~ s@^$base_dir/@@;
898	}
899	}
900	} else {
901	# Turn relative file path into full path
902	my $dirname = &File::Basename::dirname($file);
903	$before_hash = &util::filename_cat($dirname, $before_hash);
904	$before_hash = $self->eval_dir_dots($before_hash);
905	}
906
907	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
908	# make sure there's a slash on the end if it's a directory
909	if ($before_hash !~ m/\/$/) {
910	$before_hash .= "/" if (-d $linkfilename);
911	}
912	return ("http://" . $before_hash, $hash_part, 1);
913	} else {
914	# mailto, news, nntp, telnet, javascript or gopher link
915	return ($before_hash, "", 0);
916	}
917	}
918
919	sub extract_first_NNNN_characters {
920	my $self = shift (@_);
921	my ($textref, $doc_obj, $thissection) = @_;
922
923	foreach my $size (split /,/, $self->{'first'}) {
924	my $tmptext = $$textref;
925	# skip to the body
926	$tmptext =~ s/.<body[^>]>//i;
927	# remove javascript
928	$tmptext =~ s@<script.*?</script>@ @sig;
929	$tmptext =~ s/<[^>]*>/ /g;
930	$tmptext =~ s/ / /g;
931	$tmptext =~ s/^\s+//;
932	$tmptext =~ s/\s+$//;
933	$tmptext =~ s/\s+/ /gs;
934	$tmptext = &unicode::substr ($tmptext, 0, $size);
935	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
936	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
937	}
938	}
939
940
941	sub extract_metadata {
942	my $self = shift (@_);
943	my ($textref, $metadata, $doc_obj, $section) = @_;
944	my $outhandle = $self->{'outhandle'};
945	# if we don't want metadata, we may as well not be here ...
946	return if (!defined $self->{'metadata_fields'});
947
948	my $separator = $self->{'metadata_field_separator'};
949	if ($separator eq "") {
950	undef $separator;
951	}
952
953	# metadata fields to extract/save. 'key' is the (lowercase) name of the
954	# html meta, 'value' is the metadata name for greenstone to use
955	my %find_fields = ();
956
957	my %creator_fields = (); # short-cut for lookups
958
959
960	foreach my $field (split /,/, $self->{'metadata_fields'}) {
961	$field =~ s/^\s+//; # remove leading whitespace
962	$field =~ s/\s+$//; # remove trailing whitespace
963
964	# support tag<tagname>
965	if ($field =~ m/^(.?)\s<(.*?)>$/) {
966	# "$2" is the user's preferred gs metadata name
967	$find_fields{lc($1)}=$2; # lc = lowercase
968	} else { # no <tagname> for mapping
969	# "$field" is the user's preferred gs metadata name
970	$find_fields{lc($field)}=$field; # lc = lowercase
971	}
972	}
973
974	if (defined $self->{'hunt_creator_metadata'} &&
975	$self->{'hunt_creator_metadata'} == 1 ) {
976	my @extra_fields =
977	(
978	'author',
979	'author.email',
980	'creator',
981	'dc.creator',
982	'dc.creator.corporatename',
983	);
984
985	# add the creator_metadata fields to search for
986	foreach my $field (@extra_fields) {
987	$creator_fields{$field}=0; # add to lookup hash
988	}
989	}
990
991
992	# find the header in the html file, which has the meta tags
993	$$textref =~ m@<head>(.*?)</head>@si;
994
995	my $html_header=$1;
996
997	# go through every <meta... tag defined in the html and see if it is
998	# one of the tags we want to match.
999
1000	# special case for title - we want to remember if its been found
1001	my $found_title = 0;
1002	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1003	$html_header =~ m/^/; # match the start of the string, for \G assertion
1004
1005	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1006	my $metatag=$1;
1007	my ($tag, $value);
1008
1009	# find the tag name
1010	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1011	$tag=$2;
1012	# in case they're not using " or ', but they should...
1013	if (! $tag) {
1014	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1015	$tag=$1;
1016	}
1017
1018	if (!defined $tag) {
1019	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1020	next;
1021	}
1022
1023	# don't need to assign this field if it was passed in from a previous
1024	# (recursive) plugin
1025	if (defined $metadata->{$tag}) {next}
1026
1027	# find the tag content
1028	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
1029	$value=$2;
1030
1031	if (! $value) {
1032	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1033	$value=$1;
1034	}
1035	if (!defined $value) {
1036	print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1037	next;
1038	}
1039
1040	# clean up and add
1041	$value =~ s/\s+/ /gs;
1042	chomp($value); # remove trailing \n, if any
1043	if (exists $creator_fields{lc($tag)}) {
1044	# map this value onto greenstone's "Creator" metadata
1045	$tag='Creator';
1046	} elsif (!exists $find_fields{lc($tag)}) {
1047	next; # don't want this tag
1048	} else {
1049	# get the user's preferred capitalisation
1050	$tag = $find_fields{lc($tag)};
1051	}
1052	if (lc($tag) eq "title") {
1053	$found_title = 1;
1054	}
1055
1056	if ($self->{'verbosity'} > 2) {
1057	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1058	}
1059
1060	if ($tag =~ /\./) {
1061	# there is a . so has a namespace, add ex.
1062	$tag = "ex.$tag";
1063	}
1064	if (defined $separator) {
1065	my @values = split($separator, $value);
1066	foreach my $v (@values) {
1067	$doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1068	}
1069	}
1070	else {
1071	$doc_obj->add_utf8_metadata($section, $tag, $value);
1072	}
1073	}
1074
1075	# TITLE: extract the document title
1076	if (exists $find_fields{'title'} && !$found_title) {
1077	# we want a title, and didn't find one in the meta tags
1078	# see if there's a <title> tag
1079	my $title;
1080	my $from = ""; # for debugging output only
1081	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
1082	$title = $1;
1083	$from = "<title> tags";
1084	}
1085
1086	if (!defined $title) {
1087	$from = "first 100 chars";
1088	# if no title use first 100 or so characters
1089	$title = $$textref;
1090	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1091	$title =~ s/^.*?<body>//si;
1092	# ignore javascript!
1093	$title =~ s@<script.*?</script>@ @sig;
1094	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1095	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1096	$title = substr ($title, 0, 100);
1097	$title =~ s/\s\S*$/.../;
1098	}
1099	$title =~ s/<[^>]*>/ /g; # remove html tags
1100	$title =~ s/ / /g;
1101	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1102	$title =~ s/\s+/ /gs; # collapse multiple spaces
1103	$title =~ s/^\s*//; # remove leading spaces
1104	$title =~ s/\s*$//; # remove trailing spaces
1105
1106	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1107	$title =~ s/^\s+//s; # in case title_sub introduced any...
1108	$doc_obj->add_utf8_metadata ($section, 'Title', $title);
1109	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1110	if ($self->{'verbosity'} > 2);
1111	}
1112
1113	# add FileFormat metadata
1114	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1115
1116	# Special, for metadata names such as tagH1 - extracts
1117	# the text between the first <H1> and </H1> tags into "H1" metadata.
1118
1119	foreach my $field (keys %find_fields) {
1120	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1121	my $tag = $1;
1122	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1123	my $content = $1;
1124	$content =~ s/ / /g;
1125	$content =~ s/<[^>]*>/ /g;
1126	$content =~ s/^\s+//;
1127	$content =~ s/\s+$//;
1128	$content =~ s/\s+/ /gs;
1129	if ($content) {
1130	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1131	$tag =~ s/^tag//i;
1132	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1133	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1134	if ($self->{'verbosity'} > 2);
1135	}
1136	}
1137	}
1138	}
1139
1140
1141	# evaluate any "../" to next directory up
1142	# evaluate any "./" as here
1143	sub eval_dir_dots {
1144	my $self = shift (@_);
1145	my ($filename) = @_;
1146	my $dirsep_os = &util::get_os_dirsep();
1147	my @dirsep = split(/$dirsep_os/,$filename);
1148
1149	my @eval_dirs = ();
1150	foreach my $d (@dirsep) {
1151	if ($d eq "..") {
1152	pop(@eval_dirs);
1153
1154	} elsif ($d eq ".") {
1155	# do nothing!
1156
1157	} else {
1158	push(@eval_dirs,$d);
1159	}
1160	}
1161
1162	# Need to fiddle with number of elements in @eval_dirs if the
1163	# first one is the empty string. This is because of a
1164	# modification to util::filename_cat that supresses the addition
1165	# of a leading '/' character (or \ if windows) (intended to help
1166	# filename cat with relative paths) if the first entry in the
1167	# array is the empty string. Making the array start with two
1168	# empty strings is a way to defeat this "smart" option.
1169	#
1170	if (scalar(@eval_dirs) > 0) {
1171	if ($eval_dirs[0] eq ""){
1172	unshift(@eval_dirs,"");
1173	}
1174	}
1175
1176	my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1177	return $evaluated_filename;
1178	}
1179
1180	sub replace_usemap_links {
1181	my $self = shift (@_);
1182	my ($front, $link, $back) = @_;
1183
1184	# remove quotes from link at start and end if necessary
1185	if ($link=~/^[\"\']/) {
1186	$link=~s/^[\"\']//;
1187	$link=~s/[\"\']$//;
1188	$front.='"';
1189	$back="\"$back";
1190	}
1191
1192	$link =~ s/^\.\///;
1193	return $front . $link . $back;
1194	}
1195
1196	sub inc_filecount {
1197	my $self = shift (@_);
1198
1199	if ($self->{'file_num'} == 1000) {
1200	$self->{'dir_num'} ++;
1201	$self->{'file_num'} = 0;
1202	} else {
1203	$self->{'file_num'} ++;
1204	}
1205	}
1206
1207
1208	# Extend read_file so that strings like é are
1209	# converted to UTF8 internally.
1210	#
1211	# We don't convert < or > or & or " in case
1212	# they interfere with the GML files
1213
1214	sub read_file {
1215	my $self = shift(@_);
1216	my ($filename, $encoding, $language, $textref) = @_;
1217
1218	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1219
1220	# Convert entities to their UTF8 equivalents
1221	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1222	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
1223	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1224	}
1225
1226	sub HB_read_html_file {
1227	my $self = shift (@_);
1228	my ($htmlfile, $text) = @_;
1229
1230	# load in the file
1231	if (!open (FILE, $htmlfile)) {
1232	print STDERR "ERROR - could not open $htmlfile\n";
1233	return;
1234	}
1235
1236	my $foundbody = 0;
1237	$self->HB_gettext (\$foundbody, $text, "FILE");
1238	close FILE;
1239
1240	# just in case there was no <body> tag
1241	if (!$foundbody) {
1242	$foundbody = 1;
1243	open (FILE, $htmlfile) \|\| return;
1244	$self->HB_gettext (\$foundbody, $text, "FILE");
1245	close FILE;
1246	}
1247	# text is in utf8
1248	}
1249
1250	# converts the text to utf8, as ghtml does that for é etc.
1251	sub HB_gettext {
1252	my $self = shift (@_);
1253	my ($foundbody, $text, $handle) = @_;
1254
1255	my $line = "";
1256	while (defined ($line = <$handle>)) {
1257	# look for body tag
1258	if (!$$foundbody) {
1259	if ($line =~ s/^.<body[^>]>//i) {
1260	$$foundbody = 1;
1261	} else {
1262	next;
1263	}
1264	}
1265
1266	# check for symbol fonts
1267	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
1268	my $font = $1;
1269	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1270	if ($font !~ m/^arial$/i);
1271	}
1272
1273	$$text .= $line;
1274	}
1275
1276	if ($self->{'input_encoding'} eq "iso_8859_1") {
1277	# convert to utf-8
1278	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1279	}
1280	# convert any alphanumeric character entities to their utf-8
1281	# equivalent for indexing purposes
1282	#&ghtml::convertcharentities ($$text);
1283
1284	$$text =~ s/\s+/ /g; # remove \n's
1285	}
1286
1287	sub HB_clean_section {
1288	my $self = shift (@_);
1289	my ($section) = @_;
1290
1291	# remove tags without a starting tag from the section
1292	my ($tag, $tagstart);
1293	while ($section =~ m/<\/([^>]{1,10})>/) {
1294	$tag = $1;
1295	$tagstart = index($section, "<$tag");
1296	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1297	$section =~ s/<\/$tag>//;
1298	}
1299
1300	# remove extra paragraph tags
1301	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
1302
1303	# remove extra stuff at the end of the section
1304	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
1305
1306	# add a newline at the beginning of each paragraph
1307	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1308
1309	# add a newline every 80 characters at a word boundary
1310	# Note: this regular expression puts a line feed before
1311	# the last word in each section, even when it is not
1312	# needed.
1313	$section =~ s/(.{1,80})\s/$1\n/g;
1314
1315	# fix up the image links
1316	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
1317	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1318	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
1319	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1320
1321	return $section;
1322	}
1323
1324	# Will convert the oldHDL format to the new HDL format (using the Section tag)
1325	sub convert_to_newHDLformat
1326	{
1327	my $self = shift (@_);
1328	my ($file,$cnfile) = @_;
1329	my $input_filename = $file;
1330	my $tmp_filename = $cnfile;
1331
1332	# write HTML tmp file with new HDL format
1333	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1334
1335	# read in the file and do basic html cleaning (removing header etc)
1336	my $html = "";
1337	$self->HB_read_html_file ($input_filename, \$html);
1338
1339	# process the file one section at a time
1340	my $curtoclevel = 1;
1341	my $firstsection = 1;
1342	my $toclevel = 0;
1343	while (length ($html) > 0) {
1344	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
1345	$toclevel = $3;
1346	my $title = $4;
1347	my $sectiontext = "";
1348	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
1349	$sectiontext = $1;
1350	} else {
1351	$sectiontext = $html;
1352	$html = "";
1353	}
1354
1355	# remove tags and extra spaces from the title
1356	$title =~ s/<\/?[^>]+>//g;
1357	$title =~ s/^\s+\|\s+$//g;
1358
1359	# close any sections below the current level and
1360	# create a new section (special case for the firstsection)
1361	print PROD "<!--\n";
1362	while (($curtoclevel > $toclevel) \|\|
1363	(!$firstsection && $curtoclevel == $toclevel)) {
1364	$curtoclevel--;
1365	print PROD "</Section>\n";
1366	}
1367	if ($curtoclevel+1 < $toclevel) {
1368	print STDERR "WARNING - jump in toc levels in $input_filename " .
1369	"from $curtoclevel to $toclevel\n";
1370	}
1371	while ($curtoclevel < $toclevel) {
1372	$curtoclevel++;
1373	}
1374
1375	if ($curtoclevel == 1) {
1376	# add the header tag
1377	print PROD "-->\n";
1378	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1379	print PROD "<!--\n";
1380	}
1381
1382	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1383
1384	print PROD "-->\n";
1385
1386	# clean up the section html
1387	$sectiontext = $self->HB_clean_section($sectiontext);
1388
1389	print PROD "$sectiontext\n";
1390
1391	} else {
1392	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1393	"\nin $input_filename\n";
1394	last;
1395	}
1396	$firstsection = 0;
1397	}
1398
1399	print PROD "<!--\n";
1400	while ($curtoclevel > 0) {
1401	$curtoclevel--;
1402	print PROD "</Section>\n";
1403	}
1404	print PROD "-->\n";
1405
1406	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1407
1408	return $tmp_filename;
1409	}
1410
1411	sub shorten {
1412	my $self = shift (@_);
1413	my ($text) = @_;
1414
1415	return "\"$text\"" if (length($text) < 100);
1416
1417	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1418	substr ($text, length($text)-50) . "\"";
1419	}
1420
1421	sub convert_tidy_or_oldHDL_file
1422	{
1423	my $self = shift (@_);
1424	my ($file) = @_;
1425	my $input_filename = $file;
1426
1427	if (-d $input_filename)
1428	{
1429	return $input_filename;
1430	}
1431
1432	# get the input filename
1433	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1434	my $base_dirname = $dirname;
1435	$suffix = lc($suffix);
1436
1437	# derive tmp filename from input filename
1438	# Remove any white space from filename -- no risk of name collision, and
1439	# makes later conversion by utils simpler. Leave spaces in path...
1440	# tidy up the filename with space, dot, hyphen between
1441	$tailname =~ s/\s+//g;
1442	$tailname =~ s/\.+//g;
1443	$tailname =~ s/\-+//g;
1444	# convert to utf-8 otherwise we have problems with the doc.xml file
1445	# later on
1446	&unicode::ensure_utf8(\$tailname);
1447
1448	# softlink to collection tmp dir
1449	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1450	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1451
1452	my $test_dirname = "";
1453	my $f_separator = &util::get_os_dirsep();
1454
1455	if ($dirname =~ m/import$f_separator/)
1456	{
1457	$test_dirname = $'; #'
1458
1459	#print STDERR "init $'\n";
1460
1461	while ($test_dirname =~ m/[$f_separator]/)
1462	{
1463	my $folderdirname = $`;
1464	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1465	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1466	$test_dirname = $'; #'
1467	}
1468	}
1469
1470	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1471
1472	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1473	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
1474	{
1475	#convert the input file to a new style HDL
1476	my $hdl_output_filename = $input_filename;
1477	if ($self->{'old_style_HDL'})
1478	{
1479	$hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1480	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1481	}
1482
1483	#just for checking copy all other file from the base dir to tmp dir if it is not exists
1484	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1485	my @files = grep {!/^\.+$/} readdir(DIR);
1486	close(DIR);
1487
1488	foreach my $file (@files)
1489	{
1490	my $src_file = &util::filename_cat($base_dirname,$file);
1491	my $dest_file = &util::filename_cat($tmp_dirname,$file);
1492	if ((!-e $dest_file) && (!-d $src_file))
1493	{
1494	# just copy the original file back to the tmp directory
1495	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1496	}
1497	}
1498
1499	# tidy the input file
1500	my $tidy_output_filename = $hdl_output_filename;
1501	if ($self->{'use_realistic_book'})
1502	{
1503	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1504	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1505	}
1506	$tmp_filename = $tidy_output_filename;
1507	}
1508	else
1509	{
1510	if (!-e $tmp_filename)
1511	{
1512	# just copy the original file back to the tmp directory
1513	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1514	}
1515	}
1516
1517	return $tmp_filename;
1518	}
1519
1520
1521	# Will make the html input file as a proper XML file with removed font tag and
1522	# image size added to the img tag.
1523	# The tidying process takes place in a collection specific 'tmp' directory so
1524	# that we don't accidentally damage the input.
1525	sub tmp_tidy_file
1526	{
1527	my $self = shift (@_);
1528	my ($file,$cnfile) = @_;
1529	my $input_filename = $file;
1530	my $tmp_filename = $cnfile;
1531
1532	# get the input filename
1533	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1534
1535	require HTML::TokeParser::Simple;
1536
1537	# create HTML parser to decode the input file
1538	my $parser = HTML::TokeParser::Simple->new($input_filename);
1539
1540	# write HTML tmp file without the font tag and image size are added to the img tag
1541	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1542	while (my $token = $parser->get_token())
1543	{
1544	# is it an img tag
1545	if ($token->is_start_tag('img'))
1546	{
1547	# get the attributes
1548	my $attr = $token->return_attr;
1549
1550	# get the full path to the image
1551	my $img_file = &util::filename_cat($dirname,$attr->{src});
1552
1553	# set the width and height attribute
1554	($attr->{width}, $attr->{height}) = imgsize($img_file);
1555
1556	# recreate the tag
1557	print PROD "<img";
1558	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1559	print PROD ">";
1560	}
1561	# is it a font tag
1562	else
1563	{
1564	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
1565	{
1566	# remove font tag
1567	print PROD "";
1568	}
1569	else
1570	{
1571	# print without changes
1572	print PROD $token->as_is;
1573	}
1574	}
1575	}
1576	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1577
1578	# run html-tidy on the tmp file to make it a proper XML file
1579	my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`;
1580
1581	# write result back to the tmp file
1582	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1583	print PROD $tidyfile;
1584	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1585
1586	# return the output filename
1587	return $tmp_filename;
1588	}
1589
1590	sub associate_cover_image
1591	{
1592	my $self = shift(@_);
1593	my ($doc_obj, $filename) = @_;
1594	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
1595	{
1596	# we will have cover image in tidytmp, but want it from import
1597	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1598	}
1599	$self->SUPER::associate_cover_image($doc_obj, $filename);
1600	}
1601
1602
1603	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: