Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 23352

Last change on this file since 23352 was 23352, checked in by davidb, 13 years ago
Modifications to code to support filename encoding issues when tested under Windows
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 57.3 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use Encode;
39
40	use ReadTextFile;
41	use HBPlugin;
42	use ghtml;
43	use unicode;
44	use util;
45	use XMLParser;
46
47	use Image::Size;
48	use File::Copy;
49
50	sub BEGIN {
51	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
52	}
53
54	use strict; # every perl program should have this!
55	no strict 'refs'; # make an exception so we can use variables as filehandles
56
57	my $arguments =
58	[ { 'name' => "process_exp",
59	'desc' => "{BasePlugin.process_exp}",
60	'type' => "regexp",
61	'deft' => &get_default_process_exp() },
62	{ 'name' => "block_exp",
63	'desc' => "{BasePlugin.block_exp}",
64	'type' => 'regexp',
65	'deft' => &get_default_block_exp() },
66	{ 'name' => "nolinks",
67	'desc' => "{HTMLPlugin.nolinks}",
68	'type' => "flag" },
69	{ 'name' => "keep_head",
70	'desc' => "{HTMLPlugin.keep_head}",
71	'type' => "flag" },
72	{ 'name' => "no_metadata",
73	'desc' => "{HTMLPlugin.no_metadata}",
74	'type' => "flag" },
75	{ 'name' => "metadata_fields",
76	'desc' => "{HTMLPlugin.metadata_fields}",
77	'type' => "string",
78	'deft' => "Title" },
79	{ 'name' => "metadata_field_separator",
80	'desc' => "{HTMLPlugin.metadata_field_separator}",
81	'type' => "string",
82	'deft' => "" },
83	{ 'name' => "hunt_creator_metadata",
84	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
85	'type' => "flag" },
86	{ 'name' => "file_is_url",
87	'desc' => "{HTMLPlugin.file_is_url}",
88	'type' => "flag" },
89	{ 'name' => "assoc_files",
90	'desc' => "{HTMLPlugin.assoc_files}",
91	'type' => "regexp",
92	'deft' => &get_default_block_exp() },
93	{ 'name' => "rename_assoc_files",
94	'desc' => "{HTMLPlugin.rename_assoc_files}",
95	'type' => "flag" },
96	{ 'name' => "title_sub",
97	'desc' => "{HTMLPlugin.title_sub}",
98	'type' => "string",
99	'deft' => "" },
100	{ 'name' => "description_tags",
101	'desc' => "{HTMLPlugin.description_tags}",
102	'type' => "flag" },
103	# retain this for backward compatibility (w3mir option was replaced by
104	# file_is_url)
105	{ 'name' => "w3mir",
106	# 'desc' => "{HTMLPlugin.w3mir}",
107	'type' => "flag",
108	'hiddengli' => "yes"},
109	{ 'name' => "no_strip_metadata_html",
110	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
111	'type' => "string",
112	'deft' => "",
113	'reqd' => "no"},
114	{ 'name' => "sectionalise_using_h_tags",
115	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
116	'type' => "flag" },
117	{ 'name' => "use_realistic_book",
118	'desc' => "{HTMLPlugin.tidy_html}",
119	'type' => "flag"},
120	{ 'name' => "old_style_HDL",
121	'desc' => "{HTMLPlugin.old_style_HDL}",
122	'type' => "flag"},
123	{'name' => "processing_tmp_files",
124	'desc' => "{BasePlugin.processing_tmp_files}",
125	'type' => "flag",
126	'hiddengli' => "yes"}
127	];
128
129	my $options = { 'name' => "HTMLPlugin",
130	'desc' => "{HTMLPlugin.desc}",
131	'abstract' => "no",
132	'inherits' => "yes",
133	'args' => $arguments };
134
135
136	sub new {
137	my ($class) = shift (@_);
138	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139	push(@$pluginlist, $class);
140
141	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142	push(@{$hashArgOptLists->{"OptList"}},$options);
143
144
145	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
146
147	if ($self->{'w3mir'}) {
148	$self->{'file_is_url'} = 1;
149	}
150	$self->{'aux_files'} = {};
151	$self->{'dir_num'} = 0;
152	$self->{'file_num'} = 0;
153
154	return bless $self, $class;
155	}
156
157	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
158	# if have eg <script language="javascript" src="img/lib.js@123">
159	# blocking is now done by reading through the file and recording all the
160	# images and other files
161	sub get_default_block_exp {
162	my $self = shift (@_);
163
164	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
165	return "";
166	}
167
168	sub get_default_process_exp {
169	my $self = shift (@_);
170
171	# the last option is an attempt to encode the concept of an html query ...
172	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
173	}
174
175	sub store_block_files
176	{
177	my $self =shift (@_);
178	my ($filename_full_path, $block_hash) = @_;
179
180	my $html_fname = $filename_full_path;
181	my @file_blocks;
182
183	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
184	$self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
185
186	# read in file ($text will be in utf8)
187	my $raw_text = "";
188	$self->read_file_no_decoding ($filename_full_path, \$raw_text);
189
190	my $textref = \$raw_text;
191	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
192	my $closecom = '(?:-->\|(?:—\|\|--)>)';
193	$$textref =~ s/$opencom(.*?)$closecom//gs;
194
195	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
196	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
197	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
198	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
199	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
200	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
201	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
202
203	if(!defined $self->{'utf8_to_original_filename'}) {
204	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
205	$self->{'utf8_to_original_filename'} = {};
206	}
207
208	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
209
210	# remove quotes from link at start and end if necessary
211	if ($link=~/^\"/) {
212	$link=~s/^\"//;
213	$link=~s/\"$//;
214	}
215
216	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
217	# some links may just be anchor names
218	next unless ($link =~ /\S+/);
219
220	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
221	# Turn relative file path into full path
222	my $dirname = &File::Basename::dirname($filename_full_path);
223	$link = &util::filename_cat($dirname, $link);
224	}
225	$link = $self->eval_dir_dots($link);
226
227	# this is the actual filename on the filesystem (that the link refers to)
228	my $url_original_filename = $self->opt_url_decode($link);
229
230	# Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
231	my $utf8_link = "";
232	$self->decode_text($link,$content_encoding,$language,\$utf8_link);
233
234	$self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
235	# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
236
237	if ($url_original_filename ne $utf8_link) {
238	my $outhandle = $self->{'outhandle'};
239
240	print $outhandle "URL Encoding $url_original_filename\n";
241	print $outhandle " ->$utf8_link\n";
242	}
243
244	$block_hash->{'file_blocks'}->{$url_original_filename} = 1;
245	}
246	}
247
248	# Given a filename in any encoding, will URL decode it to get back the original filename
249	# in the original encoding. Because this method is intended to work out the original
250	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
251	# string already exists in the local folder.
252	# Return the original filename corresponding to the parameter URL-encoded filename, and
253	# a decoded flag that is set to true iff URL-decoding had to be applied.
254	sub opt_url_decode {
255	my $self = shift (@_);
256	my ($link) = @_;
257
258	# Replace %XX's in URL with decoded value if required.
259	# Note that the filename may include the %XX in some situations
260	if ($link =~ m/\%[A-F0-9]{2}/i) {
261	if (!-e $link) {
262	$link = &unicode::url_decode($link);
263	}
264	}
265
266	return $link;
267	}
268
269	sub read_into_doc_obj
270	{
271	my $self = shift (@_);
272	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
273
274	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
275
276	# Lookup content_encoding worked out in file_block pass for this file
277	# Store it under the local name 'content_encoding' so its nice and
278	# easy to access
279	$self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
280
281	# get the input file
282	my $input_filename = $file;
283	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
284	$suffix = lc($suffix);
285	my $tidy_filename;
286	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
287	{
288	# because the document has to be sectionalized set the description tags
289	$self->{'description_tags'} = 1;
290
291	# set the file to be tidied
292	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
293
294	# get the tidied file
295	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
296	$tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
297
298	# derive tmp filename from input filename
299	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
300
301	# set the new input file and base_dir to be from the tidied file
302	$file = "$tailname$suffix";
303	$base_dir = $dirname;
304	}
305
306	# call the parent read_into_doc_obj
307	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
308	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
309	{
310	# now we need to reset the filenames in the doc obj so that the converted filenames are not used
311	my $collect_file = &util::filename_within_collection($filename_full_path);
312	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
313	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
314	# build. so set it manually.
315	$doc_obj->{'source_path'} = $filename_full_path;
316	my $collect_conv_file = &util::filename_within_collection($tidy_filename);
317	$doc_obj->set_converted_filename($collect_conv_file);
318
319	my $plugin_filename_encoding = $self->{'filename_encoding'};
320	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
321	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
322	}
323
324	delete $self->{'store_content_encoding'}->{$filename_full_path};
325	$self->{'content_encoding'} = undef;
326
327	return ($process_status,$doc_obj);
328	}
329
330	# do plugin specific processing of doc_obj
331	sub process {
332	my $self = shift (@_);
333	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
334	my $outhandle = $self->{'outhandle'};
335
336	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
337	# this makes life so much easier... perl can cope with unix-style '/'s.
338	$base_dir =~ s@(\\)+@/@g;
339	$file =~ s@(\\)+@/@g;
340	}
341
342	# reset per-doc stuff...
343	$self->{'aux_files'} = {};
344	$self->{'dir_num'} = 0;
345	$self->{'file_num'} = 0;
346
347	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
348	# you can also include metadata in the format (X can be any number)
349	# <hX>Title<!--gsdl-metadata
350	# <Metadata name="name1">value1</Metadata>
351	# ...
352	# <Metadata name="nameN">valueN</Metadata>
353	#--></hX>
354	if ($self->{'sectionalise_using_h_tags'}) {
355	# description_tags should allways be activated because we convert headings to description tags
356	$self->{'description_tags'} = 1;
357
358	my $arrSections = [];
359	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
360
361	if (scalar(@$arrSections)) {
362	my $strMetadata = $self->update_section_data($arrSections, -1);
363	if (length($strMetadata)) {
364	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
365	$$textref =~ s/<\/body>/$strMetadata/ig;
366	}
367	}
368	}
369
370	my $cursection = $doc_obj->get_top_section();
371
372	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
373	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
374
375	# Store URL for page as metadata - this can be used for an
376	# altavista style search interface. The URL won't be valid
377	# unless the file structure contains the domain name (i.e.
378	# like when w3mir is used to download a website).
379
380	# URL metadata (even invalid ones) are used to support internal
381	# links, so even if 'file_is_url' is off, still need to store info
382
383	my ($tailname,$dirname) = &File::Basename::fileparse($file);
384
385	# my $utf8_file = $self->filename_to_utf8_metadata($file);
386	# $utf8_file =~ s/&\#095;/_/g;
387	my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
388
389	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
390	print STDERR "***!! file = $file\n";
391	print STDERR "***!! utf8_file = $utf8_file\n";
392	}
393
394
395	my $web_url = "http://";
396	if(defined $dirname) { # local directory
397	# Check for "ftp" in the domain name of the directory
398	# structure to determine if this URL should be a ftp:// URL
399	# This check is not infallible, but better than omitting the
400	# check, which would cause all files downloaded from ftp sites
401	# via mirroring with wget to have potentially erroneous http:// URLs
402	# assigned in their metadata
403	if ($dirname =~ /^[^\/]*ftp/i)
404	{
405	$web_url = "ftp://";
406	}
407	$dirname = $self->eval_dir_dots($dirname);
408	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
409	$web_url = $web_url.$dirname.$utf8_file;
410	} else {
411	$web_url = $web_url.$utf8_file;
412	}
413	$web_url =~ s/\\/\//g;
414	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
415
416	if ($self->{'file_is_url'}) {
417	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
418	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
419	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
420	}
421
422	if ($self->{'description_tags'}) {
423	# remove the html header - note that doing this here means any
424	# sections defined within the header will be lost (so all <Section>
425	# tags must appear within the body of the HTML)
426	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
427
428	$$textref =~ s/^.?<body[^>]>//is;
429	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
430
431	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
432	my $closecom = '(?:-->\|(?:—\|\|--)>)';
433
434	my $lt = '(?:<\|<)';
435	my $gt = '(?:>\|>)';
436	my $quot = '(?:"\|"\|”\|“)';
437
438	my $dont_strip = '';
439	if ($self->{'no_strip_metadata_html'}) {
440	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
441	}
442
443	my $found_something = 0; my $top = 1;
444	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
445	my $text = $1;
446	my $comment = $2;
447	if (defined $text) {
448	# text before a comment - note that getting to here
449	# doesn't necessarily mean there are Section tags in
450	# the document
451	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
452	}
453	while ($comment =~ s/$lt(.*?)$gt//s) {
454	my $tag = $1;
455	if ($tag eq "Section") {
456	$found_something = 1;
457	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
458	$top = 0;
459	} elsif ($tag eq "/Section") {
460	$found_something = 1;
461	$cursection = $doc_obj->get_parent_section ($cursection);
462	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
463	my $metaname = $1;
464	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
465	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
466	my $metavalue = $1;
467	$metavalue =~ s/^\s+//;
468	$metavalue =~ s/\s+$//;
469	# assume that no metadata value intentionally includes
470	# carriage returns or HTML tags (if they're there they
471	# were probably introduced when converting to HTML from
472	# some other format).
473	# actually some people want to have html tags in their
474	# metadata.
475	$metavalue =~ s/[\cJ\cM]/ /sg;
476	$metavalue =~ s/<[^>]+>//sg
477	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
478	$metavalue =~ s/\s+/ /sg;
479	if ($metaname =~ /\./) { # has a namespace
480	$metaname = "ex.$metaname";
481	}
482	if ($accumulate) {
483	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
484	} else {
485	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
486	}
487	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
488	# do nothing with containing Description tags
489	} else {
490	# simple HTML tag (probably created by the conversion
491	# to HTML from some other format) - we'll ignore it and
492	# hope for the best ;-)
493	}
494	}
495	}
496	if ($cursection ne "") {
497	print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
498	}
499
500	$$textref =~ s/^.?<body[^>]>//is;
501	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
502	if ($$textref =~ m/\S/) {
503	if (!$found_something) {
504	if ($self->{'verbosity'} > 2) {
505	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
506	print $outhandle " will be processed as a single section document\n";
507	}
508
509	# go ahead and process single-section document
510	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
511
512	# if document contains no Section tags we'll go ahead
513	# and extract metadata (this won't have been done
514	# above as the -description_tags option prevents it)
515	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
516	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
517	unless $self->{'no_metadata'};
518
519	} else {
520	print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
521	print $outhandle " of the final closing </Section> tag. This text will\n";
522	print $outhandle " be ignored.";
523
524	my ($text);
525	if (length($$textref) > 30) {
526	$text = substr($$textref, 0, 30) . "...";
527	} else {
528	$text = $$textref;
529	}
530	$text =~ s/\n/ /isg;
531	print $outhandle " ($text)\n";
532	}
533	} elsif (!$found_something) {
534
535	if ($self->{'verbosity'} > 2) {
536	# may get to here if document contained no valid Section
537	# tags but did contain some comments. The text will have
538	# been processed already but we should print the warning
539	# as above and extract metadata
540	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
541	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
542	}
543
544	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
545	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
546	unless $self->{'no_metadata'};
547	}
548
549	} else {
550
551	# remove header and footer
552	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
553	$$textref =~ s/^.?<body[^>]>//is;
554	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
555	}
556
557	# single section document
558	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
559	}
560
561	return 1;
562	}
563
564
565	sub process_heading
566	{
567	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
568	$strHeadingText = '' if (!defined($strHeadingText));
569
570	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
571
572	my $strSecMetadata = '';
573	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
574	{
575	$strSecMetadata .= $1;
576	}
577
578	$strHeadingText =~ s/^\s+//g;
579	$strHeadingText =~ s/\s+$//g;
580	$strSecMetadata =~ s/^\s+//g;
581	$strSecMetadata =~ s/\s+$//g;
582
583	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
584
585	if (length($strSecMetadata)) {
586	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
587	}
588
589	$strMetadata .= "\t</Description>\n";
590
591	return "<!--" . $strMetadata . "-->";
592	}
593
594
595	sub update_section_data
596	{
597	my ($self, $arrSections, $nCurTocNo) = @_;
598	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
599
600	if ($nSections == 0) {
601	push @$arrSections, $nCurTocNo;
602	return $strBuffer;
603	}
604	$nLast = $arrSections->[$nSections - 1];
605	if ($nCurTocNo > $nLast) {
606	push @$arrSections, $nCurTocNo;
607	return $strBuffer;
608	}
609	for(my $i = $nSections - 1; $i >= 0; $i--) {
610	if ($nCurTocNo <= $arrSections->[$i]) {
611	$strBuffer .= "\n</Section>";
612	pop @$arrSections;
613	}
614	}
615	push @$arrSections, $nCurTocNo;
616	return $strBuffer;
617	}
618
619
620	# note that process_section may be called multiple times for a single
621	# section (relying on the fact that add_utf8_text appends the text to any
622	# that may exist already).
623	sub process_section {
624	my $self = shift (@_);
625	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
626	# trap links
627	if (!$self->{'nolinks'}) {
628	# usemap="./#index" not handled correctly => change to "#index"
629	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
630
631	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
632	$self->replace_usemap_links($1, $2, $3)/isge;
633
634	## $$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
635
636	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
637	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
638	}
639
640	# trap images
641
642	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
643	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
644	# turned regular text succeeding images into links. That is, by embedding <imgs>
645	# inside <a href=""></a>, the text following images were no longer misbehaving.
646	# However, there would be many occasions whereby images were not meant to link
647	# to their source images but where the images would link to another web page.
648	# To allow this, the no_image_links option was introduced: it would prevent
649	# the behaviour of embedding images into links that referenced the source images.
650
651	# Somewhere along the line, the problem of normal text turning into links when
652	# such text followed images which were not embedded in <a href=""></a> ceased
653	# to occur. This is why the following lines have been commented out (as well as
654	# two lines in replace_images). They appear to no longer apply.
655
656	# If at any time, there is a need for having images embedded in <a> anchor tags,
657	# then it might be better to turn that into an HTMLPlugin option rather than make
658	# it the default behaviour. Also, eventually, no_image_links needs to become
659	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
660
661	#if(!$self->{'no_image_links'}){
662	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
663	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
664	#}
665
666	# add text to document object
667	# turn \ into \\ so that the rest of greenstone doesn't think there
668	# is an escape code following. (Macro parsing loses them...)
669	$$textref =~ s/\\/\\\\/go;
670
671	$doc_obj->add_utf8_text($cursection, $$textref);
672	}
673
674	sub replace_images {
675	my $self = shift (@_);
676	my ($front, $link, $back, $base_dir,
677	$file, $doc_obj, $section) = @_;
678
679	# remove quotes from link at start and end if necessary
680	if ($link=~/^[\"\']/) {
681	$link=~s/^[\"\']//;
682	$link=~s/[\"\']$//;
683	$front.='"';
684	$back="\"$back";
685	}
686
687	$link =~ s/\n/ /g;
688
689	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
690	# If the Word file path has spaces in it, wv messes up and you end up with
691	# absolute paths for the images, and without the "file://" prefix
692	# So check for this special case and massage the data to be correct
693	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
694	$link =~ s/^.*\\([^\\]+)$/$1/;
695	}
696
697	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
698
699	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
700
701	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n";
702
703	my $anchor_name = $img_file;
704	#$anchor_name =~ s/^.*\///;
705	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
706
707	my $image_link = $front . $img_file .$back;
708	return $image_link;
709
710	# The reasons for why the following two lines are no longer necessary can be
711	# found in subroutine process_section
712	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
713	#return $anchor_link;
714
715	#return $front . $img_file . $back . $anchor_name;
716	}
717
718	sub replace_href_links {
719	my $self = shift (@_);
720	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
721
722	# remove quotes from link at start and end if necessary
723	if ($link=~/^[\"\']/) {
724	$link=~s/^[\"\']//;
725	$link=~s/[\"\']$//;
726	$front.='"';
727	$back="\"$back";
728	}
729
730	# attempt to sort out targets - frames are not handled
731	# well in this plugin and some cases will screw things
732	# up - e.g. the _parent target (so we'll just remove
733	# them all ;-)
734	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
735	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
736	$front =~ s/target=\"?_parent\"?//is;
737	$back =~ s/target=\"?_parent\"?//is;
738
739	return $front . $link . $back if $link =~ m/^\#/s;
740	$link =~ s/\n/ /g;
741
742	# Find file referred to by $link on file system
743	# This is more complicated than it sounds when char encodings
744	# is taken in to account
745	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
746
747	# href may use '\'s where '/'s should be on Windows
748	$href =~ s/\\/\//g;
749	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
750
751
752	##### leave all these links alone (they won't be picked up by intermediate
753	##### pages). I think that's safest when dealing with frames, targets etc.
754	##### (at least until I think of a better way to do it). Problems occur with
755	##### mailto links from within small frames, the intermediate page is displayed
756	##### within that frame and can't be seen. There is still potential for this to
757	##### happen even with html pages - the solution seems to be to somehow tell
758	##### the browser from the server side to display the page being sent (i.e.
759	##### the intermediate page) in the top level window - I'm not sure if that's
760	##### possible - the following line should probably be deleted if that can be done
761	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
762
763	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
764	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
765
766
767	# If web page didn't give encoding, then default to utf8
768	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
769	print STDERR "*** Web page didn't give encoding, defaulting to UTF8!\n";
770	print STDERR "***** looking up $file\n";
771	}
772
773	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
774	$href = encode($content_encoding,$href);
775
776	$href = &unicode::raw_filename_to_url_encoded($href);
777	$href = &unicode::filename_to_url($href);
778
779	&ghtml::urlsafe ($href);
780	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
781	print STDERR "***!!! href=$href\n";
782	}
783
784
785	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
786	} else {
787	# link is to some other type of file (e.g., an image) so we'll
788	# need to associate that file
789	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
790	}
791	}
792
793	sub add_file {
794	my $self = shift (@_);
795	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
796	my ($newname);
797
798	my $filename = $href;
799	if ($base_dir eq "") {
800	# remove http:/ thereby leaving one slash at the start
801	$filename =~ s/^[^:]*:\///;
802	}
803	else {
804	# remove http://
805	$filename =~ s/^[^:]*:\/\///;
806	}
807
808	$filename = &util::filename_cat($base_dir, $filename);
809	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'})) {
810	# we are processing a tidytmp file - want paths to be in import
811	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
812	}
813
814	# Replace %XX's in URL with decoded value if required. Note that the
815	# filename may include the %XX in some situations. If the original
816	# file's name was in URL encoding, the following method will not decode
817	# it.
818	my $utf8_filename = $filename;
819	my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename);
820
821	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
822
823	# The filenames that come through the HTML file have been decoded
824	# into Unicode aware Perl strings. Need to convert them back
825	# to their initial raw-byte encoding to match the file that
826	# exists on the file system
827	$filename = encode($content_encoding, $opt_decode_utf8_filename);
828
829
830	# some special processing if the intended filename was converted to utf8, but
831	# the actual file still needs to be renamed
832	if (!-e $filename) {
833	# try the original filename stored in map
834	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
835	print STDERR "***###!! orig filename did not exist: $filename\n";
836	}
837
838	my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename};
839
840	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
841	print STDERR "**** Trying for $original_filename\n";
842	}
843
844	if (defined $original_filename && -e $original_filename) {
845	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
846	print STDERR "*** found match\n";
847	}
848	$filename = $original_filename;
849	}
850	}
851
852	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
853
854	if ($rl == 0) {
855	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
856	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
857	}
858	else {
859	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
860	}
861	}
862
863	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
864	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
865	}
866	# add the original image file as a source file
867	if (!$self->{'processing_tmp_files'} ) {
868	$doc_obj->associate_source_file($filename);
869	}
870	if ($self->{'rename_assoc_files'}) {
871	if (defined $self->{'aux_files'}->{$href}) {
872	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
873	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
874	} else {
875	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
876	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
877	$self->inc_filecount ();
878	}
879	$doc_obj->associate_file($filename, $newname, undef, $section);
880	return "_httpdocimg_/$newname";
881	} else {
882	if(&unicode::is_url_encoded($utf8_filename)) {
883	# use the possibly-decoded filename instead to avoid double URL encoding
884	($newname) = $filename =~ m/([^\/\\]*)$/;
885	} else {
886	($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
887	}
888
889	# Make sure this name uses only ASCII characters.
890	# We use either base64 or URL encoding, as these preserve original encoding
891	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
892
893	$doc_obj->associate_file($filename, $newname, undef, $section);
894
895	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
896	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
897	my $newname_url = $newname;
898	$newname_url = &unicode::filename_to_url($newname_url);
899	return "_httpdocimg_/$newname_url";
900	}
901	}
902
903
904	sub format_link {
905	my $self = shift (@_);
906	my ($link, $base_dir, $file) = @_;
907
908	my ($before_hash, $hash_part) = $link =~ m/^([^\#])(\#?.)$/;
909
910	$hash_part = "" if !defined $hash_part;
911	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
912	my $outhandle = $self->{'outhandle'};
913	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
914	if $self->{'verbosity'};
915	return ($link, "", 0);
916	}
917
918	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
919	my $type = $1;
920
921	if ($link =~ m/^(http\|ftp):/i) {
922	# Turn url (using /) into file name (possibly using \ on windows)
923	my @http_dir_split = split('/', $before_hash);
924	$before_hash = &util::filename_cat(@http_dir_split);
925	}
926
927	$before_hash = $self->eval_dir_dots($before_hash);
928
929	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
930
931	my $rl = 0;
932	$rl = 1 if (-e $linkfilename);
933
934	# make sure there's a slash on the end if it's a directory
935	if ($before_hash !~ m/\/$/) {
936	$before_hash .= "/" if (-d $linkfilename);
937	}
938	return ($type . $before_hash, $hash_part, $rl);
939
940	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
941
942	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
943
944	# the first directory will be the domain name if file_is_url
945	# to generate archives, otherwise we'll assume all files are
946	# from the same site and base_dir is the root
947
948	if ($self->{'file_is_url'}) {
949	my @dirs = split /[\/\\]/, $file;
950	my $domname = shift (@dirs);
951	$before_hash = &util::filename_cat($domname, $before_hash);
952	$before_hash =~ s@\\@/@g; # for windows
953	}
954	else
955	{
956	# see if link shares directory with source document
957	# => turn into relative link if this is so!
958
959	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
960	# too difficult doing a pattern match with embedded '\'s...
961	my $win_before_hash=$before_hash;
962	$win_before_hash =~ s@(\\)+@/@g;
963	# $base_dir is already similarly "converted" on windows.
964	if ($win_before_hash =~ s@^$base_dir/@@o) {
965	# if this is true, we removed a prefix
966	$before_hash=$win_before_hash;
967	}
968	}
969	else {
970	# before_hash has lost leading slash by this point,
971	# -> add back in prior to substitution with $base_dir
972	$before_hash = "/$before_hash";
973
974	$before_hash = &util::filename_cat("",$before_hash);
975	$before_hash =~ s@^$base_dir/@@;
976	}
977	}
978	} else {
979	# Turn relative file path into full path
980	my $dirname = &File::Basename::dirname($file);
981	$before_hash = &util::filename_cat($dirname, $before_hash);
982	$before_hash = $self->eval_dir_dots($before_hash);
983	}
984
985	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
986	# make sure there's a slash on the end if it's a directory
987	if ($before_hash !~ m/\/$/) {
988	$before_hash .= "/" if (-d $linkfilename);
989	}
990	return ("http://" . $before_hash, $hash_part, 1);
991	} else {
992	# mailto, news, nntp, telnet, javascript or gopher link
993	return ($before_hash, "", 0);
994	}
995	}
996
997	sub extract_first_NNNN_characters {
998	my $self = shift (@_);
999	my ($textref, $doc_obj, $thissection) = @_;
1000
1001	foreach my $size (split /,/, $self->{'first'}) {
1002	my $tmptext = $$textref;
1003	# skip to the body
1004	$tmptext =~ s/.<body[^>]>//i;
1005	# remove javascript
1006	$tmptext =~ s@<script.*?</script>@ @sig;
1007	$tmptext =~ s/<[^>]*>/ /g;
1008	$tmptext =~ s/ / /g;
1009	$tmptext =~ s/^\s+//;
1010	$tmptext =~ s/\s+$//;
1011	$tmptext =~ s/\s+/ /gs;
1012	$tmptext = &unicode::substr ($tmptext, 0, $size);
1013	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
1014	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1015	}
1016	}
1017
1018
1019	sub extract_metadata {
1020	my $self = shift (@_);
1021	my ($textref, $metadata, $doc_obj, $section) = @_;
1022	my $outhandle = $self->{'outhandle'};
1023	# if we don't want metadata, we may as well not be here ...
1024	return if (!defined $self->{'metadata_fields'});
1025
1026	my $separator = $self->{'metadata_field_separator'};
1027	if ($separator eq "") {
1028	undef $separator;
1029	}
1030
1031	# metadata fields to extract/save. 'key' is the (lowercase) name of the
1032	# html meta, 'value' is the metadata name for greenstone to use
1033	my %find_fields = ();
1034
1035	my %creator_fields = (); # short-cut for lookups
1036
1037
1038	foreach my $field (split /,/, $self->{'metadata_fields'}) {
1039	$field =~ s/^\s+//; # remove leading whitespace
1040	$field =~ s/\s+$//; # remove trailing whitespace
1041
1042	# support tag<tagname>
1043	if ($field =~ m/^(.?)\s<(.*?)>$/) {
1044	# "$2" is the user's preferred gs metadata name
1045	$find_fields{lc($1)}=$2; # lc = lowercase
1046	} else { # no <tagname> for mapping
1047	# "$field" is the user's preferred gs metadata name
1048	$find_fields{lc($field)}=$field; # lc = lowercase
1049	}
1050	}
1051
1052	if (defined $self->{'hunt_creator_metadata'} &&
1053	$self->{'hunt_creator_metadata'} == 1 ) {
1054	my @extra_fields =
1055	(
1056	'author',
1057	'author.email',
1058	'creator',
1059	'dc.creator',
1060	'dc.creator.corporatename',
1061	);
1062
1063	# add the creator_metadata fields to search for
1064	foreach my $field (@extra_fields) {
1065	$creator_fields{$field}=0; # add to lookup hash
1066	}
1067	}
1068
1069
1070	# find the header in the html file, which has the meta tags
1071	$$textref =~ m@<head>(.*?)</head>@si;
1072
1073	my $html_header=$1;
1074
1075	# go through every <meta... tag defined in the html and see if it is
1076	# one of the tags we want to match.
1077
1078	# special case for title - we want to remember if its been found
1079	my $found_title = 0;
1080	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1081	$html_header =~ m/^/; # match the start of the string, for \G assertion
1082
1083	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1084	my $metatag=$1;
1085	my ($tag, $value);
1086
1087	# find the tag name
1088	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1089	$tag=$2;
1090	# in case they're not using " or ', but they should...
1091	if (! $tag) {
1092	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1093	$tag=$1;
1094	}
1095
1096	if (!defined $tag) {
1097	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1098	next;
1099	}
1100
1101	# don't need to assign this field if it was passed in from a previous
1102	# (recursive) plugin
1103	if (defined $metadata->{$tag}) {next}
1104
1105	# find the tag content
1106	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
1107	$value=$2;
1108
1109	if (! $value) {
1110	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1111	$value=$1;
1112	}
1113	if (!defined $value) {
1114	print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1115	next;
1116	}
1117
1118	# clean up and add
1119	$value =~ s/\s+/ /gs;
1120	chomp($value); # remove trailing \n, if any
1121	if (exists $creator_fields{lc($tag)}) {
1122	# map this value onto greenstone's "Creator" metadata
1123	$tag='Creator';
1124	} elsif (!exists $find_fields{lc($tag)}) {
1125	next; # don't want this tag
1126	} else {
1127	# get the user's preferred capitalisation
1128	$tag = $find_fields{lc($tag)};
1129	}
1130	if (lc($tag) eq "title") {
1131	$found_title = 1;
1132	}
1133
1134	if ($self->{'verbosity'} > 2) {
1135	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1136	}
1137
1138	if ($tag =~ /\./) {
1139	# there is a . so has a namespace, add ex.
1140	$tag = "ex.$tag";
1141	}
1142	if (defined $separator) {
1143	my @values = split($separator, $value);
1144	foreach my $v (@values) {
1145	$doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1146	}
1147	}
1148	else {
1149	$doc_obj->add_utf8_metadata($section, $tag, $value);
1150	}
1151	}
1152
1153	# TITLE: extract the document title
1154	if (exists $find_fields{'title'} && !$found_title) {
1155	# we want a title, and didn't find one in the meta tags
1156	# see if there's a <title> tag
1157	my $title;
1158	my $from = ""; # for debugging output only
1159	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
1160	$title = $1;
1161	$from = "<title> tags";
1162	}
1163
1164	if (!defined $title) {
1165	$from = "first 100 chars";
1166	# if no title use first 100 or so characters
1167	$title = $$textref;
1168	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1169	$title =~ s/^.*?<body>//si;
1170	# ignore javascript!
1171	$title =~ s@<script.*?</script>@ @sig;
1172	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1173	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1174	$title = substr ($title, 0, 100);
1175	$title =~ s/\s\S*$/.../;
1176	}
1177	$title =~ s/<[^>]*>/ /g; # remove html tags
1178	$title =~ s/ / /g;
1179	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1180	$title =~ s/\s+/ /gs; # collapse multiple spaces
1181	$title =~ s/^\s*//; # remove leading spaces
1182	$title =~ s/\s*$//; # remove trailing spaces
1183
1184	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1185	$title =~ s/^\s+//s; # in case title_sub introduced any...
1186	$doc_obj->add_utf8_metadata ($section, "Title", $title);
1187	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1188	if ($self->{'verbosity'} > 2);
1189	}
1190
1191	# add FileFormat metadata
1192	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1193
1194	# Special, for metadata names such as tagH1 - extracts
1195	# the text between the first <H1> and </H1> tags into "H1" metadata.
1196
1197	foreach my $field (keys %find_fields) {
1198	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1199	my $tag = $1;
1200	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1201	my $content = $1;
1202	$content =~ s/ / /g;
1203	$content =~ s/<[^>]*>/ /g;
1204	$content =~ s/^\s+//;
1205	$content =~ s/\s+$//;
1206	$content =~ s/\s+/ /gs;
1207	if ($content) {
1208	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1209	$tag =~ s/^tag//i;
1210	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1211	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1212	if ($self->{'verbosity'} > 2);
1213	}
1214	}
1215	}
1216	}
1217
1218
1219	# evaluate any "../" to next directory up
1220	# evaluate any "./" as here
1221	sub eval_dir_dots {
1222	my $self = shift (@_);
1223	my ($filename) = @_;
1224	my $dirsep_os = &util::get_os_dirsep();
1225	my @dirsep = split(/$dirsep_os/,$filename);
1226
1227	my @eval_dirs = ();
1228	foreach my $d (@dirsep) {
1229	if ($d eq "..") {
1230	pop(@eval_dirs);
1231
1232	} elsif ($d eq ".") {
1233	# do nothing!
1234
1235	} else {
1236	push(@eval_dirs,$d);
1237	}
1238	}
1239
1240	# Need to fiddle with number of elements in @eval_dirs if the
1241	# first one is the empty string. This is because of a
1242	# modification to util::filename_cat that supresses the addition
1243	# of a leading '/' character (or \ if windows) (intended to help
1244	# filename cat with relative paths) if the first entry in the
1245	# array is the empty string. Making the array start with two
1246	# empty strings is a way to defeat this "smart" option.
1247	#
1248	if (scalar(@eval_dirs) > 0) {
1249	if ($eval_dirs[0] eq ""){
1250	unshift(@eval_dirs,"");
1251	}
1252	}
1253
1254	my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1255	return $evaluated_filename;
1256	}
1257
1258	sub replace_usemap_links {
1259	my $self = shift (@_);
1260	my ($front, $link, $back) = @_;
1261
1262	# remove quotes from link at start and end if necessary
1263	if ($link=~/^[\"\']/) {
1264	$link=~s/^[\"\']//;
1265	$link=~s/[\"\']$//;
1266	$front.='"';
1267	$back="\"$back";
1268	}
1269
1270	$link =~ s/^\.\///;
1271	return $front . $link . $back;
1272	}
1273
1274	sub inc_filecount {
1275	my $self = shift (@_);
1276
1277	if ($self->{'file_num'} == 1000) {
1278	$self->{'dir_num'} ++;
1279	$self->{'file_num'} = 0;
1280	} else {
1281	$self->{'file_num'} ++;
1282	}
1283	}
1284
1285
1286	# Extend read_file so that strings like é are
1287	# converted to UTF8 internally.
1288	#
1289	# We don't convert < or > or & or " in case
1290	# they interfere with the GML files
1291
1292	sub read_file {
1293	my $self = shift(@_);
1294	my ($filename, $encoding, $language, $textref) = @_;
1295
1296	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1297
1298	# Convert entities to their UTF8 equivalents
1299	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1300	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1301	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1302
1303	}
1304
1305	sub HB_read_html_file {
1306	my $self = shift (@_);
1307	my ($htmlfile, $text) = @_;
1308
1309	# load in the file
1310	if (!open (FILE, $htmlfile)) {
1311	print STDERR "ERROR - could not open $htmlfile\n";
1312	return;
1313	}
1314
1315	my $foundbody = 0;
1316	$self->HB_gettext (\$foundbody, $text, "FILE");
1317	close FILE;
1318
1319	# just in case there was no <body> tag
1320	if (!$foundbody) {
1321	$foundbody = 1;
1322	open (FILE, $htmlfile) \|\| return;
1323	$self->HB_gettext (\$foundbody, $text, "FILE");
1324	close FILE;
1325	}
1326	# text is in utf8
1327	}
1328
1329	# converts the text to utf8, as ghtml does that for é etc.
1330	sub HB_gettext {
1331	my $self = shift (@_);
1332	my ($foundbody, $text, $handle) = @_;
1333
1334	my $line = "";
1335	while (defined ($line = <$handle>)) {
1336	# look for body tag
1337	if (!$$foundbody) {
1338	if ($line =~ s/^.<body[^>]>//i) {
1339	$$foundbody = 1;
1340	} else {
1341	next;
1342	}
1343	}
1344
1345	# check for symbol fonts
1346	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
1347	my $font = $1;
1348	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1349	if ($font !~ m/^arial$/i);
1350	}
1351
1352	$$text .= $line;
1353	}
1354
1355	if ($self->{'input_encoding'} eq "iso_8859_1") {
1356	# convert to utf-8
1357	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1358	}
1359	# convert any alphanumeric character entities to their utf-8
1360	# equivalent for indexing purposes
1361	#&ghtml::convertcharentities ($$text);
1362
1363	$$text =~ s/\s+/ /g; # remove \n's
1364
1365	# At this point $$text is a binary byte string
1366	# => turn it into a Unicode aware string, so full
1367	# Unicode aware pattern matching can be used.
1368	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
1369	#
1370
1371	$$text = decode("utf8",$$text);
1372	}
1373
1374	sub HB_clean_section {
1375	my $self = shift (@_);
1376	my ($section) = @_;
1377
1378	# remove tags without a starting tag from the section
1379	my ($tag, $tagstart);
1380	while ($section =~ m/<\/([^>]{1,10})>/) {
1381	$tag = $1;
1382	$tagstart = index($section, "<$tag");
1383	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1384	$section =~ s/<\/$tag>//;
1385	}
1386
1387	# remove extra paragraph tags
1388	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
1389
1390	# remove extra stuff at the end of the section
1391	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
1392
1393	# add a newline at the beginning of each paragraph
1394	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1395
1396	# add a newline every 80 characters at a word boundary
1397	# Note: this regular expression puts a line feed before
1398	# the last word in each section, even when it is not
1399	# needed.
1400	$section =~ s/(.{1,80})\s/$1\n/g;
1401
1402	# fix up the image links
1403	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
1404	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1405	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
1406	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1407
1408	return $section;
1409	}
1410
1411	# Will convert the oldHDL format to the new HDL format (using the Section tag)
1412	sub convert_to_newHDLformat
1413	{
1414	my $self = shift (@_);
1415	my ($file,$cnfile) = @_;
1416	my $input_filename = $file;
1417	my $tmp_filename = $cnfile;
1418
1419	# write HTML tmp file with new HDL format
1420	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1421
1422	# read in the file and do basic html cleaning (removing header etc)
1423	my $html = "";
1424	$self->HB_read_html_file ($input_filename, \$html);
1425
1426	# process the file one section at a time
1427	my $curtoclevel = 1;
1428	my $firstsection = 1;
1429	my $toclevel = 0;
1430	while (length ($html) > 0) {
1431	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
1432	$toclevel = $3;
1433	my $title = $4;
1434	my $sectiontext = "";
1435	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
1436	$sectiontext = $1;
1437	} else {
1438	$sectiontext = $html;
1439	$html = "";
1440	}
1441
1442	# remove tags and extra spaces from the title
1443	$title =~ s/<\/?[^>]+>//g;
1444	$title =~ s/^\s+\|\s+$//g;
1445
1446	# close any sections below the current level and
1447	# create a new section (special case for the firstsection)
1448	print PROD "<!--\n";
1449	while (($curtoclevel > $toclevel) \|\|
1450	(!$firstsection && $curtoclevel == $toclevel)) {
1451	$curtoclevel--;
1452	print PROD "</Section>\n";
1453	}
1454	if ($curtoclevel+1 < $toclevel) {
1455	print STDERR "WARNING - jump in toc levels in $input_filename " .
1456	"from $curtoclevel to $toclevel\n";
1457	}
1458	while ($curtoclevel < $toclevel) {
1459	$curtoclevel++;
1460	}
1461
1462	if ($curtoclevel == 1) {
1463	# add the header tag
1464	print PROD "-->\n";
1465	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1466	print PROD "<!--\n";
1467	}
1468
1469	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1470
1471	print PROD "-->\n";
1472
1473	# clean up the section html
1474	$sectiontext = $self->HB_clean_section($sectiontext);
1475
1476	print PROD "$sectiontext\n";
1477
1478	} else {
1479	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1480	"\nin $input_filename\n";
1481	last;
1482	}
1483	$firstsection = 0;
1484	}
1485
1486	print PROD "<!--\n";
1487	while ($curtoclevel > 0) {
1488	$curtoclevel--;
1489	print PROD "</Section>\n";
1490	}
1491	print PROD "-->\n";
1492
1493	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1494
1495	return $tmp_filename;
1496	}
1497
1498	sub shorten {
1499	my $self = shift (@_);
1500	my ($text) = @_;
1501
1502	return "\"$text\"" if (length($text) < 100);
1503
1504	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1505	substr ($text, length($text)-50) . "\"";
1506	}
1507
1508	sub convert_tidy_or_oldHDL_file
1509	{
1510	my $self = shift (@_);
1511	my ($file) = @_;
1512	my $input_filename = $file;
1513
1514	if (-d $input_filename)
1515	{
1516	return $input_filename;
1517	}
1518
1519	# get the input filename
1520	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1521	my $base_dirname = $dirname;
1522	$suffix = lc($suffix);
1523
1524	# derive tmp filename from input filename
1525	# Remove any white space from filename -- no risk of name collision, and
1526	# makes later conversion by utils simpler. Leave spaces in path...
1527	# tidy up the filename with space, dot, hyphen between
1528	$tailname =~ s/\s+//g;
1529	$tailname =~ s/\.+//g;
1530	$tailname =~ s/\-+//g;
1531	# convert to utf-8 otherwise we have problems with the doc.xml file
1532	# later on
1533	&unicode::ensure_utf8(\$tailname);
1534
1535	# softlink to collection tmp dir
1536	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1537	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1538
1539	my $test_dirname = "";
1540	my $f_separator = &util::get_os_dirsep();
1541
1542	if ($dirname =~ m/import$f_separator/)
1543	{
1544	$test_dirname = $'; #'
1545
1546	#print STDERR "init $'\n";
1547
1548	while ($test_dirname =~ m/[$f_separator]/)
1549	{
1550	my $folderdirname = $`;
1551	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1552	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1553	$test_dirname = $'; #'
1554	}
1555	}
1556
1557	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1558
1559	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1560	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
1561	{
1562	#convert the input file to a new style HDL
1563	my $hdl_output_filename = $input_filename;
1564	if ($self->{'old_style_HDL'})
1565	{
1566	$hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1567	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1568	}
1569
1570	#just for checking copy all other file from the base dir to tmp dir if it is not exists
1571	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1572	my @files = grep {!/^\.+$/} readdir(DIR);
1573	close(DIR);
1574
1575	foreach my $file (@files)
1576	{
1577	my $src_file = &util::filename_cat($base_dirname,$file);
1578	my $dest_file = &util::filename_cat($tmp_dirname,$file);
1579	if ((!-e $dest_file) && (!-d $src_file))
1580	{
1581	# just copy the original file back to the tmp directory
1582	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1583	}
1584	}
1585
1586	# tidy the input file
1587	my $tidy_output_filename = $hdl_output_filename;
1588	if ($self->{'use_realistic_book'})
1589	{
1590	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1591	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1592	}
1593	$tmp_filename = $tidy_output_filename;
1594	}
1595	else
1596	{
1597	if (!-e $tmp_filename)
1598	{
1599	# just copy the original file back to the tmp directory
1600	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1601	}
1602	}
1603
1604	return $tmp_filename;
1605	}
1606
1607
1608	# Will make the html input file as a proper XML file with removed font tag and
1609	# image size added to the img tag.
1610	# The tidying process takes place in a collection specific 'tmp' directory so
1611	# that we don't accidentally damage the input.
1612	sub tmp_tidy_file
1613	{
1614	my $self = shift (@_);
1615	my ($file,$cnfile) = @_;
1616	my $input_filename = $file;
1617	my $tmp_filename = $cnfile;
1618
1619	# get the input filename
1620	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1621
1622	require HTML::TokeParser::Simple;
1623
1624	# create HTML parser to decode the input file
1625	my $parser = HTML::TokeParser::Simple->new($input_filename);
1626
1627	# write HTML tmp file without the font tag and image size are added to the img tag
1628	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1629	while (my $token = $parser->get_token())
1630	{
1631	# is it an img tag
1632	if ($token->is_start_tag('img'))
1633	{
1634	# get the attributes
1635	my $attr = $token->return_attr;
1636
1637	# get the full path to the image
1638	my $img_file = &util::filename_cat($dirname,$attr->{src});
1639
1640	# set the width and height attribute
1641	($attr->{width}, $attr->{height}) = imgsize($img_file);
1642
1643	# recreate the tag
1644	print PROD "<img";
1645	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1646	print PROD ">";
1647	}
1648	# is it a font tag
1649	else
1650	{
1651	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
1652	{
1653	# remove font tag
1654	print PROD "";
1655	}
1656	else
1657	{
1658	# print without changes
1659	print PROD $token->as_is;
1660	}
1661	}
1662	}
1663	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1664
1665	# run html-tidy on the tmp file to make it a proper XML file
1666
1667	my $outhandle = $self->{'outhandle'};
1668	print $outhandle "Converting HTML to be XML compliant:\n";
1669
1670	my $tidy_cmd = "tidy";
1671	$tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1672	$tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1673	if ($self->{'verbosity'} <= 2) {
1674	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1675	$tidy_cmd .= " 2>nul";
1676	}
1677	else {
1678	$tidy_cmd .= " 2>/dev/null";
1679	}
1680	print $outhandle " => $tidy_cmd\n";
1681	}
1682
1683	my $tidyfile = `$tidy_cmd`;
1684
1685	# write result back to the tmp file
1686	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1687	print PROD $tidyfile;
1688	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1689
1690	# return the output filename
1691	return $tmp_filename;
1692	}
1693
1694	sub associate_cover_image
1695	{
1696	my $self = shift(@_);
1697	my ($doc_obj, $filename) = @_;
1698	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
1699	{
1700	# we will have cover image in tidytmp, but want it from import
1701	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1702	}
1703	$self->SUPER::associate_cover_image($doc_obj, $filename);
1704	}
1705
1706
1707	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: