Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 23335

Last change on this file since 23335 was 23335, checked in by davidb, 13 years ago
Work done on improving handing of filenames when the actualy filename encoding used is not necesarrily known. Tested for Linux. Work currently includes some debug statements that will be removed once testing for Windows and Mac is done.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 56.7 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use Encode;
39
40	use ReadTextFile;
41	use HBPlugin;
42	use ghtml;
43	use unicode;
44	use util;
45	use XMLParser;
46
47	use Image::Size;
48	use File::Copy;
49
50	sub BEGIN {
51	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
52	}
53
54	use strict; # every perl program should have this!
55	no strict 'refs'; # make an exception so we can use variables as filehandles
56
57	my $arguments =
58	[ { 'name' => "process_exp",
59	'desc' => "{BasePlugin.process_exp}",
60	'type' => "regexp",
61	'deft' => &get_default_process_exp() },
62	{ 'name' => "block_exp",
63	'desc' => "{BasePlugin.block_exp}",
64	'type' => 'regexp',
65	'deft' => &get_default_block_exp() },
66	{ 'name' => "nolinks",
67	'desc' => "{HTMLPlugin.nolinks}",
68	'type' => "flag" },
69	{ 'name' => "keep_head",
70	'desc' => "{HTMLPlugin.keep_head}",
71	'type' => "flag" },
72	{ 'name' => "no_metadata",
73	'desc' => "{HTMLPlugin.no_metadata}",
74	'type' => "flag" },
75	{ 'name' => "metadata_fields",
76	'desc' => "{HTMLPlugin.metadata_fields}",
77	'type' => "string",
78	'deft' => "Title" },
79	{ 'name' => "metadata_field_separator",
80	'desc' => "{HTMLPlugin.metadata_field_separator}",
81	'type' => "string",
82	'deft' => "" },
83	{ 'name' => "hunt_creator_metadata",
84	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
85	'type' => "flag" },
86	{ 'name' => "file_is_url",
87	'desc' => "{HTMLPlugin.file_is_url}",
88	'type' => "flag" },
89	{ 'name' => "assoc_files",
90	'desc' => "{HTMLPlugin.assoc_files}",
91	'type' => "regexp",
92	'deft' => &get_default_block_exp() },
93	{ 'name' => "rename_assoc_files",
94	'desc' => "{HTMLPlugin.rename_assoc_files}",
95	'type' => "flag" },
96	{ 'name' => "title_sub",
97	'desc' => "{HTMLPlugin.title_sub}",
98	'type' => "string",
99	'deft' => "" },
100	{ 'name' => "description_tags",
101	'desc' => "{HTMLPlugin.description_tags}",
102	'type' => "flag" },
103	# retain this for backward compatibility (w3mir option was replaced by
104	# file_is_url)
105	{ 'name' => "w3mir",
106	# 'desc' => "{HTMLPlugin.w3mir}",
107	'type' => "flag",
108	'hiddengli' => "yes"},
109	{ 'name' => "no_strip_metadata_html",
110	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
111	'type' => "string",
112	'deft' => "",
113	'reqd' => "no"},
114	{ 'name' => "sectionalise_using_h_tags",
115	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
116	'type' => "flag" },
117	{ 'name' => "use_realistic_book",
118	'desc' => "{HTMLPlugin.tidy_html}",
119	'type' => "flag"},
120	{ 'name' => "old_style_HDL",
121	'desc' => "{HTMLPlugin.old_style_HDL}",
122	'type' => "flag"},
123	{'name' => "processing_tmp_files",
124	'desc' => "{BasePlugin.processing_tmp_files}",
125	'type' => "flag",
126	'hiddengli' => "yes"}
127	];
128
129	my $options = { 'name' => "HTMLPlugin",
130	'desc' => "{HTMLPlugin.desc}",
131	'abstract' => "no",
132	'inherits' => "yes",
133	'args' => $arguments };
134
135
136	sub new {
137	my ($class) = shift (@_);
138	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139	push(@$pluginlist, $class);
140
141	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142	push(@{$hashArgOptLists->{"OptList"}},$options);
143
144
145	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
146
147	if ($self->{'w3mir'}) {
148	$self->{'file_is_url'} = 1;
149	}
150	$self->{'aux_files'} = {};
151	$self->{'dir_num'} = 0;
152	$self->{'file_num'} = 0;
153
154	return bless $self, $class;
155	}
156
157	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
158	# if have eg <script language="javascript" src="img/lib.js@123">
159	# blocking is now done by reading through the file and recording all the
160	# images and other files
161	sub get_default_block_exp {
162	my $self = shift (@_);
163
164	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
165	return "";
166	}
167
168	sub get_default_process_exp {
169	my $self = shift (@_);
170
171	# the last option is an attempt to encode the concept of an html query ...
172	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
173	}
174
175	sub store_block_files
176	{
177	my $self =shift (@_);
178	my ($filename_full_path, $block_hash) = @_;
179
180	my $html_fname = $filename_full_path;
181	my @file_blocks;
182
183	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
184	$self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
185
186	# read in file ($text will be in utf8)
187	my $raw_text = "";
188	$self->read_file_no_decoding ($filename_full_path, \$raw_text);
189
190	my $textref = \$raw_text;
191	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
192	my $closecom = '(?:-->\|(?:—\|\|--)>)';
193	$$textref =~ s/$opencom(.*?)$closecom//gs;
194
195	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
196	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
197	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
198	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
199	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
200	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
201	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
202
203	if(!defined $self->{'utf8_to_original_filename'}) {
204	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
205	$self->{'utf8_to_original_filename'} = {};
206	}
207
208	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
209
210	# remove quotes from link at start and end if necessary
211	if ($link=~/^\"/) {
212	$link=~s/^\"//;
213	$link=~s/\"$//;
214	}
215
216	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
217	# some links may just be anchor names
218	next unless ($link =~ /\S+/);
219
220	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
221	# Turn relative file path into full path
222	my $dirname = &File::Basename::dirname($filename_full_path);
223	$link = &util::filename_cat($dirname, $link);
224	}
225	$link = $self->eval_dir_dots($link);
226
227	# this is the actual filename on the filesystem (that the link refers to)
228	my $url_original_filename = $self->opt_url_decode($link);
229
230	# Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
231	my $utf8_link = "";
232	$self->decode_text($link,$content_encoding,$language,\$utf8_link);
233
234	$self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
235	# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
236
237	if ($url_original_filename ne $utf8_link) {
238	my $outhandle = $self->{'outhandle'};
239
240	print $outhandle "URL Encoding $url_original_filename\n";
241	print $outhandle " ->$utf8_link\n";
242	}
243
244	$block_hash->{'file_blocks'}->{$url_original_filename} = 1;
245	}
246	}
247
248	# Given a filename in any encoding, will URL decode it to get back the original filename
249	# in the original encoding. Because this method is intended to work out the original
250	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
251	# string already exists in the local folder.
252	# Return the original filename corresponding to the parameter URL-encoded filename, and
253	# a decoded flag that is set to true iff URL-decoding had to be applied.
254	sub opt_url_decode {
255	my $self = shift (@_);
256	my ($link) = @_;
257
258	# Replace %XX's in URL with decoded value if required.
259	# Note that the filename may include the %XX in some situations
260	if ($link =~ m/\%[A-F0-9]{2}/i) {
261	if (!-e $link) {
262	$link = &unicode::url_decode($link);
263	}
264	}
265
266	return $link;
267	}
268
269	sub read_into_doc_obj
270	{
271	my $self = shift (@_);
272	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
273
274	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
275
276	# Lookup content_encoding worked out in file_block pass for this file
277	# Store it under the local name 'content_encoding' so its nice and
278	# easy to access
279	$self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
280
281	# get the input file
282	my $input_filename = $file;
283	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
284	$suffix = lc($suffix);
285	my $tidy_filename;
286	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
287	{
288	# because the document has to be sectionalized set the description tags
289	$self->{'description_tags'} = 1;
290
291	# set the file to be tidied
292	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
293
294	# get the tidied file
295	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
296	$tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
297
298	# derive tmp filename from input filename
299	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
300
301	# set the new input file and base_dir to be from the tidied file
302	$file = "$tailname$suffix";
303	$base_dir = $dirname;
304	}
305
306	# call the parent read_into_doc_obj
307	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
308	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
309	{
310	# now we need to reset the filenames in the doc obj so that the converted filenames are not used
311	my $collect_file = &util::filename_within_collection($filename_full_path);
312	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
313	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
314	# build. so set it manually.
315	$doc_obj->{'source_path'} = $filename_full_path;
316	my $collect_conv_file = &util::filename_within_collection($tidy_filename);
317	$doc_obj->set_converted_filename($collect_conv_file);
318	$self->set_Source_metadata($doc_obj, $filename_no_path);
319	}
320
321	delete $self->{'store_content_encoding'}->{$filename_full_path};
322	$self->{'content_encoding'} = undef;
323
324	return ($process_status,$doc_obj);
325	}
326
327	# do plugin specific processing of doc_obj
328	sub process {
329	my $self = shift (@_);
330	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
331	my $outhandle = $self->{'outhandle'};
332
333	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
334	# this makes life so much easier... perl can cope with unix-style '/'s.
335	$base_dir =~ s@(\\)+@/@g;
336	$file =~ s@(\\)+@/@g;
337	}
338
339	# reset per-doc stuff...
340	$self->{'aux_files'} = {};
341	$self->{'dir_num'} = 0;
342	$self->{'file_num'} = 0;
343
344	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
345	# you can also include metadata in the format (X can be any number)
346	# <hX>Title<!--gsdl-metadata
347	# <Metadata name="name1">value1</Metadata>
348	# ...
349	# <Metadata name="nameN">valueN</Metadata>
350	#--></hX>
351	if ($self->{'sectionalise_using_h_tags'}) {
352	# description_tags should allways be activated because we convert headings to description tags
353	$self->{'description_tags'} = 1;
354
355	my $arrSections = [];
356	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
357
358	if (scalar(@$arrSections)) {
359	my $strMetadata = $self->update_section_data($arrSections, -1);
360	if (length($strMetadata)) {
361	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
362	$$textref =~ s/<\/body>/$strMetadata/ig;
363	}
364	}
365	}
366
367	my $cursection = $doc_obj->get_top_section();
368
369	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
370	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
371
372	# Store URL for page as metadata - this can be used for an
373	# altavista style search interface. The URL won't be valid
374	# unless the file structure contains the domain name (i.e.
375	# like when w3mir is used to download a website).
376
377	# URL metadata (even invalid ones) are used to support internal
378	# links, so even if 'file_is_url' is off, still need to store info
379
380	my ($tailname,$dirname) = &File::Basename::fileparse($file);
381	print STDERR "***!! file = $file\n";
382	# my $utf8_file = $self->filename_to_utf8_metadata($file);
383	# $utf8_file =~ s/&\#095;/_/g;
384	my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
385	print STDERR "***!! utf8_file = $utf8_file\n";
386
387	my $web_url = "http://";
388	if(defined $dirname) { # local directory
389	# Check for "ftp" in the domain name of the directory
390	# structure to determine if this URL should be a ftp:// URL
391	# This check is not infallible, but better than omitting the
392	# check, which would cause all files downloaded from ftp sites
393	# via mirroring with wget to have potentially erroneous http:// URLs
394	# assigned in their metadata
395	if ($dirname =~ /^[^\/]*ftp/i)
396	{
397	$web_url = "ftp://";
398	}
399	$dirname = $self->eval_dir_dots($dirname);
400	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
401	$web_url = $web_url.$dirname.$utf8_file;
402	} else {
403	$web_url = $web_url.$utf8_file;
404	}
405	$web_url =~ s/\\/\//g;
406	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
407
408	if ($self->{'file_is_url'}) {
409	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
410	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
411	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
412	}
413
414	if ($self->{'description_tags'}) {
415	# remove the html header - note that doing this here means any
416	# sections defined within the header will be lost (so all <Section>
417	# tags must appear within the body of the HTML)
418	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
419
420	$$textref =~ s/^.?<body[^>]>//is;
421	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
422
423	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
424	my $closecom = '(?:-->\|(?:—\|\|--)>)';
425
426	my $lt = '(?:<\|<)';
427	my $gt = '(?:>\|>)';
428	my $quot = '(?:"\|"\|”\|“)';
429
430	my $dont_strip = '';
431	if ($self->{'no_strip_metadata_html'}) {
432	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
433	}
434
435	my $found_something = 0; my $top = 1;
436	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
437	my $text = $1;
438	my $comment = $2;
439	if (defined $text) {
440	# text before a comment - note that getting to here
441	# doesn't necessarily mean there are Section tags in
442	# the document
443	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
444	}
445	while ($comment =~ s/$lt(.*?)$gt//s) {
446	my $tag = $1;
447	if ($tag eq "Section") {
448	$found_something = 1;
449	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
450	$top = 0;
451	} elsif ($tag eq "/Section") {
452	$found_something = 1;
453	$cursection = $doc_obj->get_parent_section ($cursection);
454	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
455	my $metaname = $1;
456	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
457	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
458	my $metavalue = $1;
459	$metavalue =~ s/^\s+//;
460	$metavalue =~ s/\s+$//;
461	# assume that no metadata value intentionally includes
462	# carriage returns or HTML tags (if they're there they
463	# were probably introduced when converting to HTML from
464	# some other format).
465	# actually some people want to have html tags in their
466	# metadata.
467	$metavalue =~ s/[\cJ\cM]/ /sg;
468	$metavalue =~ s/<[^>]+>//sg
469	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
470	$metavalue =~ s/\s+/ /sg;
471	if ($metaname =~ /\./) { # has a namespace
472	$metaname = "ex.$metaname";
473	}
474	if ($accumulate) {
475	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
476	} else {
477	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
478	}
479	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
480	# do nothing with containing Description tags
481	} else {
482	# simple HTML tag (probably created by the conversion
483	# to HTML from some other format) - we'll ignore it and
484	# hope for the best ;-)
485	}
486	}
487	}
488	if ($cursection ne "") {
489	print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
490	}
491
492	$$textref =~ s/^.?<body[^>]>//is;
493	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
494	if ($$textref =~ m/\S/) {
495	if (!$found_something) {
496	if ($self->{'verbosity'} > 2) {
497	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
498	print $outhandle " will be processed as a single section document\n";
499	}
500
501	# go ahead and process single-section document
502	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
503
504	# if document contains no Section tags we'll go ahead
505	# and extract metadata (this won't have been done
506	# above as the -description_tags option prevents it)
507	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
508	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
509	unless $self->{'no_metadata'};
510
511	} else {
512	print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
513	print $outhandle " of the final closing </Section> tag. This text will\n";
514	print $outhandle " be ignored.";
515
516	my ($text);
517	if (length($$textref) > 30) {
518	$text = substr($$textref, 0, 30) . "...";
519	} else {
520	$text = $$textref;
521	}
522	$text =~ s/\n/ /isg;
523	print $outhandle " ($text)\n";
524	}
525	} elsif (!$found_something) {
526
527	if ($self->{'verbosity'} > 2) {
528	# may get to here if document contained no valid Section
529	# tags but did contain some comments. The text will have
530	# been processed already but we should print the warning
531	# as above and extract metadata
532	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
533	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
534	}
535
536	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
537	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
538	unless $self->{'no_metadata'};
539	}
540
541	} else {
542
543	# remove header and footer
544	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
545	$$textref =~ s/^.?<body[^>]>//is;
546	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
547	}
548
549	# single section document
550	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
551	}
552
553	return 1;
554	}
555
556
557	sub process_heading
558	{
559	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
560	$strHeadingText = '' if (!defined($strHeadingText));
561
562	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
563
564	my $strSecMetadata = '';
565	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
566	{
567	$strSecMetadata .= $1;
568	}
569
570	$strHeadingText =~ s/^\s+//g;
571	$strHeadingText =~ s/\s+$//g;
572	$strSecMetadata =~ s/^\s+//g;
573	$strSecMetadata =~ s/\s+$//g;
574
575	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
576
577	if (length($strSecMetadata)) {
578	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
579	}
580
581	$strMetadata .= "\t</Description>\n";
582
583	return "<!--" . $strMetadata . "-->";
584	}
585
586
587	sub update_section_data
588	{
589	my ($self, $arrSections, $nCurTocNo) = @_;
590	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
591
592	if ($nSections == 0) {
593	push @$arrSections, $nCurTocNo;
594	return $strBuffer;
595	}
596	$nLast = $arrSections->[$nSections - 1];
597	if ($nCurTocNo > $nLast) {
598	push @$arrSections, $nCurTocNo;
599	return $strBuffer;
600	}
601	for(my $i = $nSections - 1; $i >= 0; $i--) {
602	if ($nCurTocNo <= $arrSections->[$i]) {
603	$strBuffer .= "\n</Section>";
604	pop @$arrSections;
605	}
606	}
607	push @$arrSections, $nCurTocNo;
608	return $strBuffer;
609	}
610
611
612	# note that process_section may be called multiple times for a single
613	# section (relying on the fact that add_utf8_text appends the text to any
614	# that may exist already).
615	sub process_section {
616	my $self = shift (@_);
617	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
618	# trap links
619	if (!$self->{'nolinks'}) {
620	# usemap="./#index" not handled correctly => change to "#index"
621	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
622
623	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
624	$self->replace_usemap_links($1, $2, $3)/isge;
625
626	## $$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
627
628	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
629	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
630	}
631
632	# trap images
633
634	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
635	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
636	# turned regular text succeeding images into links. That is, by embedding <imgs>
637	# inside <a href=""></a>, the text following images were no longer misbehaving.
638	# However, there would be many occasions whereby images were not meant to link
639	# to their source images but where the images would link to another web page.
640	# To allow this, the no_image_links option was introduced: it would prevent
641	# the behaviour of embedding images into links that referenced the source images.
642
643	# Somewhere along the line, the problem of normal text turning into links when
644	# such text followed images which were not embedded in <a href=""></a> ceased
645	# to occur. This is why the following lines have been commented out (as well as
646	# two lines in replace_images). They appear to no longer apply.
647
648	# If at any time, there is a need for having images embedded in <a> anchor tags,
649	# then it might be better to turn that into an HTMLPlugin option rather than make
650	# it the default behaviour. Also, eventually, no_image_links needs to become
651	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
652
653	#if(!$self->{'no_image_links'}){
654	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
655	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
656	#}
657
658	# add text to document object
659	# turn \ into \\ so that the rest of greenstone doesn't think there
660	# is an escape code following. (Macro parsing loses them...)
661	$$textref =~ s/\\/\\\\/go;
662
663	$doc_obj->add_utf8_text($cursection, $$textref);
664	}
665
666	sub replace_images {
667	my $self = shift (@_);
668	my ($front, $link, $back, $base_dir,
669	$file, $doc_obj, $section) = @_;
670
671	# remove quotes from link at start and end if necessary
672	if ($link=~/^[\"\']/) {
673	$link=~s/^[\"\']//;
674	$link=~s/[\"\']$//;
675	$front.='"';
676	$back="\"$back";
677	}
678
679	$link =~ s/\n/ /g;
680
681	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
682	# If the Word file path has spaces in it, wv messes up and you end up with
683	# absolute paths for the images, and without the "file://" prefix
684	# So check for this special case and massage the data to be correct
685	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
686	$link =~ s/^.*\\([^\\]+)$/$1/;
687	}
688
689	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
690
691	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
692
693	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n";
694
695	my $anchor_name = $img_file;
696	#$anchor_name =~ s/^.*\///;
697	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
698
699	my $image_link = $front . $img_file .$back;
700	return $image_link;
701
702	# The reasons for why the following two lines are no longer necessary can be
703	# found in subroutine process_section
704	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
705	#return $anchor_link;
706
707	#return $front . $img_file . $back . $anchor_name;
708	}
709
710	sub replace_href_links {
711	my $self = shift (@_);
712	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
713
714	# remove quotes from link at start and end if necessary
715	if ($link=~/^[\"\']/) {
716	$link=~s/^[\"\']//;
717	$link=~s/[\"\']$//;
718	$front.='"';
719	$back="\"$back";
720	}
721
722	# attempt to sort out targets - frames are not handled
723	# well in this plugin and some cases will screw things
724	# up - e.g. the _parent target (so we'll just remove
725	# them all ;-)
726	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
727	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
728	$front =~ s/target=\"?_parent\"?//is;
729	$back =~ s/target=\"?_parent\"?//is;
730
731	return $front . $link . $back if $link =~ m/^\#/s;
732	$link =~ s/\n/ /g;
733
734	# Find file referred to by $link on file system
735	# This is more complicated than it sounds when char encodings
736	# is taken in to account
737	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
738
739	# href may use '\'s where '/'s should be on Windows
740	$href =~ s/\\/\//g;
741	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
742
743
744	##### leave all these links alone (they won't be picked up by intermediate
745	##### pages). I think that's safest when dealing with frames, targets etc.
746	##### (at least until I think of a better way to do it). Problems occur with
747	##### mailto links from within small frames, the intermediate page is displayed
748	##### within that frame and can't be seen. There is still potential for this to
749	##### happen even with html pages - the solution seems to be to somehow tell
750	##### the browser from the server side to display the page being sent (i.e.
751	##### the intermediate page) in the top level window - I'm not sure if that's
752	##### possible - the following line should probably be deleted if that can be done
753	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
754
755	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
756	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
757
758
759	# If web page didn't give encoding, then default to utf8
760	print "*************** looking up $file\n";
761
762	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
763	$href = encode($content_encoding,$href);
764
765	$href = &unicode::raw_filename_to_url_encoded($href);
766	$href = &unicode::filename_to_url($href);
767
768	&ghtml::urlsafe ($href);
769	print STDERR "***!!! href=$href\n";
770
771	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
772	} else {
773	# link is to some other type of file (e.g., an image) so we'll
774	# need to associate that file
775	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
776	}
777	}
778
779	sub add_file {
780	my $self = shift (@_);
781	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
782	my ($newname);
783
784	my $filename = $href;
785	if ($base_dir eq "") {
786	# remove http:/ thereby leaving one slash at the start
787	$filename =~ s/^[^:]*:\///;
788	}
789	else {
790	# remove http://
791	$filename =~ s/^[^:]*:\/\///;
792	}
793
794	$filename = &util::filename_cat($base_dir, $filename);
795	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'})) {
796	# we are processing a tidytmp file - want paths to be in import
797	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
798	}
799
800	# Replace %XX's in URL with decoded value if required. Note that the
801	# filename may include the %XX in some situations. If the original
802	# file's name was in URL encoding, the following method will not decode
803	# it.
804	my $utf8_filename = $filename;
805	my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename);
806
807	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
808
809	# The filenames that come through the HTML file have been decoded
810	# into Unicode aware Perl strings. Need to convert them back
811	# to their initial raw-byte encoding to match the file that
812	# exists on the file system
813	$filename = encode($content_encoding, $opt_decode_utf8_filename);
814
815
816	# some special processing if the intended filename was converted to utf8, but
817	# the actual file still needs to be renamed
818	if (!-e $filename) {
819	# try the original filename stored in map
820	print STDERR "***###!! orig filename did not exist: $filename\n";
821
822	my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename};
823
824	print STDERR "**** Trying for $original_filename\n";
825
826	if (defined $original_filename && -e $original_filename) {
827	print STDERR "*** found match\n";
828	$filename = $original_filename;
829	}
830	}
831
832	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
833
834	if ($rl == 0) {
835	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
836	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
837	}
838	else {
839	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
840	}
841	}
842
843	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
844	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
845	}
846	# add the original image file as a source file
847	if (!$self->{'processing_tmp_files'} ) {
848	$doc_obj->associate_source_file($filename);
849	}
850	if ($self->{'rename_assoc_files'}) {
851	if (defined $self->{'aux_files'}->{$href}) {
852	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
853	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
854	} else {
855	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
856	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
857	$self->inc_filecount ();
858	}
859	$doc_obj->associate_file($filename, $newname, undef, $section);
860	return "_httpdocimg_/$newname";
861	} else {
862	if(&unicode::is_url_encoded($utf8_filename)) {
863	# use the possibly-decoded filename instead to avoid double URL encoding
864	($newname) = $filename =~ m/([^\/\\]*)$/;
865	} else {
866	($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
867	}
868
869	# Make sure this name uses only ASCII characters.
870	# We use either base64 or URL encoding, as these preserve original encoding
871	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
872
873	$doc_obj->associate_file($filename, $newname, undef, $section);
874
875	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
876	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
877	my $newname_url = $newname;
878	$newname_url = &unicode::filename_to_url($newname_url);
879	return "_httpdocimg_/$newname_url";
880	}
881	}
882
883
884	sub format_link {
885	my $self = shift (@_);
886	my ($link, $base_dir, $file) = @_;
887
888	my ($before_hash, $hash_part) = $link =~ m/^([^\#])(\#?.)$/;
889
890	$hash_part = "" if !defined $hash_part;
891	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
892	my $outhandle = $self->{'outhandle'};
893	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
894	if $self->{'verbosity'};
895	return ($link, "", 0);
896	}
897
898	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
899	my $type = $1;
900
901	if ($link =~ m/^(http\|ftp):/i) {
902	# Turn url (using /) into file name (possibly using \ on windows)
903	my @http_dir_split = split('/', $before_hash);
904	$before_hash = &util::filename_cat(@http_dir_split);
905	}
906
907	$before_hash = $self->eval_dir_dots($before_hash);
908
909	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
910
911	my $rl = 0;
912	$rl = 1 if (-e $linkfilename);
913
914	# make sure there's a slash on the end if it's a directory
915	if ($before_hash !~ m/\/$/) {
916	$before_hash .= "/" if (-d $linkfilename);
917	}
918	return ($type . $before_hash, $hash_part, $rl);
919
920	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
921
922	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
923
924	# the first directory will be the domain name if file_is_url
925	# to generate archives, otherwise we'll assume all files are
926	# from the same site and base_dir is the root
927
928	if ($self->{'file_is_url'}) {
929	my @dirs = split /[\/\\]/, $file;
930	my $domname = shift (@dirs);
931	$before_hash = &util::filename_cat($domname, $before_hash);
932	$before_hash =~ s@\\@/@g; # for windows
933	}
934	else
935	{
936	# see if link shares directory with source document
937	# => turn into relative link if this is so!
938
939	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
940	# too difficult doing a pattern match with embedded '\'s...
941	my $win_before_hash=$before_hash;
942	$win_before_hash =~ s@(\\)+@/@g;
943	# $base_dir is already similarly "converted" on windows.
944	if ($win_before_hash =~ s@^$base_dir/@@o) {
945	# if this is true, we removed a prefix
946	$before_hash=$win_before_hash;
947	}
948	}
949	else {
950	# before_hash has lost leading slash by this point,
951	# -> add back in prior to substitution with $base_dir
952	$before_hash = "/$before_hash";
953
954	$before_hash = &util::filename_cat("",$before_hash);
955	$before_hash =~ s@^$base_dir/@@;
956	}
957	}
958	} else {
959	# Turn relative file path into full path
960	my $dirname = &File::Basename::dirname($file);
961	$before_hash = &util::filename_cat($dirname, $before_hash);
962	$before_hash = $self->eval_dir_dots($before_hash);
963	}
964
965	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
966	# make sure there's a slash on the end if it's a directory
967	if ($before_hash !~ m/\/$/) {
968	$before_hash .= "/" if (-d $linkfilename);
969	}
970	return ("http://" . $before_hash, $hash_part, 1);
971	} else {
972	# mailto, news, nntp, telnet, javascript or gopher link
973	return ($before_hash, "", 0);
974	}
975	}
976
977	sub extract_first_NNNN_characters {
978	my $self = shift (@_);
979	my ($textref, $doc_obj, $thissection) = @_;
980
981	foreach my $size (split /,/, $self->{'first'}) {
982	my $tmptext = $$textref;
983	# skip to the body
984	$tmptext =~ s/.<body[^>]>//i;
985	# remove javascript
986	$tmptext =~ s@<script.*?</script>@ @sig;
987	$tmptext =~ s/<[^>]*>/ /g;
988	$tmptext =~ s/ / /g;
989	$tmptext =~ s/^\s+//;
990	$tmptext =~ s/\s+$//;
991	$tmptext =~ s/\s+/ /gs;
992	$tmptext = &unicode::substr ($tmptext, 0, $size);
993	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
994	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
995	}
996	}
997
998
999	sub extract_metadata {
1000	my $self = shift (@_);
1001	my ($textref, $metadata, $doc_obj, $section) = @_;
1002	my $outhandle = $self->{'outhandle'};
1003	# if we don't want metadata, we may as well not be here ...
1004	return if (!defined $self->{'metadata_fields'});
1005
1006	my $separator = $self->{'metadata_field_separator'};
1007	if ($separator eq "") {
1008	undef $separator;
1009	}
1010
1011	# metadata fields to extract/save. 'key' is the (lowercase) name of the
1012	# html meta, 'value' is the metadata name for greenstone to use
1013	my %find_fields = ();
1014
1015	my %creator_fields = (); # short-cut for lookups
1016
1017
1018	foreach my $field (split /,/, $self->{'metadata_fields'}) {
1019	$field =~ s/^\s+//; # remove leading whitespace
1020	$field =~ s/\s+$//; # remove trailing whitespace
1021
1022	# support tag<tagname>
1023	if ($field =~ m/^(.?)\s<(.*?)>$/) {
1024	# "$2" is the user's preferred gs metadata name
1025	$find_fields{lc($1)}=$2; # lc = lowercase
1026	} else { # no <tagname> for mapping
1027	# "$field" is the user's preferred gs metadata name
1028	$find_fields{lc($field)}=$field; # lc = lowercase
1029	}
1030	}
1031
1032	if (defined $self->{'hunt_creator_metadata'} &&
1033	$self->{'hunt_creator_metadata'} == 1 ) {
1034	my @extra_fields =
1035	(
1036	'author',
1037	'author.email',
1038	'creator',
1039	'dc.creator',
1040	'dc.creator.corporatename',
1041	);
1042
1043	# add the creator_metadata fields to search for
1044	foreach my $field (@extra_fields) {
1045	$creator_fields{$field}=0; # add to lookup hash
1046	}
1047	}
1048
1049
1050	# find the header in the html file, which has the meta tags
1051	$$textref =~ m@<head>(.*?)</head>@si;
1052
1053	my $html_header=$1;
1054
1055	# go through every <meta... tag defined in the html and see if it is
1056	# one of the tags we want to match.
1057
1058	# special case for title - we want to remember if its been found
1059	my $found_title = 0;
1060	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1061	$html_header =~ m/^/; # match the start of the string, for \G assertion
1062
1063	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1064	my $metatag=$1;
1065	my ($tag, $value);
1066
1067	# find the tag name
1068	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1069	$tag=$2;
1070	# in case they're not using " or ', but they should...
1071	if (! $tag) {
1072	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1073	$tag=$1;
1074	}
1075
1076	if (!defined $tag) {
1077	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1078	next;
1079	}
1080
1081	# don't need to assign this field if it was passed in from a previous
1082	# (recursive) plugin
1083	if (defined $metadata->{$tag}) {next}
1084
1085	# find the tag content
1086	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
1087	$value=$2;
1088
1089	if (! $value) {
1090	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1091	$value=$1;
1092	}
1093	if (!defined $value) {
1094	print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1095	next;
1096	}
1097
1098	# clean up and add
1099	$value =~ s/\s+/ /gs;
1100	chomp($value); # remove trailing \n, if any
1101	if (exists $creator_fields{lc($tag)}) {
1102	# map this value onto greenstone's "Creator" metadata
1103	$tag='Creator';
1104	} elsif (!exists $find_fields{lc($tag)}) {
1105	next; # don't want this tag
1106	} else {
1107	# get the user's preferred capitalisation
1108	$tag = $find_fields{lc($tag)};
1109	}
1110	if (lc($tag) eq "title") {
1111	$found_title = 1;
1112	}
1113
1114	if ($self->{'verbosity'} > 2) {
1115	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1116	}
1117
1118	if ($tag =~ /\./) {
1119	# there is a . so has a namespace, add ex.
1120	$tag = "ex.$tag";
1121	}
1122	if (defined $separator) {
1123	my @values = split($separator, $value);
1124	foreach my $v (@values) {
1125	$doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1126	}
1127	}
1128	else {
1129	$doc_obj->add_utf8_metadata($section, $tag, $value);
1130	}
1131	}
1132
1133	# TITLE: extract the document title
1134	if (exists $find_fields{'title'} && !$found_title) {
1135	# we want a title, and didn't find one in the meta tags
1136	# see if there's a <title> tag
1137	my $title;
1138	my $from = ""; # for debugging output only
1139	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
1140	$title = $1;
1141	$from = "<title> tags";
1142	}
1143
1144	if (!defined $title) {
1145	$from = "first 100 chars";
1146	# if no title use first 100 or so characters
1147	$title = $$textref;
1148	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1149	$title =~ s/^.*?<body>//si;
1150	# ignore javascript!
1151	$title =~ s@<script.*?</script>@ @sig;
1152	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1153	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1154	$title = substr ($title, 0, 100);
1155	$title =~ s/\s\S*$/.../;
1156	}
1157	$title =~ s/<[^>]*>/ /g; # remove html tags
1158	$title =~ s/ / /g;
1159	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1160	$title =~ s/\s+/ /gs; # collapse multiple spaces
1161	$title =~ s/^\s*//; # remove leading spaces
1162	$title =~ s/\s*$//; # remove trailing spaces
1163
1164	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1165	$title =~ s/^\s+//s; # in case title_sub introduced any...
1166	print STDERR "**** adding Title: ", Encode::encode("utf8",$title), "\n";
1167	$doc_obj->add_utf8_metadata ($section, "Title", $title);
1168	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1169	if ($self->{'verbosity'} > 2);
1170	}
1171
1172	# add FileFormat metadata
1173	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1174
1175	# Special, for metadata names such as tagH1 - extracts
1176	# the text between the first <H1> and </H1> tags into "H1" metadata.
1177
1178	foreach my $field (keys %find_fields) {
1179	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1180	my $tag = $1;
1181	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1182	my $content = $1;
1183	$content =~ s/ / /g;
1184	$content =~ s/<[^>]*>/ /g;
1185	$content =~ s/^\s+//;
1186	$content =~ s/\s+$//;
1187	$content =~ s/\s+/ /gs;
1188	if ($content) {
1189	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1190	$tag =~ s/^tag//i;
1191	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1192	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1193	if ($self->{'verbosity'} > 2);
1194	}
1195	}
1196	}
1197	}
1198
1199
1200	# evaluate any "../" to next directory up
1201	# evaluate any "./" as here
1202	sub eval_dir_dots {
1203	my $self = shift (@_);
1204	my ($filename) = @_;
1205	my $dirsep_os = &util::get_os_dirsep();
1206	my @dirsep = split(/$dirsep_os/,$filename);
1207
1208	my @eval_dirs = ();
1209	foreach my $d (@dirsep) {
1210	if ($d eq "..") {
1211	pop(@eval_dirs);
1212
1213	} elsif ($d eq ".") {
1214	# do nothing!
1215
1216	} else {
1217	push(@eval_dirs,$d);
1218	}
1219	}
1220
1221	# Need to fiddle with number of elements in @eval_dirs if the
1222	# first one is the empty string. This is because of a
1223	# modification to util::filename_cat that supresses the addition
1224	# of a leading '/' character (or \ if windows) (intended to help
1225	# filename cat with relative paths) if the first entry in the
1226	# array is the empty string. Making the array start with two
1227	# empty strings is a way to defeat this "smart" option.
1228	#
1229	if (scalar(@eval_dirs) > 0) {
1230	if ($eval_dirs[0] eq ""){
1231	unshift(@eval_dirs,"");
1232	}
1233	}
1234
1235	my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1236	return $evaluated_filename;
1237	}
1238
1239	sub replace_usemap_links {
1240	my $self = shift (@_);
1241	my ($front, $link, $back) = @_;
1242
1243	# remove quotes from link at start and end if necessary
1244	if ($link=~/^[\"\']/) {
1245	$link=~s/^[\"\']//;
1246	$link=~s/[\"\']$//;
1247	$front.='"';
1248	$back="\"$back";
1249	}
1250
1251	$link =~ s/^\.\///;
1252	return $front . $link . $back;
1253	}
1254
1255	sub inc_filecount {
1256	my $self = shift (@_);
1257
1258	if ($self->{'file_num'} == 1000) {
1259	$self->{'dir_num'} ++;
1260	$self->{'file_num'} = 0;
1261	} else {
1262	$self->{'file_num'} ++;
1263	}
1264	}
1265
1266
1267	# Extend read_file so that strings like é are
1268	# converted to UTF8 internally.
1269	#
1270	# We don't convert < or > or & or " in case
1271	# they interfere with the GML files
1272
1273	sub read_file {
1274	my $self = shift(@_);
1275	my ($filename, $encoding, $language, $textref) = @_;
1276
1277	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1278
1279	# Convert entities to their UTF8 equivalents
1280	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1281	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1282	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1283
1284	}
1285
1286	sub HB_read_html_file {
1287	my $self = shift (@_);
1288	my ($htmlfile, $text) = @_;
1289
1290	# load in the file
1291	if (!open (FILE, $htmlfile)) {
1292	print STDERR "ERROR - could not open $htmlfile\n";
1293	return;
1294	}
1295
1296	my $foundbody = 0;
1297	$self->HB_gettext (\$foundbody, $text, "FILE");
1298	close FILE;
1299
1300	# just in case there was no <body> tag
1301	if (!$foundbody) {
1302	$foundbody = 1;
1303	open (FILE, $htmlfile) \|\| return;
1304	$self->HB_gettext (\$foundbody, $text, "FILE");
1305	close FILE;
1306	}
1307	# text is in utf8
1308	}
1309
1310	# converts the text to utf8, as ghtml does that for é etc.
1311	sub HB_gettext {
1312	my $self = shift (@_);
1313	my ($foundbody, $text, $handle) = @_;
1314
1315	my $line = "";
1316	while (defined ($line = <$handle>)) {
1317	# look for body tag
1318	if (!$$foundbody) {
1319	if ($line =~ s/^.<body[^>]>//i) {
1320	$$foundbody = 1;
1321	} else {
1322	next;
1323	}
1324	}
1325
1326	# check for symbol fonts
1327	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
1328	my $font = $1;
1329	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1330	if ($font !~ m/^arial$/i);
1331	}
1332
1333	$$text .= $line;
1334	}
1335
1336	if ($self->{'input_encoding'} eq "iso_8859_1") {
1337	# convert to utf-8
1338	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1339	}
1340	# convert any alphanumeric character entities to their utf-8
1341	# equivalent for indexing purposes
1342	#&ghtml::convertcharentities ($$text);
1343
1344	$$text =~ s/\s+/ /g; # remove \n's
1345
1346	# At this point $$text is a binary byte string
1347	# => turn it into a Unicode aware string, so full
1348	# Unicode aware pattern matching can be used.
1349	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
1350	#
1351
1352	$$text = decode("utf8",$$text);
1353	}
1354
1355	sub HB_clean_section {
1356	my $self = shift (@_);
1357	my ($section) = @_;
1358
1359	# remove tags without a starting tag from the section
1360	my ($tag, $tagstart);
1361	while ($section =~ m/<\/([^>]{1,10})>/) {
1362	$tag = $1;
1363	$tagstart = index($section, "<$tag");
1364	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1365	$section =~ s/<\/$tag>//;
1366	}
1367
1368	# remove extra paragraph tags
1369	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
1370
1371	# remove extra stuff at the end of the section
1372	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
1373
1374	# add a newline at the beginning of each paragraph
1375	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1376
1377	# add a newline every 80 characters at a word boundary
1378	# Note: this regular expression puts a line feed before
1379	# the last word in each section, even when it is not
1380	# needed.
1381	$section =~ s/(.{1,80})\s/$1\n/g;
1382
1383	# fix up the image links
1384	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
1385	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1386	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
1387	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1388
1389	return $section;
1390	}
1391
1392	# Will convert the oldHDL format to the new HDL format (using the Section tag)
1393	sub convert_to_newHDLformat
1394	{
1395	my $self = shift (@_);
1396	my ($file,$cnfile) = @_;
1397	my $input_filename = $file;
1398	my $tmp_filename = $cnfile;
1399
1400	# write HTML tmp file with new HDL format
1401	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1402
1403	# read in the file and do basic html cleaning (removing header etc)
1404	my $html = "";
1405	$self->HB_read_html_file ($input_filename, \$html);
1406
1407	# process the file one section at a time
1408	my $curtoclevel = 1;
1409	my $firstsection = 1;
1410	my $toclevel = 0;
1411	while (length ($html) > 0) {
1412	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
1413	$toclevel = $3;
1414	my $title = $4;
1415	my $sectiontext = "";
1416	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
1417	$sectiontext = $1;
1418	} else {
1419	$sectiontext = $html;
1420	$html = "";
1421	}
1422
1423	# remove tags and extra spaces from the title
1424	$title =~ s/<\/?[^>]+>//g;
1425	$title =~ s/^\s+\|\s+$//g;
1426
1427	# close any sections below the current level and
1428	# create a new section (special case for the firstsection)
1429	print PROD "<!--\n";
1430	while (($curtoclevel > $toclevel) \|\|
1431	(!$firstsection && $curtoclevel == $toclevel)) {
1432	$curtoclevel--;
1433	print PROD "</Section>\n";
1434	}
1435	if ($curtoclevel+1 < $toclevel) {
1436	print STDERR "WARNING - jump in toc levels in $input_filename " .
1437	"from $curtoclevel to $toclevel\n";
1438	}
1439	while ($curtoclevel < $toclevel) {
1440	$curtoclevel++;
1441	}
1442
1443	if ($curtoclevel == 1) {
1444	# add the header tag
1445	print PROD "-->\n";
1446	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1447	print PROD "<!--\n";
1448	}
1449
1450	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1451
1452	print PROD "-->\n";
1453
1454	# clean up the section html
1455	$sectiontext = $self->HB_clean_section($sectiontext);
1456
1457	print PROD "$sectiontext\n";
1458
1459	} else {
1460	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1461	"\nin $input_filename\n";
1462	last;
1463	}
1464	$firstsection = 0;
1465	}
1466
1467	print PROD "<!--\n";
1468	while ($curtoclevel > 0) {
1469	$curtoclevel--;
1470	print PROD "</Section>\n";
1471	}
1472	print PROD "-->\n";
1473
1474	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1475
1476	return $tmp_filename;
1477	}
1478
1479	sub shorten {
1480	my $self = shift (@_);
1481	my ($text) = @_;
1482
1483	return "\"$text\"" if (length($text) < 100);
1484
1485	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1486	substr ($text, length($text)-50) . "\"";
1487	}
1488
1489	sub convert_tidy_or_oldHDL_file
1490	{
1491	my $self = shift (@_);
1492	my ($file) = @_;
1493	my $input_filename = $file;
1494
1495	if (-d $input_filename)
1496	{
1497	return $input_filename;
1498	}
1499
1500	# get the input filename
1501	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1502	my $base_dirname = $dirname;
1503	$suffix = lc($suffix);
1504
1505	# derive tmp filename from input filename
1506	# Remove any white space from filename -- no risk of name collision, and
1507	# makes later conversion by utils simpler. Leave spaces in path...
1508	# tidy up the filename with space, dot, hyphen between
1509	$tailname =~ s/\s+//g;
1510	$tailname =~ s/\.+//g;
1511	$tailname =~ s/\-+//g;
1512	# convert to utf-8 otherwise we have problems with the doc.xml file
1513	# later on
1514	&unicode::ensure_utf8(\$tailname);
1515
1516	# softlink to collection tmp dir
1517	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1518	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1519
1520	my $test_dirname = "";
1521	my $f_separator = &util::get_os_dirsep();
1522
1523	if ($dirname =~ m/import$f_separator/)
1524	{
1525	$test_dirname = $'; #'
1526
1527	#print STDERR "init $'\n";
1528
1529	while ($test_dirname =~ m/[$f_separator]/)
1530	{
1531	my $folderdirname = $`;
1532	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1533	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1534	$test_dirname = $'; #'
1535	}
1536	}
1537
1538	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1539
1540	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1541	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
1542	{
1543	#convert the input file to a new style HDL
1544	my $hdl_output_filename = $input_filename;
1545	if ($self->{'old_style_HDL'})
1546	{
1547	$hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1548	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1549	}
1550
1551	#just for checking copy all other file from the base dir to tmp dir if it is not exists
1552	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1553	my @files = grep {!/^\.+$/} readdir(DIR);
1554	close(DIR);
1555
1556	foreach my $file (@files)
1557	{
1558	my $src_file = &util::filename_cat($base_dirname,$file);
1559	my $dest_file = &util::filename_cat($tmp_dirname,$file);
1560	if ((!-e $dest_file) && (!-d $src_file))
1561	{
1562	# just copy the original file back to the tmp directory
1563	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1564	}
1565	}
1566
1567	# tidy the input file
1568	my $tidy_output_filename = $hdl_output_filename;
1569	if ($self->{'use_realistic_book'})
1570	{
1571	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1572	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1573	}
1574	$tmp_filename = $tidy_output_filename;
1575	}
1576	else
1577	{
1578	if (!-e $tmp_filename)
1579	{
1580	# just copy the original file back to the tmp directory
1581	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1582	}
1583	}
1584
1585	return $tmp_filename;
1586	}
1587
1588
1589	# Will make the html input file as a proper XML file with removed font tag and
1590	# image size added to the img tag.
1591	# The tidying process takes place in a collection specific 'tmp' directory so
1592	# that we don't accidentally damage the input.
1593	sub tmp_tidy_file
1594	{
1595	my $self = shift (@_);
1596	my ($file,$cnfile) = @_;
1597	my $input_filename = $file;
1598	my $tmp_filename = $cnfile;
1599
1600	# get the input filename
1601	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1602
1603	require HTML::TokeParser::Simple;
1604
1605	# create HTML parser to decode the input file
1606	my $parser = HTML::TokeParser::Simple->new($input_filename);
1607
1608	# write HTML tmp file without the font tag and image size are added to the img tag
1609	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1610	while (my $token = $parser->get_token())
1611	{
1612	# is it an img tag
1613	if ($token->is_start_tag('img'))
1614	{
1615	# get the attributes
1616	my $attr = $token->return_attr;
1617
1618	# get the full path to the image
1619	my $img_file = &util::filename_cat($dirname,$attr->{src});
1620
1621	# set the width and height attribute
1622	($attr->{width}, $attr->{height}) = imgsize($img_file);
1623
1624	# recreate the tag
1625	print PROD "<img";
1626	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1627	print PROD ">";
1628	}
1629	# is it a font tag
1630	else
1631	{
1632	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
1633	{
1634	# remove font tag
1635	print PROD "";
1636	}
1637	else
1638	{
1639	# print without changes
1640	print PROD $token->as_is;
1641	}
1642	}
1643	}
1644	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1645
1646	# run html-tidy on the tmp file to make it a proper XML file
1647
1648	my $outhandle = $self->{'outhandle'};
1649	print $outhandle "Converting HTML to be XML compliant:\n";
1650
1651	my $tidy_cmd = "tidy";
1652	$tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1653	$tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1654	if ($self->{'verbosity'} <= 2) {
1655	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1656	$tidy_cmd .= " 2>nul";
1657	}
1658	else {
1659	$tidy_cmd .= " 2>/dev/null";
1660	}
1661	print $outhandle " => $tidy_cmd\n";
1662	}
1663
1664	my $tidyfile = `$tidy_cmd`;
1665
1666	# write result back to the tmp file
1667	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1668	print PROD $tidyfile;
1669	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1670
1671	# return the output filename
1672	return $tmp_filename;
1673	}
1674
1675	sub associate_cover_image
1676	{
1677	my $self = shift(@_);
1678	my ($doc_obj, $filename) = @_;
1679	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
1680	{
1681	# we will have cover image in tidytmp, but want it from import
1682	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1683	}
1684	$self->SUPER::associate_cover_image($doc_obj, $filename);
1685	}
1686
1687
1688	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: