Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 23363

Last change on this file since 23363 was 23363, checked in by davidb, 13 years ago
Plugin code upgrade to support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 58.1 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use Encode;
39
40	use ReadTextFile;
41	use HBPlugin;
42	use ghtml;
43	use unicode;
44	use util;
45	use XMLParser;
46
47	use Image::Size;
48	use File::Copy;
49
50	sub BEGIN {
51	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
52	}
53
54	use strict; # every perl program should have this!
55	no strict 'refs'; # make an exception so we can use variables as filehandles
56
57	my $arguments =
58	[ { 'name' => "process_exp",
59	'desc' => "{BasePlugin.process_exp}",
60	'type' => "regexp",
61	'deft' => &get_default_process_exp() },
62	{ 'name' => "block_exp",
63	'desc' => "{BasePlugin.block_exp}",
64	'type' => 'regexp',
65	'deft' => &get_default_block_exp() },
66	{ 'name' => "nolinks",
67	'desc' => "{HTMLPlugin.nolinks}",
68	'type' => "flag" },
69	{ 'name' => "keep_head",
70	'desc' => "{HTMLPlugin.keep_head}",
71	'type' => "flag" },
72	{ 'name' => "no_metadata",
73	'desc' => "{HTMLPlugin.no_metadata}",
74	'type' => "flag" },
75	{ 'name' => "metadata_fields",
76	'desc' => "{HTMLPlugin.metadata_fields}",
77	'type' => "string",
78	'deft' => "Title" },
79	{ 'name' => "metadata_field_separator",
80	'desc' => "{HTMLPlugin.metadata_field_separator}",
81	'type' => "string",
82	'deft' => "" },
83	{ 'name' => "hunt_creator_metadata",
84	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
85	'type' => "flag" },
86	{ 'name' => "file_is_url",
87	'desc' => "{HTMLPlugin.file_is_url}",
88	'type' => "flag" },
89	{ 'name' => "assoc_files",
90	'desc' => "{HTMLPlugin.assoc_files}",
91	'type' => "regexp",
92	'deft' => &get_default_block_exp() },
93	{ 'name' => "rename_assoc_files",
94	'desc' => "{HTMLPlugin.rename_assoc_files}",
95	'type' => "flag" },
96	{ 'name' => "title_sub",
97	'desc' => "{HTMLPlugin.title_sub}",
98	'type' => "string",
99	'deft' => "" },
100	{ 'name' => "description_tags",
101	'desc' => "{HTMLPlugin.description_tags}",
102	'type' => "flag" },
103	# retain this for backward compatibility (w3mir option was replaced by
104	# file_is_url)
105	{ 'name' => "w3mir",
106	# 'desc' => "{HTMLPlugin.w3mir}",
107	'type' => "flag",
108	'hiddengli' => "yes"},
109	{ 'name' => "no_strip_metadata_html",
110	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
111	'type' => "string",
112	'deft' => "",
113	'reqd' => "no"},
114	{ 'name' => "sectionalise_using_h_tags",
115	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
116	'type' => "flag" },
117	{ 'name' => "use_realistic_book",
118	'desc' => "{HTMLPlugin.tidy_html}",
119	'type' => "flag"},
120	{ 'name' => "old_style_HDL",
121	'desc' => "{HTMLPlugin.old_style_HDL}",
122	'type' => "flag"},
123	{'name' => "processing_tmp_files",
124	'desc' => "{BasePlugin.processing_tmp_files}",
125	'type' => "flag",
126	'hiddengli' => "yes"}
127	];
128
129	my $options = { 'name' => "HTMLPlugin",
130	'desc' => "{HTMLPlugin.desc}",
131	'abstract' => "no",
132	'inherits' => "yes",
133	'args' => $arguments };
134
135
136	sub new {
137	my ($class) = shift (@_);
138	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139	push(@$pluginlist, $class);
140
141	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142	push(@{$hashArgOptLists->{"OptList"}},$options);
143
144
145	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
146
147	if ($self->{'w3mir'}) {
148	$self->{'file_is_url'} = 1;
149	}
150	$self->{'aux_files'} = {};
151	$self->{'dir_num'} = 0;
152	$self->{'file_num'} = 0;
153
154	return bless $self, $class;
155	}
156
157	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
158	# if have eg <script language="javascript" src="img/lib.js@123">
159	# blocking is now done by reading through the file and recording all the
160	# images and other files
161	sub get_default_block_exp {
162	my $self = shift (@_);
163
164	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
165	return "";
166	}
167
168	sub get_default_process_exp {
169	my $self = shift (@_);
170
171	# the last option is an attempt to encode the concept of an html query ...
172	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
173	}
174
175	sub store_block_files
176	{
177	my $self =shift (@_);
178	my ($filename_full_path, $block_hash) = @_;
179
180	my $html_fname = $filename_full_path;
181	my @file_blocks;
182
183	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
184	$self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
185
186	# read in file ($text will be in utf8)
187	my $raw_text = "";
188	$self->read_file_no_decoding($filename_full_path, \$raw_text);
189
190	my $textref = \$raw_text;
191	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
192	my $closecom = '(?:-->\|(?:—\|\|--)>)';
193	$$textref =~ s/$opencom(.*?)$closecom//gs;
194
195	# Convert entities to their UTF8 equivalents
196	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
197	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
198	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
199
200	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
201	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
202	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
203	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
204	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
205	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
206	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
207
208	if(!defined $self->{'utf8_to_original_filename'}) {
209	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
210	$self->{'utf8_to_original_filename'} = {};
211	}
212
213	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
214
215	# remove quotes from link at start and end if necessary
216	if ($link =~ m/^\"/) {
217	$link =~ s/^\"//;
218	$link =~ s/\"$//;
219	}
220
221	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
222	# some links may just be anchor names
223	next unless ($link =~ /\S+/);
224
225	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
226	# Turn relative file path into full path
227	my $dirname = &File::Basename::dirname($filename_full_path);
228	$link = &util::filename_cat($dirname, $link);
229	}
230	$link = $self->eval_dir_dots($link);
231
232	# this is the actual filename on the filesystem (that the link refers to)
233	my $url_original_filename = $self->opt_url_decode($link);
234
235	# Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
236	my $utf8_link = "";
237	$self->decode_text($link,$content_encoding,$language,\$utf8_link);
238
239	$self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
240	# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
241
242	if ($url_original_filename ne $utf8_link) {
243	my $outhandle = $self->{'outhandle'};
244
245	print $outhandle "URL Encoding $url_original_filename\n";
246	print $outhandle " ->$utf8_link\n";
247	}
248
249
250	my $unicode_url_original_filename = decode("utf8",$url_original_filename);
251
252	## print STDERR "*****!!! Blocking url original filename = $unicode_url_original_filename\n";
253
254	# Allow for possibility of raw byte version (UTF8) and Unicode versions of file
255	$block_hash->{'file_blocks'}->{$url_original_filename} = 1;
256	$block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1;
257	}
258	}
259
260	# Given a filename in any encoding, will URL decode it to get back the original filename
261	# in the original encoding. Because this method is intended to work out the original
262	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
263	# string already exists in the local folder.
264	#
265	# Is the following still true??
266	# Return the original filename corresponding to the parameter URL-encoded filename, and
267	# a decoded flag that is set to true iff URL-decoding had to be applied.
268	sub opt_url_decode {
269	my $self = shift (@_);
270	my ($link) = @_;
271
272	# Replace %XX's in URL with decoded value if required.
273	# Note that the filename may include the %XX in some situations
274	if ($link =~ m/\%[A-F0-9]{2}/i) {
275	if (!-e $link) {
276	$link = &unicode::url_decode($link);
277	}
278	}
279
280	return $link;
281	}
282
283	sub read_into_doc_obj
284	{
285	my $self = shift (@_);
286	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
287
288	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
289
290	# Lookup content_encoding worked out in file_block pass for this file
291	# Store it under the local name 'content_encoding' so its nice and
292	# easy to access
293	$self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
294
295	# get the input file
296	my $input_filename = $file;
297	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
298	$suffix = lc($suffix);
299	my $tidy_filename;
300	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
301	{
302	# because the document has to be sectionalized set the description tags
303	$self->{'description_tags'} = 1;
304
305	# set the file to be tidied
306	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
307
308	# get the tidied file
309	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
310	$tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
311
312	# derive tmp filename from input filename
313	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
314
315	# set the new input file and base_dir to be from the tidied file
316	$file = "$tailname$suffix";
317	$base_dir = $dirname;
318	}
319
320	# call the parent read_into_doc_obj
321	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
322	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
323	{
324	# now we need to reset the filenames in the doc obj so that the converted filenames are not used
325	my $collect_file = &util::filename_within_collection($filename_full_path);
326	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
327	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
328	# build. So set it manually.
329	$doc_obj->set_source_path($filename_full_path);
330	my $collect_conv_file = &util::filename_within_collection($tidy_filename);
331	$doc_obj->set_converted_filename($collect_conv_file);
332
333	my $plugin_filename_encoding = $self->{'filename_encoding'};
334	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
335	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
336	}
337
338	delete $self->{'store_content_encoding'}->{$filename_full_path};
339	$self->{'content_encoding'} = undef;
340
341	return ($process_status,$doc_obj);
342	}
343
344	# do plugin specific processing of doc_obj
345	sub process {
346	my $self = shift (@_);
347	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
348	my $outhandle = $self->{'outhandle'};
349
350	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
351	# this makes life so much easier... perl can cope with unix-style '/'s.
352	$base_dir =~ s@(\\)+@/@g;
353	$file =~ s@(\\)+@/@g;
354	}
355
356	# reset per-doc stuff...
357	$self->{'aux_files'} = {};
358	$self->{'dir_num'} = 0;
359	$self->{'file_num'} = 0;
360
361	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
362	# you can also include metadata in the format (X can be any number)
363	# <hX>Title<!--gsdl-metadata
364	# <Metadata name="name1">value1</Metadata>
365	# ...
366	# <Metadata name="nameN">valueN</Metadata>
367	#--></hX>
368	if ($self->{'sectionalise_using_h_tags'}) {
369	# description_tags should allways be activated because we convert headings to description tags
370	$self->{'description_tags'} = 1;
371
372	my $arrSections = [];
373	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
374
375	if (scalar(@$arrSections)) {
376	my $strMetadata = $self->update_section_data($arrSections, -1);
377	if (length($strMetadata)) {
378	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
379	$$textref =~ s/<\/body>/$strMetadata/ig;
380	}
381	}
382	}
383
384	my $cursection = $doc_obj->get_top_section();
385
386	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
387	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
388
389	# Store URL for page as metadata - this can be used for an
390	# altavista style search interface. The URL won't be valid
391	# unless the file structure contains the domain name (i.e.
392	# like when w3mir is used to download a website).
393
394	# URL metadata (even invalid ones) are used to support internal
395	# links, so even if 'file_is_url' is off, still need to store info
396
397	my ($tailname,$dirname) = &File::Basename::fileparse($file);
398
399	# my $utf8_file = $self->filename_to_utf8_metadata($file);
400	# $utf8_file =~ s/&\#095;/_/g;
401	my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
402
403	# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
404	# print STDERR "***!! file = $file\n";
405	# print STDERR "***!! utf8_file = $utf8_file\n";
406	# }
407
408
409	my $web_url = "http://";
410	if(defined $dirname) { # local directory
411	# Check for "ftp" in the domain name of the directory
412	# structure to determine if this URL should be a ftp:// URL
413	# This check is not infallible, but better than omitting the
414	# check, which would cause all files downloaded from ftp sites
415	# via mirroring with wget to have potentially erroneous http:// URLs
416	# assigned in their metadata
417	if ($dirname =~ /^[^\/]*ftp/i)
418	{
419	$web_url = "ftp://";
420	}
421	$dirname = $self->eval_dir_dots($dirname);
422	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
423	$web_url = $web_url.$dirname.$utf8_file;
424	} else {
425	$web_url = $web_url.$utf8_file;
426	}
427	$web_url =~ s/\\/\//g;
428	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
429
430	if ($self->{'file_is_url'}) {
431	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
432	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
433	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
434	}
435
436	if ($self->{'description_tags'}) {
437	# remove the html header - note that doing this here means any
438	# sections defined within the header will be lost (so all <Section>
439	# tags must appear within the body of the HTML)
440	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
441
442	$$textref =~ s/^.?<body[^>]>//is;
443	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
444
445	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
446	my $closecom = '(?:-->\|(?:—\|\|--)>)';
447
448	my $lt = '(?:<\|<)';
449	my $gt = '(?:>\|>)';
450	my $quot = '(?:"\|"\|”\|“)';
451
452	my $dont_strip = '';
453	if ($self->{'no_strip_metadata_html'}) {
454	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
455	}
456
457	my $found_something = 0; my $top = 1;
458	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
459	my $text = $1;
460	my $comment = $2;
461	if (defined $text) {
462	# text before a comment - note that getting to here
463	# doesn't necessarily mean there are Section tags in
464	# the document
465	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
466	}
467	while ($comment =~ s/$lt(.*?)$gt//s) {
468	my $tag = $1;
469	if ($tag eq "Section") {
470	$found_something = 1;
471	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
472	$top = 0;
473	} elsif ($tag eq "/Section") {
474	$found_something = 1;
475	$cursection = $doc_obj->get_parent_section ($cursection);
476	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
477	my $metaname = $1;
478	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
479	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
480	my $metavalue = $1;
481	$metavalue =~ s/^\s+//;
482	$metavalue =~ s/\s+$//;
483	# assume that no metadata value intentionally includes
484	# carriage returns or HTML tags (if they're there they
485	# were probably introduced when converting to HTML from
486	# some other format).
487	# actually some people want to have html tags in their
488	# metadata.
489	$metavalue =~ s/[\cJ\cM]/ /sg;
490	$metavalue =~ s/<[^>]+>//sg
491	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
492	$metavalue =~ s/\s+/ /sg;
493	if ($metaname =~ /\./) { # has a namespace
494	$metaname = "ex.$metaname";
495	}
496	if ($accumulate) {
497	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
498	} else {
499	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
500	}
501	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
502	# do nothing with containing Description tags
503	} else {
504	# simple HTML tag (probably created by the conversion
505	# to HTML from some other format) - we'll ignore it and
506	# hope for the best ;-)
507	}
508	}
509	}
510	if ($cursection ne "") {
511	print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
512	}
513
514	$$textref =~ s/^.?<body[^>]>//is;
515	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
516	if ($$textref =~ m/\S/) {
517	if (!$found_something) {
518	if ($self->{'verbosity'} > 2) {
519	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
520	print $outhandle " will be processed as a single section document\n";
521	}
522
523	# go ahead and process single-section document
524	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
525
526	# if document contains no Section tags we'll go ahead
527	# and extract metadata (this won't have been done
528	# above as the -description_tags option prevents it)
529	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
530	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
531	unless $self->{'no_metadata'};
532
533	} else {
534	print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
535	print $outhandle " of the final closing </Section> tag. This text will\n";
536	print $outhandle " be ignored.";
537
538	my ($text);
539	if (length($$textref) > 30) {
540	$text = substr($$textref, 0, 30) . "...";
541	} else {
542	$text = $$textref;
543	}
544	$text =~ s/\n/ /isg;
545	print $outhandle " ($text)\n";
546	}
547	} elsif (!$found_something) {
548
549	if ($self->{'verbosity'} > 2) {
550	# may get to here if document contained no valid Section
551	# tags but did contain some comments. The text will have
552	# been processed already but we should print the warning
553	# as above and extract metadata
554	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
555	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
556	}
557
558	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
559	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
560	unless $self->{'no_metadata'};
561	}
562
563	} else {
564
565	# remove header and footer
566	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
567	$$textref =~ s/^.?<body[^>]>//is;
568	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
569	}
570
571	# single section document
572	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
573	}
574
575	return 1;
576	}
577
578
579	sub process_heading
580	{
581	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
582	$strHeadingText = '' if (!defined($strHeadingText));
583
584	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
585
586	my $strSecMetadata = '';
587	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
588	{
589	$strSecMetadata .= $1;
590	}
591
592	$strHeadingText =~ s/^\s+//g;
593	$strHeadingText =~ s/\s+$//g;
594	$strSecMetadata =~ s/^\s+//g;
595	$strSecMetadata =~ s/\s+$//g;
596
597	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
598
599	if (length($strSecMetadata)) {
600	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
601	}
602
603	$strMetadata .= "\t</Description>\n";
604
605	return "<!--" . $strMetadata . "-->";
606	}
607
608
609	sub update_section_data
610	{
611	my ($self, $arrSections, $nCurTocNo) = @_;
612	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
613
614	if ($nSections == 0) {
615	push @$arrSections, $nCurTocNo;
616	return $strBuffer;
617	}
618	$nLast = $arrSections->[$nSections - 1];
619	if ($nCurTocNo > $nLast) {
620	push @$arrSections, $nCurTocNo;
621	return $strBuffer;
622	}
623	for(my $i = $nSections - 1; $i >= 0; $i--) {
624	if ($nCurTocNo <= $arrSections->[$i]) {
625	$strBuffer .= "\n</Section>";
626	pop @$arrSections;
627	}
628	}
629	push @$arrSections, $nCurTocNo;
630	return $strBuffer;
631	}
632
633
634	# note that process_section may be called multiple times for a single
635	# section (relying on the fact that add_utf8_text appends the text to any
636	# that may exist already).
637	sub process_section {
638	my $self = shift (@_);
639	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
640	# trap links
641	if (!$self->{'nolinks'}) {
642	# usemap="./#index" not handled correctly => change to "#index"
643	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
644
645	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
646	$self->replace_usemap_links($1, $2, $3)/isge;
647
648	## $$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
649
650	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
651	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
652	}
653
654	# trap images
655
656	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
657	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
658	# turned regular text succeeding images into links. That is, by embedding <imgs>
659	# inside <a href=""></a>, the text following images were no longer misbehaving.
660	# However, there would be many occasions whereby images were not meant to link
661	# to their source images but where the images would link to another web page.
662	# To allow this, the no_image_links option was introduced: it would prevent
663	# the behaviour of embedding images into links that referenced the source images.
664
665	# Somewhere along the line, the problem of normal text turning into links when
666	# such text followed images which were not embedded in <a href=""></a> ceased
667	# to occur. This is why the following lines have been commented out (as well as
668	# two lines in replace_images). They appear to no longer apply.
669
670	# If at any time, there is a need for having images embedded in <a> anchor tags,
671	# then it might be better to turn that into an HTMLPlugin option rather than make
672	# it the default behaviour. Also, eventually, no_image_links needs to become
673	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
674
675	#if(!$self->{'no_image_links'}){
676	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
677	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
678	#}
679
680	# add text to document object
681	# turn \ into \\ so that the rest of greenstone doesn't think there
682	# is an escape code following. (Macro parsing loses them...)
683	$$textref =~ s/\\/\\\\/go;
684
685	$doc_obj->add_utf8_text($cursection, $$textref);
686	}
687
688	sub replace_images {
689	my $self = shift (@_);
690	my ($front, $link, $back, $base_dir,
691	$file, $doc_obj, $section) = @_;
692
693	# remove quotes from link at start and end if necessary
694	if ($link=~/^[\"\']/) {
695	$link=~s/^[\"\']//;
696	$link=~s/[\"\']$//;
697	$front.='"';
698	$back="\"$back";
699	}
700
701	$link =~ s/\n/ /g;
702
703	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
704	# If the Word file path has spaces in it, wv messes up and you end up with
705	# absolute paths for the images, and without the "file://" prefix
706	# So check for this special case and massage the data to be correct
707	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
708	$link =~ s/^.*\\([^\\]+)$/$1/;
709	}
710
711	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
712
713	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
714
715	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n";
716
717	my $anchor_name = $img_file;
718	#$anchor_name =~ s/^.*\///;
719	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
720
721	my $image_link = $front . $img_file .$back;
722	return $image_link;
723
724	# The reasons for why the following two lines are no longer necessary can be
725	# found in subroutine process_section
726	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
727	#return $anchor_link;
728
729	#return $front . $img_file . $back . $anchor_name;
730	}
731
732	sub replace_href_links {
733	my $self = shift (@_);
734	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
735
736	# remove quotes from link at start and end if necessary
737	if ($link=~/^[\"\']/) {
738	$link=~s/^[\"\']//;
739	$link=~s/[\"\']$//;
740	$front.='"';
741	$back="\"$back";
742	}
743
744	# attempt to sort out targets - frames are not handled
745	# well in this plugin and some cases will screw things
746	# up - e.g. the _parent target (so we'll just remove
747	# them all ;-)
748	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
749	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
750	$front =~ s/target=\"?_parent\"?//is;
751	$back =~ s/target=\"?_parent\"?//is;
752
753	return $front . $link . $back if $link =~ m/^\#/s;
754	$link =~ s/\n/ /g;
755
756	# Find file referred to by $link on file system
757	# This is more complicated than it sounds when char encodings
758	# is taken in to account
759	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
760
761	# href may use '\'s where '/'s should be on Windows
762	$href =~ s/\\/\//g;
763	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
764
765
766	##### leave all these links alone (they won't be picked up by intermediate
767	##### pages). I think that's safest when dealing with frames, targets etc.
768	##### (at least until I think of a better way to do it). Problems occur with
769	##### mailto links from within small frames, the intermediate page is displayed
770	##### within that frame and can't be seen. There is still potential for this to
771	##### happen even with html pages - the solution seems to be to somehow tell
772	##### the browser from the server side to display the page being sent (i.e.
773	##### the intermediate page) in the top level window - I'm not sure if that's
774	##### possible - the following line should probably be deleted if that can be done
775	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
776
777	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
778	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
779
780	# If web page didn't give encoding, then default to utf8
781	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
782
783	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
784	print STDERR "*** Encoding with $content_encoding href: $href\n";
785	}
786
787	$href = encode($content_encoding,$href);
788
789	$href = &unicode::raw_filename_to_url_encoded($href);
790	$href = &unicode::filename_to_url($href);
791
792	&ghtml::urlsafe ($href);
793	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
794	print STDERR "***!!! href=$href\n";
795	}
796
797
798	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
799	} else {
800	# link is to some other type of file (e.g., an image) so we'll
801	# need to associate that file
802	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
803	}
804	}
805
806	sub add_file {
807	my $self = shift (@_);
808	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
809	my ($newname);
810
811	my $filename = $href;
812	if ($base_dir eq "") {
813	# remove http:/ thereby leaving one slash at the start
814	$filename =~ s/^[^:]*:\///;
815	}
816	else {
817	# remove http://
818	$filename =~ s/^[^:]*:\/\///;
819	}
820
821	$filename = &util::filename_cat($base_dir, $filename);
822
823	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'})) {
824	# we are processing a tidytmp file - want paths to be in import
825	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
826	}
827
828	# Replace %XX's in URL with decoded value if required. Note that the
829	# filename may include the %XX in some situations. If the original
830	# file's name was in URL encoding, the following method will not decode
831	# it.
832	my $utf8_filename = $filename;
833	my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename);
834
835	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
836
837	# The filenames that come through the HTML file have been decoded
838	# into Unicode aware Perl strings. Need to convert them back
839	# to their initial raw-byte encoding to match the file that
840	# exists on the file system
841	$filename = encode($content_encoding, $opt_decode_utf8_filename);
842
843	# some special processing if the intended filename was converted to utf8, but
844	# the actual file still needs to be renamed
845	if (!&util::fd_exists($filename)) {
846	# try the original filename stored in map
847	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
848	print STDERR "******!! orig filename did not exist: $filename\n";
849	}
850
851	## print STDERR "**** trying to look up utf8_filename: $utf8_filename\n";
852
853	my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename};
854
855	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
856	print STDERR "****** From lookup utf8_filename, now trying for: $original_filename\n";
857	}
858
859	if (defined $original_filename && -e $original_filename) {
860	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
861	print STDERR "****** Found match!\n";
862	}
863	$filename = $original_filename;
864	}
865	}
866
867	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
868
869	if ($rl == 0) {
870	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
871	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
872	}
873	else {
874	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
875	}
876	}
877
878	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
879	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
880	}
881	# add the original image file as a source file
882	if (!$self->{'processing_tmp_files'} ) {
883	$doc_obj->associate_source_file($filename);
884	}
885	if ($self->{'rename_assoc_files'}) {
886	if (defined $self->{'aux_files'}->{$href}) {
887	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
888	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
889	} else {
890	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
891	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
892	$self->inc_filecount ();
893	}
894	$doc_obj->associate_file($filename, $newname, undef, $section);
895	return "_httpdocimg_/$newname";
896	} else {
897	if(&unicode::is_url_encoded($utf8_filename)) {
898	# use the possibly-decoded filename instead to avoid double URL encoding
899	($newname) = $filename =~ m/([^\/\\]*)$/;
900	} else {
901	($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
902	}
903
904	# Make sure this name uses only ASCII characters.
905	# We use either base64 or URL encoding, as these preserve original encoding
906	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
907
908	### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
909	$doc_obj->associate_file($filename, $newname, undef, $section);
910
911	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
912	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
913	my $newname_url = $newname;
914	$newname_url = &unicode::filename_to_url($newname_url);
915	return "_httpdocimg_/$newname_url";
916	}
917	}
918
919
920	sub format_link {
921	my $self = shift (@_);
922	my ($link, $base_dir, $file) = @_;
923
924	my ($before_hash, $hash_part) = $link =~ m/^([^\#])(\#?.)$/;
925
926	$hash_part = "" if !defined $hash_part;
927	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
928	my $outhandle = $self->{'outhandle'};
929	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
930	if $self->{'verbosity'};
931	return ($link, "", 0);
932	}
933
934	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
935	my $type = $1;
936
937	if ($link =~ m/^(http\|ftp):/i) {
938	# Turn url (using /) into file name (possibly using \ on windows)
939	my @http_dir_split = split('/', $before_hash);
940	$before_hash = &util::filename_cat(@http_dir_split);
941	}
942
943	$before_hash = $self->eval_dir_dots($before_hash);
944
945	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
946
947	my $rl = 0;
948	$rl = 1 if (-e $linkfilename);
949
950	# make sure there's a slash on the end if it's a directory
951	if ($before_hash !~ m/\/$/) {
952	$before_hash .= "/" if (-d $linkfilename);
953	}
954	return ($type . $before_hash, $hash_part, $rl);
955
956	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
957
958	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
959
960	# the first directory will be the domain name if file_is_url
961	# to generate archives, otherwise we'll assume all files are
962	# from the same site and base_dir is the root
963
964	if ($self->{'file_is_url'}) {
965	my @dirs = split /[\/\\]/, $file;
966	my $domname = shift (@dirs);
967	$before_hash = &util::filename_cat($domname, $before_hash);
968	$before_hash =~ s@\\@/@g; # for windows
969	}
970	else
971	{
972	# see if link shares directory with source document
973	# => turn into relative link if this is so!
974
975	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
976	# too difficult doing a pattern match with embedded '\'s...
977	my $win_before_hash=$before_hash;
978	$win_before_hash =~ s@(\\)+@/@g;
979	# $base_dir is already similarly "converted" on windows.
980	if ($win_before_hash =~ s@^$base_dir/@@o) {
981	# if this is true, we removed a prefix
982	$before_hash=$win_before_hash;
983	}
984	}
985	else {
986	# before_hash has lost leading slash by this point,
987	# -> add back in prior to substitution with $base_dir
988	$before_hash = "/$before_hash";
989
990	$before_hash = &util::filename_cat("",$before_hash);
991	$before_hash =~ s@^$base_dir/@@;
992	}
993	}
994	} else {
995	# Turn relative file path into full path
996	my $dirname = &File::Basename::dirname($file);
997	$before_hash = &util::filename_cat($dirname, $before_hash);
998	$before_hash = $self->eval_dir_dots($before_hash);
999	}
1000
1001	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1002	# make sure there's a slash on the end if it's a directory
1003	if ($before_hash !~ m/\/$/) {
1004	$before_hash .= "/" if (-d $linkfilename);
1005	}
1006	return ("http://" . $before_hash, $hash_part, 1);
1007	} else {
1008	# mailto, news, nntp, telnet, javascript or gopher link
1009	return ($before_hash, "", 0);
1010	}
1011	}
1012
1013	sub extract_first_NNNN_characters {
1014	my $self = shift (@_);
1015	my ($textref, $doc_obj, $thissection) = @_;
1016
1017	foreach my $size (split /,/, $self->{'first'}) {
1018	my $tmptext = $$textref;
1019	# skip to the body
1020	$tmptext =~ s/.<body[^>]>//i;
1021	# remove javascript
1022	$tmptext =~ s@<script.*?</script>@ @sig;
1023	$tmptext =~ s/<[^>]*>/ /g;
1024	$tmptext =~ s/ / /g;
1025	$tmptext =~ s/^\s+//;
1026	$tmptext =~ s/\s+$//;
1027	$tmptext =~ s/\s+/ /gs;
1028	$tmptext = &unicode::substr ($tmptext, 0, $size);
1029	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
1030	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1031	}
1032	}
1033
1034
1035	sub extract_metadata {
1036	my $self = shift (@_);
1037	my ($textref, $metadata, $doc_obj, $section) = @_;
1038	my $outhandle = $self->{'outhandle'};
1039	# if we don't want metadata, we may as well not be here ...
1040	return if (!defined $self->{'metadata_fields'});
1041
1042	my $separator = $self->{'metadata_field_separator'};
1043	if ($separator eq "") {
1044	undef $separator;
1045	}
1046
1047	# metadata fields to extract/save. 'key' is the (lowercase) name of the
1048	# html meta, 'value' is the metadata name for greenstone to use
1049	my %find_fields = ();
1050
1051	my %creator_fields = (); # short-cut for lookups
1052
1053
1054	foreach my $field (split /,/, $self->{'metadata_fields'}) {
1055	$field =~ s/^\s+//; # remove leading whitespace
1056	$field =~ s/\s+$//; # remove trailing whitespace
1057
1058	# support tag<tagname>
1059	if ($field =~ m/^(.?)\s<(.*?)>$/) {
1060	# "$2" is the user's preferred gs metadata name
1061	$find_fields{lc($1)}=$2; # lc = lowercase
1062	} else { # no <tagname> for mapping
1063	# "$field" is the user's preferred gs metadata name
1064	$find_fields{lc($field)}=$field; # lc = lowercase
1065	}
1066	}
1067
1068	if (defined $self->{'hunt_creator_metadata'} &&
1069	$self->{'hunt_creator_metadata'} == 1 ) {
1070	my @extra_fields =
1071	(
1072	'author',
1073	'author.email',
1074	'creator',
1075	'dc.creator',
1076	'dc.creator.corporatename',
1077	);
1078
1079	# add the creator_metadata fields to search for
1080	foreach my $field (@extra_fields) {
1081	$creator_fields{$field}=0; # add to lookup hash
1082	}
1083	}
1084
1085
1086	# find the header in the html file, which has the meta tags
1087	$$textref =~ m@<head>(.*?)</head>@si;
1088
1089	my $html_header=$1;
1090
1091	# go through every <meta... tag defined in the html and see if it is
1092	# one of the tags we want to match.
1093
1094	# special case for title - we want to remember if its been found
1095	my $found_title = 0;
1096	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1097	$html_header =~ m/^/; # match the start of the string, for \G assertion
1098
1099	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1100	my $metatag=$1;
1101	my ($tag, $value);
1102
1103	# find the tag name
1104	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1105	$tag=$2;
1106	# in case they're not using " or ', but they should...
1107	if (! $tag) {
1108	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1109	$tag=$1;
1110	}
1111
1112	if (!defined $tag) {
1113	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1114	next;
1115	}
1116
1117	# don't need to assign this field if it was passed in from a previous
1118	# (recursive) plugin
1119	if (defined $metadata->{$tag}) {next}
1120
1121	# find the tag content
1122	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
1123	$value=$2;
1124
1125	if (! $value) {
1126	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1127	$value=$1;
1128	}
1129	if (!defined $value) {
1130	print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1131	next;
1132	}
1133
1134	# clean up and add
1135	$value =~ s/\s+/ /gs;
1136	chomp($value); # remove trailing \n, if any
1137	if (exists $creator_fields{lc($tag)}) {
1138	# map this value onto greenstone's "Creator" metadata
1139	$tag='Creator';
1140	} elsif (!exists $find_fields{lc($tag)}) {
1141	next; # don't want this tag
1142	} else {
1143	# get the user's preferred capitalisation
1144	$tag = $find_fields{lc($tag)};
1145	}
1146	if (lc($tag) eq "title") {
1147	$found_title = 1;
1148	}
1149
1150	if ($self->{'verbosity'} > 2) {
1151	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1152	}
1153
1154	if ($tag =~ /\./) {
1155	# there is a . so has a namespace, add ex.
1156	$tag = "ex.$tag";
1157	}
1158	if (defined $separator) {
1159	my @values = split($separator, $value);
1160	foreach my $v (@values) {
1161	$doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1162	}
1163	}
1164	else {
1165	$doc_obj->add_utf8_metadata($section, $tag, $value);
1166	}
1167	}
1168
1169	# TITLE: extract the document title
1170	if (exists $find_fields{'title'} && !$found_title) {
1171	# we want a title, and didn't find one in the meta tags
1172	# see if there's a <title> tag
1173	my $title;
1174	my $from = ""; # for debugging output only
1175	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
1176	$title = $1;
1177	$from = "<title> tags";
1178	}
1179
1180	if (!defined $title) {
1181	$from = "first 100 chars";
1182	# if no title use first 100 or so characters
1183	$title = $$textref;
1184	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1185	$title =~ s/^.*?<body>//si;
1186	# ignore javascript!
1187	$title =~ s@<script.*?</script>@ @sig;
1188	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1189	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1190	$title = substr ($title, 0, 100);
1191	$title =~ s/\s\S*$/.../;
1192	}
1193	$title =~ s/<[^>]*>/ /g; # remove html tags
1194	$title =~ s/ / /g;
1195	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1196	$title =~ s/\s+/ /gs; # collapse multiple spaces
1197	$title =~ s/^\s*//; # remove leading spaces
1198	$title =~ s/\s*$//; # remove trailing spaces
1199
1200	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1201	$title =~ s/^\s+//s; # in case title_sub introduced any...
1202	$doc_obj->add_utf8_metadata ($section, "Title", $title);
1203	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1204	if ($self->{'verbosity'} > 2);
1205	}
1206
1207	# add FileFormat metadata
1208	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1209
1210	# Special, for metadata names such as tagH1 - extracts
1211	# the text between the first <H1> and </H1> tags into "H1" metadata.
1212
1213	foreach my $field (keys %find_fields) {
1214	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1215	my $tag = $1;
1216	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1217	my $content = $1;
1218	$content =~ s/ / /g;
1219	$content =~ s/<[^>]*>/ /g;
1220	$content =~ s/^\s+//;
1221	$content =~ s/\s+$//;
1222	$content =~ s/\s+/ /gs;
1223	if ($content) {
1224	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1225	$tag =~ s/^tag//i;
1226	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1227	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1228	if ($self->{'verbosity'} > 2);
1229	}
1230	}
1231	}
1232	}
1233
1234
1235	# evaluate any "../" to next directory up
1236	# evaluate any "./" as here
1237	sub eval_dir_dots {
1238	my $self = shift (@_);
1239	my ($filename) = @_;
1240	my $dirsep_os = &util::get_os_dirsep();
1241	my @dirsep = split(/$dirsep_os/,$filename);
1242
1243	my @eval_dirs = ();
1244	foreach my $d (@dirsep) {
1245	if ($d eq "..") {
1246	pop(@eval_dirs);
1247
1248	} elsif ($d eq ".") {
1249	# do nothing!
1250
1251	} else {
1252	push(@eval_dirs,$d);
1253	}
1254	}
1255
1256	# Need to fiddle with number of elements in @eval_dirs if the
1257	# first one is the empty string. This is because of a
1258	# modification to util::filename_cat that supresses the addition
1259	# of a leading '/' character (or \ if windows) (intended to help
1260	# filename cat with relative paths) if the first entry in the
1261	# array is the empty string. Making the array start with two
1262	# empty strings is a way to defeat this "smart" option.
1263	#
1264	if (scalar(@eval_dirs) > 0) {
1265	if ($eval_dirs[0] eq ""){
1266	unshift(@eval_dirs,"");
1267	}
1268	}
1269
1270	my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1271	return $evaluated_filename;
1272	}
1273
1274	sub replace_usemap_links {
1275	my $self = shift (@_);
1276	my ($front, $link, $back) = @_;
1277
1278	# remove quotes from link at start and end if necessary
1279	if ($link=~/^[\"\']/) {
1280	$link=~s/^[\"\']//;
1281	$link=~s/[\"\']$//;
1282	$front.='"';
1283	$back="\"$back";
1284	}
1285
1286	$link =~ s/^\.\///;
1287	return $front . $link . $back;
1288	}
1289
1290	sub inc_filecount {
1291	my $self = shift (@_);
1292
1293	if ($self->{'file_num'} == 1000) {
1294	$self->{'dir_num'} ++;
1295	$self->{'file_num'} = 0;
1296	} else {
1297	$self->{'file_num'} ++;
1298	}
1299	}
1300
1301
1302	# Extend read_file so that strings like é are
1303	# converted to UTF8 internally.
1304	#
1305	# We don't convert < or > or & or " in case
1306	# they interfere with the GML files
1307
1308	sub read_file {
1309	my $self = shift(@_);
1310	my ($filename, $encoding, $language, $textref) = @_;
1311
1312	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1313
1314	# Convert entities to their Unicode code-point equivalents
1315	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1316	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1317	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1318
1319	}
1320
1321	sub HB_read_html_file {
1322	my $self = shift (@_);
1323	my ($htmlfile, $text) = @_;
1324
1325	# load in the file
1326	if (!open (FILE, $htmlfile)) {
1327	print STDERR "ERROR - could not open $htmlfile\n";
1328	return;
1329	}
1330
1331	my $foundbody = 0;
1332	$self->HB_gettext (\$foundbody, $text, "FILE");
1333	close FILE;
1334
1335	# just in case there was no <body> tag
1336	if (!$foundbody) {
1337	$foundbody = 1;
1338	open (FILE, $htmlfile) \|\| return;
1339	$self->HB_gettext (\$foundbody, $text, "FILE");
1340	close FILE;
1341	}
1342	# text is in utf8
1343	}
1344
1345	# converts the text to utf8, as ghtml does that for é etc.
1346	sub HB_gettext {
1347	my $self = shift (@_);
1348	my ($foundbody, $text, $handle) = @_;
1349
1350	my $line = "";
1351	while (defined ($line = <$handle>)) {
1352	# look for body tag
1353	if (!$$foundbody) {
1354	if ($line =~ s/^.<body[^>]>//i) {
1355	$$foundbody = 1;
1356	} else {
1357	next;
1358	}
1359	}
1360
1361	# check for symbol fonts
1362	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
1363	my $font = $1;
1364	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1365	if ($font !~ m/^arial$/i);
1366	}
1367
1368	$$text .= $line;
1369	}
1370
1371	if ($self->{'input_encoding'} eq "iso_8859_1") {
1372	# convert to utf-8
1373	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1374	}
1375	# convert any alphanumeric character entities to their utf-8
1376	# equivalent for indexing purposes
1377	#&ghtml::convertcharentities ($$text);
1378
1379	$$text =~ s/\s+/ /g; # remove \n's
1380
1381	# At this point $$text is a binary byte string
1382	# => turn it into a Unicode aware string, so full
1383	# Unicode aware pattern matching can be used.
1384	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
1385	#
1386
1387	$$text = decode("utf8",$$text);
1388	}
1389
1390	sub HB_clean_section {
1391	my $self = shift (@_);
1392	my ($section) = @_;
1393
1394	# remove tags without a starting tag from the section
1395	my ($tag, $tagstart);
1396	while ($section =~ m/<\/([^>]{1,10})>/) {
1397	$tag = $1;
1398	$tagstart = index($section, "<$tag");
1399	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1400	$section =~ s/<\/$tag>//;
1401	}
1402
1403	# remove extra paragraph tags
1404	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
1405
1406	# remove extra stuff at the end of the section
1407	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
1408
1409	# add a newline at the beginning of each paragraph
1410	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1411
1412	# add a newline every 80 characters at a word boundary
1413	# Note: this regular expression puts a line feed before
1414	# the last word in each section, even when it is not
1415	# needed.
1416	$section =~ s/(.{1,80})\s/$1\n/g;
1417
1418	# fix up the image links
1419	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
1420	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1421	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
1422	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1423
1424	return $section;
1425	}
1426
1427	# Will convert the oldHDL format to the new HDL format (using the Section tag)
1428	sub convert_to_newHDLformat
1429	{
1430	my $self = shift (@_);
1431	my ($file,$cnfile) = @_;
1432	my $input_filename = $file;
1433	my $tmp_filename = $cnfile;
1434
1435	# write HTML tmp file with new HDL format
1436	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1437
1438	# read in the file and do basic html cleaning (removing header etc)
1439	my $html = "";
1440	$self->HB_read_html_file ($input_filename, \$html);
1441
1442	# process the file one section at a time
1443	my $curtoclevel = 1;
1444	my $firstsection = 1;
1445	my $toclevel = 0;
1446	while (length ($html) > 0) {
1447	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
1448	$toclevel = $3;
1449	my $title = $4;
1450	my $sectiontext = "";
1451	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
1452	$sectiontext = $1;
1453	} else {
1454	$sectiontext = $html;
1455	$html = "";
1456	}
1457
1458	# remove tags and extra spaces from the title
1459	$title =~ s/<\/?[^>]+>//g;
1460	$title =~ s/^\s+\|\s+$//g;
1461
1462	# close any sections below the current level and
1463	# create a new section (special case for the firstsection)
1464	print PROD "<!--\n";
1465	while (($curtoclevel > $toclevel) \|\|
1466	(!$firstsection && $curtoclevel == $toclevel)) {
1467	$curtoclevel--;
1468	print PROD "</Section>\n";
1469	}
1470	if ($curtoclevel+1 < $toclevel) {
1471	print STDERR "WARNING - jump in toc levels in $input_filename " .
1472	"from $curtoclevel to $toclevel\n";
1473	}
1474	while ($curtoclevel < $toclevel) {
1475	$curtoclevel++;
1476	}
1477
1478	if ($curtoclevel == 1) {
1479	# add the header tag
1480	print PROD "-->\n";
1481	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1482	print PROD "<!--\n";
1483	}
1484
1485	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1486
1487	print PROD "-->\n";
1488
1489	# clean up the section html
1490	$sectiontext = $self->HB_clean_section($sectiontext);
1491
1492	print PROD "$sectiontext\n";
1493
1494	} else {
1495	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1496	"\nin $input_filename\n";
1497	last;
1498	}
1499	$firstsection = 0;
1500	}
1501
1502	print PROD "<!--\n";
1503	while ($curtoclevel > 0) {
1504	$curtoclevel--;
1505	print PROD "</Section>\n";
1506	}
1507	print PROD "-->\n";
1508
1509	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1510
1511	return $tmp_filename;
1512	}
1513
1514	sub shorten {
1515	my $self = shift (@_);
1516	my ($text) = @_;
1517
1518	return "\"$text\"" if (length($text) < 100);
1519
1520	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1521	substr ($text, length($text)-50) . "\"";
1522	}
1523
1524	sub convert_tidy_or_oldHDL_file
1525	{
1526	my $self = shift (@_);
1527	my ($file) = @_;
1528	my $input_filename = $file;
1529
1530	if (-d $input_filename)
1531	{
1532	return $input_filename;
1533	}
1534
1535	# get the input filename
1536	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1537	my $base_dirname = $dirname;
1538	$suffix = lc($suffix);
1539
1540	# derive tmp filename from input filename
1541	# Remove any white space from filename -- no risk of name collision, and
1542	# makes later conversion by utils simpler. Leave spaces in path...
1543	# tidy up the filename with space, dot, hyphen between
1544	$tailname =~ s/\s+//g;
1545	$tailname =~ s/\.+//g;
1546	$tailname =~ s/\-+//g;
1547	# convert to utf-8 otherwise we have problems with the doc.xml file
1548	# later on
1549	&unicode::ensure_utf8(\$tailname);
1550
1551	# softlink to collection tmp dir
1552	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1553	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1554
1555	my $test_dirname = "";
1556	my $f_separator = &util::get_os_dirsep();
1557
1558	if ($dirname =~ m/import$f_separator/)
1559	{
1560	$test_dirname = $'; #'
1561
1562	#print STDERR "init $'\n";
1563
1564	while ($test_dirname =~ m/[$f_separator]/)
1565	{
1566	my $folderdirname = $`;
1567	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1568	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1569	$test_dirname = $'; #'
1570	}
1571	}
1572
1573	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1574
1575	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1576	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
1577	{
1578	#convert the input file to a new style HDL
1579	my $hdl_output_filename = $input_filename;
1580	if ($self->{'old_style_HDL'})
1581	{
1582	$hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1583	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1584	}
1585
1586	#just for checking copy all other file from the base dir to tmp dir if it is not exists
1587	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1588	my @files = grep {!/^\.+$/} readdir(DIR);
1589	close(DIR);
1590
1591	foreach my $file (@files)
1592	{
1593	my $src_file = &util::filename_cat($base_dirname,$file);
1594	my $dest_file = &util::filename_cat($tmp_dirname,$file);
1595	if ((!-e $dest_file) && (!-d $src_file))
1596	{
1597	# just copy the original file back to the tmp directory
1598	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1599	}
1600	}
1601
1602	# tidy the input file
1603	my $tidy_output_filename = $hdl_output_filename;
1604	if ($self->{'use_realistic_book'})
1605	{
1606	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1607	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1608	}
1609	$tmp_filename = $tidy_output_filename;
1610	}
1611	else
1612	{
1613	if (!-e $tmp_filename)
1614	{
1615	# just copy the original file back to the tmp directory
1616	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1617	}
1618	}
1619
1620	return $tmp_filename;
1621	}
1622
1623
1624	# Will make the html input file as a proper XML file with removed font tag and
1625	# image size added to the img tag.
1626	# The tidying process takes place in a collection specific 'tmp' directory so
1627	# that we don't accidentally damage the input.
1628	sub tmp_tidy_file
1629	{
1630	my $self = shift (@_);
1631	my ($file,$cnfile) = @_;
1632	my $input_filename = $file;
1633	my $tmp_filename = $cnfile;
1634
1635	# get the input filename
1636	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1637
1638	require HTML::TokeParser::Simple;
1639
1640	# create HTML parser to decode the input file
1641	my $parser = HTML::TokeParser::Simple->new($input_filename);
1642
1643	# write HTML tmp file without the font tag and image size are added to the img tag
1644	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1645	while (my $token = $parser->get_token())
1646	{
1647	# is it an img tag
1648	if ($token->is_start_tag('img'))
1649	{
1650	# get the attributes
1651	my $attr = $token->return_attr;
1652
1653	# get the full path to the image
1654	my $img_file = &util::filename_cat($dirname,$attr->{src});
1655
1656	# set the width and height attribute
1657	($attr->{width}, $attr->{height}) = imgsize($img_file);
1658
1659	# recreate the tag
1660	print PROD "<img";
1661	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1662	print PROD ">";
1663	}
1664	# is it a font tag
1665	else
1666	{
1667	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
1668	{
1669	# remove font tag
1670	print PROD "";
1671	}
1672	else
1673	{
1674	# print without changes
1675	print PROD $token->as_is;
1676	}
1677	}
1678	}
1679	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1680
1681	# run html-tidy on the tmp file to make it a proper XML file
1682
1683	my $outhandle = $self->{'outhandle'};
1684	print $outhandle "Converting HTML to be XML compliant:\n";
1685
1686	my $tidy_cmd = "tidy";
1687	$tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1688	$tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1689	if ($self->{'verbosity'} <= 2) {
1690	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1691	$tidy_cmd .= " 2>nul";
1692	}
1693	else {
1694	$tidy_cmd .= " 2>/dev/null";
1695	}
1696	print $outhandle " => $tidy_cmd\n";
1697	}
1698
1699	my $tidyfile = `$tidy_cmd`;
1700
1701	# write result back to the tmp file
1702	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1703	print PROD $tidyfile;
1704	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1705
1706	# return the output filename
1707	return $tmp_filename;
1708	}
1709
1710	sub associate_cover_image
1711	{
1712	my $self = shift(@_);
1713	my ($doc_obj, $filename) = @_;
1714	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
1715	{
1716	# we will have cover image in tidytmp, but want it from import
1717	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1718	}
1719	$self->SUPER::associate_cover_image($doc_obj, $filename);
1720	}
1721
1722
1723	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: