Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 23347

Last change on this file since 23347 was 23347, checked in by davidb, 13 years ago
Tidy up of debugging statements for handline filename encodings, plus finishing off the 'deduce_filename_encoding' routine
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 57.1 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use Encode;
39
40	use ReadTextFile;
41	use HBPlugin;
42	use ghtml;
43	use unicode;
44	use util;
45	use XMLParser;
46
47	use Image::Size;
48	use File::Copy;
49
50	sub BEGIN {
51	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
52	}
53
54	use strict; # every perl program should have this!
55	no strict 'refs'; # make an exception so we can use variables as filehandles
56
57	my $arguments =
58	[ { 'name' => "process_exp",
59	'desc' => "{BasePlugin.process_exp}",
60	'type' => "regexp",
61	'deft' => &get_default_process_exp() },
62	{ 'name' => "block_exp",
63	'desc' => "{BasePlugin.block_exp}",
64	'type' => 'regexp',
65	'deft' => &get_default_block_exp() },
66	{ 'name' => "nolinks",
67	'desc' => "{HTMLPlugin.nolinks}",
68	'type' => "flag" },
69	{ 'name' => "keep_head",
70	'desc' => "{HTMLPlugin.keep_head}",
71	'type' => "flag" },
72	{ 'name' => "no_metadata",
73	'desc' => "{HTMLPlugin.no_metadata}",
74	'type' => "flag" },
75	{ 'name' => "metadata_fields",
76	'desc' => "{HTMLPlugin.metadata_fields}",
77	'type' => "string",
78	'deft' => "Title" },
79	{ 'name' => "metadata_field_separator",
80	'desc' => "{HTMLPlugin.metadata_field_separator}",
81	'type' => "string",
82	'deft' => "" },
83	{ 'name' => "hunt_creator_metadata",
84	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
85	'type' => "flag" },
86	{ 'name' => "file_is_url",
87	'desc' => "{HTMLPlugin.file_is_url}",
88	'type' => "flag" },
89	{ 'name' => "assoc_files",
90	'desc' => "{HTMLPlugin.assoc_files}",
91	'type' => "regexp",
92	'deft' => &get_default_block_exp() },
93	{ 'name' => "rename_assoc_files",
94	'desc' => "{HTMLPlugin.rename_assoc_files}",
95	'type' => "flag" },
96	{ 'name' => "title_sub",
97	'desc' => "{HTMLPlugin.title_sub}",
98	'type' => "string",
99	'deft' => "" },
100	{ 'name' => "description_tags",
101	'desc' => "{HTMLPlugin.description_tags}",
102	'type' => "flag" },
103	# retain this for backward compatibility (w3mir option was replaced by
104	# file_is_url)
105	{ 'name' => "w3mir",
106	# 'desc' => "{HTMLPlugin.w3mir}",
107	'type' => "flag",
108	'hiddengli' => "yes"},
109	{ 'name' => "no_strip_metadata_html",
110	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
111	'type' => "string",
112	'deft' => "",
113	'reqd' => "no"},
114	{ 'name' => "sectionalise_using_h_tags",
115	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
116	'type' => "flag" },
117	{ 'name' => "use_realistic_book",
118	'desc' => "{HTMLPlugin.tidy_html}",
119	'type' => "flag"},
120	{ 'name' => "old_style_HDL",
121	'desc' => "{HTMLPlugin.old_style_HDL}",
122	'type' => "flag"},
123	{'name' => "processing_tmp_files",
124	'desc' => "{BasePlugin.processing_tmp_files}",
125	'type' => "flag",
126	'hiddengli' => "yes"}
127	];
128
129	my $options = { 'name' => "HTMLPlugin",
130	'desc' => "{HTMLPlugin.desc}",
131	'abstract' => "no",
132	'inherits' => "yes",
133	'args' => $arguments };
134
135
136	sub new {
137	my ($class) = shift (@_);
138	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139	push(@$pluginlist, $class);
140
141	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142	push(@{$hashArgOptLists->{"OptList"}},$options);
143
144
145	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
146
147	if ($self->{'w3mir'}) {
148	$self->{'file_is_url'} = 1;
149	}
150	$self->{'aux_files'} = {};
151	$self->{'dir_num'} = 0;
152	$self->{'file_num'} = 0;
153
154	return bless $self, $class;
155	}
156
157	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
158	# if have eg <script language="javascript" src="img/lib.js@123">
159	# blocking is now done by reading through the file and recording all the
160	# images and other files
161	sub get_default_block_exp {
162	my $self = shift (@_);
163
164	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
165	return "";
166	}
167
168	sub get_default_process_exp {
169	my $self = shift (@_);
170
171	# the last option is an attempt to encode the concept of an html query ...
172	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
173	}
174
175	sub store_block_files
176	{
177	my $self =shift (@_);
178	my ($filename_full_path, $block_hash) = @_;
179
180	my $html_fname = $filename_full_path;
181	my @file_blocks;
182
183	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
184	$self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
185
186	# read in file ($text will be in utf8)
187	my $raw_text = "";
188	$self->read_file_no_decoding ($filename_full_path, \$raw_text);
189
190	my $textref = \$raw_text;
191	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
192	my $closecom = '(?:-->\|(?:—\|\|--)>)';
193	$$textref =~ s/$opencom(.*?)$closecom//gs;
194
195	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
196	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
197	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
198	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
199	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
200	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
201	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
202
203	if(!defined $self->{'utf8_to_original_filename'}) {
204	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
205	$self->{'utf8_to_original_filename'} = {};
206	}
207
208	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
209
210	# remove quotes from link at start and end if necessary
211	if ($link=~/^\"/) {
212	$link=~s/^\"//;
213	$link=~s/\"$//;
214	}
215
216	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
217	# some links may just be anchor names
218	next unless ($link =~ /\S+/);
219
220	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
221	# Turn relative file path into full path
222	my $dirname = &File::Basename::dirname($filename_full_path);
223	$link = &util::filename_cat($dirname, $link);
224	}
225	$link = $self->eval_dir_dots($link);
226
227	# this is the actual filename on the filesystem (that the link refers to)
228	my $url_original_filename = $self->opt_url_decode($link);
229
230	# Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
231	my $utf8_link = "";
232	$self->decode_text($link,$content_encoding,$language,\$utf8_link);
233
234	$self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
235	# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
236
237	if ($url_original_filename ne $utf8_link) {
238	my $outhandle = $self->{'outhandle'};
239
240	print $outhandle "URL Encoding $url_original_filename\n";
241	print $outhandle " ->$utf8_link\n";
242	}
243
244	$block_hash->{'file_blocks'}->{$url_original_filename} = 1;
245	}
246	}
247
248	# Given a filename in any encoding, will URL decode it to get back the original filename
249	# in the original encoding. Because this method is intended to work out the original
250	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
251	# string already exists in the local folder.
252	# Return the original filename corresponding to the parameter URL-encoded filename, and
253	# a decoded flag that is set to true iff URL-decoding had to be applied.
254	sub opt_url_decode {
255	my $self = shift (@_);
256	my ($link) = @_;
257
258	# Replace %XX's in URL with decoded value if required.
259	# Note that the filename may include the %XX in some situations
260	if ($link =~ m/\%[A-F0-9]{2}/i) {
261	if (!-e $link) {
262	$link = &unicode::url_decode($link);
263	}
264	}
265
266	return $link;
267	}
268
269	sub read_into_doc_obj
270	{
271	my $self = shift (@_);
272	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
273
274	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
275
276	# Lookup content_encoding worked out in file_block pass for this file
277	# Store it under the local name 'content_encoding' so its nice and
278	# easy to access
279	$self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
280
281	# get the input file
282	my $input_filename = $file;
283	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
284	$suffix = lc($suffix);
285	my $tidy_filename;
286	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
287	{
288	# because the document has to be sectionalized set the description tags
289	$self->{'description_tags'} = 1;
290
291	# set the file to be tidied
292	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
293
294	# get the tidied file
295	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
296	$tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
297
298	# derive tmp filename from input filename
299	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
300
301	# set the new input file and base_dir to be from the tidied file
302	$file = "$tailname$suffix";
303	$base_dir = $dirname;
304	}
305
306	# call the parent read_into_doc_obj
307	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
308	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
309	{
310	# now we need to reset the filenames in the doc obj so that the converted filenames are not used
311	my $collect_file = &util::filename_within_collection($filename_full_path);
312	$doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
313	## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
314	# build. so set it manually.
315	$doc_obj->{'source_path'} = $filename_full_path;
316	my $collect_conv_file = &util::filename_within_collection($tidy_filename);
317	$doc_obj->set_converted_filename($collect_conv_file);
318	$self->set_Source_metadata($doc_obj, $filename_no_path);
319	}
320
321	delete $self->{'store_content_encoding'}->{$filename_full_path};
322	$self->{'content_encoding'} = undef;
323
324	return ($process_status,$doc_obj);
325	}
326
327	# do plugin specific processing of doc_obj
328	sub process {
329	my $self = shift (@_);
330	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
331	my $outhandle = $self->{'outhandle'};
332
333	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
334	# this makes life so much easier... perl can cope with unix-style '/'s.
335	$base_dir =~ s@(\\)+@/@g;
336	$file =~ s@(\\)+@/@g;
337	}
338
339	# reset per-doc stuff...
340	$self->{'aux_files'} = {};
341	$self->{'dir_num'} = 0;
342	$self->{'file_num'} = 0;
343
344	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
345	# you can also include metadata in the format (X can be any number)
346	# <hX>Title<!--gsdl-metadata
347	# <Metadata name="name1">value1</Metadata>
348	# ...
349	# <Metadata name="nameN">valueN</Metadata>
350	#--></hX>
351	if ($self->{'sectionalise_using_h_tags'}) {
352	# description_tags should allways be activated because we convert headings to description tags
353	$self->{'description_tags'} = 1;
354
355	my $arrSections = [];
356	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
357
358	if (scalar(@$arrSections)) {
359	my $strMetadata = $self->update_section_data($arrSections, -1);
360	if (length($strMetadata)) {
361	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
362	$$textref =~ s/<\/body>/$strMetadata/ig;
363	}
364	}
365	}
366
367	my $cursection = $doc_obj->get_top_section();
368
369	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
370	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
371
372	# Store URL for page as metadata - this can be used for an
373	# altavista style search interface. The URL won't be valid
374	# unless the file structure contains the domain name (i.e.
375	# like when w3mir is used to download a website).
376
377	# URL metadata (even invalid ones) are used to support internal
378	# links, so even if 'file_is_url' is off, still need to store info
379
380	my ($tailname,$dirname) = &File::Basename::fileparse($file);
381
382	# my $utf8_file = $self->filename_to_utf8_metadata($file);
383	# $utf8_file =~ s/&\#095;/_/g;
384	my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
385
386	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
387	print STDERR "***!! file = $file\n";
388	print STDERR "***!! utf8_file = $utf8_file\n";
389	}
390
391
392	my $web_url = "http://";
393	if(defined $dirname) { # local directory
394	# Check for "ftp" in the domain name of the directory
395	# structure to determine if this URL should be a ftp:// URL
396	# This check is not infallible, but better than omitting the
397	# check, which would cause all files downloaded from ftp sites
398	# via mirroring with wget to have potentially erroneous http:// URLs
399	# assigned in their metadata
400	if ($dirname =~ /^[^\/]*ftp/i)
401	{
402	$web_url = "ftp://";
403	}
404	$dirname = $self->eval_dir_dots($dirname);
405	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
406	$web_url = $web_url.$dirname.$utf8_file;
407	} else {
408	$web_url = $web_url.$utf8_file;
409	}
410	$web_url =~ s/\\/\//g;
411	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
412
413	if ($self->{'file_is_url'}) {
414	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
415	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
416	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
417	}
418
419	if ($self->{'description_tags'}) {
420	# remove the html header - note that doing this here means any
421	# sections defined within the header will be lost (so all <Section>
422	# tags must appear within the body of the HTML)
423	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
424
425	$$textref =~ s/^.?<body[^>]>//is;
426	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
427
428	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
429	my $closecom = '(?:-->\|(?:—\|\|--)>)';
430
431	my $lt = '(?:<\|<)';
432	my $gt = '(?:>\|>)';
433	my $quot = '(?:"\|"\|”\|“)';
434
435	my $dont_strip = '';
436	if ($self->{'no_strip_metadata_html'}) {
437	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
438	}
439
440	my $found_something = 0; my $top = 1;
441	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
442	my $text = $1;
443	my $comment = $2;
444	if (defined $text) {
445	# text before a comment - note that getting to here
446	# doesn't necessarily mean there are Section tags in
447	# the document
448	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
449	}
450	while ($comment =~ s/$lt(.*?)$gt//s) {
451	my $tag = $1;
452	if ($tag eq "Section") {
453	$found_something = 1;
454	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
455	$top = 0;
456	} elsif ($tag eq "/Section") {
457	$found_something = 1;
458	$cursection = $doc_obj->get_parent_section ($cursection);
459	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
460	my $metaname = $1;
461	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
462	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
463	my $metavalue = $1;
464	$metavalue =~ s/^\s+//;
465	$metavalue =~ s/\s+$//;
466	# assume that no metadata value intentionally includes
467	# carriage returns or HTML tags (if they're there they
468	# were probably introduced when converting to HTML from
469	# some other format).
470	# actually some people want to have html tags in their
471	# metadata.
472	$metavalue =~ s/[\cJ\cM]/ /sg;
473	$metavalue =~ s/<[^>]+>//sg
474	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
475	$metavalue =~ s/\s+/ /sg;
476	if ($metaname =~ /\./) { # has a namespace
477	$metaname = "ex.$metaname";
478	}
479	if ($accumulate) {
480	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
481	} else {
482	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
483	}
484	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
485	# do nothing with containing Description tags
486	} else {
487	# simple HTML tag (probably created by the conversion
488	# to HTML from some other format) - we'll ignore it and
489	# hope for the best ;-)
490	}
491	}
492	}
493	if ($cursection ne "") {
494	print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
495	}
496
497	$$textref =~ s/^.?<body[^>]>//is;
498	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
499	if ($$textref =~ m/\S/) {
500	if (!$found_something) {
501	if ($self->{'verbosity'} > 2) {
502	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
503	print $outhandle " will be processed as a single section document\n";
504	}
505
506	# go ahead and process single-section document
507	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
508
509	# if document contains no Section tags we'll go ahead
510	# and extract metadata (this won't have been done
511	# above as the -description_tags option prevents it)
512	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
513	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
514	unless $self->{'no_metadata'};
515
516	} else {
517	print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
518	print $outhandle " of the final closing </Section> tag. This text will\n";
519	print $outhandle " be ignored.";
520
521	my ($text);
522	if (length($$textref) > 30) {
523	$text = substr($$textref, 0, 30) . "...";
524	} else {
525	$text = $$textref;
526	}
527	$text =~ s/\n/ /isg;
528	print $outhandle " ($text)\n";
529	}
530	} elsif (!$found_something) {
531
532	if ($self->{'verbosity'} > 2) {
533	# may get to here if document contained no valid Section
534	# tags but did contain some comments. The text will have
535	# been processed already but we should print the warning
536	# as above and extract metadata
537	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
538	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
539	}
540
541	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
542	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
543	unless $self->{'no_metadata'};
544	}
545
546	} else {
547
548	# remove header and footer
549	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
550	$$textref =~ s/^.?<body[^>]>//is;
551	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
552	}
553
554	# single section document
555	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
556	}
557
558	return 1;
559	}
560
561
562	sub process_heading
563	{
564	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
565	$strHeadingText = '' if (!defined($strHeadingText));
566
567	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
568
569	my $strSecMetadata = '';
570	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
571	{
572	$strSecMetadata .= $1;
573	}
574
575	$strHeadingText =~ s/^\s+//g;
576	$strHeadingText =~ s/\s+$//g;
577	$strSecMetadata =~ s/^\s+//g;
578	$strSecMetadata =~ s/\s+$//g;
579
580	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
581
582	if (length($strSecMetadata)) {
583	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
584	}
585
586	$strMetadata .= "\t</Description>\n";
587
588	return "<!--" . $strMetadata . "-->";
589	}
590
591
592	sub update_section_data
593	{
594	my ($self, $arrSections, $nCurTocNo) = @_;
595	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
596
597	if ($nSections == 0) {
598	push @$arrSections, $nCurTocNo;
599	return $strBuffer;
600	}
601	$nLast = $arrSections->[$nSections - 1];
602	if ($nCurTocNo > $nLast) {
603	push @$arrSections, $nCurTocNo;
604	return $strBuffer;
605	}
606	for(my $i = $nSections - 1; $i >= 0; $i--) {
607	if ($nCurTocNo <= $arrSections->[$i]) {
608	$strBuffer .= "\n</Section>";
609	pop @$arrSections;
610	}
611	}
612	push @$arrSections, $nCurTocNo;
613	return $strBuffer;
614	}
615
616
617	# note that process_section may be called multiple times for a single
618	# section (relying on the fact that add_utf8_text appends the text to any
619	# that may exist already).
620	sub process_section {
621	my $self = shift (@_);
622	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
623	# trap links
624	if (!$self->{'nolinks'}) {
625	# usemap="./#index" not handled correctly => change to "#index"
626	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
627
628	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
629	$self->replace_usemap_links($1, $2, $3)/isge;
630
631	## $$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
632
633	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
634	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
635	}
636
637	# trap images
638
639	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
640	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
641	# turned regular text succeeding images into links. That is, by embedding <imgs>
642	# inside <a href=""></a>, the text following images were no longer misbehaving.
643	# However, there would be many occasions whereby images were not meant to link
644	# to their source images but where the images would link to another web page.
645	# To allow this, the no_image_links option was introduced: it would prevent
646	# the behaviour of embedding images into links that referenced the source images.
647
648	# Somewhere along the line, the problem of normal text turning into links when
649	# such text followed images which were not embedded in <a href=""></a> ceased
650	# to occur. This is why the following lines have been commented out (as well as
651	# two lines in replace_images). They appear to no longer apply.
652
653	# If at any time, there is a need for having images embedded in <a> anchor tags,
654	# then it might be better to turn that into an HTMLPlugin option rather than make
655	# it the default behaviour. Also, eventually, no_image_links needs to become
656	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
657
658	#if(!$self->{'no_image_links'}){
659	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
660	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
661	#}
662
663	# add text to document object
664	# turn \ into \\ so that the rest of greenstone doesn't think there
665	# is an escape code following. (Macro parsing loses them...)
666	$$textref =~ s/\\/\\\\/go;
667
668	$doc_obj->add_utf8_text($cursection, $$textref);
669	}
670
671	sub replace_images {
672	my $self = shift (@_);
673	my ($front, $link, $back, $base_dir,
674	$file, $doc_obj, $section) = @_;
675
676	# remove quotes from link at start and end if necessary
677	if ($link=~/^[\"\']/) {
678	$link=~s/^[\"\']//;
679	$link=~s/[\"\']$//;
680	$front.='"';
681	$back="\"$back";
682	}
683
684	$link =~ s/\n/ /g;
685
686	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
687	# If the Word file path has spaces in it, wv messes up and you end up with
688	# absolute paths for the images, and without the "file://" prefix
689	# So check for this special case and massage the data to be correct
690	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
691	$link =~ s/^.*\\([^\\]+)$/$1/;
692	}
693
694	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
695
696	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
697
698	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n";
699
700	my $anchor_name = $img_file;
701	#$anchor_name =~ s/^.*\///;
702	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
703
704	my $image_link = $front . $img_file .$back;
705	return $image_link;
706
707	# The reasons for why the following two lines are no longer necessary can be
708	# found in subroutine process_section
709	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
710	#return $anchor_link;
711
712	#return $front . $img_file . $back . $anchor_name;
713	}
714
715	sub replace_href_links {
716	my $self = shift (@_);
717	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
718
719	# remove quotes from link at start and end if necessary
720	if ($link=~/^[\"\']/) {
721	$link=~s/^[\"\']//;
722	$link=~s/[\"\']$//;
723	$front.='"';
724	$back="\"$back";
725	}
726
727	# attempt to sort out targets - frames are not handled
728	# well in this plugin and some cases will screw things
729	# up - e.g. the _parent target (so we'll just remove
730	# them all ;-)
731	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
732	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
733	$front =~ s/target=\"?_parent\"?//is;
734	$back =~ s/target=\"?_parent\"?//is;
735
736	return $front . $link . $back if $link =~ m/^\#/s;
737	$link =~ s/\n/ /g;
738
739	# Find file referred to by $link on file system
740	# This is more complicated than it sounds when char encodings
741	# is taken in to account
742	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
743
744	# href may use '\'s where '/'s should be on Windows
745	$href =~ s/\\/\//g;
746	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
747
748
749	##### leave all these links alone (they won't be picked up by intermediate
750	##### pages). I think that's safest when dealing with frames, targets etc.
751	##### (at least until I think of a better way to do it). Problems occur with
752	##### mailto links from within small frames, the intermediate page is displayed
753	##### within that frame and can't be seen. There is still potential for this to
754	##### happen even with html pages - the solution seems to be to somehow tell
755	##### the browser from the server side to display the page being sent (i.e.
756	##### the intermediate page) in the top level window - I'm not sure if that's
757	##### possible - the following line should probably be deleted if that can be done
758	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
759
760	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
761	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
762
763
764	# If web page didn't give encoding, then default to utf8
765	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
766	print STDERR "*** Web page didn't give encoding, defaulting to UTF8!\n";
767	print STDERR "***** looking up $file\n";
768	}
769
770	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
771	$href = encode($content_encoding,$href);
772
773	$href = &unicode::raw_filename_to_url_encoded($href);
774	$href = &unicode::filename_to_url($href);
775
776	&ghtml::urlsafe ($href);
777	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
778	print STDERR "***!!! href=$href\n";
779	}
780
781
782	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
783	} else {
784	# link is to some other type of file (e.g., an image) so we'll
785	# need to associate that file
786	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
787	}
788	}
789
790	sub add_file {
791	my $self = shift (@_);
792	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
793	my ($newname);
794
795	my $filename = $href;
796	if ($base_dir eq "") {
797	# remove http:/ thereby leaving one slash at the start
798	$filename =~ s/^[^:]*:\///;
799	}
800	else {
801	# remove http://
802	$filename =~ s/^[^:]*:\/\///;
803	}
804
805	$filename = &util::filename_cat($base_dir, $filename);
806	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'})) {
807	# we are processing a tidytmp file - want paths to be in import
808	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
809	}
810
811	# Replace %XX's in URL with decoded value if required. Note that the
812	# filename may include the %XX in some situations. If the original
813	# file's name was in URL encoding, the following method will not decode
814	# it.
815	my $utf8_filename = $filename;
816	my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename);
817
818	my $content_encoding= $self->{'content_encoding'} \|\| "utf8";
819
820	# The filenames that come through the HTML file have been decoded
821	# into Unicode aware Perl strings. Need to convert them back
822	# to their initial raw-byte encoding to match the file that
823	# exists on the file system
824	$filename = encode($content_encoding, $opt_decode_utf8_filename);
825
826
827	# some special processing if the intended filename was converted to utf8, but
828	# the actual file still needs to be renamed
829	if (!-e $filename) {
830	# try the original filename stored in map
831	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
832	print STDERR "***###!! orig filename did not exist: $filename\n";
833	}
834
835	my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename};
836
837	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
838	print STDERR "**** Trying for $original_filename\n";
839	}
840
841	if (defined $original_filename && -e $original_filename) {
842	if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
843	print STDERR "*** found match\n";
844	}
845	$filename = $original_filename;
846	}
847	}
848
849	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
850
851	if ($rl == 0) {
852	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
853	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
854	}
855	else {
856	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
857	}
858	}
859
860	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
861	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
862	}
863	# add the original image file as a source file
864	if (!$self->{'processing_tmp_files'} ) {
865	$doc_obj->associate_source_file($filename);
866	}
867	if ($self->{'rename_assoc_files'}) {
868	if (defined $self->{'aux_files'}->{$href}) {
869	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
870	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
871	} else {
872	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
873	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
874	$self->inc_filecount ();
875	}
876	$doc_obj->associate_file($filename, $newname, undef, $section);
877	return "_httpdocimg_/$newname";
878	} else {
879	if(&unicode::is_url_encoded($utf8_filename)) {
880	# use the possibly-decoded filename instead to avoid double URL encoding
881	($newname) = $filename =~ m/([^\/\\]*)$/;
882	} else {
883	($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
884	}
885
886	# Make sure this name uses only ASCII characters.
887	# We use either base64 or URL encoding, as these preserve original encoding
888	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
889
890	$doc_obj->associate_file($filename, $newname, undef, $section);
891
892	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
893	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
894	my $newname_url = $newname;
895	$newname_url = &unicode::filename_to_url($newname_url);
896	return "_httpdocimg_/$newname_url";
897	}
898	}
899
900
901	sub format_link {
902	my $self = shift (@_);
903	my ($link, $base_dir, $file) = @_;
904
905	my ($before_hash, $hash_part) = $link =~ m/^([^\#])(\#?.)$/;
906
907	$hash_part = "" if !defined $hash_part;
908	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
909	my $outhandle = $self->{'outhandle'};
910	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
911	if $self->{'verbosity'};
912	return ($link, "", 0);
913	}
914
915	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
916	my $type = $1;
917
918	if ($link =~ m/^(http\|ftp):/i) {
919	# Turn url (using /) into file name (possibly using \ on windows)
920	my @http_dir_split = split('/', $before_hash);
921	$before_hash = &util::filename_cat(@http_dir_split);
922	}
923
924	$before_hash = $self->eval_dir_dots($before_hash);
925
926	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
927
928	my $rl = 0;
929	$rl = 1 if (-e $linkfilename);
930
931	# make sure there's a slash on the end if it's a directory
932	if ($before_hash !~ m/\/$/) {
933	$before_hash .= "/" if (-d $linkfilename);
934	}
935	return ($type . $before_hash, $hash_part, $rl);
936
937	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
938
939	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
940
941	# the first directory will be the domain name if file_is_url
942	# to generate archives, otherwise we'll assume all files are
943	# from the same site and base_dir is the root
944
945	if ($self->{'file_is_url'}) {
946	my @dirs = split /[\/\\]/, $file;
947	my $domname = shift (@dirs);
948	$before_hash = &util::filename_cat($domname, $before_hash);
949	$before_hash =~ s@\\@/@g; # for windows
950	}
951	else
952	{
953	# see if link shares directory with source document
954	# => turn into relative link if this is so!
955
956	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
957	# too difficult doing a pattern match with embedded '\'s...
958	my $win_before_hash=$before_hash;
959	$win_before_hash =~ s@(\\)+@/@g;
960	# $base_dir is already similarly "converted" on windows.
961	if ($win_before_hash =~ s@^$base_dir/@@o) {
962	# if this is true, we removed a prefix
963	$before_hash=$win_before_hash;
964	}
965	}
966	else {
967	# before_hash has lost leading slash by this point,
968	# -> add back in prior to substitution with $base_dir
969	$before_hash = "/$before_hash";
970
971	$before_hash = &util::filename_cat("",$before_hash);
972	$before_hash =~ s@^$base_dir/@@;
973	}
974	}
975	} else {
976	# Turn relative file path into full path
977	my $dirname = &File::Basename::dirname($file);
978	$before_hash = &util::filename_cat($dirname, $before_hash);
979	$before_hash = $self->eval_dir_dots($before_hash);
980	}
981
982	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
983	# make sure there's a slash on the end if it's a directory
984	if ($before_hash !~ m/\/$/) {
985	$before_hash .= "/" if (-d $linkfilename);
986	}
987	return ("http://" . $before_hash, $hash_part, 1);
988	} else {
989	# mailto, news, nntp, telnet, javascript or gopher link
990	return ($before_hash, "", 0);
991	}
992	}
993
994	sub extract_first_NNNN_characters {
995	my $self = shift (@_);
996	my ($textref, $doc_obj, $thissection) = @_;
997
998	foreach my $size (split /,/, $self->{'first'}) {
999	my $tmptext = $$textref;
1000	# skip to the body
1001	$tmptext =~ s/.<body[^>]>//i;
1002	# remove javascript
1003	$tmptext =~ s@<script.*?</script>@ @sig;
1004	$tmptext =~ s/<[^>]*>/ /g;
1005	$tmptext =~ s/ / /g;
1006	$tmptext =~ s/^\s+//;
1007	$tmptext =~ s/\s+$//;
1008	$tmptext =~ s/\s+/ /gs;
1009	$tmptext = &unicode::substr ($tmptext, 0, $size);
1010	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
1011	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1012	}
1013	}
1014
1015
1016	sub extract_metadata {
1017	my $self = shift (@_);
1018	my ($textref, $metadata, $doc_obj, $section) = @_;
1019	my $outhandle = $self->{'outhandle'};
1020	# if we don't want metadata, we may as well not be here ...
1021	return if (!defined $self->{'metadata_fields'});
1022
1023	my $separator = $self->{'metadata_field_separator'};
1024	if ($separator eq "") {
1025	undef $separator;
1026	}
1027
1028	# metadata fields to extract/save. 'key' is the (lowercase) name of the
1029	# html meta, 'value' is the metadata name for greenstone to use
1030	my %find_fields = ();
1031
1032	my %creator_fields = (); # short-cut for lookups
1033
1034
1035	foreach my $field (split /,/, $self->{'metadata_fields'}) {
1036	$field =~ s/^\s+//; # remove leading whitespace
1037	$field =~ s/\s+$//; # remove trailing whitespace
1038
1039	# support tag<tagname>
1040	if ($field =~ m/^(.?)\s<(.*?)>$/) {
1041	# "$2" is the user's preferred gs metadata name
1042	$find_fields{lc($1)}=$2; # lc = lowercase
1043	} else { # no <tagname> for mapping
1044	# "$field" is the user's preferred gs metadata name
1045	$find_fields{lc($field)}=$field; # lc = lowercase
1046	}
1047	}
1048
1049	if (defined $self->{'hunt_creator_metadata'} &&
1050	$self->{'hunt_creator_metadata'} == 1 ) {
1051	my @extra_fields =
1052	(
1053	'author',
1054	'author.email',
1055	'creator',
1056	'dc.creator',
1057	'dc.creator.corporatename',
1058	);
1059
1060	# add the creator_metadata fields to search for
1061	foreach my $field (@extra_fields) {
1062	$creator_fields{$field}=0; # add to lookup hash
1063	}
1064	}
1065
1066
1067	# find the header in the html file, which has the meta tags
1068	$$textref =~ m@<head>(.*?)</head>@si;
1069
1070	my $html_header=$1;
1071
1072	# go through every <meta... tag defined in the html and see if it is
1073	# one of the tags we want to match.
1074
1075	# special case for title - we want to remember if its been found
1076	my $found_title = 0;
1077	# this assumes that ">" won't appear. (I don't think it's allowed to...)
1078	$html_header =~ m/^/; # match the start of the string, for \G assertion
1079
1080	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
1081	my $metatag=$1;
1082	my ($tag, $value);
1083
1084	# find the tag name
1085	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
1086	$tag=$2;
1087	# in case they're not using " or ', but they should...
1088	if (! $tag) {
1089	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1090	$tag=$1;
1091	}
1092
1093	if (!defined $tag) {
1094	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1095	next;
1096	}
1097
1098	# don't need to assign this field if it was passed in from a previous
1099	# (recursive) plugin
1100	if (defined $metadata->{$tag}) {next}
1101
1102	# find the tag content
1103	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
1104	$value=$2;
1105
1106	if (! $value) {
1107	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
1108	$value=$1;
1109	}
1110	if (!defined $value) {
1111	print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1112	next;
1113	}
1114
1115	# clean up and add
1116	$value =~ s/\s+/ /gs;
1117	chomp($value); # remove trailing \n, if any
1118	if (exists $creator_fields{lc($tag)}) {
1119	# map this value onto greenstone's "Creator" metadata
1120	$tag='Creator';
1121	} elsif (!exists $find_fields{lc($tag)}) {
1122	next; # don't want this tag
1123	} else {
1124	# get the user's preferred capitalisation
1125	$tag = $find_fields{lc($tag)};
1126	}
1127	if (lc($tag) eq "title") {
1128	$found_title = 1;
1129	}
1130
1131	if ($self->{'verbosity'} > 2) {
1132	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1133	}
1134
1135	if ($tag =~ /\./) {
1136	# there is a . so has a namespace, add ex.
1137	$tag = "ex.$tag";
1138	}
1139	if (defined $separator) {
1140	my @values = split($separator, $value);
1141	foreach my $v (@values) {
1142	$doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1143	}
1144	}
1145	else {
1146	$doc_obj->add_utf8_metadata($section, $tag, $value);
1147	}
1148	}
1149
1150	# TITLE: extract the document title
1151	if (exists $find_fields{'title'} && !$found_title) {
1152	# we want a title, and didn't find one in the meta tags
1153	# see if there's a <title> tag
1154	my $title;
1155	my $from = ""; # for debugging output only
1156	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
1157	$title = $1;
1158	$from = "<title> tags";
1159	}
1160
1161	if (!defined $title) {
1162	$from = "first 100 chars";
1163	# if no title use first 100 or so characters
1164	$title = $$textref;
1165	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1166	$title =~ s/^.*?<body>//si;
1167	# ignore javascript!
1168	$title =~ s@<script.*?</script>@ @sig;
1169	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1170	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1171	$title = substr ($title, 0, 100);
1172	$title =~ s/\s\S*$/.../;
1173	}
1174	$title =~ s/<[^>]*>/ /g; # remove html tags
1175	$title =~ s/ / /g;
1176	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1177	$title =~ s/\s+/ /gs; # collapse multiple spaces
1178	$title =~ s/^\s*//; # remove leading spaces
1179	$title =~ s/\s*$//; # remove trailing spaces
1180
1181	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1182	$title =~ s/^\s+//s; # in case title_sub introduced any...
1183	$doc_obj->add_utf8_metadata ($section, "Title", $title);
1184	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1185	if ($self->{'verbosity'} > 2);
1186	}
1187
1188	# add FileFormat metadata
1189	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1190
1191	# Special, for metadata names such as tagH1 - extracts
1192	# the text between the first <H1> and </H1> tags into "H1" metadata.
1193
1194	foreach my $field (keys %find_fields) {
1195	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1196	my $tag = $1;
1197	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1198	my $content = $1;
1199	$content =~ s/ / /g;
1200	$content =~ s/<[^>]*>/ /g;
1201	$content =~ s/^\s+//;
1202	$content =~ s/\s+$//;
1203	$content =~ s/\s+/ /gs;
1204	if ($content) {
1205	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1206	$tag =~ s/^tag//i;
1207	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1208	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1209	if ($self->{'verbosity'} > 2);
1210	}
1211	}
1212	}
1213	}
1214
1215
1216	# evaluate any "../" to next directory up
1217	# evaluate any "./" as here
1218	sub eval_dir_dots {
1219	my $self = shift (@_);
1220	my ($filename) = @_;
1221	my $dirsep_os = &util::get_os_dirsep();
1222	my @dirsep = split(/$dirsep_os/,$filename);
1223
1224	my @eval_dirs = ();
1225	foreach my $d (@dirsep) {
1226	if ($d eq "..") {
1227	pop(@eval_dirs);
1228
1229	} elsif ($d eq ".") {
1230	# do nothing!
1231
1232	} else {
1233	push(@eval_dirs,$d);
1234	}
1235	}
1236
1237	# Need to fiddle with number of elements in @eval_dirs if the
1238	# first one is the empty string. This is because of a
1239	# modification to util::filename_cat that supresses the addition
1240	# of a leading '/' character (or \ if windows) (intended to help
1241	# filename cat with relative paths) if the first entry in the
1242	# array is the empty string. Making the array start with two
1243	# empty strings is a way to defeat this "smart" option.
1244	#
1245	if (scalar(@eval_dirs) > 0) {
1246	if ($eval_dirs[0] eq ""){
1247	unshift(@eval_dirs,"");
1248	}
1249	}
1250
1251	my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1252	return $evaluated_filename;
1253	}
1254
1255	sub replace_usemap_links {
1256	my $self = shift (@_);
1257	my ($front, $link, $back) = @_;
1258
1259	# remove quotes from link at start and end if necessary
1260	if ($link=~/^[\"\']/) {
1261	$link=~s/^[\"\']//;
1262	$link=~s/[\"\']$//;
1263	$front.='"';
1264	$back="\"$back";
1265	}
1266
1267	$link =~ s/^\.\///;
1268	return $front . $link . $back;
1269	}
1270
1271	sub inc_filecount {
1272	my $self = shift (@_);
1273
1274	if ($self->{'file_num'} == 1000) {
1275	$self->{'dir_num'} ++;
1276	$self->{'file_num'} = 0;
1277	} else {
1278	$self->{'file_num'} ++;
1279	}
1280	}
1281
1282
1283	# Extend read_file so that strings like é are
1284	# converted to UTF8 internally.
1285	#
1286	# We don't convert < or > or & or " in case
1287	# they interfere with the GML files
1288
1289	sub read_file {
1290	my $self = shift(@_);
1291	my ($filename, $encoding, $language, $textref) = @_;
1292
1293	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1294
1295	# Convert entities to their UTF8 equivalents
1296	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1297	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1298	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1299
1300	}
1301
1302	sub HB_read_html_file {
1303	my $self = shift (@_);
1304	my ($htmlfile, $text) = @_;
1305
1306	# load in the file
1307	if (!open (FILE, $htmlfile)) {
1308	print STDERR "ERROR - could not open $htmlfile\n";
1309	return;
1310	}
1311
1312	my $foundbody = 0;
1313	$self->HB_gettext (\$foundbody, $text, "FILE");
1314	close FILE;
1315
1316	# just in case there was no <body> tag
1317	if (!$foundbody) {
1318	$foundbody = 1;
1319	open (FILE, $htmlfile) \|\| return;
1320	$self->HB_gettext (\$foundbody, $text, "FILE");
1321	close FILE;
1322	}
1323	# text is in utf8
1324	}
1325
1326	# converts the text to utf8, as ghtml does that for é etc.
1327	sub HB_gettext {
1328	my $self = shift (@_);
1329	my ($foundbody, $text, $handle) = @_;
1330
1331	my $line = "";
1332	while (defined ($line = <$handle>)) {
1333	# look for body tag
1334	if (!$$foundbody) {
1335	if ($line =~ s/^.<body[^>]>//i) {
1336	$$foundbody = 1;
1337	} else {
1338	next;
1339	}
1340	}
1341
1342	# check for symbol fonts
1343	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
1344	my $font = $1;
1345	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1346	if ($font !~ m/^arial$/i);
1347	}
1348
1349	$$text .= $line;
1350	}
1351
1352	if ($self->{'input_encoding'} eq "iso_8859_1") {
1353	# convert to utf-8
1354	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1355	}
1356	# convert any alphanumeric character entities to their utf-8
1357	# equivalent for indexing purposes
1358	#&ghtml::convertcharentities ($$text);
1359
1360	$$text =~ s/\s+/ /g; # remove \n's
1361
1362	# At this point $$text is a binary byte string
1363	# => turn it into a Unicode aware string, so full
1364	# Unicode aware pattern matching can be used.
1365	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
1366	#
1367
1368	$$text = decode("utf8",$$text);
1369	}
1370
1371	sub HB_clean_section {
1372	my $self = shift (@_);
1373	my ($section) = @_;
1374
1375	# remove tags without a starting tag from the section
1376	my ($tag, $tagstart);
1377	while ($section =~ m/<\/([^>]{1,10})>/) {
1378	$tag = $1;
1379	$tagstart = index($section, "<$tag");
1380	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1381	$section =~ s/<\/$tag>//;
1382	}
1383
1384	# remove extra paragraph tags
1385	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
1386
1387	# remove extra stuff at the end of the section
1388	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
1389
1390	# add a newline at the beginning of each paragraph
1391	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1392
1393	# add a newline every 80 characters at a word boundary
1394	# Note: this regular expression puts a line feed before
1395	# the last word in each section, even when it is not
1396	# needed.
1397	$section =~ s/(.{1,80})\s/$1\n/g;
1398
1399	# fix up the image links
1400	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
1401	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1402	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
1403	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1404
1405	return $section;
1406	}
1407
1408	# Will convert the oldHDL format to the new HDL format (using the Section tag)
1409	sub convert_to_newHDLformat
1410	{
1411	my $self = shift (@_);
1412	my ($file,$cnfile) = @_;
1413	my $input_filename = $file;
1414	my $tmp_filename = $cnfile;
1415
1416	# write HTML tmp file with new HDL format
1417	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1418
1419	# read in the file and do basic html cleaning (removing header etc)
1420	my $html = "";
1421	$self->HB_read_html_file ($input_filename, \$html);
1422
1423	# process the file one section at a time
1424	my $curtoclevel = 1;
1425	my $firstsection = 1;
1426	my $toclevel = 0;
1427	while (length ($html) > 0) {
1428	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
1429	$toclevel = $3;
1430	my $title = $4;
1431	my $sectiontext = "";
1432	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
1433	$sectiontext = $1;
1434	} else {
1435	$sectiontext = $html;
1436	$html = "";
1437	}
1438
1439	# remove tags and extra spaces from the title
1440	$title =~ s/<\/?[^>]+>//g;
1441	$title =~ s/^\s+\|\s+$//g;
1442
1443	# close any sections below the current level and
1444	# create a new section (special case for the firstsection)
1445	print PROD "<!--\n";
1446	while (($curtoclevel > $toclevel) \|\|
1447	(!$firstsection && $curtoclevel == $toclevel)) {
1448	$curtoclevel--;
1449	print PROD "</Section>\n";
1450	}
1451	if ($curtoclevel+1 < $toclevel) {
1452	print STDERR "WARNING - jump in toc levels in $input_filename " .
1453	"from $curtoclevel to $toclevel\n";
1454	}
1455	while ($curtoclevel < $toclevel) {
1456	$curtoclevel++;
1457	}
1458
1459	if ($curtoclevel == 1) {
1460	# add the header tag
1461	print PROD "-->\n";
1462	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1463	print PROD "<!--\n";
1464	}
1465
1466	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1467
1468	print PROD "-->\n";
1469
1470	# clean up the section html
1471	$sectiontext = $self->HB_clean_section($sectiontext);
1472
1473	print PROD "$sectiontext\n";
1474
1475	} else {
1476	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1477	"\nin $input_filename\n";
1478	last;
1479	}
1480	$firstsection = 0;
1481	}
1482
1483	print PROD "<!--\n";
1484	while ($curtoclevel > 0) {
1485	$curtoclevel--;
1486	print PROD "</Section>\n";
1487	}
1488	print PROD "-->\n";
1489
1490	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1491
1492	return $tmp_filename;
1493	}
1494
1495	sub shorten {
1496	my $self = shift (@_);
1497	my ($text) = @_;
1498
1499	return "\"$text\"" if (length($text) < 100);
1500
1501	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1502	substr ($text, length($text)-50) . "\"";
1503	}
1504
1505	sub convert_tidy_or_oldHDL_file
1506	{
1507	my $self = shift (@_);
1508	my ($file) = @_;
1509	my $input_filename = $file;
1510
1511	if (-d $input_filename)
1512	{
1513	return $input_filename;
1514	}
1515
1516	# get the input filename
1517	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1518	my $base_dirname = $dirname;
1519	$suffix = lc($suffix);
1520
1521	# derive tmp filename from input filename
1522	# Remove any white space from filename -- no risk of name collision, and
1523	# makes later conversion by utils simpler. Leave spaces in path...
1524	# tidy up the filename with space, dot, hyphen between
1525	$tailname =~ s/\s+//g;
1526	$tailname =~ s/\.+//g;
1527	$tailname =~ s/\-+//g;
1528	# convert to utf-8 otherwise we have problems with the doc.xml file
1529	# later on
1530	&unicode::ensure_utf8(\$tailname);
1531
1532	# softlink to collection tmp dir
1533	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1534	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1535
1536	my $test_dirname = "";
1537	my $f_separator = &util::get_os_dirsep();
1538
1539	if ($dirname =~ m/import$f_separator/)
1540	{
1541	$test_dirname = $'; #'
1542
1543	#print STDERR "init $'\n";
1544
1545	while ($test_dirname =~ m/[$f_separator]/)
1546	{
1547	my $folderdirname = $`;
1548	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1549	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1550	$test_dirname = $'; #'
1551	}
1552	}
1553
1554	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1555
1556	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1557	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
1558	{
1559	#convert the input file to a new style HDL
1560	my $hdl_output_filename = $input_filename;
1561	if ($self->{'old_style_HDL'})
1562	{
1563	$hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1564	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1565	}
1566
1567	#just for checking copy all other file from the base dir to tmp dir if it is not exists
1568	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1569	my @files = grep {!/^\.+$/} readdir(DIR);
1570	close(DIR);
1571
1572	foreach my $file (@files)
1573	{
1574	my $src_file = &util::filename_cat($base_dirname,$file);
1575	my $dest_file = &util::filename_cat($tmp_dirname,$file);
1576	if ((!-e $dest_file) && (!-d $src_file))
1577	{
1578	# just copy the original file back to the tmp directory
1579	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1580	}
1581	}
1582
1583	# tidy the input file
1584	my $tidy_output_filename = $hdl_output_filename;
1585	if ($self->{'use_realistic_book'})
1586	{
1587	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1588	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1589	}
1590	$tmp_filename = $tidy_output_filename;
1591	}
1592	else
1593	{
1594	if (!-e $tmp_filename)
1595	{
1596	# just copy the original file back to the tmp directory
1597	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1598	}
1599	}
1600
1601	return $tmp_filename;
1602	}
1603
1604
1605	# Will make the html input file as a proper XML file with removed font tag and
1606	# image size added to the img tag.
1607	# The tidying process takes place in a collection specific 'tmp' directory so
1608	# that we don't accidentally damage the input.
1609	sub tmp_tidy_file
1610	{
1611	my $self = shift (@_);
1612	my ($file,$cnfile) = @_;
1613	my $input_filename = $file;
1614	my $tmp_filename = $cnfile;
1615
1616	# get the input filename
1617	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1618
1619	require HTML::TokeParser::Simple;
1620
1621	# create HTML parser to decode the input file
1622	my $parser = HTML::TokeParser::Simple->new($input_filename);
1623
1624	# write HTML tmp file without the font tag and image size are added to the img tag
1625	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1626	while (my $token = $parser->get_token())
1627	{
1628	# is it an img tag
1629	if ($token->is_start_tag('img'))
1630	{
1631	# get the attributes
1632	my $attr = $token->return_attr;
1633
1634	# get the full path to the image
1635	my $img_file = &util::filename_cat($dirname,$attr->{src});
1636
1637	# set the width and height attribute
1638	($attr->{width}, $attr->{height}) = imgsize($img_file);
1639
1640	# recreate the tag
1641	print PROD "<img";
1642	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1643	print PROD ">";
1644	}
1645	# is it a font tag
1646	else
1647	{
1648	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
1649	{
1650	# remove font tag
1651	print PROD "";
1652	}
1653	else
1654	{
1655	# print without changes
1656	print PROD $token->as_is;
1657	}
1658	}
1659	}
1660	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1661
1662	# run html-tidy on the tmp file to make it a proper XML file
1663
1664	my $outhandle = $self->{'outhandle'};
1665	print $outhandle "Converting HTML to be XML compliant:\n";
1666
1667	my $tidy_cmd = "tidy";
1668	$tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1669	$tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1670	if ($self->{'verbosity'} <= 2) {
1671	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1672	$tidy_cmd .= " 2>nul";
1673	}
1674	else {
1675	$tidy_cmd .= " 2>/dev/null";
1676	}
1677	print $outhandle " => $tidy_cmd\n";
1678	}
1679
1680	my $tidyfile = `$tidy_cmd`;
1681
1682	# write result back to the tmp file
1683	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1684	print PROD $tidyfile;
1685	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1686
1687	# return the output filename
1688	return $tmp_filename;
1689	}
1690
1691	sub associate_cover_image
1692	{
1693	my $self = shift(@_);
1694	my ($doc_obj, $filename) = @_;
1695	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
1696	{
1697	# we will have cover image in tidytmp, but want it from import
1698	$filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1699	}
1700	$self->SUPER::associate_cover_image($doc_obj, $filename);
1701	}
1702
1703
1704	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: