Context Navigation

source: gsdl/trunk/perllib/plugins/HTMLPlugin.pm@ 20778

Last change on this file since 20778 was 20778, checked in by kjdon, 15 years ago
plugins now need to add any auxiliary source files as source assoc files, so we know when to reimport for incremental import. Have started this, but not finished and not tested :-)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 51.8 KB

Line
1	###########################################################################
2	#
3	# HTMLPlugin.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	#
35
36	package HTMLPlugin;
37
38	use ReadTextFile;
39	use HBPlugin;
40	use ghtml;
41	use unicode;
42	use util;
43	use XMLParser;
44
45	use Image::Size;
46	use File::Copy;
47
48	sub BEGIN {
49	@HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
50	}
51
52	use strict; # every perl program should have this!
53	no strict 'refs'; # make an exception so we can use variables as filehandles
54
55	my $arguments =
56	[ { 'name' => "process_exp",
57	'desc' => "{BasePlugin.process_exp}",
58	'type' => "regexp",
59	'deft' => &get_default_process_exp() },
60	{ 'name' => "block_exp",
61	'desc' => "{BasePlugin.block_exp}",
62	'type' => 'regexp',
63	'deft' => &get_default_block_exp() },
64	{ 'name' => "nolinks",
65	'desc' => "{HTMLPlugin.nolinks}",
66	'type' => "flag" },
67	{ 'name' => "keep_head",
68	'desc' => "{HTMLPlugin.keep_head}",
69	'type' => "flag" },
70	{ 'name' => "no_metadata",
71	'desc' => "{HTMLPlugin.no_metadata}",
72	'type' => "flag" },
73	{ 'name' => "metadata_fields",
74	'desc' => "{HTMLPlugin.metadata_fields}",
75	'type' => "string",
76	'deft' => "Title" },
77	{ 'name' => "hunt_creator_metadata",
78	'desc' => "{HTMLPlugin.hunt_creator_metadata}",
79	'type' => "flag" },
80	{ 'name' => "file_is_url",
81	'desc' => "{HTMLPlugin.file_is_url}",
82	'type' => "flag" },
83	{ 'name' => "assoc_files",
84	'desc' => "{HTMLPlugin.assoc_files}",
85	'type' => "regexp",
86	'deft' => &get_default_block_exp() },
87	{ 'name' => "rename_assoc_files",
88	'desc' => "{HTMLPlugin.rename_assoc_files}",
89	'type' => "flag" },
90	{ 'name' => "title_sub",
91	'desc' => "{HTMLPlugin.title_sub}",
92	'type' => "string",
93	'deft' => "" },
94	{ 'name' => "description_tags",
95	'desc' => "{HTMLPlugin.description_tags}",
96	'type' => "flag" },
97	# retain this for backward compatibility (w3mir option was replaced by
98	# file_is_url)
99	{ 'name' => "w3mir",
100	# 'desc' => "{HTMLPlugin.w3mir}",
101	'type' => "flag",
102	'hiddengli' => "yes"},
103	{ 'name' => "no_strip_metadata_html",
104	'desc' => "{HTMLPlugin.no_strip_metadata_html}",
105	'type' => "string",
106	'deft' => "",
107	'reqd' => "no"},
108	{ 'name' => "sectionalise_using_h_tags",
109	'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
110	'type' => "flag" },
111	{ 'name' => "use_realistic_book",
112	'desc' => "{HTMLPlugin.tidy_html}",
113	'type' => "flag"},
114	{ 'name' => "old_style_HDL",
115	'desc' => "{HTMLPlugin.old_style_HDL}",
116	'type' => "flag"}
117	];
118
119	my $options = { 'name' => "HTMLPlugin",
120	'desc' => "{HTMLPlugin.desc}",
121	'abstract' => "no",
122	'inherits' => "yes",
123	'args' => $arguments };
124
125
126	sub new {
127	my ($class) = shift (@_);
128	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
129	push(@$pluginlist, $class);
130
131	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
132	push(@{$hashArgOptLists->{"OptList"}},$options);
133
134
135	my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
136
137	if ($self->{'w3mir'}) {
138	$self->{'file_is_url'} = 1;
139	}
140	$self->{'aux_files'} = {};
141	$self->{'dir_num'} = 0;
142	$self->{'file_num'} = 0;
143
144	return bless $self, $class;
145	}
146
147	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
148	# if have eg <script language="javascript" src="img/lib.js@123">
149	sub get_default_block_exp {
150	my $self = shift (@_);
151
152	#return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
153	return "";
154	}
155
156	sub get_default_process_exp {
157	my $self = shift (@_);
158
159	# the last option is an attempt to encode the concept of an html query ...
160	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
161	}
162
163	sub store_block_files
164	{
165	my $self =shift (@_);
166	my ($filename_full_path, $block_hash) = @_;
167
168	my $html_fname = $filename_full_path;
169	my @file_blocks;
170
171	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
172
173	# read in file ($text will be in utf8)
174	my $raw_text = "";
175	$self->read_file_no_decoding ($filename_full_path, \$raw_text);
176
177	my $textref = \$raw_text;
178	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
179	my $closecom = '(?:-->\|(?:—\|\|--)>)';
180	$$textref =~ s/$opencom(.*?)$closecom//gs;
181
182	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
183	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
184	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
185	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
186	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
187	my @tabbg_matches = ($$textref =~ m/<(?:body\|table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
188	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
189
190	if(!defined $self->{'utf8_to_original_filename'}) {
191	# maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
192	$self->{'utf8_to_original_filename'} = {};
193	}
194
195	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
196
197	# remove quotes from link at start and end if necessary
198	if ($link=~/^\"/) {
199	$link=~s/^\"//;
200	$link=~s/\"$//;
201	}
202
203	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
204	# some links may just be anchor names
205	next unless ($link =~ /\S+/);
206
207	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
208	# Turn relative file path into full path
209	my $dirname = &File::Basename::dirname($filename_full_path);
210	$link = &util::filename_cat($dirname, $link);
211	}
212	$link = $self->eval_dir_dots($link);
213
214	# this is the actual filename on the filesystem (that the link refers to)
215	my $url_original_filename = $self->opt_url_decode($link);
216
217	# Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
218	my $utf8_link = "";
219	$self->decode_text($link,$encoding,$language,\$utf8_link);
220
221	$self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
222	# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
223
224	if ($url_original_filename ne $utf8_link) {
225	my $outhandle = $self->{'outhandle'};
226
227	print $outhandle "URL Encoding $url_original_filename\n";
228	print $outhandle " ->$utf8_link\n";
229	}
230
231	$block_hash->{'file_blocks'}->{$url_original_filename} = 1;
232	}
233	}
234
235	# Given a filename in any encoding, will URL decode it to get back the original filename
236	# in the original encoding. Because this method is intended to work out the original
237	# filename, it does not URL decode any filename if a file by the name of the URL-encoded*
238	# string already exists in the local folder.
239	# Return the original filename corresponding to the parameter URL-encoded filename, and
240	# a decoded flag that is set to true iff URL-decoding had to be applied.
241	sub opt_url_decode {
242	my $self = shift (@_);
243	my ($link) = @_;
244
245	# Replace %XX's in URL with decoded value if required.
246	# Note that the filename may include the %XX in some situations
247	if ($link =~ m/\%[A-F0-9]{2}/i) {
248	if (!-e $link) {
249	$link = &unicode::url_decode($link);
250	}
251	}
252
253	return $link;
254	}
255
256	sub read_into_doc_obj
257	{
258	my $self = shift (@_);
259	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
260
261	# get the input file
262	my $input_filename = $file;
263	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
264	$suffix = lc($suffix);
265
266	if (($self->{'use_realistic_book'}) \|\| ($self->{'old_style_HDL'}))
267	{
268	# because the document has to be sectionalized set the description tags
269	$self->{'description_tags'} = 1;
270
271	# set the file to be tidied
272	$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
273
274	# get the tidied file
275	#my $tidy_filename = $self->tmp_tidy_file($input_filename);
276	my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
277
278	# derive tmp filename from input filename
279	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
280
281	# set the new input file and base_dir to be from the tidied file
282	$file = "$tailname$suffix";
283	$base_dir = $dirname;
284	}
285
286	# call the parent read_into_doc_obj
287	my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
288
289	return ($process_status,$doc_obj);
290	}
291
292	# do plugin specific processing of doc_obj
293	sub process {
294	my $self = shift (@_);
295	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
296	my $outhandle = $self->{'outhandle'};
297
298	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
299	# this makes life so much easier... perl can cope with unix-style '/'s.
300	$base_dir =~ s@(\\)+@/@g;
301	$file =~ s@(\\)+@/@g;
302	}
303
304	# reset per-doc stuff...
305	$self->{'aux_files'} = {};
306	$self->{'dir_num'} = 0;
307	$self->{'file_num'} = 0;
308
309	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
310	# you can also include metadata in the format (X can be any number)
311	# <hX>Title<!--gsdl-metadata
312	# <Metadata name="name1">value1</Metadata>
313	# ...
314	# <Metadata name="nameN">valueN</Metadata>
315	#--></hX>
316	if ($self->{'sectionalise_using_h_tags'}) {
317	# description_tags should allways be activated because we convert headings to description tags
318	$self->{'description_tags'} = 1;
319
320	my $arrSections = [];
321	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
322
323	if (scalar(@$arrSections)) {
324	my $strMetadata = $self->update_section_data($arrSections, -1);
325	if (length($strMetadata)) {
326	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
327	$$textref =~ s/<\/body>/$strMetadata/ig;
328	}
329	}
330	}
331
332	my $cursection = $doc_obj->get_top_section();
333
334	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
335	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
336
337	# Store URL for page as metadata - this can be used for an
338	# altavista style search interface. The URL won't be valid
339	# unless the file structure contains the domain name (i.e.
340	# like when w3mir is used to download a website).
341
342	# URL metadata (even invalid ones) are used to support internal
343	# links, so even if 'file_is_url' is off, still need to store info
344
345	my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$");
346	my $utf8_file = $self->filename_to_utf8_metadata($file);
347	$utf8_file =~ s/&\#095;/_/g;
348	my $web_url = "http://";
349	if(defined $dirname) { # local directory
350	$dirname = $self->eval_dir_dots($dirname);
351	$dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
352	$web_url = $web_url.$dirname.$utf8_file;
353	} else {
354	$web_url = $web_url.$utf8_file;
355	}
356	$web_url =~ s/\\/\//g;
357	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
358
359	if ($self->{'file_is_url'}) {
360	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
361	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
362	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
363	}
364
365	if ($self->{'description_tags'}) {
366	# remove the html header - note that doing this here means any
367	# sections defined within the header will be lost (so all <Section>
368	# tags must appear within the body of the HTML)
369	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
370
371	$$textref =~ s/^.?<body[^>]>//is;
372	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
373
374	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
375	my $closecom = '(?:-->\|(?:—\|\|--)>)';
376
377	my $lt = '(?:<\|<)';
378	my $gt = '(?:>\|>)';
379	my $quot = '(?:"\|"\|”\|“)';
380
381	my $dont_strip = '';
382	if ($self->{'no_strip_metadata_html'}) {
383	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
384	}
385
386	my $found_something = 0; my $top = 1;
387	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
388	my $text = $1;
389	my $comment = $2;
390	if (defined $text) {
391	# text before a comment - note that getting to here
392	# doesn't necessarily mean there are Section tags in
393	# the document
394	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
395	}
396	while ($comment =~ s/$lt(.*?)$gt//s) {
397	my $tag = $1;
398	if ($tag eq "Section") {
399	$found_something = 1;
400	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
401	$top = 0;
402	} elsif ($tag eq "/Section") {
403	$found_something = 1;
404	$cursection = $doc_obj->get_parent_section ($cursection);
405	} elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
406	my $metaname = $1;
407	my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
408	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
409	my $metavalue = $1;
410	$metavalue =~ s/^\s+//;
411	$metavalue =~ s/\s+$//;
412	# assume that no metadata value intentionally includes
413	# carriage returns or HTML tags (if they're there they
414	# were probably introduced when converting to HTML from
415	# some other format).
416	# actually some people want to have html tags in their
417	# metadata.
418	$metavalue =~ s/[\cJ\cM]/ /sg;
419	$metavalue =~ s/<[^>]+>//sg
420	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ m/^($dont_strip)$/);
421	$metavalue =~ s/\s+/ /sg;
422	if ($accumulate) {
423	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
424	} else {
425	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
426	}
427	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
428	# do nothing with containing Description tags
429	} else {
430	# simple HTML tag (probably created by the conversion
431	# to HTML from some other format) - we'll ignore it and
432	# hope for the best ;-)
433	}
434	}
435	}
436	if ($cursection ne "") {
437	print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
438	}
439
440	$$textref =~ s/^.?<body[^>]>//is;
441	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
442	if ($$textref =~ m/\S/) {
443	if (!$found_something) {
444	if ($self->{'verbosity'} > 2) {
445	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
446	print $outhandle " will be processed as a single section document\n";
447	}
448
449	# go ahead and process single-section document
450	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
451
452	# if document contains no Section tags we'll go ahead
453	# and extract metadata (this won't have been done
454	# above as the -description_tags option prevents it)
455	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
456	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
457	unless $self->{'no_metadata'};
458
459	} else {
460	print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
461	print $outhandle " of the final closing </Section> tag. This text will\n";
462	print $outhandle " be ignored.";
463
464	my ($text);
465	if (length($$textref) > 30) {
466	$text = substr($$textref, 0, 30) . "...";
467	} else {
468	$text = $$textref;
469	}
470	$text =~ s/\n/ /isg;
471	print $outhandle " ($text)\n";
472	}
473	} elsif (!$found_something) {
474
475	if ($self->{'verbosity'} > 2) {
476	# may get to here if document contained no valid Section
477	# tags but did contain some comments. The text will have
478	# been processed already but we should print the warning
479	# as above and extract metadata
480	print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
481	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
482	}
483
484	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
485	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
486	unless $self->{'no_metadata'};
487	}
488
489	} else {
490
491	# remove header and footer
492	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
493	$$textref =~ s/^.?<body[^>]>//is;
494	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
495	}
496
497	# single section document
498	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
499	}
500	return 1;
501	}
502
503
504	sub process_heading
505	{
506	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
507	$strHeadingText = '' if (!defined($strHeadingText));
508
509	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
510
511	my $strSecMetadata = '';
512	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
513	{
514	$strSecMetadata .= $1;
515	}
516
517	$strHeadingText =~ s/^\s+//g;
518	$strHeadingText =~ s/\s+$//g;
519	$strSecMetadata =~ s/^\s+//g;
520	$strSecMetadata =~ s/\s+$//g;
521
522	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
523
524	if (length($strSecMetadata)) {
525	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
526	}
527
528	$strMetadata .= "\t</Description>\n";
529
530	return "<!--" . $strMetadata . "-->";
531	}
532
533
534	sub update_section_data
535	{
536	my ($self, $arrSections, $nCurTocNo) = @_;
537	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
538
539	if ($nSections == 0) {
540	push @$arrSections, $nCurTocNo;
541	return $strBuffer;
542	}
543	$nLast = $arrSections->[$nSections - 1];
544	if ($nCurTocNo > $nLast) {
545	push @$arrSections, $nCurTocNo;
546	return $strBuffer;
547	}
548	for(my $i = $nSections - 1; $i >= 0; $i--) {
549	if ($nCurTocNo <= $arrSections->[$i]) {
550	$strBuffer .= "\n</Section>";
551	pop @$arrSections;
552	}
553	}
554	push @$arrSections, $nCurTocNo;
555	return $strBuffer;
556	}
557
558
559	# note that process_section may be called multiple times for a single
560	# section (relying on the fact that add_utf8_text appends the text to any
561	# that may exist already).
562	sub process_section {
563	my $self = shift (@_);
564	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
565	# trap links
566	if (!$self->{'nolinks'}) {
567	# usemap="./#index" not handled correctly => change to "#index"
568	## $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
569
570	$$textref =~ s/(<img[^>]?usemap\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
571	$self->replace_usemap_links($1, $2, $3)/isge;
572
573	## $$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
574
575	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]*>)/
576	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
577	}
578
579	# trap images
580
581	# Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
582	# i.e. <a href="image><img src="image"></a> in order to overcome a problem that
583	# turned regular text succeeding images into links. That is, by embedding <imgs>
584	# inside <a href=""></a>, the text following images were no longer misbehaving.
585	# However, there would be many occasions whereby images were not meant to link
586	# to their source images but where the images would link to another web page.
587	# To allow this, the no_image_links option was introduced: it would prevent
588	# the behaviour of embedding images into links that referenced the source images.
589
590	# Somewhere along the line, the problem of normal text turning into links when
591	# such text followed images which were not embedded in <a href=""></a> ceased
592	# to occur. This is why the following lines have been commented out (as well as
593	# two lines in replace_images). They appear to no longer apply.
594
595	# If at any time, there is a need for having images embedded in <a> anchor tags,
596	# then it might be better to turn that into an HTMLPlugin option rather than make
597	# it the default behaviour. Also, eventually, no_image_links needs to become
598	# a deprecated option for HTMLPlugin as it has now become the default behaviour.
599
600	#if(!$self->{'no_image_links'}){
601	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)((?:[\"][^\"]+[\"])\|(?:[\'][^\']+[\'])\|(?:[^\s\/>]+))([^>]>)/
602	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
603	#}
604
605	# add text to document object
606	# turn \ into \\ so that the rest of greenstone doesn't think there
607	# is an escape code following. (Macro parsing loses them...)
608	$$textref =~ s/\\/\\\\/go;
609
610	$doc_obj->add_utf8_text($cursection, $$textref);
611	}
612
613	sub replace_images {
614	my $self = shift (@_);
615	my ($front, $link, $back, $base_dir,
616	$file, $doc_obj, $section) = @_;
617
618	# remove quotes from link at start and end if necessary
619	if ($link=~/^[\"\']/) {
620	$link=~s/^[\"\']//;
621	$link=~s/[\"\']$//;
622	$front.='"';
623	$back="\"$back";
624	}
625
626	$link =~ s/\n/ /g;
627
628	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
629	# If the Word file path has spaces in it, wv messes up and you end up with
630	# absolute paths for the images, and without the "file://" prefix
631	# So check for this special case and massage the data to be correct
632	if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
633	$link =~ s/^.*\\([^\\]+)$/$1/;
634	}
635
636	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
637
638	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
639
640	# print STDERR "** link = $link\n href = $href\n** img_file = $img_file, rl = $rl\n";
641
642	my $anchor_name = $img_file;
643	#$anchor_name =~ s/^.*\///;
644	#$anchor_name = "<a name=\"$anchor_name\" ></a>";
645
646	my $image_link = $front . $img_file .$back;
647	return $image_link;
648
649	# The reasons for why the following two lines are no longer necessary can be
650	# found in subroutine process_section
651	#my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
652	#return $anchor_link;
653
654	#return $front . $img_file . $back . $anchor_name;
655	}
656
657	sub replace_href_links {
658	my $self = shift (@_);
659	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
660
661	# remove quotes from link at start and end if necessary
662	if ($link=~/^[\"\']/) {
663	$link=~s/^[\"\']//;
664	$link=~s/[\"\']$//;
665	$front.='"';
666	$back="\"$back";
667	}
668
669	# attempt to sort out targets - frames are not handled
670	# well in this plugin and some cases will screw things
671	# up - e.g. the _parent target (so we'll just remove
672	# them all ;-)
673	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
674	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
675	$front =~ s/target=\"?_parent\"?//is;
676	$back =~ s/target=\"?_parent\"?//is;
677
678	return $front . $link . $back if $link =~ m/^\#/s;
679	$link =~ s/\n/ /g;
680
681	# Find file referred to by $link on file system
682	# This is more complicated than it sounds when char encodings
683	# is taken in to account
684	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
685
686	# href may use '\'s where '/'s should be on Windows
687	$href =~ s/\\/\//g;
688	my ($filename) = $href =~ m/^(?:.?):(?:\/\/)?(.)/;
689
690
691	##### leave all these links alone (they won't be picked up by intermediate
692	##### pages). I think that's safest when dealing with frames, targets etc.
693	##### (at least until I think of a better way to do it). Problems occur with
694	##### mailto links from within small frames, the intermediate page is displayed
695	##### within that frame and can't be seen. There is still potential for this to
696	##### happen even with html pages - the solution seems to be to somehow tell
697	##### the browser from the server side to display the page being sent (i.e.
698	##### the intermediate page) in the top level window - I'm not sure if that's
699	##### possible - the following line should probably be deleted if that can be done
700	return $front . $link . $back if $href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
701
702
703	if (($rl == 0) \|\| ($filename =~ m/$self->{'process_exp'}/) \|\|
704	($href =~ m/\/$/) \|\| ($href =~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
705	&ghtml::urlsafe ($href);
706	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
707	} else {
708	# link is to some other type of file (eg image) so we'll
709	# need to associate that file
710	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
711	}
712	}
713
714	sub add_file {
715	my $self = shift (@_);
716	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
717	my ($newname);
718
719	my $filename = $href;
720	if ($base_dir eq "") {
721	# remove http:/ thereby leaving one slash at the start
722	$filename =~ s/^[^:]*:\///;
723	}
724	else {
725	# remove http://
726	$filename =~ s/^[^:]*:\/\///;
727	}
728
729	$filename = &util::filename_cat($base_dir, $filename);
730	# Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some
731	# situations. If the original file's name was in URL encoding, the following method will not decode it.
732	my $utf8_filename = $filename;
733	$filename = $self->opt_url_decode($utf8_filename);
734
735	# some special processing if the intended filename was converted to utf8, but
736	# the actual file still needs to be renamed
737	if (!-e $filename) {
738	# try the original filename stored in map
739	my $original_filename = $self->{'utf8_to_original_filename'}->{$filename};
740	if (defined $original_filename && -e $original_filename) {
741	$filename = $original_filename;
742	}
743	}
744
745	my ($ext) = $filename =~ m/(\.[^\.]*)$/;
746
747	if ($rl == 0) {
748	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
749	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
750	}
751	else {
752	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
753	}
754	}
755
756	if ((!defined $ext) \|\| ($ext !~ m/$self->{'assoc_files'}/)) {
757	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
758	}
759	# add the original image file as a source file
760	$doc_obj->associate_source_file($filename);
761	if ($self->{'rename_assoc_files'}) {
762	if (defined $self->{'aux_files'}->{$href}) {
763	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
764	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
765	} else {
766	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
767	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
768	$self->inc_filecount ();
769	}
770	$doc_obj->associate_file($filename, $newname, undef, $section);
771	return "_httpdocimg_/$newname";
772	} else {
773	if(&unicode::is_url_encoded($utf8_filename)) {
774	# use the possibly-decoded filename instead to avoid double URL encoding
775	($newname) = $filename =~ m/([^\/\\]*)$/;
776	} else {
777	($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
778	}
779
780	# Make sure this name uses only ASCII characters.
781	# We use either base64 or URL encoding, as these preserve original encoding
782	$newname = &util::rename_file($newname, $self->{'file_rename_method'});
783
784	$doc_obj->associate_file($filename, $newname, undef, $section);
785
786	# Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
787	# of filenames, URL-encode the additional percent signs of the URL-encoded filename
788	my $newname_url = $newname;
789	$newname_url = &unicode::filename_to_url($newname_url);
790	return "_httpdocimg_/$newname_url";
791	}
792	}
793
794
795	sub format_link {
796	my $self = shift (@_);
797	my ($link, $base_dir, $file) = @_;
798
799	my ($before_hash, $hash_part) = $link =~ m/^([^\#])(\#?.)$/;
800
801	$hash_part = "" if !defined $hash_part;
802	if (!defined $before_hash \|\| $before_hash !~ m/[\w\.\/]/) {
803	my $outhandle = $self->{'outhandle'};
804	print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
805	if $self->{'verbosity'};
806	return ($link, "", 0);
807	}
808
809	if ($before_hash =~ s@^((?:http\|https\|ftp\|file\|mms)://)@@i) {
810	my $type = $1;
811
812	if ($link =~ m/^(http\|ftp):/i) {
813	# Turn url (using /) into file name (possibly using \ on windows)
814	my @http_dir_split = split('/', $before_hash);
815	$before_hash = &util::filename_cat(@http_dir_split);
816	}
817
818	$before_hash = $self->eval_dir_dots($before_hash);
819
820	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
821
822	my $rl = 0;
823	$rl = 1 if (-e $linkfilename);
824
825	# make sure there's a slash on the end if it's a directory
826	if ($before_hash !~ m/\/$/) {
827	$before_hash .= "/" if (-d $linkfilename);
828	}
829	return ($type . $before_hash, $hash_part, $rl);
830
831	} elsif ($link !~ m/^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ m/^\//) {
832
833	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ m/\\/) {
834
835	# the first directory will be the domain name if file_is_url
836	# to generate archives, otherwise we'll assume all files are
837	# from the same site and base_dir is the root
838
839	if ($self->{'file_is_url'}) {
840	my @dirs = split /[\/\\]/, $file;
841	my $domname = shift (@dirs);
842	$before_hash = &util::filename_cat($domname, $before_hash);
843	$before_hash =~ s@\\@/@g; # for windows
844	}
845	else
846	{
847	# see if link shares directory with source document
848	# => turn into relative link if this is so!
849
850	if ($ENV{'GSDLOS'} =~ m/^windows/i) {
851	# too difficult doing a pattern match with embedded '\'s...
852	my $win_before_hash=$before_hash;
853	$win_before_hash =~ s@(\\)+@/@g;
854	# $base_dir is already similarly "converted" on windows.
855	if ($win_before_hash =~ s@^$base_dir/@@o) {
856	# if this is true, we removed a prefix
857	$before_hash=$win_before_hash;
858	}
859	}
860	else {
861	# before_hash has lost leading slash by this point,
862	# -> add back in prior to substitution with $base_dir
863	$before_hash = "/$before_hash";
864
865	$before_hash = &util::filename_cat("",$before_hash);
866	$before_hash =~ s@^$base_dir/@@;
867	}
868	}
869	} else {
870	# Turn relative file path into full path
871	my $dirname = &File::Basename::dirname($file);
872	$before_hash = &util::filename_cat($dirname, $before_hash);
873	$before_hash = $self->eval_dir_dots($before_hash);
874	}
875
876	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
877	# make sure there's a slash on the end if it's a directory
878	if ($before_hash !~ m/\/$/) {
879	$before_hash .= "/" if (-d $linkfilename);
880	}
881	return ("http://" . $before_hash, $hash_part, 1);
882	} else {
883	# mailto, news, nntp, telnet, javascript or gopher link
884	return ($before_hash, "", 0);
885	}
886	}
887
888	sub extract_first_NNNN_characters {
889	my $self = shift (@_);
890	my ($textref, $doc_obj, $thissection) = @_;
891
892	foreach my $size (split /,/, $self->{'first'}) {
893	my $tmptext = $$textref;
894	# skip to the body
895	$tmptext =~ s/.<body[^>]>//i;
896	# remove javascript
897	$tmptext =~ s@<script.*?</script>@ @sig;
898	$tmptext =~ s/<[^>]*>/ /g;
899	$tmptext =~ s/ / /g;
900	$tmptext =~ s/^\s+//;
901	$tmptext =~ s/\s+$//;
902	$tmptext =~ s/\s+/ /gs;
903	$tmptext = &unicode::substr ($tmptext, 0, $size);
904	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
905	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
906	}
907	}
908
909
910	sub extract_metadata {
911	my $self = shift (@_);
912	my ($textref, $metadata, $doc_obj, $section) = @_;
913	my $outhandle = $self->{'outhandle'};
914	# if we don't want metadata, we may as well not be here ...
915	return if (!defined $self->{'metadata_fields'});
916
917	# metadata fields to extract/save. 'key' is the (lowercase) name of the
918	# html meta, 'value' is the metadata name for greenstone to use
919	my %find_fields = ();
920
921	my %creator_fields = (); # short-cut for lookups
922
923
924	foreach my $field (split /,/, $self->{'metadata_fields'}) {
925	$field =~ s/^\s+//; # remove leading whitespace
926	$field =~ s/\s+$//; # remove trailing whitespace
927
928	# support tag<tagname>
929	if ($field =~ m/^(.?)\s<(.*?)>$/) {
930	# "$2" is the user's preferred gs metadata name
931	$find_fields{lc($1)}=$2; # lc = lowercase
932	} else { # no <tagname> for mapping
933	# "$field" is the user's preferred gs metadata name
934	$find_fields{lc($field)}=$field; # lc = lowercase
935	}
936	}
937
938	if (defined $self->{'hunt_creator_metadata'} &&
939	$self->{'hunt_creator_metadata'} == 1 ) {
940	my @extra_fields =
941	(
942	'author',
943	'author.email',
944	'creator',
945	'dc.creator',
946	'dc.creator.corporatename',
947	);
948
949	# add the creator_metadata fields to search for
950	foreach my $field (@extra_fields) {
951	$creator_fields{$field}=0; # add to lookup hash
952	}
953	}
954
955
956	# find the header in the html file, which has the meta tags
957	$$textref =~ m@<head>(.*?)</head>@si;
958
959	my $html_header=$1;
960
961	# go through every <meta... tag defined in the html and see if it is
962	# one of the tags we want to match.
963
964	# special case for title - we want to remember if its been found
965	my $found_title = 0;
966	# this assumes that ">" won't appear. (I don't think it's allowed to...)
967	$html_header =~ m/^/; # match the start of the string, for \G assertion
968
969	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
970	my $metatag=$1;
971	my ($tag, $value);
972
973	# find the tag name
974	$metatag =~ m/(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
975	$tag=$2;
976	# in case they're not using " or ', but they should...
977	if (! $tag) {
978	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
979	$tag=$1;
980	}
981
982	if (!defined $tag) {
983	print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
984	next;
985	}
986
987	# don't need to assign this field if it was passed in from a previous
988	# (recursive) plugin
989	if (defined $metadata->{$tag}) {next}
990
991	# find the tag content
992	$metatag =~ m/content\s=\s([\"\'])?(.*?)\1/is;
993	$value=$2;
994
995	if (! $value) {
996	$metatag =~ m/(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
997	$value=$1;
998	}
999	if (!defined $value) {
1000	print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1001	next;
1002	}
1003
1004	# clean up and add
1005	$value =~ s/\s+/ /gs;
1006	chomp($value); # remove trailing \n, if any
1007	if (exists $creator_fields{lc($tag)}) {
1008	# map this value onto greenstone's "Creator" metadata
1009	$tag='Creator';
1010	} elsif (!exists $find_fields{lc($tag)}) {
1011	next; # don't want this tag
1012	} else {
1013	# get the user's preferred capitalisation
1014	$tag = $find_fields{lc($tag)};
1015	}
1016	if (lc($tag) eq "title") {
1017	$found_title = 1;
1018	}
1019
1020	if ($self->{'verbosity'} > 2) {
1021	print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1022	}
1023
1024	# Do we still reply on the following? Surely there must
1025	# be a better way to go about this outside of the plugin?
1026	#
1027	#if ($tag =~ m/date.*/i){
1028	# $tag = lc($tag);
1029	#}
1030
1031	$doc_obj->add_utf8_metadata($section, $tag, $value);
1032
1033	}
1034
1035	# TITLE: extract the document title
1036	if (exists $find_fields{'title'} && !$found_title) {
1037	# we want a title, and didn't find one in the meta tags
1038	# see if there's a <title> tag
1039	my $title;
1040	my $from = ""; # for debugging output only
1041	if ($html_header =~ m/<title[^>]>([^<]+)<\/title[^>]>/is) {
1042	$title = $1;
1043	$from = "<title> tags";
1044	}
1045
1046	if (!defined $title) {
1047	$from = "first 100 chars";
1048	# if no title use first 100 or so characters
1049	$title = $$textref;
1050	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1051	$title =~ s/^.*?<body>//si;
1052	# ignore javascript!
1053	$title =~ s@<script.*?</script>@ @sig;
1054	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1055	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
1056	$title = substr ($title, 0, 100);
1057	$title =~ s/\s\S*$/.../;
1058	}
1059	$title =~ s/<[^>]*>/ /g; # remove html tags
1060	$title =~ s/ / /g;
1061	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
1062	$title =~ s/\s+/ /gs; # collapse multiple spaces
1063	$title =~ s/^\s*//; # remove leading spaces
1064	$title =~ s/\s*$//; # remove trailing spaces
1065
1066	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1067	$title =~ s/^\s+//s; # in case title_sub introduced any...
1068	$doc_obj->add_utf8_metadata ($section, 'Title', $title);
1069	print $outhandle " extracted Title metadata \"$title\" from $from\n"
1070	if ($self->{'verbosity'} > 2);
1071	}
1072
1073	# add FileFormat metadata
1074	$doc_obj->add_metadata($section,"FileFormat", "HTML");
1075
1076	# Special, for metadata names such as tagH1 - extracts
1077	# the text between the first <H1> and </H1> tags into "H1" metadata.
1078
1079	foreach my $field (keys %find_fields) {
1080	if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1081	my $tag = $1;
1082	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
1083	my $content = $1;
1084	$content =~ s/ / /g;
1085	$content =~ s/<[^>]*>/ /g;
1086	$content =~ s/^\s+//;
1087	$content =~ s/\s+$//;
1088	$content =~ s/\s+/ /gs;
1089	if ($content) {
1090	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1091	$tag =~ s/^tag//i;
1092	$doc_obj->add_utf8_metadata ($section, $tag, $content);
1093	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1094	if ($self->{'verbosity'} > 2);
1095	}
1096	}
1097	}
1098	}
1099
1100
1101	# evaluate any "../" to next directory up
1102	# evaluate any "./" as here
1103	sub eval_dir_dots {
1104	my $self = shift (@_);
1105	my ($filename) = @_;
1106	my $dirsep_os = &util::get_os_dirsep();
1107	my @dirsep = split(/$dirsep_os/,$filename);
1108
1109	my @eval_dirs = ();
1110	foreach my $d (@dirsep) {
1111	if ($d eq "..") {
1112	pop(@eval_dirs);
1113
1114	} elsif ($d eq ".") {
1115	# do nothing!
1116
1117	} else {
1118	push(@eval_dirs,$d);
1119	}
1120	}
1121
1122	# Need to fiddle with number of elements in @eval_dirs if the
1123	# first one is the empty string. This is because of a
1124	# modification to util::filename_cat that supresses the addition
1125	# of a leading '/' character (or \ if windows) (intended to help
1126	# filename cat with relative paths) if the first entry in the
1127	# array is the empty string. Making the array start with two
1128	# empty strings is a way to defeat this "smart" option.
1129	#
1130	if (scalar(@eval_dirs) > 0) {
1131	if ($eval_dirs[0] eq ""){
1132	unshift(@eval_dirs,"");
1133	}
1134	}
1135
1136	my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1137	return $evaluated_filename;
1138	}
1139
1140	sub replace_usemap_links {
1141	my $self = shift (@_);
1142	my ($front, $link, $back) = @_;
1143
1144	# remove quotes from link at start and end if necessary
1145	if ($link=~/^[\"\']/) {
1146	$link=~s/^[\"\']//;
1147	$link=~s/[\"\']$//;
1148	$front.='"';
1149	$back="\"$back";
1150	}
1151
1152	$link =~ s/^\.\///;
1153	return $front . $link . $back;
1154	}
1155
1156	sub inc_filecount {
1157	my $self = shift (@_);
1158
1159	if ($self->{'file_num'} == 1000) {
1160	$self->{'dir_num'} ++;
1161	$self->{'file_num'} = 0;
1162	} else {
1163	$self->{'file_num'} ++;
1164	}
1165	}
1166
1167
1168	# Extend read_file so that strings like é are
1169	# converted to UTF8 internally.
1170	#
1171	# We don't convert < or > or & or " in case
1172	# they interfere with the GML files
1173
1174	sub read_file {
1175	my $self = shift(@_);
1176	my ($filename, $encoding, $language, $textref) = @_;
1177
1178	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1179
1180	# Convert entities to their UTF8 equivalents
1181	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
1182	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
1183	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
1184	}
1185
1186	sub HB_read_html_file {
1187	my $self = shift (@_);
1188	my ($htmlfile, $text) = @_;
1189
1190	# load in the file
1191	if (!open (FILE, $htmlfile)) {
1192	print STDERR "ERROR - could not open $htmlfile\n";
1193	return;
1194	}
1195
1196	my $foundbody = 0;
1197	$self->HB_gettext (\$foundbody, $text, "FILE");
1198	close FILE;
1199
1200	# just in case there was no <body> tag
1201	if (!$foundbody) {
1202	$foundbody = 1;
1203	open (FILE, $htmlfile) \|\| return;
1204	$self->HB_gettext (\$foundbody, $text, "FILE");
1205	close FILE;
1206	}
1207	# text is in utf8
1208	}
1209
1210	# converts the text to utf8, as ghtml does that for é etc.
1211	sub HB_gettext {
1212	my $self = shift (@_);
1213	my ($foundbody, $text, $handle) = @_;
1214
1215	my $line = "";
1216	while (defined ($line = <$handle>)) {
1217	# look for body tag
1218	if (!$$foundbody) {
1219	if ($line =~ s/^.<body[^>]>//i) {
1220	$$foundbody = 1;
1221	} else {
1222	next;
1223	}
1224	}
1225
1226	# check for symbol fonts
1227	if ($line =~ m/<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
1228	my $font = $1;
1229	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1230	if ($font !~ m/^arial$/i);
1231	}
1232
1233	$$text .= $line;
1234	}
1235
1236	if ($self->{'input_encoding'} eq "iso_8859_1") {
1237	# convert to utf-8
1238	$$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1239	}
1240	# convert any alphanumeric character entities to their utf-8
1241	# equivalent for indexing purposes
1242	#&ghtml::convertcharentities ($$text);
1243
1244	$$text =~ s/\s+/ /g; # remove \n's
1245	}
1246
1247	sub HB_clean_section {
1248	my $self = shift (@_);
1249	my ($section) = @_;
1250
1251	# remove tags without a starting tag from the section
1252	my ($tag, $tagstart);
1253	while ($section =~ m/<\/([^>]{1,10})>/) {
1254	$tag = $1;
1255	$tagstart = index($section, "<$tag");
1256	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1257	$section =~ s/<\/$tag>//;
1258	}
1259
1260	# remove extra paragraph tags
1261	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
1262
1263	# remove extra stuff at the end of the section
1264	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
1265
1266	# add a newline at the beginning of each paragraph
1267	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1268
1269	# add a newline every 80 characters at a word boundary
1270	# Note: this regular expression puts a line feed before
1271	# the last word in each section, even when it is not
1272	# needed.
1273	$section =~ s/(.{1,80})\s/$1\n/g;
1274
1275	# fix up the image links
1276	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
1277	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1278	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
1279	<center><img src=\"$1\" \/><\/center><br\/>/ig;
1280
1281	return $section;
1282	}
1283
1284	# Will convert the oldHDL format to the new HDL format (using the Section tag)
1285	sub convert_to_newHDLformat
1286	{
1287	my $self = shift (@_);
1288	my ($file,$cnfile) = @_;
1289	my $input_filename = $file;
1290	my $tmp_filename = $cnfile;
1291
1292	# write HTML tmp file with new HDL format
1293	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1294
1295	# read in the file and do basic html cleaning (removing header etc)
1296	my $html = "";
1297	$self->HB_read_html_file ($input_filename, \$html);
1298
1299	# process the file one section at a time
1300	my $curtoclevel = 1;
1301	my $firstsection = 1;
1302	my $toclevel = 0;
1303	while (length ($html) > 0) {
1304	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
1305	$toclevel = $3;
1306	my $title = $4;
1307	my $sectiontext = "";
1308	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
1309	$sectiontext = $1;
1310	} else {
1311	$sectiontext = $html;
1312	$html = "";
1313	}
1314
1315	# remove tags and extra spaces from the title
1316	$title =~ s/<\/?[^>]+>//g;
1317	$title =~ s/^\s+\|\s+$//g;
1318
1319	# close any sections below the current level and
1320	# create a new section (special case for the firstsection)
1321	print PROD "<!--\n";
1322	while (($curtoclevel > $toclevel) \|\|
1323	(!$firstsection && $curtoclevel == $toclevel)) {
1324	$curtoclevel--;
1325	print PROD "</Section>\n";
1326	}
1327	if ($curtoclevel+1 < $toclevel) {
1328	print STDERR "WARNING - jump in toc levels in $input_filename " .
1329	"from $curtoclevel to $toclevel\n";
1330	}
1331	while ($curtoclevel < $toclevel) {
1332	$curtoclevel++;
1333	}
1334
1335	if ($curtoclevel == 1) {
1336	# add the header tag
1337	print PROD "-->\n";
1338	print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1339	print PROD "<!--\n";
1340	}
1341
1342	print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1343
1344	print PROD "-->\n";
1345
1346	# clean up the section html
1347	$sectiontext = $self->HB_clean_section($sectiontext);
1348
1349	print PROD "$sectiontext\n";
1350
1351	} else {
1352	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1353	"\nin $input_filename\n";
1354	last;
1355	}
1356	$firstsection = 0;
1357	}
1358
1359	print PROD "<!--\n";
1360	while ($curtoclevel > 0) {
1361	$curtoclevel--;
1362	print PROD "</Section>\n";
1363	}
1364	print PROD "-->\n";
1365
1366	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1367
1368	return $tmp_filename;
1369	}
1370
1371	sub shorten {
1372	my $self = shift (@_);
1373	my ($text) = @_;
1374
1375	return "\"$text\"" if (length($text) < 100);
1376
1377	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1378	substr ($text, length($text)-50) . "\"";
1379	}
1380
1381	sub convert_tidy_or_oldHDL_file
1382	{
1383	my $self = shift (@_);
1384	my ($file) = @_;
1385	my $input_filename = $file;
1386
1387	if (-d $input_filename)
1388	{
1389	return $input_filename;
1390	}
1391
1392	# get the input filename
1393	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1394	my $base_dirname = $dirname;
1395	$suffix = lc($suffix);
1396
1397	# derive tmp filename from input filename
1398	# Remove any white space from filename -- no risk of name collision, and
1399	# makes later conversion by utils simpler. Leave spaces in path...
1400	# tidy up the filename with space, dot, hyphen between
1401	$tailname =~ s/\s+//g;
1402	$tailname =~ s/\.+//g;
1403	$tailname =~ s/\-+//g;
1404	# convert to utf-8 otherwise we have problems with the doc.xml file
1405	# later on
1406	&unicode::ensure_utf8(\$tailname);
1407
1408	# softlink to collection tmp dir
1409	my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1410	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1411
1412	my $test_dirname = "";
1413	my $f_separator = &util::get_os_dirsep();
1414
1415	if ($dirname =~ m/import$f_separator/)
1416	{
1417	$test_dirname = $'; #'
1418
1419	#print STDERR "init $'\n";
1420
1421	while ($test_dirname =~ m/[$f_separator]/)
1422	{
1423	my $folderdirname = $`;
1424	$tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1425	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1426	$test_dirname = $'; #'
1427	}
1428	}
1429
1430	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1431
1432	# tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1433	if (($suffix eq ".htm") \|\| ($suffix eq ".html") \|\| ($suffix eq ".shtml"))
1434	{
1435	#convert the input file to a new style HDL
1436	my $hdl_output_filename = $input_filename;
1437	if ($self->{'old_style_HDL'})
1438	{
1439	$hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1440	$hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1441	}
1442
1443	#just for checking copy all other file from the base dir to tmp dir if it is not exists
1444	opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1445	my @files = grep {!/^\.+$/} readdir(DIR);
1446	close(DIR);
1447
1448	foreach my $file (@files)
1449	{
1450	my $src_file = &util::filename_cat($base_dirname,$file);
1451	my $dest_file = &util::filename_cat($tmp_dirname,$file);
1452	if ((!-e $dest_file) && (!-d $src_file))
1453	{
1454	# just copy the original file back to the tmp directory
1455	copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1456	}
1457	}
1458
1459	# tidy the input file
1460	my $tidy_output_filename = $hdl_output_filename;
1461	if ($self->{'use_realistic_book'})
1462	{
1463	$tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1464	$tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1465	}
1466	$tmp_filename = $tidy_output_filename;
1467	}
1468	else
1469	{
1470	if (!-e $tmp_filename)
1471	{
1472	# just copy the original file back to the tmp directory
1473	copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1474	}
1475	}
1476
1477	return $tmp_filename;
1478	}
1479
1480
1481	# Will make the html input file as a proper XML file with removed font tag and
1482	# image size added to the img tag.
1483	# The tidying process takes place in a collection specific 'tmp' directory so
1484	# that we don't accidentally damage the input.
1485	sub tmp_tidy_file
1486	{
1487	my $self = shift (@_);
1488	my ($file,$cnfile) = @_;
1489	my $input_filename = $file;
1490	my $tmp_filename = $cnfile;
1491
1492	# get the input filename
1493	my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1494
1495	require HTML::TokeParser::Simple;
1496
1497	# create HTML parser to decode the input file
1498	my $parser = HTML::TokeParser::Simple->new($input_filename);
1499
1500	# write HTML tmp file without the font tag and image size are added to the img tag
1501	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1502	while (my $token = $parser->get_token())
1503	{
1504	# is it an img tag
1505	if ($token->is_start_tag('img'))
1506	{
1507	# get the attributes
1508	my $attr = $token->return_attr;
1509
1510	# get the full path to the image
1511	my $img_file = &util::filename_cat($dirname,$attr->{src});
1512
1513	# set the width and height attribute
1514	($attr->{width}, $attr->{height}) = imgsize($img_file);
1515
1516	# recreate the tag
1517	print PROD "<img";
1518	print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1519	print PROD ">";
1520	}
1521	# is it a font tag
1522	else
1523	{
1524	if (($token->is_start_tag('font')) \|\| ($token->is_end_tag('font')))
1525	{
1526	# remove font tag
1527	print PROD "";
1528	}
1529	else
1530	{
1531	# print without changes
1532	print PROD $token->as_is;
1533	}
1534	}
1535	}
1536	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1537
1538	# run html-tidy on the tmp file to make it a proper XML file
1539	my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`;
1540
1541	# write result back to the tmp file
1542	open (PROD, ">$tmp_filename") \|\| die("Error Writing to File: $tmp_filename $!");
1543	print PROD $tidyfile;
1544	close (PROD) \|\| die("Error Closing File: $tmp_filename $!");
1545
1546	# return the output filename
1547	return $tmp_filename;
1548	}
1549
1550	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: