source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 23335

Last change on this file since 23335 was 23335, checked in by davidb, 13 years ago

Work done on improving handing of filenames when the actualy filename encoding used is not necesarrily known. Tested for Linux. Work currently includes some debug statements that will be removed once testing for Windows and Mac is done.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 56.7 KB
Line 
1###########################################################################
2#
3# HTMLPlugin.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlugin;
37
38use Encode;
39
40use ReadTextFile;
41use HBPlugin;
42use ghtml;
43use unicode;
44use util;
45use XMLParser;
46
47use Image::Size;
48use File::Copy;
49
50sub BEGIN {
51 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
52}
53
54use strict; # every perl program should have this!
55no strict 'refs'; # make an exception so we can use variables as filehandles
56
57my $arguments =
58 [ { 'name' => "process_exp",
59 'desc' => "{BasePlugin.process_exp}",
60 'type' => "regexp",
61 'deft' => &get_default_process_exp() },
62 { 'name' => "block_exp",
63 'desc' => "{BasePlugin.block_exp}",
64 'type' => 'regexp',
65 'deft' => &get_default_block_exp() },
66 { 'name' => "nolinks",
67 'desc' => "{HTMLPlugin.nolinks}",
68 'type' => "flag" },
69 { 'name' => "keep_head",
70 'desc' => "{HTMLPlugin.keep_head}",
71 'type' => "flag" },
72 { 'name' => "no_metadata",
73 'desc' => "{HTMLPlugin.no_metadata}",
74 'type' => "flag" },
75 { 'name' => "metadata_fields",
76 'desc' => "{HTMLPlugin.metadata_fields}",
77 'type' => "string",
78 'deft' => "Title" },
79 { 'name' => "metadata_field_separator",
80 'desc' => "{HTMLPlugin.metadata_field_separator}",
81 'type' => "string",
82 'deft' => "" },
83 { 'name' => "hunt_creator_metadata",
84 'desc' => "{HTMLPlugin.hunt_creator_metadata}",
85 'type' => "flag" },
86 { 'name' => "file_is_url",
87 'desc' => "{HTMLPlugin.file_is_url}",
88 'type' => "flag" },
89 { 'name' => "assoc_files",
90 'desc' => "{HTMLPlugin.assoc_files}",
91 'type' => "regexp",
92 'deft' => &get_default_block_exp() },
93 { 'name' => "rename_assoc_files",
94 'desc' => "{HTMLPlugin.rename_assoc_files}",
95 'type' => "flag" },
96 { 'name' => "title_sub",
97 'desc' => "{HTMLPlugin.title_sub}",
98 'type' => "string",
99 'deft' => "" },
100 { 'name' => "description_tags",
101 'desc' => "{HTMLPlugin.description_tags}",
102 'type' => "flag" },
103 # retain this for backward compatibility (w3mir option was replaced by
104 # file_is_url)
105 { 'name' => "w3mir",
106# 'desc' => "{HTMLPlugin.w3mir}",
107 'type' => "flag",
108 'hiddengli' => "yes"},
109 { 'name' => "no_strip_metadata_html",
110 'desc' => "{HTMLPlugin.no_strip_metadata_html}",
111 'type' => "string",
112 'deft' => "",
113 'reqd' => "no"},
114 { 'name' => "sectionalise_using_h_tags",
115 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
116 'type' => "flag" },
117 { 'name' => "use_realistic_book",
118 'desc' => "{HTMLPlugin.tidy_html}",
119 'type' => "flag"},
120 { 'name' => "old_style_HDL",
121 'desc' => "{HTMLPlugin.old_style_HDL}",
122 'type' => "flag"},
123 {'name' => "processing_tmp_files",
124 'desc' => "{BasePlugin.processing_tmp_files}",
125 'type' => "flag",
126 'hiddengli' => "yes"}
127 ];
128
129my $options = { 'name' => "HTMLPlugin",
130 'desc' => "{HTMLPlugin.desc}",
131 'abstract' => "no",
132 'inherits' => "yes",
133 'args' => $arguments };
134
135
136sub new {
137 my ($class) = shift (@_);
138 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139 push(@$pluginlist, $class);
140
141 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142 push(@{$hashArgOptLists->{"OptList"}},$options);
143
144
145 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
146
147 if ($self->{'w3mir'}) {
148 $self->{'file_is_url'} = 1;
149 }
150 $self->{'aux_files'} = {};
151 $self->{'dir_num'} = 0;
152 $self->{'file_num'} = 0;
153
154 return bless $self, $class;
155}
156
157# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
158# if have eg <script language="javascript" src="img/lib.js@123">
159# blocking is now done by reading through the file and recording all the
160# images and other files
161sub get_default_block_exp {
162 my $self = shift (@_);
163
164 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
165 return "";
166}
167
168sub get_default_process_exp {
169 my $self = shift (@_);
170
171 # the last option is an attempt to encode the concept of an html query ...
172 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
173}
174
175sub store_block_files
176{
177 my $self =shift (@_);
178 my ($filename_full_path, $block_hash) = @_;
179
180 my $html_fname = $filename_full_path;
181 my @file_blocks;
182
183 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
184 $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
185
186 # read in file ($text will be in utf8)
187 my $raw_text = "";
188 $self->read_file_no_decoding ($filename_full_path, \$raw_text);
189
190 my $textref = \$raw_text;
191 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
192 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
193 $$textref =~ s/$opencom(.*?)$closecom//gs;
194
195 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
196 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
197 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
198 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
199 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
200 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
201 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
202
203 if(!defined $self->{'utf8_to_original_filename'}) {
204 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
205 $self->{'utf8_to_original_filename'} = {};
206 }
207
208 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
209
210 # remove quotes from link at start and end if necessary
211 if ($link=~/^\"/) {
212 $link=~s/^\"//;
213 $link=~s/\"$//;
214 }
215
216 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
217 # some links may just be anchor names
218 next unless ($link =~ /\S+/);
219
220 if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
221 # Turn relative file path into full path
222 my $dirname = &File::Basename::dirname($filename_full_path);
223 $link = &util::filename_cat($dirname, $link);
224 }
225 $link = $self->eval_dir_dots($link);
226
227 # this is the actual filename on the filesystem (that the link refers to)
228 my $url_original_filename = $self->opt_url_decode($link);
229
230 # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
231 my $utf8_link = "";
232 $self->decode_text($link,$content_encoding,$language,\$utf8_link);
233
234 $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
235# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
236
237 if ($url_original_filename ne $utf8_link) {
238 my $outhandle = $self->{'outhandle'};
239
240 print $outhandle "URL Encoding $url_original_filename\n";
241 print $outhandle " ->$utf8_link\n";
242 }
243
244 $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
245 }
246}
247
248# Given a filename in any encoding, will URL decode it to get back the original filename
249# in the original encoding. Because this method is intended to work out the *original*
250# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
251# string already exists in the local folder.
252# Return the original filename corresponding to the parameter URL-encoded filename, and
253# a decoded flag that is set to true iff URL-decoding had to be applied.
254sub opt_url_decode {
255 my $self = shift (@_);
256 my ($link) = @_;
257
258 # Replace %XX's in URL with decoded value if required.
259 # Note that the filename may include the %XX in some situations
260 if ($link =~ m/\%[A-F0-9]{2}/i) {
261 if (!-e $link) {
262 $link = &unicode::url_decode($link);
263 }
264 }
265
266 return $link;
267}
268
269sub read_into_doc_obj
270{
271 my $self = shift (@_);
272 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
273
274 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
275
276 # Lookup content_encoding worked out in file_block pass for this file
277 # Store it under the local name 'content_encoding' so its nice and
278 # easy to access
279 $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
280
281 # get the input file
282 my $input_filename = $file;
283 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
284 $suffix = lc($suffix);
285 my $tidy_filename;
286 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
287 {
288 # because the document has to be sectionalized set the description tags
289 $self->{'description_tags'} = 1;
290
291 # set the file to be tidied
292 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
293
294 # get the tidied file
295 #my $tidy_filename = $self->tmp_tidy_file($input_filename);
296 $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
297
298 # derive tmp filename from input filename
299 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
300
301 # set the new input file and base_dir to be from the tidied file
302 $file = "$tailname$suffix";
303 $base_dir = $dirname;
304 }
305
306 # call the parent read_into_doc_obj
307 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
308 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
309 {
310 # now we need to reset the filenames in the doc obj so that the converted filenames are not used
311 my $collect_file = &util::filename_within_collection($filename_full_path);
312 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
313 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
314 # build. so set it manually.
315 $doc_obj->{'source_path'} = $filename_full_path;
316 my $collect_conv_file = &util::filename_within_collection($tidy_filename);
317 $doc_obj->set_converted_filename($collect_conv_file);
318 $self->set_Source_metadata($doc_obj, $filename_no_path);
319 }
320
321 delete $self->{'store_content_encoding'}->{$filename_full_path};
322 $self->{'content_encoding'} = undef;
323
324 return ($process_status,$doc_obj);
325}
326
327# do plugin specific processing of doc_obj
328sub process {
329 my $self = shift (@_);
330 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
331 my $outhandle = $self->{'outhandle'};
332
333 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
334 # this makes life so much easier... perl can cope with unix-style '/'s.
335 $base_dir =~ s@(\\)+@/@g;
336 $file =~ s@(\\)+@/@g;
337 }
338
339 # reset per-doc stuff...
340 $self->{'aux_files'} = {};
341 $self->{'dir_num'} = 0;
342 $self->{'file_num'} = 0;
343
344 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
345 # you can also include metadata in the format (X can be any number)
346 # <hX>Title<!--gsdl-metadata
347 # <Metadata name="name1">value1</Metadata>
348 # ...
349 # <Metadata name="nameN">valueN</Metadata>
350 #--></hX>
351 if ($self->{'sectionalise_using_h_tags'}) {
352 # description_tags should allways be activated because we convert headings to description tags
353 $self->{'description_tags'} = 1;
354
355 my $arrSections = [];
356 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
357
358 if (scalar(@$arrSections)) {
359 my $strMetadata = $self->update_section_data($arrSections, -1);
360 if (length($strMetadata)) {
361 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
362 $$textref =~ s/<\/body>/$strMetadata/ig;
363 }
364 }
365 }
366
367 my $cursection = $doc_obj->get_top_section();
368
369 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
370 unless $self->{'no_metadata'} || $self->{'description_tags'};
371
372 # Store URL for page as metadata - this can be used for an
373 # altavista style search interface. The URL won't be valid
374 # unless the file structure contains the domain name (i.e.
375 # like when w3mir is used to download a website).
376
377 # URL metadata (even invalid ones) are used to support internal
378 # links, so even if 'file_is_url' is off, still need to store info
379
380 my ($tailname,$dirname) = &File::Basename::fileparse($file);
381 print STDERR "***!! file = $file\n";
382# my $utf8_file = $self->filename_to_utf8_metadata($file);
383# $utf8_file =~ s/&\#095;/_/g;
384 my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
385 print STDERR "***!! utf8_file = $utf8_file\n";
386
387 my $web_url = "http://";
388 if(defined $dirname) { # local directory
389 # Check for "ftp" in the domain name of the directory
390 # structure to determine if this URL should be a ftp:// URL
391 # This check is not infallible, but better than omitting the
392 # check, which would cause all files downloaded from ftp sites
393 # via mirroring with wget to have potentially erroneous http:// URLs
394 # assigned in their metadata
395 if ($dirname =~ /^[^\/]*ftp/i)
396 {
397 $web_url = "ftp://";
398 }
399 $dirname = $self->eval_dir_dots($dirname);
400 $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
401 $web_url = $web_url.$dirname.$utf8_file;
402 } else {
403 $web_url = $web_url.$utf8_file;
404 }
405 $web_url =~ s/\\/\//g;
406 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
407
408 if ($self->{'file_is_url'}) {
409 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
410 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
411 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
412 }
413
414 if ($self->{'description_tags'}) {
415 # remove the html header - note that doing this here means any
416 # sections defined within the header will be lost (so all <Section>
417 # tags must appear within the body of the HTML)
418 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
419
420 $$textref =~ s/^.*?<body[^>]*>//is;
421 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
422
423 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
424 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
425
426 my $lt = '(?:<|&lt;)';
427 my $gt = '(?:>|&gt;)';
428 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
429
430 my $dont_strip = '';
431 if ($self->{'no_strip_metadata_html'}) {
432 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
433 }
434
435 my $found_something = 0; my $top = 1;
436 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
437 my $text = $1;
438 my $comment = $2;
439 if (defined $text) {
440 # text before a comment - note that getting to here
441 # doesn't necessarily mean there are Section tags in
442 # the document
443 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
444 }
445 while ($comment =~ s/$lt(.*?)$gt//s) {
446 my $tag = $1;
447 if ($tag eq "Section") {
448 $found_something = 1;
449 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
450 $top = 0;
451 } elsif ($tag eq "/Section") {
452 $found_something = 1;
453 $cursection = $doc_obj->get_parent_section ($cursection);
454 } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
455 my $metaname = $1;
456 my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
457 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
458 my $metavalue = $1;
459 $metavalue =~ s/^\s+//;
460 $metavalue =~ s/\s+$//;
461 # assume that no metadata value intentionally includes
462 # carriage returns or HTML tags (if they're there they
463 # were probably introduced when converting to HTML from
464 # some other format).
465 # actually some people want to have html tags in their
466 # metadata.
467 $metavalue =~ s/[\cJ\cM]/ /sg;
468 $metavalue =~ s/<[^>]+>//sg
469 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
470 $metavalue =~ s/\s+/ /sg;
471 if ($metaname =~ /\./) { # has a namespace
472 $metaname = "ex.$metaname";
473 }
474 if ($accumulate) {
475 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
476 } else {
477 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
478 }
479 } elsif ($tag eq "Description" || $tag eq "/Description") {
480 # do nothing with containing Description tags
481 } else {
482 # simple HTML tag (probably created by the conversion
483 # to HTML from some other format) - we'll ignore it and
484 # hope for the best ;-)
485 }
486 }
487 }
488 if ($cursection ne "") {
489 print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
490 }
491
492 $$textref =~ s/^.*?<body[^>]*>//is;
493 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
494 if ($$textref =~ m/\S/) {
495 if (!$found_something) {
496 if ($self->{'verbosity'} > 2) {
497 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
498 print $outhandle " will be processed as a single section document\n";
499 }
500
501 # go ahead and process single-section document
502 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
503
504 # if document contains no Section tags we'll go ahead
505 # and extract metadata (this won't have been done
506 # above as the -description_tags option prevents it)
507 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
508 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
509 unless $self->{'no_metadata'};
510
511 } else {
512 print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
513 print $outhandle " of the final closing </Section> tag. This text will\n";
514 print $outhandle " be ignored.";
515
516 my ($text);
517 if (length($$textref) > 30) {
518 $text = substr($$textref, 0, 30) . "...";
519 } else {
520 $text = $$textref;
521 }
522 $text =~ s/\n/ /isg;
523 print $outhandle " ($text)\n";
524 }
525 } elsif (!$found_something) {
526
527 if ($self->{'verbosity'} > 2) {
528 # may get to here if document contained no valid Section
529 # tags but did contain some comments. The text will have
530 # been processed already but we should print the warning
531 # as above and extract metadata
532 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
533 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
534 }
535
536 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
537 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
538 unless $self->{'no_metadata'};
539 }
540
541 } else {
542
543 # remove header and footer
544 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
545 $$textref =~ s/^.*?<body[^>]*>//is;
546 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
547 }
548
549 # single section document
550 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
551 }
552
553 return 1;
554}
555
556
557sub process_heading
558{
559 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
560 $strHeadingText = '' if (!defined($strHeadingText));
561
562 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
563
564 my $strSecMetadata = '';
565 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
566 {
567 $strSecMetadata .= $1;
568 }
569
570 $strHeadingText =~ s/^\s+//g;
571 $strHeadingText =~ s/\s+$//g;
572 $strSecMetadata =~ s/^\s+//g;
573 $strSecMetadata =~ s/\s+$//g;
574
575 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
576
577 if (length($strSecMetadata)) {
578 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
579 }
580
581 $strMetadata .= "\t</Description>\n";
582
583 return "<!--" . $strMetadata . "-->";
584}
585
586
587sub update_section_data
588{
589 my ($self, $arrSections, $nCurTocNo) = @_;
590 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
591
592 if ($nSections == 0) {
593 push @$arrSections, $nCurTocNo;
594 return $strBuffer;
595 }
596 $nLast = $arrSections->[$nSections - 1];
597 if ($nCurTocNo > $nLast) {
598 push @$arrSections, $nCurTocNo;
599 return $strBuffer;
600 }
601 for(my $i = $nSections - 1; $i >= 0; $i--) {
602 if ($nCurTocNo <= $arrSections->[$i]) {
603 $strBuffer .= "\n</Section>";
604 pop @$arrSections;
605 }
606 }
607 push @$arrSections, $nCurTocNo;
608 return $strBuffer;
609}
610
611
612# note that process_section may be called multiple times for a single
613# section (relying on the fact that add_utf8_text appends the text to any
614# that may exist already).
615sub process_section {
616 my $self = shift (@_);
617 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
618 # trap links
619 if (!$self->{'nolinks'}) {
620 # usemap="./#index" not handled correctly => change to "#index"
621## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
622
623 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
624 $self->replace_usemap_links($1, $2, $3)/isge;
625
626## $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
627
628 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
629 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
630 }
631
632 # trap images
633
634 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
635 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
636 # turned regular text succeeding images into links. That is, by embedding <imgs>
637 # inside <a href=""></a>, the text following images were no longer misbehaving.
638 # However, there would be many occasions whereby images were not meant to link
639 # to their source images but where the images would link to another web page.
640 # To allow this, the no_image_links option was introduced: it would prevent
641 # the behaviour of embedding images into links that referenced the source images.
642
643 # Somewhere along the line, the problem of normal text turning into links when
644 # such text followed images which were not embedded in <a href=""></a> ceased
645 # to occur. This is why the following lines have been commented out (as well as
646 # two lines in replace_images). They appear to no longer apply.
647
648 # If at any time, there is a need for having images embedded in <a> anchor tags,
649 # then it might be better to turn that into an HTMLPlugin option rather than make
650 # it the default behaviour. Also, eventually, no_image_links needs to become
651 # a deprecated option for HTMLPlugin as it has now become the default behaviour.
652
653 #if(!$self->{'no_image_links'}){
654 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
655 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
656 #}
657
658 # add text to document object
659 # turn \ into \\ so that the rest of greenstone doesn't think there
660 # is an escape code following. (Macro parsing loses them...)
661 $$textref =~ s/\\/\\\\/go;
662
663 $doc_obj->add_utf8_text($cursection, $$textref);
664}
665
666sub replace_images {
667 my $self = shift (@_);
668 my ($front, $link, $back, $base_dir,
669 $file, $doc_obj, $section) = @_;
670
671 # remove quotes from link at start and end if necessary
672 if ($link=~/^[\"\']/) {
673 $link=~s/^[\"\']//;
674 $link=~s/[\"\']$//;
675 $front.='"';
676 $back="\"$back";
677 }
678
679 $link =~ s/\n/ /g;
680
681 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
682 # If the Word file path has spaces in it, wv messes up and you end up with
683 # absolute paths for the images, and without the "file://" prefix
684 # So check for this special case and massage the data to be correct
685 if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
686 $link =~ s/^.*\\([^\\]+)$/$1/;
687 }
688
689 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
690
691 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
692
693# print STDERR "**** link = $link\n**** href = $href\n**** img_file = $img_file, rl = $rl\n";
694
695 my $anchor_name = $img_file;
696 #$anchor_name =~ s/^.*\///;
697 #$anchor_name = "<a name=\"$anchor_name\" ></a>";
698
699 my $image_link = $front . $img_file .$back;
700 return $image_link;
701
702 # The reasons for why the following two lines are no longer necessary can be
703 # found in subroutine process_section
704 #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
705 #return $anchor_link;
706
707 #return $front . $img_file . $back . $anchor_name;
708}
709
710sub replace_href_links {
711 my $self = shift (@_);
712 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
713
714 # remove quotes from link at start and end if necessary
715 if ($link=~/^[\"\']/) {
716 $link=~s/^[\"\']//;
717 $link=~s/[\"\']$//;
718 $front.='"';
719 $back="\"$back";
720 }
721
722 # attempt to sort out targets - frames are not handled
723 # well in this plugin and some cases will screw things
724 # up - e.g. the _parent target (so we'll just remove
725 # them all ;-)
726 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
727 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
728 $front =~ s/target=\"?_parent\"?//is;
729 $back =~ s/target=\"?_parent\"?//is;
730
731 return $front . $link . $back if $link =~ m/^\#/s;
732 $link =~ s/\n/ /g;
733
734 # Find file referred to by $link on file system
735 # This is more complicated than it sounds when char encodings
736 # is taken in to account
737 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
738
739 # href may use '\'s where '/'s should be on Windows
740 $href =~ s/\\/\//g;
741 my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
742
743
744 ##### leave all these links alone (they won't be picked up by intermediate
745 ##### pages). I think that's safest when dealing with frames, targets etc.
746 ##### (at least until I think of a better way to do it). Problems occur with
747 ##### mailto links from within small frames, the intermediate page is displayed
748 ##### within that frame and can't be seen. There is still potential for this to
749 ##### happen even with html pages - the solution seems to be to somehow tell
750 ##### the browser from the server side to display the page being sent (i.e.
751 ##### the intermediate page) in the top level window - I'm not sure if that's
752 ##### possible - the following line should probably be deleted if that can be done
753 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
754
755 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
756 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
757
758
759 # If web page didn't give encoding, then default to utf8
760 print "*************** looking up $file\n";
761
762 my $content_encoding= $self->{'content_encoding'} || "utf8";
763 $href = encode($content_encoding,$href);
764
765 $href = &unicode::raw_filename_to_url_encoded($href);
766 $href = &unicode::filename_to_url($href);
767
768 &ghtml::urlsafe ($href);
769 print STDERR "***!!! href=$href\n";
770
771 return $front . "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part . $back;
772 } else {
773 # link is to some other type of file (e.g., an image) so we'll
774 # need to associate that file
775 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
776 }
777}
778
779sub add_file {
780 my $self = shift (@_);
781 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
782 my ($newname);
783
784 my $filename = $href;
785 if ($base_dir eq "") {
786 # remove http:/ thereby leaving one slash at the start
787 $filename =~ s/^[^:]*:\///;
788 }
789 else {
790 # remove http://
791 $filename =~ s/^[^:]*:\/\///;
792 }
793
794 $filename = &util::filename_cat($base_dir, $filename);
795 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) {
796 # we are processing a tidytmp file - want paths to be in import
797 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
798 }
799
800 # Replace %XX's in URL with decoded value if required. Note that the
801 # filename may include the %XX in some situations. If the *original*
802 # file's name was in URL encoding, the following method will not decode
803 # it.
804 my $utf8_filename = $filename;
805 my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename);
806
807 my $content_encoding= $self->{'content_encoding'} || "utf8";
808
809 # The filenames that come through the HTML file have been decoded
810 # into Unicode aware Perl strings. Need to convert them back
811 # to their initial raw-byte encoding to match the file that
812 # exists on the file system
813 $filename = encode($content_encoding, $opt_decode_utf8_filename);
814
815
816 # some special processing if the intended filename was converted to utf8, but
817 # the actual file still needs to be renamed
818 if (!-e $filename) {
819 # try the original filename stored in map
820 print STDERR "***###!! orig filename did not exist: $filename\n";
821
822 my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename};
823
824 print STDERR "**** Trying for $original_filename\n";
825
826 if (defined $original_filename && -e $original_filename) {
827 print STDERR "*** found match\n";
828 $filename = $original_filename;
829 }
830 }
831
832 my ($ext) = $filename =~ m/(\.[^\.]*)$/;
833
834 if ($rl == 0) {
835 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
836 return "_httpextlink_&amp;rl=0&amp;el=prompt&amp;href=" . $href . $hash_part;
837 }
838 else {
839 return "_httpextlink_&amp;rl=0&amp;el=direct&amp;href=" . $href . $hash_part;
840 }
841 }
842
843 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
844 return "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part;
845 }
846 # add the original image file as a source file
847 if (!$self->{'processing_tmp_files'} ) {
848 $doc_obj->associate_source_file($filename);
849 }
850 if ($self->{'rename_assoc_files'}) {
851 if (defined $self->{'aux_files'}->{$href}) {
852 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
853 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
854 } else {
855 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
856 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
857 $self->inc_filecount ();
858 }
859 $doc_obj->associate_file($filename, $newname, undef, $section);
860 return "_httpdocimg_/$newname";
861 } else {
862 if(&unicode::is_url_encoded($utf8_filename)) {
863 # use the possibly-decoded filename instead to avoid double URL encoding
864 ($newname) = $filename =~ m/([^\/\\]*)$/;
865 } else {
866 ($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
867 }
868
869 # Make sure this name uses only ASCII characters.
870 # We use either base64 or URL encoding, as these preserve original encoding
871 $newname = &util::rename_file($newname, $self->{'file_rename_method'});
872
873 $doc_obj->associate_file($filename, $newname, undef, $section);
874
875 # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
876 # of filenames, URL-encode the additional percent signs of the URL-encoded filename
877 my $newname_url = $newname;
878 $newname_url = &unicode::filename_to_url($newname_url);
879 return "_httpdocimg_/$newname_url";
880 }
881}
882
883
884sub format_link {
885 my $self = shift (@_);
886 my ($link, $base_dir, $file) = @_;
887
888 my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/;
889
890 $hash_part = "" if !defined $hash_part;
891 if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
892 my $outhandle = $self->{'outhandle'};
893 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
894 if $self->{'verbosity'};
895 return ($link, "", 0);
896 }
897
898 if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) {
899 my $type = $1;
900
901 if ($link =~ m/^(http|ftp):/i) {
902 # Turn url (using /) into file name (possibly using \ on windows)
903 my @http_dir_split = split('/', $before_hash);
904 $before_hash = &util::filename_cat(@http_dir_split);
905 }
906
907 $before_hash = $self->eval_dir_dots($before_hash);
908
909 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
910
911 my $rl = 0;
912 $rl = 1 if (-e $linkfilename);
913
914 # make sure there's a slash on the end if it's a directory
915 if ($before_hash !~ m/\/$/) {
916 $before_hash .= "/" if (-d $linkfilename);
917 }
918 return ($type . $before_hash, $hash_part, $rl);
919
920 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
921
922 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
923
924 # the first directory will be the domain name if file_is_url
925 # to generate archives, otherwise we'll assume all files are
926 # from the same site and base_dir is the root
927
928 if ($self->{'file_is_url'}) {
929 my @dirs = split /[\/\\]/, $file;
930 my $domname = shift (@dirs);
931 $before_hash = &util::filename_cat($domname, $before_hash);
932 $before_hash =~ s@\\@/@g; # for windows
933 }
934 else
935 {
936 # see if link shares directory with source document
937 # => turn into relative link if this is so!
938
939 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
940 # too difficult doing a pattern match with embedded '\'s...
941 my $win_before_hash=$before_hash;
942 $win_before_hash =~ s@(\\)+@/@g;
943 # $base_dir is already similarly "converted" on windows.
944 if ($win_before_hash =~ s@^$base_dir/@@o) {
945 # if this is true, we removed a prefix
946 $before_hash=$win_before_hash;
947 }
948 }
949 else {
950 # before_hash has lost leading slash by this point,
951 # -> add back in prior to substitution with $base_dir
952 $before_hash = "/$before_hash";
953
954 $before_hash = &util::filename_cat("",$before_hash);
955 $before_hash =~ s@^$base_dir/@@;
956 }
957 }
958 } else {
959 # Turn relative file path into full path
960 my $dirname = &File::Basename::dirname($file);
961 $before_hash = &util::filename_cat($dirname, $before_hash);
962 $before_hash = $self->eval_dir_dots($before_hash);
963 }
964
965 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
966 # make sure there's a slash on the end if it's a directory
967 if ($before_hash !~ m/\/$/) {
968 $before_hash .= "/" if (-d $linkfilename);
969 }
970 return ("http://" . $before_hash, $hash_part, 1);
971 } else {
972 # mailto, news, nntp, telnet, javascript or gopher link
973 return ($before_hash, "", 0);
974 }
975}
976
977sub extract_first_NNNN_characters {
978 my $self = shift (@_);
979 my ($textref, $doc_obj, $thissection) = @_;
980
981 foreach my $size (split /,/, $self->{'first'}) {
982 my $tmptext = $$textref;
983 # skip to the body
984 $tmptext =~ s/.*<body[^>]*>//i;
985 # remove javascript
986 $tmptext =~ s@<script.*?</script>@ @sig;
987 $tmptext =~ s/<[^>]*>/ /g;
988 $tmptext =~ s/&nbsp;/ /g;
989 $tmptext =~ s/^\s+//;
990 $tmptext =~ s/\s+$//;
991 $tmptext =~ s/\s+/ /gs;
992 $tmptext = &unicode::substr ($tmptext, 0, $size);
993 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
994 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
995 }
996}
997
998
999sub extract_metadata {
1000 my $self = shift (@_);
1001 my ($textref, $metadata, $doc_obj, $section) = @_;
1002 my $outhandle = $self->{'outhandle'};
1003 # if we don't want metadata, we may as well not be here ...
1004 return if (!defined $self->{'metadata_fields'});
1005
1006 my $separator = $self->{'metadata_field_separator'};
1007 if ($separator eq "") {
1008 undef $separator;
1009 }
1010
1011 # metadata fields to extract/save. 'key' is the (lowercase) name of the
1012 # html meta, 'value' is the metadata name for greenstone to use
1013 my %find_fields = ();
1014
1015 my %creator_fields = (); # short-cut for lookups
1016
1017
1018 foreach my $field (split /,/, $self->{'metadata_fields'}) {
1019 $field =~ s/^\s+//; # remove leading whitespace
1020 $field =~ s/\s+$//; # remove trailing whitespace
1021
1022 # support tag<tagname>
1023 if ($field =~ m/^(.*?)\s*<(.*?)>$/) {
1024 # "$2" is the user's preferred gs metadata name
1025 $find_fields{lc($1)}=$2; # lc = lowercase
1026 } else { # no <tagname> for mapping
1027 # "$field" is the user's preferred gs metadata name
1028 $find_fields{lc($field)}=$field; # lc = lowercase
1029 }
1030 }
1031
1032 if (defined $self->{'hunt_creator_metadata'} &&
1033 $self->{'hunt_creator_metadata'} == 1 ) {
1034 my @extra_fields =
1035 (
1036 'author',
1037 'author.email',
1038 'creator',
1039 'dc.creator',
1040 'dc.creator.corporatename',
1041 );
1042
1043 # add the creator_metadata fields to search for
1044 foreach my $field (@extra_fields) {
1045 $creator_fields{$field}=0; # add to lookup hash
1046 }
1047 }
1048
1049
1050 # find the header in the html file, which has the meta tags
1051 $$textref =~ m@<head>(.*?)</head>@si;
1052
1053 my $html_header=$1;
1054
1055 # go through every <meta... tag defined in the html and see if it is
1056 # one of the tags we want to match.
1057
1058 # special case for title - we want to remember if its been found
1059 my $found_title = 0;
1060 # this assumes that ">" won't appear. (I don't think it's allowed to...)
1061 $html_header =~ m/^/; # match the start of the string, for \G assertion
1062
1063 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
1064 my $metatag=$1;
1065 my ($tag, $value);
1066
1067 # find the tag name
1068 $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
1069 $tag=$2;
1070 # in case they're not using " or ', but they should...
1071 if (! $tag) {
1072 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1073 $tag=$1;
1074 }
1075
1076 if (!defined $tag) {
1077 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1078 next;
1079 }
1080
1081 # don't need to assign this field if it was passed in from a previous
1082 # (recursive) plugin
1083 if (defined $metadata->{$tag}) {next}
1084
1085 # find the tag content
1086 $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
1087 $value=$2;
1088
1089 if (! $value) {
1090 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1091 $value=$1;
1092 }
1093 if (!defined $value) {
1094 print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1095 next;
1096 }
1097
1098 # clean up and add
1099 $value =~ s/\s+/ /gs;
1100 chomp($value); # remove trailing \n, if any
1101 if (exists $creator_fields{lc($tag)}) {
1102 # map this value onto greenstone's "Creator" metadata
1103 $tag='Creator';
1104 } elsif (!exists $find_fields{lc($tag)}) {
1105 next; # don't want this tag
1106 } else {
1107 # get the user's preferred capitalisation
1108 $tag = $find_fields{lc($tag)};
1109 }
1110 if (lc($tag) eq "title") {
1111 $found_title = 1;
1112 }
1113
1114 if ($self->{'verbosity'} > 2) {
1115 print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1116 }
1117
1118 if ($tag =~ /\./) {
1119 # there is a . so has a namespace, add ex.
1120 $tag = "ex.$tag";
1121 }
1122 if (defined $separator) {
1123 my @values = split($separator, $value);
1124 foreach my $v (@values) {
1125 $doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1126 }
1127 }
1128 else {
1129 $doc_obj->add_utf8_metadata($section, $tag, $value);
1130 }
1131 }
1132
1133 # TITLE: extract the document title
1134 if (exists $find_fields{'title'} && !$found_title) {
1135 # we want a title, and didn't find one in the meta tags
1136 # see if there's a <title> tag
1137 my $title;
1138 my $from = ""; # for debugging output only
1139 if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
1140 $title = $1;
1141 $from = "<title> tags";
1142 }
1143
1144 if (!defined $title) {
1145 $from = "first 100 chars";
1146 # if no title use first 100 or so characters
1147 $title = $$textref;
1148 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1149 $title =~ s/^.*?<body>//si;
1150 # ignore javascript!
1151 $title =~ s@<script.*?</script>@ @sig;
1152 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1153 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1154 $title = substr ($title, 0, 100);
1155 $title =~ s/\s\S*$/.../;
1156 }
1157 $title =~ s/<[^>]*>/ /g; # remove html tags
1158 $title =~ s/&nbsp;/ /g;
1159 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1160 $title =~ s/\s+/ /gs; # collapse multiple spaces
1161 $title =~ s/^\s*//; # remove leading spaces
1162 $title =~ s/\s*$//; # remove trailing spaces
1163
1164 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1165 $title =~ s/^\s+//s; # in case title_sub introduced any...
1166 print STDERR "**** adding Title: ", Encode::encode("utf8",$title), "\n";
1167 $doc_obj->add_utf8_metadata ($section, "Title", $title);
1168 print $outhandle " extracted Title metadata \"$title\" from $from\n"
1169 if ($self->{'verbosity'} > 2);
1170 }
1171
1172 # add FileFormat metadata
1173 $doc_obj->add_metadata($section,"FileFormat", "HTML");
1174
1175 # Special, for metadata names such as tagH1 - extracts
1176 # the text between the first <H1> and </H1> tags into "H1" metadata.
1177
1178 foreach my $field (keys %find_fields) {
1179 if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1180 my $tag = $1;
1181 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1182 my $content = $1;
1183 $content =~ s/&nbsp;/ /g;
1184 $content =~ s/<[^>]*>/ /g;
1185 $content =~ s/^\s+//;
1186 $content =~ s/\s+$//;
1187 $content =~ s/\s+/ /gs;
1188 if ($content) {
1189 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1190 $tag =~ s/^tag//i;
1191 $doc_obj->add_utf8_metadata ($section, $tag, $content);
1192 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1193 if ($self->{'verbosity'} > 2);
1194 }
1195 }
1196 }
1197}
1198
1199
1200# evaluate any "../" to next directory up
1201# evaluate any "./" as here
1202sub eval_dir_dots {
1203 my $self = shift (@_);
1204 my ($filename) = @_;
1205 my $dirsep_os = &util::get_os_dirsep();
1206 my @dirsep = split(/$dirsep_os/,$filename);
1207
1208 my @eval_dirs = ();
1209 foreach my $d (@dirsep) {
1210 if ($d eq "..") {
1211 pop(@eval_dirs);
1212
1213 } elsif ($d eq ".") {
1214 # do nothing!
1215
1216 } else {
1217 push(@eval_dirs,$d);
1218 }
1219 }
1220
1221 # Need to fiddle with number of elements in @eval_dirs if the
1222 # first one is the empty string. This is because of a
1223 # modification to util::filename_cat that supresses the addition
1224 # of a leading '/' character (or \ if windows) (intended to help
1225 # filename cat with relative paths) if the first entry in the
1226 # array is the empty string. Making the array start with *two*
1227 # empty strings is a way to defeat this "smart" option.
1228 #
1229 if (scalar(@eval_dirs) > 0) {
1230 if ($eval_dirs[0] eq ""){
1231 unshift(@eval_dirs,"");
1232 }
1233 }
1234
1235 my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1236 return $evaluated_filename;
1237}
1238
1239sub replace_usemap_links {
1240 my $self = shift (@_);
1241 my ($front, $link, $back) = @_;
1242
1243 # remove quotes from link at start and end if necessary
1244 if ($link=~/^[\"\']/) {
1245 $link=~s/^[\"\']//;
1246 $link=~s/[\"\']$//;
1247 $front.='"';
1248 $back="\"$back";
1249 }
1250
1251 $link =~ s/^\.\///;
1252 return $front . $link . $back;
1253}
1254
1255sub inc_filecount {
1256 my $self = shift (@_);
1257
1258 if ($self->{'file_num'} == 1000) {
1259 $self->{'dir_num'} ++;
1260 $self->{'file_num'} = 0;
1261 } else {
1262 $self->{'file_num'} ++;
1263 }
1264}
1265
1266
1267# Extend read_file so that strings like &eacute; are
1268# converted to UTF8 internally.
1269#
1270# We don't convert &lt; or &gt; or &amp; or &quot; in case
1271# they interfere with the GML files
1272
1273sub read_file {
1274 my $self = shift(@_);
1275 my ($filename, $encoding, $language, $textref) = @_;
1276
1277 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1278
1279 # Convert entities to their UTF8 equivalents
1280 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
1281 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1282 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
1283
1284}
1285
1286sub HB_read_html_file {
1287 my $self = shift (@_);
1288 my ($htmlfile, $text) = @_;
1289
1290 # load in the file
1291 if (!open (FILE, $htmlfile)) {
1292 print STDERR "ERROR - could not open $htmlfile\n";
1293 return;
1294 }
1295
1296 my $foundbody = 0;
1297 $self->HB_gettext (\$foundbody, $text, "FILE");
1298 close FILE;
1299
1300 # just in case there was no <body> tag
1301 if (!$foundbody) {
1302 $foundbody = 1;
1303 open (FILE, $htmlfile) || return;
1304 $self->HB_gettext (\$foundbody, $text, "FILE");
1305 close FILE;
1306 }
1307 # text is in utf8
1308}
1309
1310# converts the text to utf8, as ghtml does that for &eacute; etc.
1311sub HB_gettext {
1312 my $self = shift (@_);
1313 my ($foundbody, $text, $handle) = @_;
1314
1315 my $line = "";
1316 while (defined ($line = <$handle>)) {
1317 # look for body tag
1318 if (!$$foundbody) {
1319 if ($line =~ s/^.*<body[^>]*>//i) {
1320 $$foundbody = 1;
1321 } else {
1322 next;
1323 }
1324 }
1325
1326 # check for symbol fonts
1327 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
1328 my $font = $1;
1329 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1330 if ($font !~ m/^arial$/i);
1331 }
1332
1333 $$text .= $line;
1334 }
1335
1336 if ($self->{'input_encoding'} eq "iso_8859_1") {
1337 # convert to utf-8
1338 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1339 }
1340 # convert any alphanumeric character entities to their utf-8
1341 # equivalent for indexing purposes
1342 #&ghtml::convertcharentities ($$text);
1343
1344 $$text =~ s/\s+/ /g; # remove \n's
1345
1346 # At this point $$text is a binary byte string
1347 # => turn it into a Unicode aware string, so full
1348 # Unicode aware pattern matching can be used.
1349 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
1350 #
1351
1352 $$text = decode("utf8",$$text);
1353}
1354
1355sub HB_clean_section {
1356 my $self = shift (@_);
1357 my ($section) = @_;
1358
1359 # remove tags without a starting tag from the section
1360 my ($tag, $tagstart);
1361 while ($section =~ m/<\/([^>]{1,10})>/) {
1362 $tag = $1;
1363 $tagstart = index($section, "<$tag");
1364 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1365 $section =~ s/<\/$tag>//;
1366 }
1367
1368 # remove extra paragraph tags
1369 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
1370
1371 # remove extra stuff at the end of the section
1372 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
1373
1374 # add a newline at the beginning of each paragraph
1375 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1376
1377 # add a newline every 80 characters at a word boundary
1378 # Note: this regular expression puts a line feed before
1379 # the last word in each section, even when it is not
1380 # needed.
1381 $section =~ s/(.{1,80})\s/$1\n/g;
1382
1383 # fix up the image links
1384 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
1385 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1386 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
1387 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1388
1389 return $section;
1390}
1391
1392# Will convert the oldHDL format to the new HDL format (using the Section tag)
1393sub convert_to_newHDLformat
1394{
1395 my $self = shift (@_);
1396 my ($file,$cnfile) = @_;
1397 my $input_filename = $file;
1398 my $tmp_filename = $cnfile;
1399
1400 # write HTML tmp file with new HDL format
1401 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1402
1403 # read in the file and do basic html cleaning (removing header etc)
1404 my $html = "";
1405 $self->HB_read_html_file ($input_filename, \$html);
1406
1407 # process the file one section at a time
1408 my $curtoclevel = 1;
1409 my $firstsection = 1;
1410 my $toclevel = 0;
1411 while (length ($html) > 0) {
1412 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
1413 $toclevel = $3;
1414 my $title = $4;
1415 my $sectiontext = "";
1416 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
1417 $sectiontext = $1;
1418 } else {
1419 $sectiontext = $html;
1420 $html = "";
1421 }
1422
1423 # remove tags and extra spaces from the title
1424 $title =~ s/<\/?[^>]+>//g;
1425 $title =~ s/^\s+|\s+$//g;
1426
1427 # close any sections below the current level and
1428 # create a new section (special case for the firstsection)
1429 print PROD "<!--\n";
1430 while (($curtoclevel > $toclevel) ||
1431 (!$firstsection && $curtoclevel == $toclevel)) {
1432 $curtoclevel--;
1433 print PROD "</Section>\n";
1434 }
1435 if ($curtoclevel+1 < $toclevel) {
1436 print STDERR "WARNING - jump in toc levels in $input_filename " .
1437 "from $curtoclevel to $toclevel\n";
1438 }
1439 while ($curtoclevel < $toclevel) {
1440 $curtoclevel++;
1441 }
1442
1443 if ($curtoclevel == 1) {
1444 # add the header tag
1445 print PROD "-->\n";
1446 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1447 print PROD "<!--\n";
1448 }
1449
1450 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1451
1452 print PROD "-->\n";
1453
1454 # clean up the section html
1455 $sectiontext = $self->HB_clean_section($sectiontext);
1456
1457 print PROD "$sectiontext\n";
1458
1459 } else {
1460 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1461 "\nin $input_filename\n";
1462 last;
1463 }
1464 $firstsection = 0;
1465 }
1466
1467 print PROD "<!--\n";
1468 while ($curtoclevel > 0) {
1469 $curtoclevel--;
1470 print PROD "</Section>\n";
1471 }
1472 print PROD "-->\n";
1473
1474 close (PROD) || die("Error Closing File: $tmp_filename $!");
1475
1476 return $tmp_filename;
1477}
1478
1479sub shorten {
1480 my $self = shift (@_);
1481 my ($text) = @_;
1482
1483 return "\"$text\"" if (length($text) < 100);
1484
1485 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1486 substr ($text, length($text)-50) . "\"";
1487}
1488
1489sub convert_tidy_or_oldHDL_file
1490{
1491 my $self = shift (@_);
1492 my ($file) = @_;
1493 my $input_filename = $file;
1494
1495 if (-d $input_filename)
1496 {
1497 return $input_filename;
1498 }
1499
1500 # get the input filename
1501 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1502 my $base_dirname = $dirname;
1503 $suffix = lc($suffix);
1504
1505 # derive tmp filename from input filename
1506 # Remove any white space from filename -- no risk of name collision, and
1507 # makes later conversion by utils simpler. Leave spaces in path...
1508 # tidy up the filename with space, dot, hyphen between
1509 $tailname =~ s/\s+//g;
1510 $tailname =~ s/\.+//g;
1511 $tailname =~ s/\-+//g;
1512 # convert to utf-8 otherwise we have problems with the doc.xml file
1513 # later on
1514 &unicode::ensure_utf8(\$tailname);
1515
1516 # softlink to collection tmp dir
1517 my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1518 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1519
1520 my $test_dirname = "";
1521 my $f_separator = &util::get_os_dirsep();
1522
1523 if ($dirname =~ m/import$f_separator/)
1524 {
1525 $test_dirname = $'; #'
1526
1527 #print STDERR "init $'\n";
1528
1529 while ($test_dirname =~ m/[$f_separator]/)
1530 {
1531 my $folderdirname = $`;
1532 $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1533 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1534 $test_dirname = $'; #'
1535 }
1536 }
1537
1538 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1539
1540 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1541 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
1542 {
1543 #convert the input file to a new style HDL
1544 my $hdl_output_filename = $input_filename;
1545 if ($self->{'old_style_HDL'})
1546 {
1547 $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1548 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1549 }
1550
1551 #just for checking copy all other file from the base dir to tmp dir if it is not exists
1552 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1553 my @files = grep {!/^\.+$/} readdir(DIR);
1554 close(DIR);
1555
1556 foreach my $file (@files)
1557 {
1558 my $src_file = &util::filename_cat($base_dirname,$file);
1559 my $dest_file = &util::filename_cat($tmp_dirname,$file);
1560 if ((!-e $dest_file) && (!-d $src_file))
1561 {
1562 # just copy the original file back to the tmp directory
1563 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1564 }
1565 }
1566
1567 # tidy the input file
1568 my $tidy_output_filename = $hdl_output_filename;
1569 if ($self->{'use_realistic_book'})
1570 {
1571 $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1572 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1573 }
1574 $tmp_filename = $tidy_output_filename;
1575 }
1576 else
1577 {
1578 if (!-e $tmp_filename)
1579 {
1580 # just copy the original file back to the tmp directory
1581 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1582 }
1583 }
1584
1585 return $tmp_filename;
1586}
1587
1588
1589# Will make the html input file as a proper XML file with removed font tag and
1590# image size added to the img tag.
1591# The tidying process takes place in a collection specific 'tmp' directory so
1592# that we don't accidentally damage the input.
1593sub tmp_tidy_file
1594{
1595 my $self = shift (@_);
1596 my ($file,$cnfile) = @_;
1597 my $input_filename = $file;
1598 my $tmp_filename = $cnfile;
1599
1600 # get the input filename
1601 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1602
1603 require HTML::TokeParser::Simple;
1604
1605 # create HTML parser to decode the input file
1606 my $parser = HTML::TokeParser::Simple->new($input_filename);
1607
1608 # write HTML tmp file without the font tag and image size are added to the img tag
1609 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1610 while (my $token = $parser->get_token())
1611 {
1612 # is it an img tag
1613 if ($token->is_start_tag('img'))
1614 {
1615 # get the attributes
1616 my $attr = $token->return_attr;
1617
1618 # get the full path to the image
1619 my $img_file = &util::filename_cat($dirname,$attr->{src});
1620
1621 # set the width and height attribute
1622 ($attr->{width}, $attr->{height}) = imgsize($img_file);
1623
1624 # recreate the tag
1625 print PROD "<img";
1626 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1627 print PROD ">";
1628 }
1629 # is it a font tag
1630 else
1631 {
1632 if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
1633 {
1634 # remove font tag
1635 print PROD "";
1636 }
1637 else
1638 {
1639 # print without changes
1640 print PROD $token->as_is;
1641 }
1642 }
1643 }
1644 close (PROD) || die("Error Closing File: $tmp_filename $!");
1645
1646 # run html-tidy on the tmp file to make it a proper XML file
1647
1648 my $outhandle = $self->{'outhandle'};
1649 print $outhandle "Converting HTML to be XML compliant:\n";
1650
1651 my $tidy_cmd = "tidy";
1652 $tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1653 $tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1654 if ($self->{'verbosity'} <= 2) {
1655 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1656 $tidy_cmd .= " 2>nul";
1657 }
1658 else {
1659 $tidy_cmd .= " 2>/dev/null";
1660 }
1661 print $outhandle " => $tidy_cmd\n";
1662 }
1663
1664 my $tidyfile = `$tidy_cmd`;
1665
1666 # write result back to the tmp file
1667 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1668 print PROD $tidyfile;
1669 close (PROD) || die("Error Closing File: $tmp_filename $!");
1670
1671 # return the output filename
1672 return $tmp_filename;
1673}
1674
1675sub associate_cover_image
1676{
1677 my $self = shift(@_);
1678 my ($doc_obj, $filename) = @_;
1679 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
1680 {
1681 # we will have cover image in tidytmp, but want it from import
1682 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1683 }
1684 $self->SUPER::associate_cover_image($doc_obj, $filename);
1685}
1686
1687
16881;
Note: See TracBrowser for help on using the repository browser.