source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 23347

Last change on this file since 23347 was 23347, checked in by davidb, 13 years ago

Tidy up of debugging statements for handline filename encodings, plus finishing off the 'deduce_filename_encoding' routine

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 57.1 KB
Line 
1###########################################################################
2#
3# HTMLPlugin.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlugin;
37
38use Encode;
39
40use ReadTextFile;
41use HBPlugin;
42use ghtml;
43use unicode;
44use util;
45use XMLParser;
46
47use Image::Size;
48use File::Copy;
49
50sub BEGIN {
51 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
52}
53
54use strict; # every perl program should have this!
55no strict 'refs'; # make an exception so we can use variables as filehandles
56
57my $arguments =
58 [ { 'name' => "process_exp",
59 'desc' => "{BasePlugin.process_exp}",
60 'type' => "regexp",
61 'deft' => &get_default_process_exp() },
62 { 'name' => "block_exp",
63 'desc' => "{BasePlugin.block_exp}",
64 'type' => 'regexp',
65 'deft' => &get_default_block_exp() },
66 { 'name' => "nolinks",
67 'desc' => "{HTMLPlugin.nolinks}",
68 'type' => "flag" },
69 { 'name' => "keep_head",
70 'desc' => "{HTMLPlugin.keep_head}",
71 'type' => "flag" },
72 { 'name' => "no_metadata",
73 'desc' => "{HTMLPlugin.no_metadata}",
74 'type' => "flag" },
75 { 'name' => "metadata_fields",
76 'desc' => "{HTMLPlugin.metadata_fields}",
77 'type' => "string",
78 'deft' => "Title" },
79 { 'name' => "metadata_field_separator",
80 'desc' => "{HTMLPlugin.metadata_field_separator}",
81 'type' => "string",
82 'deft' => "" },
83 { 'name' => "hunt_creator_metadata",
84 'desc' => "{HTMLPlugin.hunt_creator_metadata}",
85 'type' => "flag" },
86 { 'name' => "file_is_url",
87 'desc' => "{HTMLPlugin.file_is_url}",
88 'type' => "flag" },
89 { 'name' => "assoc_files",
90 'desc' => "{HTMLPlugin.assoc_files}",
91 'type' => "regexp",
92 'deft' => &get_default_block_exp() },
93 { 'name' => "rename_assoc_files",
94 'desc' => "{HTMLPlugin.rename_assoc_files}",
95 'type' => "flag" },
96 { 'name' => "title_sub",
97 'desc' => "{HTMLPlugin.title_sub}",
98 'type' => "string",
99 'deft' => "" },
100 { 'name' => "description_tags",
101 'desc' => "{HTMLPlugin.description_tags}",
102 'type' => "flag" },
103 # retain this for backward compatibility (w3mir option was replaced by
104 # file_is_url)
105 { 'name' => "w3mir",
106# 'desc' => "{HTMLPlugin.w3mir}",
107 'type' => "flag",
108 'hiddengli' => "yes"},
109 { 'name' => "no_strip_metadata_html",
110 'desc' => "{HTMLPlugin.no_strip_metadata_html}",
111 'type' => "string",
112 'deft' => "",
113 'reqd' => "no"},
114 { 'name' => "sectionalise_using_h_tags",
115 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
116 'type' => "flag" },
117 { 'name' => "use_realistic_book",
118 'desc' => "{HTMLPlugin.tidy_html}",
119 'type' => "flag"},
120 { 'name' => "old_style_HDL",
121 'desc' => "{HTMLPlugin.old_style_HDL}",
122 'type' => "flag"},
123 {'name' => "processing_tmp_files",
124 'desc' => "{BasePlugin.processing_tmp_files}",
125 'type' => "flag",
126 'hiddengli' => "yes"}
127 ];
128
129my $options = { 'name' => "HTMLPlugin",
130 'desc' => "{HTMLPlugin.desc}",
131 'abstract' => "no",
132 'inherits' => "yes",
133 'args' => $arguments };
134
135
136sub new {
137 my ($class) = shift (@_);
138 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139 push(@$pluginlist, $class);
140
141 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142 push(@{$hashArgOptLists->{"OptList"}},$options);
143
144
145 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
146
147 if ($self->{'w3mir'}) {
148 $self->{'file_is_url'} = 1;
149 }
150 $self->{'aux_files'} = {};
151 $self->{'dir_num'} = 0;
152 $self->{'file_num'} = 0;
153
154 return bless $self, $class;
155}
156
157# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
158# if have eg <script language="javascript" src="img/lib.js@123">
159# blocking is now done by reading through the file and recording all the
160# images and other files
161sub get_default_block_exp {
162 my $self = shift (@_);
163
164 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
165 return "";
166}
167
168sub get_default_process_exp {
169 my $self = shift (@_);
170
171 # the last option is an attempt to encode the concept of an html query ...
172 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
173}
174
175sub store_block_files
176{
177 my $self =shift (@_);
178 my ($filename_full_path, $block_hash) = @_;
179
180 my $html_fname = $filename_full_path;
181 my @file_blocks;
182
183 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
184 $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
185
186 # read in file ($text will be in utf8)
187 my $raw_text = "";
188 $self->read_file_no_decoding ($filename_full_path, \$raw_text);
189
190 my $textref = \$raw_text;
191 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
192 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
193 $$textref =~ s/$opencom(.*?)$closecom//gs;
194
195 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
196 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
197 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
198 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
199 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
200 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
201 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
202
203 if(!defined $self->{'utf8_to_original_filename'}) {
204 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
205 $self->{'utf8_to_original_filename'} = {};
206 }
207
208 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
209
210 # remove quotes from link at start and end if necessary
211 if ($link=~/^\"/) {
212 $link=~s/^\"//;
213 $link=~s/\"$//;
214 }
215
216 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
217 # some links may just be anchor names
218 next unless ($link =~ /\S+/);
219
220 if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
221 # Turn relative file path into full path
222 my $dirname = &File::Basename::dirname($filename_full_path);
223 $link = &util::filename_cat($dirname, $link);
224 }
225 $link = $self->eval_dir_dots($link);
226
227 # this is the actual filename on the filesystem (that the link refers to)
228 my $url_original_filename = $self->opt_url_decode($link);
229
230 # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
231 my $utf8_link = "";
232 $self->decode_text($link,$content_encoding,$language,\$utf8_link);
233
234 $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
235# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
236
237 if ($url_original_filename ne $utf8_link) {
238 my $outhandle = $self->{'outhandle'};
239
240 print $outhandle "URL Encoding $url_original_filename\n";
241 print $outhandle " ->$utf8_link\n";
242 }
243
244 $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
245 }
246}
247
248# Given a filename in any encoding, will URL decode it to get back the original filename
249# in the original encoding. Because this method is intended to work out the *original*
250# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
251# string already exists in the local folder.
252# Return the original filename corresponding to the parameter URL-encoded filename, and
253# a decoded flag that is set to true iff URL-decoding had to be applied.
254sub opt_url_decode {
255 my $self = shift (@_);
256 my ($link) = @_;
257
258 # Replace %XX's in URL with decoded value if required.
259 # Note that the filename may include the %XX in some situations
260 if ($link =~ m/\%[A-F0-9]{2}/i) {
261 if (!-e $link) {
262 $link = &unicode::url_decode($link);
263 }
264 }
265
266 return $link;
267}
268
269sub read_into_doc_obj
270{
271 my $self = shift (@_);
272 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
273
274 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
275
276 # Lookup content_encoding worked out in file_block pass for this file
277 # Store it under the local name 'content_encoding' so its nice and
278 # easy to access
279 $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
280
281 # get the input file
282 my $input_filename = $file;
283 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
284 $suffix = lc($suffix);
285 my $tidy_filename;
286 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
287 {
288 # because the document has to be sectionalized set the description tags
289 $self->{'description_tags'} = 1;
290
291 # set the file to be tidied
292 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
293
294 # get the tidied file
295 #my $tidy_filename = $self->tmp_tidy_file($input_filename);
296 $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
297
298 # derive tmp filename from input filename
299 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
300
301 # set the new input file and base_dir to be from the tidied file
302 $file = "$tailname$suffix";
303 $base_dir = $dirname;
304 }
305
306 # call the parent read_into_doc_obj
307 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
308 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
309 {
310 # now we need to reset the filenames in the doc obj so that the converted filenames are not used
311 my $collect_file = &util::filename_within_collection($filename_full_path);
312 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
313 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
314 # build. so set it manually.
315 $doc_obj->{'source_path'} = $filename_full_path;
316 my $collect_conv_file = &util::filename_within_collection($tidy_filename);
317 $doc_obj->set_converted_filename($collect_conv_file);
318 $self->set_Source_metadata($doc_obj, $filename_no_path);
319 }
320
321 delete $self->{'store_content_encoding'}->{$filename_full_path};
322 $self->{'content_encoding'} = undef;
323
324 return ($process_status,$doc_obj);
325}
326
327# do plugin specific processing of doc_obj
328sub process {
329 my $self = shift (@_);
330 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
331 my $outhandle = $self->{'outhandle'};
332
333 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
334 # this makes life so much easier... perl can cope with unix-style '/'s.
335 $base_dir =~ s@(\\)+@/@g;
336 $file =~ s@(\\)+@/@g;
337 }
338
339 # reset per-doc stuff...
340 $self->{'aux_files'} = {};
341 $self->{'dir_num'} = 0;
342 $self->{'file_num'} = 0;
343
344 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
345 # you can also include metadata in the format (X can be any number)
346 # <hX>Title<!--gsdl-metadata
347 # <Metadata name="name1">value1</Metadata>
348 # ...
349 # <Metadata name="nameN">valueN</Metadata>
350 #--></hX>
351 if ($self->{'sectionalise_using_h_tags'}) {
352 # description_tags should allways be activated because we convert headings to description tags
353 $self->{'description_tags'} = 1;
354
355 my $arrSections = [];
356 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
357
358 if (scalar(@$arrSections)) {
359 my $strMetadata = $self->update_section_data($arrSections, -1);
360 if (length($strMetadata)) {
361 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
362 $$textref =~ s/<\/body>/$strMetadata/ig;
363 }
364 }
365 }
366
367 my $cursection = $doc_obj->get_top_section();
368
369 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
370 unless $self->{'no_metadata'} || $self->{'description_tags'};
371
372 # Store URL for page as metadata - this can be used for an
373 # altavista style search interface. The URL won't be valid
374 # unless the file structure contains the domain name (i.e.
375 # like when w3mir is used to download a website).
376
377 # URL metadata (even invalid ones) are used to support internal
378 # links, so even if 'file_is_url' is off, still need to store info
379
380 my ($tailname,$dirname) = &File::Basename::fileparse($file);
381
382# my $utf8_file = $self->filename_to_utf8_metadata($file);
383# $utf8_file =~ s/&\#095;/_/g;
384 my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
385
386 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
387 print STDERR "***!! file = $file\n";
388 print STDERR "***!! utf8_file = $utf8_file\n";
389 }
390
391
392 my $web_url = "http://";
393 if(defined $dirname) { # local directory
394 # Check for "ftp" in the domain name of the directory
395 # structure to determine if this URL should be a ftp:// URL
396 # This check is not infallible, but better than omitting the
397 # check, which would cause all files downloaded from ftp sites
398 # via mirroring with wget to have potentially erroneous http:// URLs
399 # assigned in their metadata
400 if ($dirname =~ /^[^\/]*ftp/i)
401 {
402 $web_url = "ftp://";
403 }
404 $dirname = $self->eval_dir_dots($dirname);
405 $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
406 $web_url = $web_url.$dirname.$utf8_file;
407 } else {
408 $web_url = $web_url.$utf8_file;
409 }
410 $web_url =~ s/\\/\//g;
411 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
412
413 if ($self->{'file_is_url'}) {
414 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
415 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
416 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
417 }
418
419 if ($self->{'description_tags'}) {
420 # remove the html header - note that doing this here means any
421 # sections defined within the header will be lost (so all <Section>
422 # tags must appear within the body of the HTML)
423 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
424
425 $$textref =~ s/^.*?<body[^>]*>//is;
426 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
427
428 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
429 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
430
431 my $lt = '(?:<|&lt;)';
432 my $gt = '(?:>|&gt;)';
433 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
434
435 my $dont_strip = '';
436 if ($self->{'no_strip_metadata_html'}) {
437 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
438 }
439
440 my $found_something = 0; my $top = 1;
441 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
442 my $text = $1;
443 my $comment = $2;
444 if (defined $text) {
445 # text before a comment - note that getting to here
446 # doesn't necessarily mean there are Section tags in
447 # the document
448 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
449 }
450 while ($comment =~ s/$lt(.*?)$gt//s) {
451 my $tag = $1;
452 if ($tag eq "Section") {
453 $found_something = 1;
454 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
455 $top = 0;
456 } elsif ($tag eq "/Section") {
457 $found_something = 1;
458 $cursection = $doc_obj->get_parent_section ($cursection);
459 } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
460 my $metaname = $1;
461 my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
462 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
463 my $metavalue = $1;
464 $metavalue =~ s/^\s+//;
465 $metavalue =~ s/\s+$//;
466 # assume that no metadata value intentionally includes
467 # carriage returns or HTML tags (if they're there they
468 # were probably introduced when converting to HTML from
469 # some other format).
470 # actually some people want to have html tags in their
471 # metadata.
472 $metavalue =~ s/[\cJ\cM]/ /sg;
473 $metavalue =~ s/<[^>]+>//sg
474 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
475 $metavalue =~ s/\s+/ /sg;
476 if ($metaname =~ /\./) { # has a namespace
477 $metaname = "ex.$metaname";
478 }
479 if ($accumulate) {
480 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
481 } else {
482 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
483 }
484 } elsif ($tag eq "Description" || $tag eq "/Description") {
485 # do nothing with containing Description tags
486 } else {
487 # simple HTML tag (probably created by the conversion
488 # to HTML from some other format) - we'll ignore it and
489 # hope for the best ;-)
490 }
491 }
492 }
493 if ($cursection ne "") {
494 print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
495 }
496
497 $$textref =~ s/^.*?<body[^>]*>//is;
498 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
499 if ($$textref =~ m/\S/) {
500 if (!$found_something) {
501 if ($self->{'verbosity'} > 2) {
502 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
503 print $outhandle " will be processed as a single section document\n";
504 }
505
506 # go ahead and process single-section document
507 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
508
509 # if document contains no Section tags we'll go ahead
510 # and extract metadata (this won't have been done
511 # above as the -description_tags option prevents it)
512 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
513 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
514 unless $self->{'no_metadata'};
515
516 } else {
517 print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
518 print $outhandle " of the final closing </Section> tag. This text will\n";
519 print $outhandle " be ignored.";
520
521 my ($text);
522 if (length($$textref) > 30) {
523 $text = substr($$textref, 0, 30) . "...";
524 } else {
525 $text = $$textref;
526 }
527 $text =~ s/\n/ /isg;
528 print $outhandle " ($text)\n";
529 }
530 } elsif (!$found_something) {
531
532 if ($self->{'verbosity'} > 2) {
533 # may get to here if document contained no valid Section
534 # tags but did contain some comments. The text will have
535 # been processed already but we should print the warning
536 # as above and extract metadata
537 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
538 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
539 }
540
541 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
542 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
543 unless $self->{'no_metadata'};
544 }
545
546 } else {
547
548 # remove header and footer
549 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
550 $$textref =~ s/^.*?<body[^>]*>//is;
551 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
552 }
553
554 # single section document
555 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
556 }
557
558 return 1;
559}
560
561
562sub process_heading
563{
564 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
565 $strHeadingText = '' if (!defined($strHeadingText));
566
567 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
568
569 my $strSecMetadata = '';
570 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
571 {
572 $strSecMetadata .= $1;
573 }
574
575 $strHeadingText =~ s/^\s+//g;
576 $strHeadingText =~ s/\s+$//g;
577 $strSecMetadata =~ s/^\s+//g;
578 $strSecMetadata =~ s/\s+$//g;
579
580 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
581
582 if (length($strSecMetadata)) {
583 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
584 }
585
586 $strMetadata .= "\t</Description>\n";
587
588 return "<!--" . $strMetadata . "-->";
589}
590
591
592sub update_section_data
593{
594 my ($self, $arrSections, $nCurTocNo) = @_;
595 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
596
597 if ($nSections == 0) {
598 push @$arrSections, $nCurTocNo;
599 return $strBuffer;
600 }
601 $nLast = $arrSections->[$nSections - 1];
602 if ($nCurTocNo > $nLast) {
603 push @$arrSections, $nCurTocNo;
604 return $strBuffer;
605 }
606 for(my $i = $nSections - 1; $i >= 0; $i--) {
607 if ($nCurTocNo <= $arrSections->[$i]) {
608 $strBuffer .= "\n</Section>";
609 pop @$arrSections;
610 }
611 }
612 push @$arrSections, $nCurTocNo;
613 return $strBuffer;
614}
615
616
617# note that process_section may be called multiple times for a single
618# section (relying on the fact that add_utf8_text appends the text to any
619# that may exist already).
620sub process_section {
621 my $self = shift (@_);
622 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
623 # trap links
624 if (!$self->{'nolinks'}) {
625 # usemap="./#index" not handled correctly => change to "#index"
626## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
627
628 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
629 $self->replace_usemap_links($1, $2, $3)/isge;
630
631## $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
632
633 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
634 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
635 }
636
637 # trap images
638
639 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
640 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
641 # turned regular text succeeding images into links. That is, by embedding <imgs>
642 # inside <a href=""></a>, the text following images were no longer misbehaving.
643 # However, there would be many occasions whereby images were not meant to link
644 # to their source images but where the images would link to another web page.
645 # To allow this, the no_image_links option was introduced: it would prevent
646 # the behaviour of embedding images into links that referenced the source images.
647
648 # Somewhere along the line, the problem of normal text turning into links when
649 # such text followed images which were not embedded in <a href=""></a> ceased
650 # to occur. This is why the following lines have been commented out (as well as
651 # two lines in replace_images). They appear to no longer apply.
652
653 # If at any time, there is a need for having images embedded in <a> anchor tags,
654 # then it might be better to turn that into an HTMLPlugin option rather than make
655 # it the default behaviour. Also, eventually, no_image_links needs to become
656 # a deprecated option for HTMLPlugin as it has now become the default behaviour.
657
658 #if(!$self->{'no_image_links'}){
659 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
660 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
661 #}
662
663 # add text to document object
664 # turn \ into \\ so that the rest of greenstone doesn't think there
665 # is an escape code following. (Macro parsing loses them...)
666 $$textref =~ s/\\/\\\\/go;
667
668 $doc_obj->add_utf8_text($cursection, $$textref);
669}
670
671sub replace_images {
672 my $self = shift (@_);
673 my ($front, $link, $back, $base_dir,
674 $file, $doc_obj, $section) = @_;
675
676 # remove quotes from link at start and end if necessary
677 if ($link=~/^[\"\']/) {
678 $link=~s/^[\"\']//;
679 $link=~s/[\"\']$//;
680 $front.='"';
681 $back="\"$back";
682 }
683
684 $link =~ s/\n/ /g;
685
686 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
687 # If the Word file path has spaces in it, wv messes up and you end up with
688 # absolute paths for the images, and without the "file://" prefix
689 # So check for this special case and massage the data to be correct
690 if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
691 $link =~ s/^.*\\([^\\]+)$/$1/;
692 }
693
694 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
695
696 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
697
698# print STDERR "**** link = $link\n**** href = $href\n**** img_file = $img_file, rl = $rl\n";
699
700 my $anchor_name = $img_file;
701 #$anchor_name =~ s/^.*\///;
702 #$anchor_name = "<a name=\"$anchor_name\" ></a>";
703
704 my $image_link = $front . $img_file .$back;
705 return $image_link;
706
707 # The reasons for why the following two lines are no longer necessary can be
708 # found in subroutine process_section
709 #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
710 #return $anchor_link;
711
712 #return $front . $img_file . $back . $anchor_name;
713}
714
715sub replace_href_links {
716 my $self = shift (@_);
717 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
718
719 # remove quotes from link at start and end if necessary
720 if ($link=~/^[\"\']/) {
721 $link=~s/^[\"\']//;
722 $link=~s/[\"\']$//;
723 $front.='"';
724 $back="\"$back";
725 }
726
727 # attempt to sort out targets - frames are not handled
728 # well in this plugin and some cases will screw things
729 # up - e.g. the _parent target (so we'll just remove
730 # them all ;-)
731 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
732 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
733 $front =~ s/target=\"?_parent\"?//is;
734 $back =~ s/target=\"?_parent\"?//is;
735
736 return $front . $link . $back if $link =~ m/^\#/s;
737 $link =~ s/\n/ /g;
738
739 # Find file referred to by $link on file system
740 # This is more complicated than it sounds when char encodings
741 # is taken in to account
742 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
743
744 # href may use '\'s where '/'s should be on Windows
745 $href =~ s/\\/\//g;
746 my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
747
748
749 ##### leave all these links alone (they won't be picked up by intermediate
750 ##### pages). I think that's safest when dealing with frames, targets etc.
751 ##### (at least until I think of a better way to do it). Problems occur with
752 ##### mailto links from within small frames, the intermediate page is displayed
753 ##### within that frame and can't be seen. There is still potential for this to
754 ##### happen even with html pages - the solution seems to be to somehow tell
755 ##### the browser from the server side to display the page being sent (i.e.
756 ##### the intermediate page) in the top level window - I'm not sure if that's
757 ##### possible - the following line should probably be deleted if that can be done
758 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
759
760 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
761 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
762
763
764 # If web page didn't give encoding, then default to utf8
765 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
766 print STDERR "*** Web page didn't give encoding, defaulting to UTF8!\n";
767 print STDERR "***** looking up $file\n";
768 }
769
770 my $content_encoding= $self->{'content_encoding'} || "utf8";
771 $href = encode($content_encoding,$href);
772
773 $href = &unicode::raw_filename_to_url_encoded($href);
774 $href = &unicode::filename_to_url($href);
775
776 &ghtml::urlsafe ($href);
777 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
778 print STDERR "***!!! href=$href\n";
779 }
780
781
782 return $front . "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part . $back;
783 } else {
784 # link is to some other type of file (e.g., an image) so we'll
785 # need to associate that file
786 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
787 }
788}
789
790sub add_file {
791 my $self = shift (@_);
792 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
793 my ($newname);
794
795 my $filename = $href;
796 if ($base_dir eq "") {
797 # remove http:/ thereby leaving one slash at the start
798 $filename =~ s/^[^:]*:\///;
799 }
800 else {
801 # remove http://
802 $filename =~ s/^[^:]*:\/\///;
803 }
804
805 $filename = &util::filename_cat($base_dir, $filename);
806 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) {
807 # we are processing a tidytmp file - want paths to be in import
808 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
809 }
810
811 # Replace %XX's in URL with decoded value if required. Note that the
812 # filename may include the %XX in some situations. If the *original*
813 # file's name was in URL encoding, the following method will not decode
814 # it.
815 my $utf8_filename = $filename;
816 my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename);
817
818 my $content_encoding= $self->{'content_encoding'} || "utf8";
819
820 # The filenames that come through the HTML file have been decoded
821 # into Unicode aware Perl strings. Need to convert them back
822 # to their initial raw-byte encoding to match the file that
823 # exists on the file system
824 $filename = encode($content_encoding, $opt_decode_utf8_filename);
825
826
827 # some special processing if the intended filename was converted to utf8, but
828 # the actual file still needs to be renamed
829 if (!-e $filename) {
830 # try the original filename stored in map
831 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
832 print STDERR "***###!! orig filename did not exist: $filename\n";
833 }
834
835 my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename};
836
837 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
838 print STDERR "**** Trying for $original_filename\n";
839 }
840
841 if (defined $original_filename && -e $original_filename) {
842 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
843 print STDERR "*** found match\n";
844 }
845 $filename = $original_filename;
846 }
847 }
848
849 my ($ext) = $filename =~ m/(\.[^\.]*)$/;
850
851 if ($rl == 0) {
852 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
853 return "_httpextlink_&amp;rl=0&amp;el=prompt&amp;href=" . $href . $hash_part;
854 }
855 else {
856 return "_httpextlink_&amp;rl=0&amp;el=direct&amp;href=" . $href . $hash_part;
857 }
858 }
859
860 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
861 return "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part;
862 }
863 # add the original image file as a source file
864 if (!$self->{'processing_tmp_files'} ) {
865 $doc_obj->associate_source_file($filename);
866 }
867 if ($self->{'rename_assoc_files'}) {
868 if (defined $self->{'aux_files'}->{$href}) {
869 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
870 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
871 } else {
872 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
873 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
874 $self->inc_filecount ();
875 }
876 $doc_obj->associate_file($filename, $newname, undef, $section);
877 return "_httpdocimg_/$newname";
878 } else {
879 if(&unicode::is_url_encoded($utf8_filename)) {
880 # use the possibly-decoded filename instead to avoid double URL encoding
881 ($newname) = $filename =~ m/([^\/\\]*)$/;
882 } else {
883 ($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
884 }
885
886 # Make sure this name uses only ASCII characters.
887 # We use either base64 or URL encoding, as these preserve original encoding
888 $newname = &util::rename_file($newname, $self->{'file_rename_method'});
889
890 $doc_obj->associate_file($filename, $newname, undef, $section);
891
892 # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
893 # of filenames, URL-encode the additional percent signs of the URL-encoded filename
894 my $newname_url = $newname;
895 $newname_url = &unicode::filename_to_url($newname_url);
896 return "_httpdocimg_/$newname_url";
897 }
898}
899
900
901sub format_link {
902 my $self = shift (@_);
903 my ($link, $base_dir, $file) = @_;
904
905 my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/;
906
907 $hash_part = "" if !defined $hash_part;
908 if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
909 my $outhandle = $self->{'outhandle'};
910 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
911 if $self->{'verbosity'};
912 return ($link, "", 0);
913 }
914
915 if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) {
916 my $type = $1;
917
918 if ($link =~ m/^(http|ftp):/i) {
919 # Turn url (using /) into file name (possibly using \ on windows)
920 my @http_dir_split = split('/', $before_hash);
921 $before_hash = &util::filename_cat(@http_dir_split);
922 }
923
924 $before_hash = $self->eval_dir_dots($before_hash);
925
926 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
927
928 my $rl = 0;
929 $rl = 1 if (-e $linkfilename);
930
931 # make sure there's a slash on the end if it's a directory
932 if ($before_hash !~ m/\/$/) {
933 $before_hash .= "/" if (-d $linkfilename);
934 }
935 return ($type . $before_hash, $hash_part, $rl);
936
937 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
938
939 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
940
941 # the first directory will be the domain name if file_is_url
942 # to generate archives, otherwise we'll assume all files are
943 # from the same site and base_dir is the root
944
945 if ($self->{'file_is_url'}) {
946 my @dirs = split /[\/\\]/, $file;
947 my $domname = shift (@dirs);
948 $before_hash = &util::filename_cat($domname, $before_hash);
949 $before_hash =~ s@\\@/@g; # for windows
950 }
951 else
952 {
953 # see if link shares directory with source document
954 # => turn into relative link if this is so!
955
956 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
957 # too difficult doing a pattern match with embedded '\'s...
958 my $win_before_hash=$before_hash;
959 $win_before_hash =~ s@(\\)+@/@g;
960 # $base_dir is already similarly "converted" on windows.
961 if ($win_before_hash =~ s@^$base_dir/@@o) {
962 # if this is true, we removed a prefix
963 $before_hash=$win_before_hash;
964 }
965 }
966 else {
967 # before_hash has lost leading slash by this point,
968 # -> add back in prior to substitution with $base_dir
969 $before_hash = "/$before_hash";
970
971 $before_hash = &util::filename_cat("",$before_hash);
972 $before_hash =~ s@^$base_dir/@@;
973 }
974 }
975 } else {
976 # Turn relative file path into full path
977 my $dirname = &File::Basename::dirname($file);
978 $before_hash = &util::filename_cat($dirname, $before_hash);
979 $before_hash = $self->eval_dir_dots($before_hash);
980 }
981
982 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
983 # make sure there's a slash on the end if it's a directory
984 if ($before_hash !~ m/\/$/) {
985 $before_hash .= "/" if (-d $linkfilename);
986 }
987 return ("http://" . $before_hash, $hash_part, 1);
988 } else {
989 # mailto, news, nntp, telnet, javascript or gopher link
990 return ($before_hash, "", 0);
991 }
992}
993
994sub extract_first_NNNN_characters {
995 my $self = shift (@_);
996 my ($textref, $doc_obj, $thissection) = @_;
997
998 foreach my $size (split /,/, $self->{'first'}) {
999 my $tmptext = $$textref;
1000 # skip to the body
1001 $tmptext =~ s/.*<body[^>]*>//i;
1002 # remove javascript
1003 $tmptext =~ s@<script.*?</script>@ @sig;
1004 $tmptext =~ s/<[^>]*>/ /g;
1005 $tmptext =~ s/&nbsp;/ /g;
1006 $tmptext =~ s/^\s+//;
1007 $tmptext =~ s/\s+$//;
1008 $tmptext =~ s/\s+/ /gs;
1009 $tmptext = &unicode::substr ($tmptext, 0, $size);
1010 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
1011 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1012 }
1013}
1014
1015
1016sub extract_metadata {
1017 my $self = shift (@_);
1018 my ($textref, $metadata, $doc_obj, $section) = @_;
1019 my $outhandle = $self->{'outhandle'};
1020 # if we don't want metadata, we may as well not be here ...
1021 return if (!defined $self->{'metadata_fields'});
1022
1023 my $separator = $self->{'metadata_field_separator'};
1024 if ($separator eq "") {
1025 undef $separator;
1026 }
1027
1028 # metadata fields to extract/save. 'key' is the (lowercase) name of the
1029 # html meta, 'value' is the metadata name for greenstone to use
1030 my %find_fields = ();
1031
1032 my %creator_fields = (); # short-cut for lookups
1033
1034
1035 foreach my $field (split /,/, $self->{'metadata_fields'}) {
1036 $field =~ s/^\s+//; # remove leading whitespace
1037 $field =~ s/\s+$//; # remove trailing whitespace
1038
1039 # support tag<tagname>
1040 if ($field =~ m/^(.*?)\s*<(.*?)>$/) {
1041 # "$2" is the user's preferred gs metadata name
1042 $find_fields{lc($1)}=$2; # lc = lowercase
1043 } else { # no <tagname> for mapping
1044 # "$field" is the user's preferred gs metadata name
1045 $find_fields{lc($field)}=$field; # lc = lowercase
1046 }
1047 }
1048
1049 if (defined $self->{'hunt_creator_metadata'} &&
1050 $self->{'hunt_creator_metadata'} == 1 ) {
1051 my @extra_fields =
1052 (
1053 'author',
1054 'author.email',
1055 'creator',
1056 'dc.creator',
1057 'dc.creator.corporatename',
1058 );
1059
1060 # add the creator_metadata fields to search for
1061 foreach my $field (@extra_fields) {
1062 $creator_fields{$field}=0; # add to lookup hash
1063 }
1064 }
1065
1066
1067 # find the header in the html file, which has the meta tags
1068 $$textref =~ m@<head>(.*?)</head>@si;
1069
1070 my $html_header=$1;
1071
1072 # go through every <meta... tag defined in the html and see if it is
1073 # one of the tags we want to match.
1074
1075 # special case for title - we want to remember if its been found
1076 my $found_title = 0;
1077 # this assumes that ">" won't appear. (I don't think it's allowed to...)
1078 $html_header =~ m/^/; # match the start of the string, for \G assertion
1079
1080 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
1081 my $metatag=$1;
1082 my ($tag, $value);
1083
1084 # find the tag name
1085 $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
1086 $tag=$2;
1087 # in case they're not using " or ', but they should...
1088 if (! $tag) {
1089 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1090 $tag=$1;
1091 }
1092
1093 if (!defined $tag) {
1094 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1095 next;
1096 }
1097
1098 # don't need to assign this field if it was passed in from a previous
1099 # (recursive) plugin
1100 if (defined $metadata->{$tag}) {next}
1101
1102 # find the tag content
1103 $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
1104 $value=$2;
1105
1106 if (! $value) {
1107 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1108 $value=$1;
1109 }
1110 if (!defined $value) {
1111 print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1112 next;
1113 }
1114
1115 # clean up and add
1116 $value =~ s/\s+/ /gs;
1117 chomp($value); # remove trailing \n, if any
1118 if (exists $creator_fields{lc($tag)}) {
1119 # map this value onto greenstone's "Creator" metadata
1120 $tag='Creator';
1121 } elsif (!exists $find_fields{lc($tag)}) {
1122 next; # don't want this tag
1123 } else {
1124 # get the user's preferred capitalisation
1125 $tag = $find_fields{lc($tag)};
1126 }
1127 if (lc($tag) eq "title") {
1128 $found_title = 1;
1129 }
1130
1131 if ($self->{'verbosity'} > 2) {
1132 print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1133 }
1134
1135 if ($tag =~ /\./) {
1136 # there is a . so has a namespace, add ex.
1137 $tag = "ex.$tag";
1138 }
1139 if (defined $separator) {
1140 my @values = split($separator, $value);
1141 foreach my $v (@values) {
1142 $doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1143 }
1144 }
1145 else {
1146 $doc_obj->add_utf8_metadata($section, $tag, $value);
1147 }
1148 }
1149
1150 # TITLE: extract the document title
1151 if (exists $find_fields{'title'} && !$found_title) {
1152 # we want a title, and didn't find one in the meta tags
1153 # see if there's a <title> tag
1154 my $title;
1155 my $from = ""; # for debugging output only
1156 if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
1157 $title = $1;
1158 $from = "<title> tags";
1159 }
1160
1161 if (!defined $title) {
1162 $from = "first 100 chars";
1163 # if no title use first 100 or so characters
1164 $title = $$textref;
1165 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1166 $title =~ s/^.*?<body>//si;
1167 # ignore javascript!
1168 $title =~ s@<script.*?</script>@ @sig;
1169 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1170 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1171 $title = substr ($title, 0, 100);
1172 $title =~ s/\s\S*$/.../;
1173 }
1174 $title =~ s/<[^>]*>/ /g; # remove html tags
1175 $title =~ s/&nbsp;/ /g;
1176 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1177 $title =~ s/\s+/ /gs; # collapse multiple spaces
1178 $title =~ s/^\s*//; # remove leading spaces
1179 $title =~ s/\s*$//; # remove trailing spaces
1180
1181 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1182 $title =~ s/^\s+//s; # in case title_sub introduced any...
1183 $doc_obj->add_utf8_metadata ($section, "Title", $title);
1184 print $outhandle " extracted Title metadata \"$title\" from $from\n"
1185 if ($self->{'verbosity'} > 2);
1186 }
1187
1188 # add FileFormat metadata
1189 $doc_obj->add_metadata($section,"FileFormat", "HTML");
1190
1191 # Special, for metadata names such as tagH1 - extracts
1192 # the text between the first <H1> and </H1> tags into "H1" metadata.
1193
1194 foreach my $field (keys %find_fields) {
1195 if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1196 my $tag = $1;
1197 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1198 my $content = $1;
1199 $content =~ s/&nbsp;/ /g;
1200 $content =~ s/<[^>]*>/ /g;
1201 $content =~ s/^\s+//;
1202 $content =~ s/\s+$//;
1203 $content =~ s/\s+/ /gs;
1204 if ($content) {
1205 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1206 $tag =~ s/^tag//i;
1207 $doc_obj->add_utf8_metadata ($section, $tag, $content);
1208 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1209 if ($self->{'verbosity'} > 2);
1210 }
1211 }
1212 }
1213}
1214
1215
1216# evaluate any "../" to next directory up
1217# evaluate any "./" as here
1218sub eval_dir_dots {
1219 my $self = shift (@_);
1220 my ($filename) = @_;
1221 my $dirsep_os = &util::get_os_dirsep();
1222 my @dirsep = split(/$dirsep_os/,$filename);
1223
1224 my @eval_dirs = ();
1225 foreach my $d (@dirsep) {
1226 if ($d eq "..") {
1227 pop(@eval_dirs);
1228
1229 } elsif ($d eq ".") {
1230 # do nothing!
1231
1232 } else {
1233 push(@eval_dirs,$d);
1234 }
1235 }
1236
1237 # Need to fiddle with number of elements in @eval_dirs if the
1238 # first one is the empty string. This is because of a
1239 # modification to util::filename_cat that supresses the addition
1240 # of a leading '/' character (or \ if windows) (intended to help
1241 # filename cat with relative paths) if the first entry in the
1242 # array is the empty string. Making the array start with *two*
1243 # empty strings is a way to defeat this "smart" option.
1244 #
1245 if (scalar(@eval_dirs) > 0) {
1246 if ($eval_dirs[0] eq ""){
1247 unshift(@eval_dirs,"");
1248 }
1249 }
1250
1251 my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1252 return $evaluated_filename;
1253}
1254
1255sub replace_usemap_links {
1256 my $self = shift (@_);
1257 my ($front, $link, $back) = @_;
1258
1259 # remove quotes from link at start and end if necessary
1260 if ($link=~/^[\"\']/) {
1261 $link=~s/^[\"\']//;
1262 $link=~s/[\"\']$//;
1263 $front.='"';
1264 $back="\"$back";
1265 }
1266
1267 $link =~ s/^\.\///;
1268 return $front . $link . $back;
1269}
1270
1271sub inc_filecount {
1272 my $self = shift (@_);
1273
1274 if ($self->{'file_num'} == 1000) {
1275 $self->{'dir_num'} ++;
1276 $self->{'file_num'} = 0;
1277 } else {
1278 $self->{'file_num'} ++;
1279 }
1280}
1281
1282
1283# Extend read_file so that strings like &eacute; are
1284# converted to UTF8 internally.
1285#
1286# We don't convert &lt; or &gt; or &amp; or &quot; in case
1287# they interfere with the GML files
1288
1289sub read_file {
1290 my $self = shift(@_);
1291 my ($filename, $encoding, $language, $textref) = @_;
1292
1293 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1294
1295 # Convert entities to their UTF8 equivalents
1296 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
1297 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1298 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
1299
1300}
1301
1302sub HB_read_html_file {
1303 my $self = shift (@_);
1304 my ($htmlfile, $text) = @_;
1305
1306 # load in the file
1307 if (!open (FILE, $htmlfile)) {
1308 print STDERR "ERROR - could not open $htmlfile\n";
1309 return;
1310 }
1311
1312 my $foundbody = 0;
1313 $self->HB_gettext (\$foundbody, $text, "FILE");
1314 close FILE;
1315
1316 # just in case there was no <body> tag
1317 if (!$foundbody) {
1318 $foundbody = 1;
1319 open (FILE, $htmlfile) || return;
1320 $self->HB_gettext (\$foundbody, $text, "FILE");
1321 close FILE;
1322 }
1323 # text is in utf8
1324}
1325
1326# converts the text to utf8, as ghtml does that for &eacute; etc.
1327sub HB_gettext {
1328 my $self = shift (@_);
1329 my ($foundbody, $text, $handle) = @_;
1330
1331 my $line = "";
1332 while (defined ($line = <$handle>)) {
1333 # look for body tag
1334 if (!$$foundbody) {
1335 if ($line =~ s/^.*<body[^>]*>//i) {
1336 $$foundbody = 1;
1337 } else {
1338 next;
1339 }
1340 }
1341
1342 # check for symbol fonts
1343 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
1344 my $font = $1;
1345 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1346 if ($font !~ m/^arial$/i);
1347 }
1348
1349 $$text .= $line;
1350 }
1351
1352 if ($self->{'input_encoding'} eq "iso_8859_1") {
1353 # convert to utf-8
1354 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1355 }
1356 # convert any alphanumeric character entities to their utf-8
1357 # equivalent for indexing purposes
1358 #&ghtml::convertcharentities ($$text);
1359
1360 $$text =~ s/\s+/ /g; # remove \n's
1361
1362 # At this point $$text is a binary byte string
1363 # => turn it into a Unicode aware string, so full
1364 # Unicode aware pattern matching can be used.
1365 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
1366 #
1367
1368 $$text = decode("utf8",$$text);
1369}
1370
1371sub HB_clean_section {
1372 my $self = shift (@_);
1373 my ($section) = @_;
1374
1375 # remove tags without a starting tag from the section
1376 my ($tag, $tagstart);
1377 while ($section =~ m/<\/([^>]{1,10})>/) {
1378 $tag = $1;
1379 $tagstart = index($section, "<$tag");
1380 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1381 $section =~ s/<\/$tag>//;
1382 }
1383
1384 # remove extra paragraph tags
1385 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
1386
1387 # remove extra stuff at the end of the section
1388 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
1389
1390 # add a newline at the beginning of each paragraph
1391 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1392
1393 # add a newline every 80 characters at a word boundary
1394 # Note: this regular expression puts a line feed before
1395 # the last word in each section, even when it is not
1396 # needed.
1397 $section =~ s/(.{1,80})\s/$1\n/g;
1398
1399 # fix up the image links
1400 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
1401 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1402 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
1403 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1404
1405 return $section;
1406}
1407
1408# Will convert the oldHDL format to the new HDL format (using the Section tag)
1409sub convert_to_newHDLformat
1410{
1411 my $self = shift (@_);
1412 my ($file,$cnfile) = @_;
1413 my $input_filename = $file;
1414 my $tmp_filename = $cnfile;
1415
1416 # write HTML tmp file with new HDL format
1417 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1418
1419 # read in the file and do basic html cleaning (removing header etc)
1420 my $html = "";
1421 $self->HB_read_html_file ($input_filename, \$html);
1422
1423 # process the file one section at a time
1424 my $curtoclevel = 1;
1425 my $firstsection = 1;
1426 my $toclevel = 0;
1427 while (length ($html) > 0) {
1428 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
1429 $toclevel = $3;
1430 my $title = $4;
1431 my $sectiontext = "";
1432 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
1433 $sectiontext = $1;
1434 } else {
1435 $sectiontext = $html;
1436 $html = "";
1437 }
1438
1439 # remove tags and extra spaces from the title
1440 $title =~ s/<\/?[^>]+>//g;
1441 $title =~ s/^\s+|\s+$//g;
1442
1443 # close any sections below the current level and
1444 # create a new section (special case for the firstsection)
1445 print PROD "<!--\n";
1446 while (($curtoclevel > $toclevel) ||
1447 (!$firstsection && $curtoclevel == $toclevel)) {
1448 $curtoclevel--;
1449 print PROD "</Section>\n";
1450 }
1451 if ($curtoclevel+1 < $toclevel) {
1452 print STDERR "WARNING - jump in toc levels in $input_filename " .
1453 "from $curtoclevel to $toclevel\n";
1454 }
1455 while ($curtoclevel < $toclevel) {
1456 $curtoclevel++;
1457 }
1458
1459 if ($curtoclevel == 1) {
1460 # add the header tag
1461 print PROD "-->\n";
1462 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1463 print PROD "<!--\n";
1464 }
1465
1466 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1467
1468 print PROD "-->\n";
1469
1470 # clean up the section html
1471 $sectiontext = $self->HB_clean_section($sectiontext);
1472
1473 print PROD "$sectiontext\n";
1474
1475 } else {
1476 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1477 "\nin $input_filename\n";
1478 last;
1479 }
1480 $firstsection = 0;
1481 }
1482
1483 print PROD "<!--\n";
1484 while ($curtoclevel > 0) {
1485 $curtoclevel--;
1486 print PROD "</Section>\n";
1487 }
1488 print PROD "-->\n";
1489
1490 close (PROD) || die("Error Closing File: $tmp_filename $!");
1491
1492 return $tmp_filename;
1493}
1494
1495sub shorten {
1496 my $self = shift (@_);
1497 my ($text) = @_;
1498
1499 return "\"$text\"" if (length($text) < 100);
1500
1501 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1502 substr ($text, length($text)-50) . "\"";
1503}
1504
1505sub convert_tidy_or_oldHDL_file
1506{
1507 my $self = shift (@_);
1508 my ($file) = @_;
1509 my $input_filename = $file;
1510
1511 if (-d $input_filename)
1512 {
1513 return $input_filename;
1514 }
1515
1516 # get the input filename
1517 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1518 my $base_dirname = $dirname;
1519 $suffix = lc($suffix);
1520
1521 # derive tmp filename from input filename
1522 # Remove any white space from filename -- no risk of name collision, and
1523 # makes later conversion by utils simpler. Leave spaces in path...
1524 # tidy up the filename with space, dot, hyphen between
1525 $tailname =~ s/\s+//g;
1526 $tailname =~ s/\.+//g;
1527 $tailname =~ s/\-+//g;
1528 # convert to utf-8 otherwise we have problems with the doc.xml file
1529 # later on
1530 &unicode::ensure_utf8(\$tailname);
1531
1532 # softlink to collection tmp dir
1533 my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1534 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1535
1536 my $test_dirname = "";
1537 my $f_separator = &util::get_os_dirsep();
1538
1539 if ($dirname =~ m/import$f_separator/)
1540 {
1541 $test_dirname = $'; #'
1542
1543 #print STDERR "init $'\n";
1544
1545 while ($test_dirname =~ m/[$f_separator]/)
1546 {
1547 my $folderdirname = $`;
1548 $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1549 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1550 $test_dirname = $'; #'
1551 }
1552 }
1553
1554 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1555
1556 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1557 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
1558 {
1559 #convert the input file to a new style HDL
1560 my $hdl_output_filename = $input_filename;
1561 if ($self->{'old_style_HDL'})
1562 {
1563 $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1564 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1565 }
1566
1567 #just for checking copy all other file from the base dir to tmp dir if it is not exists
1568 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1569 my @files = grep {!/^\.+$/} readdir(DIR);
1570 close(DIR);
1571
1572 foreach my $file (@files)
1573 {
1574 my $src_file = &util::filename_cat($base_dirname,$file);
1575 my $dest_file = &util::filename_cat($tmp_dirname,$file);
1576 if ((!-e $dest_file) && (!-d $src_file))
1577 {
1578 # just copy the original file back to the tmp directory
1579 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1580 }
1581 }
1582
1583 # tidy the input file
1584 my $tidy_output_filename = $hdl_output_filename;
1585 if ($self->{'use_realistic_book'})
1586 {
1587 $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1588 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1589 }
1590 $tmp_filename = $tidy_output_filename;
1591 }
1592 else
1593 {
1594 if (!-e $tmp_filename)
1595 {
1596 # just copy the original file back to the tmp directory
1597 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1598 }
1599 }
1600
1601 return $tmp_filename;
1602}
1603
1604
1605# Will make the html input file as a proper XML file with removed font tag and
1606# image size added to the img tag.
1607# The tidying process takes place in a collection specific 'tmp' directory so
1608# that we don't accidentally damage the input.
1609sub tmp_tidy_file
1610{
1611 my $self = shift (@_);
1612 my ($file,$cnfile) = @_;
1613 my $input_filename = $file;
1614 my $tmp_filename = $cnfile;
1615
1616 # get the input filename
1617 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1618
1619 require HTML::TokeParser::Simple;
1620
1621 # create HTML parser to decode the input file
1622 my $parser = HTML::TokeParser::Simple->new($input_filename);
1623
1624 # write HTML tmp file without the font tag and image size are added to the img tag
1625 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1626 while (my $token = $parser->get_token())
1627 {
1628 # is it an img tag
1629 if ($token->is_start_tag('img'))
1630 {
1631 # get the attributes
1632 my $attr = $token->return_attr;
1633
1634 # get the full path to the image
1635 my $img_file = &util::filename_cat($dirname,$attr->{src});
1636
1637 # set the width and height attribute
1638 ($attr->{width}, $attr->{height}) = imgsize($img_file);
1639
1640 # recreate the tag
1641 print PROD "<img";
1642 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1643 print PROD ">";
1644 }
1645 # is it a font tag
1646 else
1647 {
1648 if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
1649 {
1650 # remove font tag
1651 print PROD "";
1652 }
1653 else
1654 {
1655 # print without changes
1656 print PROD $token->as_is;
1657 }
1658 }
1659 }
1660 close (PROD) || die("Error Closing File: $tmp_filename $!");
1661
1662 # run html-tidy on the tmp file to make it a proper XML file
1663
1664 my $outhandle = $self->{'outhandle'};
1665 print $outhandle "Converting HTML to be XML compliant:\n";
1666
1667 my $tidy_cmd = "tidy";
1668 $tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1669 $tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1670 if ($self->{'verbosity'} <= 2) {
1671 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1672 $tidy_cmd .= " 2>nul";
1673 }
1674 else {
1675 $tidy_cmd .= " 2>/dev/null";
1676 }
1677 print $outhandle " => $tidy_cmd\n";
1678 }
1679
1680 my $tidyfile = `$tidy_cmd`;
1681
1682 # write result back to the tmp file
1683 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1684 print PROD $tidyfile;
1685 close (PROD) || die("Error Closing File: $tmp_filename $!");
1686
1687 # return the output filename
1688 return $tmp_filename;
1689}
1690
1691sub associate_cover_image
1692{
1693 my $self = shift(@_);
1694 my ($doc_obj, $filename) = @_;
1695 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
1696 {
1697 # we will have cover image in tidytmp, but want it from import
1698 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1699 }
1700 $self->SUPER::associate_cover_image($doc_obj, $filename);
1701}
1702
1703
17041;
Note: See TracBrowser for help on using the repository browser.