source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 22355

Last change on this file since 22355 was 22355, checked in by kjdon, 14 years ago

previously, when use_realistic_book was set, all files listed in archivesinf dbs were in tidytmp - change paths to be import paths. This assumes that same directory structure is kept in tidytmp, which appears to be the case. No tidytmp paths should now appear in archiveinf dbs

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 53.9 KB
Line 
1###########################################################################
2#
3# HTMLPlugin.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlugin;
37
38use ReadTextFile;
39use HBPlugin;
40use ghtml;
41use unicode;
42use util;
43use XMLParser;
44
45use Image::Size;
46use File::Copy;
47
48sub BEGIN {
49 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
50}
51
52use strict; # every perl program should have this!
53no strict 'refs'; # make an exception so we can use variables as filehandles
54
55my $arguments =
56 [ { 'name' => "process_exp",
57 'desc' => "{BasePlugin.process_exp}",
58 'type' => "regexp",
59 'deft' => &get_default_process_exp() },
60 { 'name' => "block_exp",
61 'desc' => "{BasePlugin.block_exp}",
62 'type' => 'regexp',
63 'deft' => &get_default_block_exp() },
64 { 'name' => "nolinks",
65 'desc' => "{HTMLPlugin.nolinks}",
66 'type' => "flag" },
67 { 'name' => "keep_head",
68 'desc' => "{HTMLPlugin.keep_head}",
69 'type' => "flag" },
70 { 'name' => "no_metadata",
71 'desc' => "{HTMLPlugin.no_metadata}",
72 'type' => "flag" },
73 { 'name' => "metadata_fields",
74 'desc' => "{HTMLPlugin.metadata_fields}",
75 'type' => "string",
76 'deft' => "Title" },
77 { 'name' => "metadata_field_separator",
78 'desc' => "{HTMLPlugin.metadata_field_separator}",
79 'type' => "string",
80 'deft' => "" },
81 { 'name' => "hunt_creator_metadata",
82 'desc' => "{HTMLPlugin.hunt_creator_metadata}",
83 'type' => "flag" },
84 { 'name' => "file_is_url",
85 'desc' => "{HTMLPlugin.file_is_url}",
86 'type' => "flag" },
87 { 'name' => "assoc_files",
88 'desc' => "{HTMLPlugin.assoc_files}",
89 'type' => "regexp",
90 'deft' => &get_default_block_exp() },
91 { 'name' => "rename_assoc_files",
92 'desc' => "{HTMLPlugin.rename_assoc_files}",
93 'type' => "flag" },
94 { 'name' => "title_sub",
95 'desc' => "{HTMLPlugin.title_sub}",
96 'type' => "string",
97 'deft' => "" },
98 { 'name' => "description_tags",
99 'desc' => "{HTMLPlugin.description_tags}",
100 'type' => "flag" },
101 # retain this for backward compatibility (w3mir option was replaced by
102 # file_is_url)
103 { 'name' => "w3mir",
104# 'desc' => "{HTMLPlugin.w3mir}",
105 'type' => "flag",
106 'hiddengli' => "yes"},
107 { 'name' => "no_strip_metadata_html",
108 'desc' => "{HTMLPlugin.no_strip_metadata_html}",
109 'type' => "string",
110 'deft' => "",
111 'reqd' => "no"},
112 { 'name' => "sectionalise_using_h_tags",
113 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
114 'type' => "flag" },
115 { 'name' => "use_realistic_book",
116 'desc' => "{HTMLPlugin.tidy_html}",
117 'type' => "flag"},
118 { 'name' => "old_style_HDL",
119 'desc' => "{HTMLPlugin.old_style_HDL}",
120 'type' => "flag"},
121 {'name' => "processing_tmp_files",
122 'desc' => "{BasePlugin.processing_tmp_files}",
123 'type' => "flag",
124 'hiddengli' => "yes"}
125 ];
126
127my $options = { 'name' => "HTMLPlugin",
128 'desc' => "{HTMLPlugin.desc}",
129 'abstract' => "no",
130 'inherits' => "yes",
131 'args' => $arguments };
132
133
134sub new {
135 my ($class) = shift (@_);
136 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
137 push(@$pluginlist, $class);
138
139 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
140 push(@{$hashArgOptLists->{"OptList"}},$options);
141
142
143 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
144
145 if ($self->{'w3mir'}) {
146 $self->{'file_is_url'} = 1;
147 }
148 $self->{'aux_files'} = {};
149 $self->{'dir_num'} = 0;
150 $self->{'file_num'} = 0;
151
152 return bless $self, $class;
153}
154
155# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
156# if have eg <script language="javascript" src="img/lib.js@123">
157# blocking is now done by reading through the file and recording all the
158# images and other files
159sub get_default_block_exp {
160 my $self = shift (@_);
161
162 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
163 return "";
164}
165
166sub get_default_process_exp {
167 my $self = shift (@_);
168
169 # the last option is an attempt to encode the concept of an html query ...
170 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
171}
172
173sub store_block_files
174{
175 my $self =shift (@_);
176 my ($filename_full_path, $block_hash) = @_;
177
178 my $html_fname = $filename_full_path;
179 my @file_blocks;
180
181 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
182
183 # read in file ($text will be in utf8)
184 my $raw_text = "";
185 $self->read_file_no_decoding ($filename_full_path, \$raw_text);
186
187 my $textref = \$raw_text;
188 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
189 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
190 $$textref =~ s/$opencom(.*?)$closecom//gs;
191
192 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
193 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
194 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
195 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
196 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
197 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
198 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
199
200 if(!defined $self->{'utf8_to_original_filename'}) {
201 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
202 $self->{'utf8_to_original_filename'} = {};
203 }
204
205 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
206
207 # remove quotes from link at start and end if necessary
208 if ($link=~/^\"/) {
209 $link=~s/^\"//;
210 $link=~s/\"$//;
211 }
212
213 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
214 # some links may just be anchor names
215 next unless ($link =~ /\S+/);
216
217 if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
218 # Turn relative file path into full path
219 my $dirname = &File::Basename::dirname($filename_full_path);
220 $link = &util::filename_cat($dirname, $link);
221 }
222 $link = $self->eval_dir_dots($link);
223
224 # this is the actual filename on the filesystem (that the link refers to)
225 my $url_original_filename = $self->opt_url_decode($link);
226
227 # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
228 my $utf8_link = "";
229 $self->decode_text($link,$encoding,$language,\$utf8_link);
230
231 $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
232# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
233
234 if ($url_original_filename ne $utf8_link) {
235 my $outhandle = $self->{'outhandle'};
236
237 print $outhandle "URL Encoding $url_original_filename\n";
238 print $outhandle " ->$utf8_link\n";
239 }
240
241 $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
242 }
243}
244
245# Given a filename in any encoding, will URL decode it to get back the original filename
246# in the original encoding. Because this method is intended to work out the *original*
247# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
248# string already exists in the local folder.
249# Return the original filename corresponding to the parameter URL-encoded filename, and
250# a decoded flag that is set to true iff URL-decoding had to be applied.
251sub opt_url_decode {
252 my $self = shift (@_);
253 my ($link) = @_;
254
255 # Replace %XX's in URL with decoded value if required.
256 # Note that the filename may include the %XX in some situations
257 if ($link =~ m/\%[A-F0-9]{2}/i) {
258 if (!-e $link) {
259 $link = &unicode::url_decode($link);
260 }
261 }
262
263 return $link;
264}
265
266sub read_into_doc_obj
267{
268 my $self = shift (@_);
269 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
270
271 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
272 # get the input file
273 my $input_filename = $file;
274 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
275 $suffix = lc($suffix);
276 my $tidy_filename;
277 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
278 {
279 # because the document has to be sectionalized set the description tags
280 $self->{'description_tags'} = 1;
281
282 # set the file to be tidied
283 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
284
285 # get the tidied file
286 #my $tidy_filename = $self->tmp_tidy_file($input_filename);
287 $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
288
289 # derive tmp filename from input filename
290 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
291
292 # set the new input file and base_dir to be from the tidied file
293 $file = "$tailname$suffix";
294 $base_dir = $dirname;
295 }
296
297 # call the parent read_into_doc_obj
298 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
299 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
300 {
301 # now we need to reset the filenames in the doc obj so that the converted filenames are not used
302 my $collect_file = &util::filename_within_collection($filename_full_path);
303 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
304 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
305 # build. so set it manually.
306 $doc_obj->{'source_path'} = $filename_full_path;
307 my $collect_conv_file = &util::filename_within_collection($tidy_filename);
308 $doc_obj->set_converted_filename($collect_conv_file);
309 $self->set_Source_metadata($doc_obj, $filename_no_path);
310 }
311 return ($process_status,$doc_obj);
312}
313
314# do plugin specific processing of doc_obj
315sub process {
316 my $self = shift (@_);
317 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
318 my $outhandle = $self->{'outhandle'};
319
320 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
321 # this makes life so much easier... perl can cope with unix-style '/'s.
322 $base_dir =~ s@(\\)+@/@g;
323 $file =~ s@(\\)+@/@g;
324 }
325
326 # reset per-doc stuff...
327 $self->{'aux_files'} = {};
328 $self->{'dir_num'} = 0;
329 $self->{'file_num'} = 0;
330
331 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
332 # you can also include metadata in the format (X can be any number)
333 # <hX>Title<!--gsdl-metadata
334 # <Metadata name="name1">value1</Metadata>
335 # ...
336 # <Metadata name="nameN">valueN</Metadata>
337 #--></hX>
338 if ($self->{'sectionalise_using_h_tags'}) {
339 # description_tags should allways be activated because we convert headings to description tags
340 $self->{'description_tags'} = 1;
341
342 my $arrSections = [];
343 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
344
345 if (scalar(@$arrSections)) {
346 my $strMetadata = $self->update_section_data($arrSections, -1);
347 if (length($strMetadata)) {
348 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
349 $$textref =~ s/<\/body>/$strMetadata/ig;
350 }
351 }
352 }
353
354 my $cursection = $doc_obj->get_top_section();
355
356 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
357 unless $self->{'no_metadata'} || $self->{'description_tags'};
358
359 # Store URL for page as metadata - this can be used for an
360 # altavista style search interface. The URL won't be valid
361 # unless the file structure contains the domain name (i.e.
362 # like when w3mir is used to download a website).
363
364 # URL metadata (even invalid ones) are used to support internal
365 # links, so even if 'file_is_url' is off, still need to store info
366
367 my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$");
368 my $utf8_file = $self->filename_to_utf8_metadata($file);
369 $utf8_file =~ s/&\#095;/_/g;
370 my $web_url = "http://";
371 if(defined $dirname) { # local directory
372 $dirname = $self->eval_dir_dots($dirname);
373 $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
374 $web_url = $web_url.$dirname.$utf8_file;
375 } else {
376 $web_url = $web_url.$utf8_file;
377 }
378 $web_url =~ s/\\/\//g;
379 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
380
381 if ($self->{'file_is_url'}) {
382 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
383 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
384 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
385 }
386
387 if ($self->{'description_tags'}) {
388 # remove the html header - note that doing this here means any
389 # sections defined within the header will be lost (so all <Section>
390 # tags must appear within the body of the HTML)
391 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
392
393 $$textref =~ s/^.*?<body[^>]*>//is;
394 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
395
396 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
397 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
398
399 my $lt = '(?:<|&lt;)';
400 my $gt = '(?:>|&gt;)';
401 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
402
403 my $dont_strip = '';
404 if ($self->{'no_strip_metadata_html'}) {
405 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
406 }
407
408 my $found_something = 0; my $top = 1;
409 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
410 my $text = $1;
411 my $comment = $2;
412 if (defined $text) {
413 # text before a comment - note that getting to here
414 # doesn't necessarily mean there are Section tags in
415 # the document
416 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
417 }
418 while ($comment =~ s/$lt(.*?)$gt//s) {
419 my $tag = $1;
420 if ($tag eq "Section") {
421 $found_something = 1;
422 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
423 $top = 0;
424 } elsif ($tag eq "/Section") {
425 $found_something = 1;
426 $cursection = $doc_obj->get_parent_section ($cursection);
427 } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
428 my $metaname = $1;
429 my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
430 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
431 my $metavalue = $1;
432 $metavalue =~ s/^\s+//;
433 $metavalue =~ s/\s+$//;
434 # assume that no metadata value intentionally includes
435 # carriage returns or HTML tags (if they're there they
436 # were probably introduced when converting to HTML from
437 # some other format).
438 # actually some people want to have html tags in their
439 # metadata.
440 $metavalue =~ s/[\cJ\cM]/ /sg;
441 $metavalue =~ s/<[^>]+>//sg
442 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
443 $metavalue =~ s/\s+/ /sg;
444 if ($metaname =~ /\./) { # has a namespace
445 $metaname = "ex.$metaname";
446 }
447 if ($accumulate) {
448 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
449 } else {
450 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
451 }
452 } elsif ($tag eq "Description" || $tag eq "/Description") {
453 # do nothing with containing Description tags
454 } else {
455 # simple HTML tag (probably created by the conversion
456 # to HTML from some other format) - we'll ignore it and
457 # hope for the best ;-)
458 }
459 }
460 }
461 if ($cursection ne "") {
462 print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
463 }
464
465 $$textref =~ s/^.*?<body[^>]*>//is;
466 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
467 if ($$textref =~ m/\S/) {
468 if (!$found_something) {
469 if ($self->{'verbosity'} > 2) {
470 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
471 print $outhandle " will be processed as a single section document\n";
472 }
473
474 # go ahead and process single-section document
475 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
476
477 # if document contains no Section tags we'll go ahead
478 # and extract metadata (this won't have been done
479 # above as the -description_tags option prevents it)
480 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
481 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
482 unless $self->{'no_metadata'};
483
484 } else {
485 print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
486 print $outhandle " of the final closing </Section> tag. This text will\n";
487 print $outhandle " be ignored.";
488
489 my ($text);
490 if (length($$textref) > 30) {
491 $text = substr($$textref, 0, 30) . "...";
492 } else {
493 $text = $$textref;
494 }
495 $text =~ s/\n/ /isg;
496 print $outhandle " ($text)\n";
497 }
498 } elsif (!$found_something) {
499
500 if ($self->{'verbosity'} > 2) {
501 # may get to here if document contained no valid Section
502 # tags but did contain some comments. The text will have
503 # been processed already but we should print the warning
504 # as above and extract metadata
505 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
506 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
507 }
508
509 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
510 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
511 unless $self->{'no_metadata'};
512 }
513
514 } else {
515
516 # remove header and footer
517 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
518 $$textref =~ s/^.*?<body[^>]*>//is;
519 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
520 }
521
522 # single section document
523 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
524 }
525 return 1;
526}
527
528
529sub process_heading
530{
531 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
532 $strHeadingText = '' if (!defined($strHeadingText));
533
534 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
535
536 my $strSecMetadata = '';
537 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
538 {
539 $strSecMetadata .= $1;
540 }
541
542 $strHeadingText =~ s/^\s+//g;
543 $strHeadingText =~ s/\s+$//g;
544 $strSecMetadata =~ s/^\s+//g;
545 $strSecMetadata =~ s/\s+$//g;
546
547 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
548
549 if (length($strSecMetadata)) {
550 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
551 }
552
553 $strMetadata .= "\t</Description>\n";
554
555 return "<!--" . $strMetadata . "-->";
556}
557
558
559sub update_section_data
560{
561 my ($self, $arrSections, $nCurTocNo) = @_;
562 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
563
564 if ($nSections == 0) {
565 push @$arrSections, $nCurTocNo;
566 return $strBuffer;
567 }
568 $nLast = $arrSections->[$nSections - 1];
569 if ($nCurTocNo > $nLast) {
570 push @$arrSections, $nCurTocNo;
571 return $strBuffer;
572 }
573 for(my $i = $nSections - 1; $i >= 0; $i--) {
574 if ($nCurTocNo <= $arrSections->[$i]) {
575 $strBuffer .= "\n</Section>";
576 pop @$arrSections;
577 }
578 }
579 push @$arrSections, $nCurTocNo;
580 return $strBuffer;
581}
582
583
584# note that process_section may be called multiple times for a single
585# section (relying on the fact that add_utf8_text appends the text to any
586# that may exist already).
587sub process_section {
588 my $self = shift (@_);
589 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
590 # trap links
591 if (!$self->{'nolinks'}) {
592 # usemap="./#index" not handled correctly => change to "#index"
593## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
594
595 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
596 $self->replace_usemap_links($1, $2, $3)/isge;
597
598## $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
599
600 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
601 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
602 }
603
604 # trap images
605
606 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
607 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
608 # turned regular text succeeding images into links. That is, by embedding <imgs>
609 # inside <a href=""></a>, the text following images were no longer misbehaving.
610 # However, there would be many occasions whereby images were not meant to link
611 # to their source images but where the images would link to another web page.
612 # To allow this, the no_image_links option was introduced: it would prevent
613 # the behaviour of embedding images into links that referenced the source images.
614
615 # Somewhere along the line, the problem of normal text turning into links when
616 # such text followed images which were not embedded in <a href=""></a> ceased
617 # to occur. This is why the following lines have been commented out (as well as
618 # two lines in replace_images). They appear to no longer apply.
619
620 # If at any time, there is a need for having images embedded in <a> anchor tags,
621 # then it might be better to turn that into an HTMLPlugin option rather than make
622 # it the default behaviour. Also, eventually, no_image_links needs to become
623 # a deprecated option for HTMLPlugin as it has now become the default behaviour.
624
625 #if(!$self->{'no_image_links'}){
626 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
627 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
628 #}
629
630 # add text to document object
631 # turn \ into \\ so that the rest of greenstone doesn't think there
632 # is an escape code following. (Macro parsing loses them...)
633 $$textref =~ s/\\/\\\\/go;
634
635 $doc_obj->add_utf8_text($cursection, $$textref);
636}
637
638sub replace_images {
639 my $self = shift (@_);
640 my ($front, $link, $back, $base_dir,
641 $file, $doc_obj, $section) = @_;
642
643 # remove quotes from link at start and end if necessary
644 if ($link=~/^[\"\']/) {
645 $link=~s/^[\"\']//;
646 $link=~s/[\"\']$//;
647 $front.='"';
648 $back="\"$back";
649 }
650
651 $link =~ s/\n/ /g;
652
653 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
654 # If the Word file path has spaces in it, wv messes up and you end up with
655 # absolute paths for the images, and without the "file://" prefix
656 # So check for this special case and massage the data to be correct
657 if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
658 $link =~ s/^.*\\([^\\]+)$/$1/;
659 }
660
661 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
662
663 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
664
665# print STDERR "**** link = $link\n**** href = $href\n**** img_file = $img_file, rl = $rl\n";
666
667 my $anchor_name = $img_file;
668 #$anchor_name =~ s/^.*\///;
669 #$anchor_name = "<a name=\"$anchor_name\" ></a>";
670
671 my $image_link = $front . $img_file .$back;
672 return $image_link;
673
674 # The reasons for why the following two lines are no longer necessary can be
675 # found in subroutine process_section
676 #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
677 #return $anchor_link;
678
679 #return $front . $img_file . $back . $anchor_name;
680}
681
682sub replace_href_links {
683 my $self = shift (@_);
684 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
685
686 # remove quotes from link at start and end if necessary
687 if ($link=~/^[\"\']/) {
688 $link=~s/^[\"\']//;
689 $link=~s/[\"\']$//;
690 $front.='"';
691 $back="\"$back";
692 }
693
694 # attempt to sort out targets - frames are not handled
695 # well in this plugin and some cases will screw things
696 # up - e.g. the _parent target (so we'll just remove
697 # them all ;-)
698 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
699 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
700 $front =~ s/target=\"?_parent\"?//is;
701 $back =~ s/target=\"?_parent\"?//is;
702
703 return $front . $link . $back if $link =~ m/^\#/s;
704 $link =~ s/\n/ /g;
705
706 # Find file referred to by $link on file system
707 # This is more complicated than it sounds when char encodings
708 # is taken in to account
709 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
710
711 # href may use '\'s where '/'s should be on Windows
712 $href =~ s/\\/\//g;
713 my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
714
715
716 ##### leave all these links alone (they won't be picked up by intermediate
717 ##### pages). I think that's safest when dealing with frames, targets etc.
718 ##### (at least until I think of a better way to do it). Problems occur with
719 ##### mailto links from within small frames, the intermediate page is displayed
720 ##### within that frame and can't be seen. There is still potential for this to
721 ##### happen even with html pages - the solution seems to be to somehow tell
722 ##### the browser from the server side to display the page being sent (i.e.
723 ##### the intermediate page) in the top level window - I'm not sure if that's
724 ##### possible - the following line should probably be deleted if that can be done
725 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
726
727
728 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
729 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
730 &ghtml::urlsafe ($href);
731 return $front . "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part . $back;
732 } else {
733 # link is to some other type of file (eg image) so we'll
734 # need to associate that file
735 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
736 }
737}
738
739sub add_file {
740 my $self = shift (@_);
741 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
742 my ($newname);
743
744 my $filename = $href;
745 if ($base_dir eq "") {
746 # remove http:/ thereby leaving one slash at the start
747 $filename =~ s/^[^:]*:\///;
748 }
749 else {
750 # remove http://
751 $filename =~ s/^[^:]*:\/\///;
752 }
753
754 $filename = &util::filename_cat($base_dir, $filename);
755 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) {
756 # we are processing a tidytmp file - want paths to be in import
757 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
758 }
759 # Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some
760 # situations. If the *original* file's name was in URL encoding, the following method will not decode it.
761 my $utf8_filename = $filename;
762 $filename = $self->opt_url_decode($utf8_filename);
763
764 # some special processing if the intended filename was converted to utf8, but
765 # the actual file still needs to be renamed
766 if (!-e $filename) {
767 # try the original filename stored in map
768 my $original_filename = $self->{'utf8_to_original_filename'}->{$filename};
769 if (defined $original_filename && -e $original_filename) {
770 $filename = $original_filename;
771 }
772 }
773
774 my ($ext) = $filename =~ m/(\.[^\.]*)$/;
775
776 if ($rl == 0) {
777 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
778 return "_httpextlink_&amp;rl=0&amp;el=prompt&amp;href=" . $href . $hash_part;
779 }
780 else {
781 return "_httpextlink_&amp;rl=0&amp;el=direct&amp;href=" . $href . $hash_part;
782 }
783 }
784
785 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
786 return "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part;
787 }
788 # add the original image file as a source file
789 if (!$self->{'processing_tmp_files'} ) {
790 $doc_obj->associate_source_file($filename);
791 }
792 if ($self->{'rename_assoc_files'}) {
793 if (defined $self->{'aux_files'}->{$href}) {
794 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
795 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
796 } else {
797 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
798 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
799 $self->inc_filecount ();
800 }
801 $doc_obj->associate_file($filename, $newname, undef, $section);
802 return "_httpdocimg_/$newname";
803 } else {
804 if(&unicode::is_url_encoded($utf8_filename)) {
805 # use the possibly-decoded filename instead to avoid double URL encoding
806 ($newname) = $filename =~ m/([^\/\\]*)$/;
807 } else {
808 ($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
809 }
810
811 # Make sure this name uses only ASCII characters.
812 # We use either base64 or URL encoding, as these preserve original encoding
813 $newname = &util::rename_file($newname, $self->{'file_rename_method'});
814
815 $doc_obj->associate_file($filename, $newname, undef, $section);
816
817 # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
818 # of filenames, URL-encode the additional percent signs of the URL-encoded filename
819 my $newname_url = $newname;
820 $newname_url = &unicode::filename_to_url($newname_url);
821 return "_httpdocimg_/$newname_url";
822 }
823}
824
825
826sub format_link {
827 my $self = shift (@_);
828 my ($link, $base_dir, $file) = @_;
829
830 my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/;
831
832 $hash_part = "" if !defined $hash_part;
833 if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
834 my $outhandle = $self->{'outhandle'};
835 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
836 if $self->{'verbosity'};
837 return ($link, "", 0);
838 }
839
840 if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) {
841 my $type = $1;
842
843 if ($link =~ m/^(http|ftp):/i) {
844 # Turn url (using /) into file name (possibly using \ on windows)
845 my @http_dir_split = split('/', $before_hash);
846 $before_hash = &util::filename_cat(@http_dir_split);
847 }
848
849 $before_hash = $self->eval_dir_dots($before_hash);
850
851 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
852
853 my $rl = 0;
854 $rl = 1 if (-e $linkfilename);
855
856 # make sure there's a slash on the end if it's a directory
857 if ($before_hash !~ m/\/$/) {
858 $before_hash .= "/" if (-d $linkfilename);
859 }
860 return ($type . $before_hash, $hash_part, $rl);
861
862 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
863
864 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
865
866 # the first directory will be the domain name if file_is_url
867 # to generate archives, otherwise we'll assume all files are
868 # from the same site and base_dir is the root
869
870 if ($self->{'file_is_url'}) {
871 my @dirs = split /[\/\\]/, $file;
872 my $domname = shift (@dirs);
873 $before_hash = &util::filename_cat($domname, $before_hash);
874 $before_hash =~ s@\\@/@g; # for windows
875 }
876 else
877 {
878 # see if link shares directory with source document
879 # => turn into relative link if this is so!
880
881 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
882 # too difficult doing a pattern match with embedded '\'s...
883 my $win_before_hash=$before_hash;
884 $win_before_hash =~ s@(\\)+@/@g;
885 # $base_dir is already similarly "converted" on windows.
886 if ($win_before_hash =~ s@^$base_dir/@@o) {
887 # if this is true, we removed a prefix
888 $before_hash=$win_before_hash;
889 }
890 }
891 else {
892 # before_hash has lost leading slash by this point,
893 # -> add back in prior to substitution with $base_dir
894 $before_hash = "/$before_hash";
895
896 $before_hash = &util::filename_cat("",$before_hash);
897 $before_hash =~ s@^$base_dir/@@;
898 }
899 }
900 } else {
901 # Turn relative file path into full path
902 my $dirname = &File::Basename::dirname($file);
903 $before_hash = &util::filename_cat($dirname, $before_hash);
904 $before_hash = $self->eval_dir_dots($before_hash);
905 }
906
907 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
908 # make sure there's a slash on the end if it's a directory
909 if ($before_hash !~ m/\/$/) {
910 $before_hash .= "/" if (-d $linkfilename);
911 }
912 return ("http://" . $before_hash, $hash_part, 1);
913 } else {
914 # mailto, news, nntp, telnet, javascript or gopher link
915 return ($before_hash, "", 0);
916 }
917}
918
919sub extract_first_NNNN_characters {
920 my $self = shift (@_);
921 my ($textref, $doc_obj, $thissection) = @_;
922
923 foreach my $size (split /,/, $self->{'first'}) {
924 my $tmptext = $$textref;
925 # skip to the body
926 $tmptext =~ s/.*<body[^>]*>//i;
927 # remove javascript
928 $tmptext =~ s@<script.*?</script>@ @sig;
929 $tmptext =~ s/<[^>]*>/ /g;
930 $tmptext =~ s/&nbsp;/ /g;
931 $tmptext =~ s/^\s+//;
932 $tmptext =~ s/\s+$//;
933 $tmptext =~ s/\s+/ /gs;
934 $tmptext = &unicode::substr ($tmptext, 0, $size);
935 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
936 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
937 }
938}
939
940
941sub extract_metadata {
942 my $self = shift (@_);
943 my ($textref, $metadata, $doc_obj, $section) = @_;
944 my $outhandle = $self->{'outhandle'};
945 # if we don't want metadata, we may as well not be here ...
946 return if (!defined $self->{'metadata_fields'});
947
948 my $separator = $self->{'metadata_field_separator'};
949 if ($separator eq "") {
950 undef $separator;
951 }
952
953 # metadata fields to extract/save. 'key' is the (lowercase) name of the
954 # html meta, 'value' is the metadata name for greenstone to use
955 my %find_fields = ();
956
957 my %creator_fields = (); # short-cut for lookups
958
959
960 foreach my $field (split /,/, $self->{'metadata_fields'}) {
961 $field =~ s/^\s+//; # remove leading whitespace
962 $field =~ s/\s+$//; # remove trailing whitespace
963
964 # support tag<tagname>
965 if ($field =~ m/^(.*?)\s*<(.*?)>$/) {
966 # "$2" is the user's preferred gs metadata name
967 $find_fields{lc($1)}=$2; # lc = lowercase
968 } else { # no <tagname> for mapping
969 # "$field" is the user's preferred gs metadata name
970 $find_fields{lc($field)}=$field; # lc = lowercase
971 }
972 }
973
974 if (defined $self->{'hunt_creator_metadata'} &&
975 $self->{'hunt_creator_metadata'} == 1 ) {
976 my @extra_fields =
977 (
978 'author',
979 'author.email',
980 'creator',
981 'dc.creator',
982 'dc.creator.corporatename',
983 );
984
985 # add the creator_metadata fields to search for
986 foreach my $field (@extra_fields) {
987 $creator_fields{$field}=0; # add to lookup hash
988 }
989 }
990
991
992 # find the header in the html file, which has the meta tags
993 $$textref =~ m@<head>(.*?)</head>@si;
994
995 my $html_header=$1;
996
997 # go through every <meta... tag defined in the html and see if it is
998 # one of the tags we want to match.
999
1000 # special case for title - we want to remember if its been found
1001 my $found_title = 0;
1002 # this assumes that ">" won't appear. (I don't think it's allowed to...)
1003 $html_header =~ m/^/; # match the start of the string, for \G assertion
1004
1005 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
1006 my $metatag=$1;
1007 my ($tag, $value);
1008
1009 # find the tag name
1010 $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
1011 $tag=$2;
1012 # in case they're not using " or ', but they should...
1013 if (! $tag) {
1014 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1015 $tag=$1;
1016 }
1017
1018 if (!defined $tag) {
1019 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1020 next;
1021 }
1022
1023 # don't need to assign this field if it was passed in from a previous
1024 # (recursive) plugin
1025 if (defined $metadata->{$tag}) {next}
1026
1027 # find the tag content
1028 $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
1029 $value=$2;
1030
1031 if (! $value) {
1032 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1033 $value=$1;
1034 }
1035 if (!defined $value) {
1036 print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1037 next;
1038 }
1039
1040 # clean up and add
1041 $value =~ s/\s+/ /gs;
1042 chomp($value); # remove trailing \n, if any
1043 if (exists $creator_fields{lc($tag)}) {
1044 # map this value onto greenstone's "Creator" metadata
1045 $tag='Creator';
1046 } elsif (!exists $find_fields{lc($tag)}) {
1047 next; # don't want this tag
1048 } else {
1049 # get the user's preferred capitalisation
1050 $tag = $find_fields{lc($tag)};
1051 }
1052 if (lc($tag) eq "title") {
1053 $found_title = 1;
1054 }
1055
1056 if ($self->{'verbosity'} > 2) {
1057 print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1058 }
1059
1060 if ($tag =~ /\./) {
1061 # there is a . so has a namespace, add ex.
1062 $tag = "ex.$tag";
1063 }
1064 if (defined $separator) {
1065 my @values = split($separator, $value);
1066 foreach my $v (@values) {
1067 $doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1068 }
1069 }
1070 else {
1071 $doc_obj->add_utf8_metadata($section, $tag, $value);
1072 }
1073 }
1074
1075 # TITLE: extract the document title
1076 if (exists $find_fields{'title'} && !$found_title) {
1077 # we want a title, and didn't find one in the meta tags
1078 # see if there's a <title> tag
1079 my $title;
1080 my $from = ""; # for debugging output only
1081 if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
1082 $title = $1;
1083 $from = "<title> tags";
1084 }
1085
1086 if (!defined $title) {
1087 $from = "first 100 chars";
1088 # if no title use first 100 or so characters
1089 $title = $$textref;
1090 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1091 $title =~ s/^.*?<body>//si;
1092 # ignore javascript!
1093 $title =~ s@<script.*?</script>@ @sig;
1094 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1095 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1096 $title = substr ($title, 0, 100);
1097 $title =~ s/\s\S*$/.../;
1098 }
1099 $title =~ s/<[^>]*>/ /g; # remove html tags
1100 $title =~ s/&nbsp;/ /g;
1101 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1102 $title =~ s/\s+/ /gs; # collapse multiple spaces
1103 $title =~ s/^\s*//; # remove leading spaces
1104 $title =~ s/\s*$//; # remove trailing spaces
1105
1106 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1107 $title =~ s/^\s+//s; # in case title_sub introduced any...
1108 $doc_obj->add_utf8_metadata ($section, 'Title', $title);
1109 print $outhandle " extracted Title metadata \"$title\" from $from\n"
1110 if ($self->{'verbosity'} > 2);
1111 }
1112
1113 # add FileFormat metadata
1114 $doc_obj->add_metadata($section,"FileFormat", "HTML");
1115
1116 # Special, for metadata names such as tagH1 - extracts
1117 # the text between the first <H1> and </H1> tags into "H1" metadata.
1118
1119 foreach my $field (keys %find_fields) {
1120 if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1121 my $tag = $1;
1122 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1123 my $content = $1;
1124 $content =~ s/&nbsp;/ /g;
1125 $content =~ s/<[^>]*>/ /g;
1126 $content =~ s/^\s+//;
1127 $content =~ s/\s+$//;
1128 $content =~ s/\s+/ /gs;
1129 if ($content) {
1130 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1131 $tag =~ s/^tag//i;
1132 $doc_obj->add_utf8_metadata ($section, $tag, $content);
1133 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1134 if ($self->{'verbosity'} > 2);
1135 }
1136 }
1137 }
1138}
1139
1140
1141# evaluate any "../" to next directory up
1142# evaluate any "./" as here
1143sub eval_dir_dots {
1144 my $self = shift (@_);
1145 my ($filename) = @_;
1146 my $dirsep_os = &util::get_os_dirsep();
1147 my @dirsep = split(/$dirsep_os/,$filename);
1148
1149 my @eval_dirs = ();
1150 foreach my $d (@dirsep) {
1151 if ($d eq "..") {
1152 pop(@eval_dirs);
1153
1154 } elsif ($d eq ".") {
1155 # do nothing!
1156
1157 } else {
1158 push(@eval_dirs,$d);
1159 }
1160 }
1161
1162 # Need to fiddle with number of elements in @eval_dirs if the
1163 # first one is the empty string. This is because of a
1164 # modification to util::filename_cat that supresses the addition
1165 # of a leading '/' character (or \ if windows) (intended to help
1166 # filename cat with relative paths) if the first entry in the
1167 # array is the empty string. Making the array start with *two*
1168 # empty strings is a way to defeat this "smart" option.
1169 #
1170 if (scalar(@eval_dirs) > 0) {
1171 if ($eval_dirs[0] eq ""){
1172 unshift(@eval_dirs,"");
1173 }
1174 }
1175
1176 my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1177 return $evaluated_filename;
1178}
1179
1180sub replace_usemap_links {
1181 my $self = shift (@_);
1182 my ($front, $link, $back) = @_;
1183
1184 # remove quotes from link at start and end if necessary
1185 if ($link=~/^[\"\']/) {
1186 $link=~s/^[\"\']//;
1187 $link=~s/[\"\']$//;
1188 $front.='"';
1189 $back="\"$back";
1190 }
1191
1192 $link =~ s/^\.\///;
1193 return $front . $link . $back;
1194}
1195
1196sub inc_filecount {
1197 my $self = shift (@_);
1198
1199 if ($self->{'file_num'} == 1000) {
1200 $self->{'dir_num'} ++;
1201 $self->{'file_num'} = 0;
1202 } else {
1203 $self->{'file_num'} ++;
1204 }
1205}
1206
1207
1208# Extend read_file so that strings like &eacute; are
1209# converted to UTF8 internally.
1210#
1211# We don't convert &lt; or &gt; or &amp; or &quot; in case
1212# they interfere with the GML files
1213
1214sub read_file {
1215 my $self = shift(@_);
1216 my ($filename, $encoding, $language, $textref) = @_;
1217
1218 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1219
1220 # Convert entities to their UTF8 equivalents
1221 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
1222 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
1223 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
1224}
1225
1226sub HB_read_html_file {
1227 my $self = shift (@_);
1228 my ($htmlfile, $text) = @_;
1229
1230 # load in the file
1231 if (!open (FILE, $htmlfile)) {
1232 print STDERR "ERROR - could not open $htmlfile\n";
1233 return;
1234 }
1235
1236 my $foundbody = 0;
1237 $self->HB_gettext (\$foundbody, $text, "FILE");
1238 close FILE;
1239
1240 # just in case there was no <body> tag
1241 if (!$foundbody) {
1242 $foundbody = 1;
1243 open (FILE, $htmlfile) || return;
1244 $self->HB_gettext (\$foundbody, $text, "FILE");
1245 close FILE;
1246 }
1247 # text is in utf8
1248}
1249
1250# converts the text to utf8, as ghtml does that for &eacute; etc.
1251sub HB_gettext {
1252 my $self = shift (@_);
1253 my ($foundbody, $text, $handle) = @_;
1254
1255 my $line = "";
1256 while (defined ($line = <$handle>)) {
1257 # look for body tag
1258 if (!$$foundbody) {
1259 if ($line =~ s/^.*<body[^>]*>//i) {
1260 $$foundbody = 1;
1261 } else {
1262 next;
1263 }
1264 }
1265
1266 # check for symbol fonts
1267 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
1268 my $font = $1;
1269 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1270 if ($font !~ m/^arial$/i);
1271 }
1272
1273 $$text .= $line;
1274 }
1275
1276 if ($self->{'input_encoding'} eq "iso_8859_1") {
1277 # convert to utf-8
1278 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1279 }
1280 # convert any alphanumeric character entities to their utf-8
1281 # equivalent for indexing purposes
1282 #&ghtml::convertcharentities ($$text);
1283
1284 $$text =~ s/\s+/ /g; # remove \n's
1285}
1286
1287sub HB_clean_section {
1288 my $self = shift (@_);
1289 my ($section) = @_;
1290
1291 # remove tags without a starting tag from the section
1292 my ($tag, $tagstart);
1293 while ($section =~ m/<\/([^>]{1,10})>/) {
1294 $tag = $1;
1295 $tagstart = index($section, "<$tag");
1296 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1297 $section =~ s/<\/$tag>//;
1298 }
1299
1300 # remove extra paragraph tags
1301 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
1302
1303 # remove extra stuff at the end of the section
1304 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
1305
1306 # add a newline at the beginning of each paragraph
1307 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1308
1309 # add a newline every 80 characters at a word boundary
1310 # Note: this regular expression puts a line feed before
1311 # the last word in each section, even when it is not
1312 # needed.
1313 $section =~ s/(.{1,80})\s/$1\n/g;
1314
1315 # fix up the image links
1316 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
1317 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1318 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
1319 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1320
1321 return $section;
1322}
1323
1324# Will convert the oldHDL format to the new HDL format (using the Section tag)
1325sub convert_to_newHDLformat
1326{
1327 my $self = shift (@_);
1328 my ($file,$cnfile) = @_;
1329 my $input_filename = $file;
1330 my $tmp_filename = $cnfile;
1331
1332 # write HTML tmp file with new HDL format
1333 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1334
1335 # read in the file and do basic html cleaning (removing header etc)
1336 my $html = "";
1337 $self->HB_read_html_file ($input_filename, \$html);
1338
1339 # process the file one section at a time
1340 my $curtoclevel = 1;
1341 my $firstsection = 1;
1342 my $toclevel = 0;
1343 while (length ($html) > 0) {
1344 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
1345 $toclevel = $3;
1346 my $title = $4;
1347 my $sectiontext = "";
1348 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
1349 $sectiontext = $1;
1350 } else {
1351 $sectiontext = $html;
1352 $html = "";
1353 }
1354
1355 # remove tags and extra spaces from the title
1356 $title =~ s/<\/?[^>]+>//g;
1357 $title =~ s/^\s+|\s+$//g;
1358
1359 # close any sections below the current level and
1360 # create a new section (special case for the firstsection)
1361 print PROD "<!--\n";
1362 while (($curtoclevel > $toclevel) ||
1363 (!$firstsection && $curtoclevel == $toclevel)) {
1364 $curtoclevel--;
1365 print PROD "</Section>\n";
1366 }
1367 if ($curtoclevel+1 < $toclevel) {
1368 print STDERR "WARNING - jump in toc levels in $input_filename " .
1369 "from $curtoclevel to $toclevel\n";
1370 }
1371 while ($curtoclevel < $toclevel) {
1372 $curtoclevel++;
1373 }
1374
1375 if ($curtoclevel == 1) {
1376 # add the header tag
1377 print PROD "-->\n";
1378 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1379 print PROD "<!--\n";
1380 }
1381
1382 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1383
1384 print PROD "-->\n";
1385
1386 # clean up the section html
1387 $sectiontext = $self->HB_clean_section($sectiontext);
1388
1389 print PROD "$sectiontext\n";
1390
1391 } else {
1392 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1393 "\nin $input_filename\n";
1394 last;
1395 }
1396 $firstsection = 0;
1397 }
1398
1399 print PROD "<!--\n";
1400 while ($curtoclevel > 0) {
1401 $curtoclevel--;
1402 print PROD "</Section>\n";
1403 }
1404 print PROD "-->\n";
1405
1406 close (PROD) || die("Error Closing File: $tmp_filename $!");
1407
1408 return $tmp_filename;
1409}
1410
1411sub shorten {
1412 my $self = shift (@_);
1413 my ($text) = @_;
1414
1415 return "\"$text\"" if (length($text) < 100);
1416
1417 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1418 substr ($text, length($text)-50) . "\"";
1419}
1420
1421sub convert_tidy_or_oldHDL_file
1422{
1423 my $self = shift (@_);
1424 my ($file) = @_;
1425 my $input_filename = $file;
1426
1427 if (-d $input_filename)
1428 {
1429 return $input_filename;
1430 }
1431
1432 # get the input filename
1433 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1434 my $base_dirname = $dirname;
1435 $suffix = lc($suffix);
1436
1437 # derive tmp filename from input filename
1438 # Remove any white space from filename -- no risk of name collision, and
1439 # makes later conversion by utils simpler. Leave spaces in path...
1440 # tidy up the filename with space, dot, hyphen between
1441 $tailname =~ s/\s+//g;
1442 $tailname =~ s/\.+//g;
1443 $tailname =~ s/\-+//g;
1444 # convert to utf-8 otherwise we have problems with the doc.xml file
1445 # later on
1446 &unicode::ensure_utf8(\$tailname);
1447
1448 # softlink to collection tmp dir
1449 my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1450 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1451
1452 my $test_dirname = "";
1453 my $f_separator = &util::get_os_dirsep();
1454
1455 if ($dirname =~ m/import$f_separator/)
1456 {
1457 $test_dirname = $'; #'
1458
1459 #print STDERR "init $'\n";
1460
1461 while ($test_dirname =~ m/[$f_separator]/)
1462 {
1463 my $folderdirname = $`;
1464 $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1465 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1466 $test_dirname = $'; #'
1467 }
1468 }
1469
1470 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1471
1472 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1473 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
1474 {
1475 #convert the input file to a new style HDL
1476 my $hdl_output_filename = $input_filename;
1477 if ($self->{'old_style_HDL'})
1478 {
1479 $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1480 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1481 }
1482
1483 #just for checking copy all other file from the base dir to tmp dir if it is not exists
1484 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1485 my @files = grep {!/^\.+$/} readdir(DIR);
1486 close(DIR);
1487
1488 foreach my $file (@files)
1489 {
1490 my $src_file = &util::filename_cat($base_dirname,$file);
1491 my $dest_file = &util::filename_cat($tmp_dirname,$file);
1492 if ((!-e $dest_file) && (!-d $src_file))
1493 {
1494 # just copy the original file back to the tmp directory
1495 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1496 }
1497 }
1498
1499 # tidy the input file
1500 my $tidy_output_filename = $hdl_output_filename;
1501 if ($self->{'use_realistic_book'})
1502 {
1503 $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1504 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1505 }
1506 $tmp_filename = $tidy_output_filename;
1507 }
1508 else
1509 {
1510 if (!-e $tmp_filename)
1511 {
1512 # just copy the original file back to the tmp directory
1513 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1514 }
1515 }
1516
1517 return $tmp_filename;
1518}
1519
1520
1521# Will make the html input file as a proper XML file with removed font tag and
1522# image size added to the img tag.
1523# The tidying process takes place in a collection specific 'tmp' directory so
1524# that we don't accidentally damage the input.
1525sub tmp_tidy_file
1526{
1527 my $self = shift (@_);
1528 my ($file,$cnfile) = @_;
1529 my $input_filename = $file;
1530 my $tmp_filename = $cnfile;
1531
1532 # get the input filename
1533 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1534
1535 require HTML::TokeParser::Simple;
1536
1537 # create HTML parser to decode the input file
1538 my $parser = HTML::TokeParser::Simple->new($input_filename);
1539
1540 # write HTML tmp file without the font tag and image size are added to the img tag
1541 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1542 while (my $token = $parser->get_token())
1543 {
1544 # is it an img tag
1545 if ($token->is_start_tag('img'))
1546 {
1547 # get the attributes
1548 my $attr = $token->return_attr;
1549
1550 # get the full path to the image
1551 my $img_file = &util::filename_cat($dirname,$attr->{src});
1552
1553 # set the width and height attribute
1554 ($attr->{width}, $attr->{height}) = imgsize($img_file);
1555
1556 # recreate the tag
1557 print PROD "<img";
1558 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1559 print PROD ">";
1560 }
1561 # is it a font tag
1562 else
1563 {
1564 if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
1565 {
1566 # remove font tag
1567 print PROD "";
1568 }
1569 else
1570 {
1571 # print without changes
1572 print PROD $token->as_is;
1573 }
1574 }
1575 }
1576 close (PROD) || die("Error Closing File: $tmp_filename $!");
1577
1578 # run html-tidy on the tmp file to make it a proper XML file
1579 my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`;
1580
1581 # write result back to the tmp file
1582 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1583 print PROD $tidyfile;
1584 close (PROD) || die("Error Closing File: $tmp_filename $!");
1585
1586 # return the output filename
1587 return $tmp_filename;
1588}
1589
1590sub associate_cover_image
1591{
1592 my $self = shift(@_);
1593 my ($doc_obj, $filename) = @_;
1594 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
1595 {
1596 # we will have cover image in tidytmp, but want it from import
1597 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1598 }
1599 $self->SUPER::associate_cover_image($doc_obj, $filename);
1600}
1601
1602
16031;
Note: See TracBrowser for help on using the repository browser.