source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 22951

Last change on this file since 22951 was 22951, checked in by davidb, 14 years ago

Encode::decode cannot be applied to all characters returned by ghtml::getcharequiv(). If getcharequiv does not recognize a character then it does not encode it, and so we cannot apply decode() to it. getcharequiv() upgraded to include an optional extra param that says whether or not to decode the equiv char should there be one it can map it to.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 55.0 KB
Line 
1###########################################################################
2#
3# HTMLPlugin.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlugin;
37
38use Encode;
39
40use ReadTextFile;
41use HBPlugin;
42use ghtml;
43use unicode;
44use util;
45use XMLParser;
46
47use Image::Size;
48use File::Copy;
49
50sub BEGIN {
51 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
52}
53
54use strict; # every perl program should have this!
55no strict 'refs'; # make an exception so we can use variables as filehandles
56
57my $arguments =
58 [ { 'name' => "process_exp",
59 'desc' => "{BasePlugin.process_exp}",
60 'type' => "regexp",
61 'deft' => &get_default_process_exp() },
62 { 'name' => "block_exp",
63 'desc' => "{BasePlugin.block_exp}",
64 'type' => 'regexp',
65 'deft' => &get_default_block_exp() },
66 { 'name' => "nolinks",
67 'desc' => "{HTMLPlugin.nolinks}",
68 'type' => "flag" },
69 { 'name' => "keep_head",
70 'desc' => "{HTMLPlugin.keep_head}",
71 'type' => "flag" },
72 { 'name' => "no_metadata",
73 'desc' => "{HTMLPlugin.no_metadata}",
74 'type' => "flag" },
75 { 'name' => "metadata_fields",
76 'desc' => "{HTMLPlugin.metadata_fields}",
77 'type' => "string",
78 'deft' => "Title" },
79 { 'name' => "metadata_field_separator",
80 'desc' => "{HTMLPlugin.metadata_field_separator}",
81 'type' => "string",
82 'deft' => "" },
83 { 'name' => "hunt_creator_metadata",
84 'desc' => "{HTMLPlugin.hunt_creator_metadata}",
85 'type' => "flag" },
86 { 'name' => "file_is_url",
87 'desc' => "{HTMLPlugin.file_is_url}",
88 'type' => "flag" },
89 { 'name' => "assoc_files",
90 'desc' => "{HTMLPlugin.assoc_files}",
91 'type' => "regexp",
92 'deft' => &get_default_block_exp() },
93 { 'name' => "rename_assoc_files",
94 'desc' => "{HTMLPlugin.rename_assoc_files}",
95 'type' => "flag" },
96 { 'name' => "title_sub",
97 'desc' => "{HTMLPlugin.title_sub}",
98 'type' => "string",
99 'deft' => "" },
100 { 'name' => "description_tags",
101 'desc' => "{HTMLPlugin.description_tags}",
102 'type' => "flag" },
103 # retain this for backward compatibility (w3mir option was replaced by
104 # file_is_url)
105 { 'name' => "w3mir",
106# 'desc' => "{HTMLPlugin.w3mir}",
107 'type' => "flag",
108 'hiddengli' => "yes"},
109 { 'name' => "no_strip_metadata_html",
110 'desc' => "{HTMLPlugin.no_strip_metadata_html}",
111 'type' => "string",
112 'deft' => "",
113 'reqd' => "no"},
114 { 'name' => "sectionalise_using_h_tags",
115 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
116 'type' => "flag" },
117 { 'name' => "use_realistic_book",
118 'desc' => "{HTMLPlugin.tidy_html}",
119 'type' => "flag"},
120 { 'name' => "old_style_HDL",
121 'desc' => "{HTMLPlugin.old_style_HDL}",
122 'type' => "flag"},
123 {'name' => "processing_tmp_files",
124 'desc' => "{BasePlugin.processing_tmp_files}",
125 'type' => "flag",
126 'hiddengli' => "yes"}
127 ];
128
129my $options = { 'name' => "HTMLPlugin",
130 'desc' => "{HTMLPlugin.desc}",
131 'abstract' => "no",
132 'inherits' => "yes",
133 'args' => $arguments };
134
135
136sub new {
137 my ($class) = shift (@_);
138 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139 push(@$pluginlist, $class);
140
141 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142 push(@{$hashArgOptLists->{"OptList"}},$options);
143
144
145 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
146
147 if ($self->{'w3mir'}) {
148 $self->{'file_is_url'} = 1;
149 }
150 $self->{'aux_files'} = {};
151 $self->{'dir_num'} = 0;
152 $self->{'file_num'} = 0;
153
154 return bless $self, $class;
155}
156
157# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
158# if have eg <script language="javascript" src="img/lib.js@123">
159# blocking is now done by reading through the file and recording all the
160# images and other files
161sub get_default_block_exp {
162 my $self = shift (@_);
163
164 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
165 return "";
166}
167
168sub get_default_process_exp {
169 my $self = shift (@_);
170
171 # the last option is an attempt to encode the concept of an html query ...
172 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
173}
174
175sub store_block_files
176{
177 my $self =shift (@_);
178 my ($filename_full_path, $block_hash) = @_;
179
180 my $html_fname = $filename_full_path;
181 my @file_blocks;
182
183 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
184
185 # read in file ($text will be in utf8)
186 my $raw_text = "";
187 $self->read_file_no_decoding ($filename_full_path, \$raw_text);
188
189 my $textref = \$raw_text;
190 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
191 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
192 $$textref =~ s/$opencom(.*?)$closecom//gs;
193
194 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
195 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
196 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
197 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
198 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
199 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
200 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
201
202 if(!defined $self->{'utf8_to_original_filename'}) {
203 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
204 $self->{'utf8_to_original_filename'} = {};
205 }
206
207 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
208
209 # remove quotes from link at start and end if necessary
210 if ($link=~/^\"/) {
211 $link=~s/^\"//;
212 $link=~s/\"$//;
213 }
214
215 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
216 # some links may just be anchor names
217 next unless ($link =~ /\S+/);
218
219 if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
220 # Turn relative file path into full path
221 my $dirname = &File::Basename::dirname($filename_full_path);
222 $link = &util::filename_cat($dirname, $link);
223 }
224 $link = $self->eval_dir_dots($link);
225
226 # this is the actual filename on the filesystem (that the link refers to)
227 my $url_original_filename = $self->opt_url_decode($link);
228
229 # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
230 my $utf8_link = "";
231 $self->decode_text($link,$encoding,$language,\$utf8_link);
232
233 $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
234# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
235
236 if ($url_original_filename ne $utf8_link) {
237 my $outhandle = $self->{'outhandle'};
238
239 print $outhandle "URL Encoding $url_original_filename\n";
240 print $outhandle " ->$utf8_link\n";
241 }
242
243 $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
244 }
245}
246
247# Given a filename in any encoding, will URL decode it to get back the original filename
248# in the original encoding. Because this method is intended to work out the *original*
249# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
250# string already exists in the local folder.
251# Return the original filename corresponding to the parameter URL-encoded filename, and
252# a decoded flag that is set to true iff URL-decoding had to be applied.
253sub opt_url_decode {
254 my $self = shift (@_);
255 my ($link) = @_;
256
257 # Replace %XX's in URL with decoded value if required.
258 # Note that the filename may include the %XX in some situations
259 if ($link =~ m/\%[A-F0-9]{2}/i) {
260 if (!-e $link) {
261 $link = &unicode::url_decode($link);
262 }
263 }
264
265 return $link;
266}
267
268sub read_into_doc_obj
269{
270 my $self = shift (@_);
271 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
272
273 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
274 # get the input file
275 my $input_filename = $file;
276 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
277 $suffix = lc($suffix);
278 my $tidy_filename;
279 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
280 {
281 # because the document has to be sectionalized set the description tags
282 $self->{'description_tags'} = 1;
283
284 # set the file to be tidied
285 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
286
287 # get the tidied file
288 #my $tidy_filename = $self->tmp_tidy_file($input_filename);
289 $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
290
291 # derive tmp filename from input filename
292 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
293
294 # set the new input file and base_dir to be from the tidied file
295 $file = "$tailname$suffix";
296 $base_dir = $dirname;
297 }
298
299 # call the parent read_into_doc_obj
300 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
301 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
302 {
303 # now we need to reset the filenames in the doc obj so that the converted filenames are not used
304 my $collect_file = &util::filename_within_collection($filename_full_path);
305 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
306 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
307 # build. so set it manually.
308 $doc_obj->{'source_path'} = $filename_full_path;
309 my $collect_conv_file = &util::filename_within_collection($tidy_filename);
310 $doc_obj->set_converted_filename($collect_conv_file);
311 $self->set_Source_metadata($doc_obj, $filename_no_path);
312 }
313 return ($process_status,$doc_obj);
314}
315
316# do plugin specific processing of doc_obj
317sub process {
318 my $self = shift (@_);
319 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
320 my $outhandle = $self->{'outhandle'};
321
322 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
323 # this makes life so much easier... perl can cope with unix-style '/'s.
324 $base_dir =~ s@(\\)+@/@g;
325 $file =~ s@(\\)+@/@g;
326 }
327
328 # reset per-doc stuff...
329 $self->{'aux_files'} = {};
330 $self->{'dir_num'} = 0;
331 $self->{'file_num'} = 0;
332
333 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
334 # you can also include metadata in the format (X can be any number)
335 # <hX>Title<!--gsdl-metadata
336 # <Metadata name="name1">value1</Metadata>
337 # ...
338 # <Metadata name="nameN">valueN</Metadata>
339 #--></hX>
340 if ($self->{'sectionalise_using_h_tags'}) {
341 # description_tags should allways be activated because we convert headings to description tags
342 $self->{'description_tags'} = 1;
343
344 my $arrSections = [];
345 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
346
347 if (scalar(@$arrSections)) {
348 my $strMetadata = $self->update_section_data($arrSections, -1);
349 if (length($strMetadata)) {
350 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
351 $$textref =~ s/<\/body>/$strMetadata/ig;
352 }
353 }
354 }
355
356 my $cursection = $doc_obj->get_top_section();
357
358 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
359 unless $self->{'no_metadata'} || $self->{'description_tags'};
360
361 # Store URL for page as metadata - this can be used for an
362 # altavista style search interface. The URL won't be valid
363 # unless the file structure contains the domain name (i.e.
364 # like when w3mir is used to download a website).
365
366 # URL metadata (even invalid ones) are used to support internal
367 # links, so even if 'file_is_url' is off, still need to store info
368
369 my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$");
370 my $utf8_file = $self->filename_to_utf8_metadata($file);
371 $utf8_file =~ s/&\#095;/_/g;
372 my $web_url = "http://";
373 if(defined $dirname) { # local directory
374 # Check for "ftp" in the domain name of the directory
375 # structure to determine if this URL should be a ftp:// URL
376 # This check is not infallible, but better than omitting the
377 # check, which would cause all files downloaded from ftp sites
378 # via mirroring with wget to have potentially erroneous http:// URLs
379 # assigned in their metadata
380 if ($dirname =~ /^[^\/]*ftp/i)
381 {
382 $web_url = "ftp://";
383 }
384 $dirname = $self->eval_dir_dots($dirname);
385 $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
386 $web_url = $web_url.$dirname.$utf8_file;
387 } else {
388 $web_url = $web_url.$utf8_file;
389 }
390 $web_url =~ s/\\/\//g;
391 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
392
393 if ($self->{'file_is_url'}) {
394 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
395 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
396 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
397 }
398
399 if ($self->{'description_tags'}) {
400 # remove the html header - note that doing this here means any
401 # sections defined within the header will be lost (so all <Section>
402 # tags must appear within the body of the HTML)
403 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
404
405 $$textref =~ s/^.*?<body[^>]*>//is;
406 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
407
408 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
409 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
410
411 my $lt = '(?:<|&lt;)';
412 my $gt = '(?:>|&gt;)';
413 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
414
415 my $dont_strip = '';
416 if ($self->{'no_strip_metadata_html'}) {
417 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
418 }
419
420 my $found_something = 0; my $top = 1;
421 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
422 my $text = $1;
423 my $comment = $2;
424 if (defined $text) {
425 # text before a comment - note that getting to here
426 # doesn't necessarily mean there are Section tags in
427 # the document
428 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
429 }
430 while ($comment =~ s/$lt(.*?)$gt//s) {
431 my $tag = $1;
432 if ($tag eq "Section") {
433 $found_something = 1;
434 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
435 $top = 0;
436 } elsif ($tag eq "/Section") {
437 $found_something = 1;
438 $cursection = $doc_obj->get_parent_section ($cursection);
439 } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
440 my $metaname = $1;
441 my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
442 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
443 my $metavalue = $1;
444 $metavalue =~ s/^\s+//;
445 $metavalue =~ s/\s+$//;
446 # assume that no metadata value intentionally includes
447 # carriage returns or HTML tags (if they're there they
448 # were probably introduced when converting to HTML from
449 # some other format).
450 # actually some people want to have html tags in their
451 # metadata.
452 $metavalue =~ s/[\cJ\cM]/ /sg;
453 $metavalue =~ s/<[^>]+>//sg
454 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
455 $metavalue =~ s/\s+/ /sg;
456 if ($metaname =~ /\./) { # has a namespace
457 $metaname = "ex.$metaname";
458 }
459 if ($accumulate) {
460 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
461 } else {
462 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
463 }
464 } elsif ($tag eq "Description" || $tag eq "/Description") {
465 # do nothing with containing Description tags
466 } else {
467 # simple HTML tag (probably created by the conversion
468 # to HTML from some other format) - we'll ignore it and
469 # hope for the best ;-)
470 }
471 }
472 }
473 if ($cursection ne "") {
474 print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
475 }
476
477 $$textref =~ s/^.*?<body[^>]*>//is;
478 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
479 if ($$textref =~ m/\S/) {
480 if (!$found_something) {
481 if ($self->{'verbosity'} > 2) {
482 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
483 print $outhandle " will be processed as a single section document\n";
484 }
485
486 # go ahead and process single-section document
487 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
488
489 # if document contains no Section tags we'll go ahead
490 # and extract metadata (this won't have been done
491 # above as the -description_tags option prevents it)
492 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
493 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
494 unless $self->{'no_metadata'};
495
496 } else {
497 print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
498 print $outhandle " of the final closing </Section> tag. This text will\n";
499 print $outhandle " be ignored.";
500
501 my ($text);
502 if (length($$textref) > 30) {
503 $text = substr($$textref, 0, 30) . "...";
504 } else {
505 $text = $$textref;
506 }
507 $text =~ s/\n/ /isg;
508 print $outhandle " ($text)\n";
509 }
510 } elsif (!$found_something) {
511
512 if ($self->{'verbosity'} > 2) {
513 # may get to here if document contained no valid Section
514 # tags but did contain some comments. The text will have
515 # been processed already but we should print the warning
516 # as above and extract metadata
517 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
518 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
519 }
520
521 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
522 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
523 unless $self->{'no_metadata'};
524 }
525
526 } else {
527
528 # remove header and footer
529 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
530 $$textref =~ s/^.*?<body[^>]*>//is;
531 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
532 }
533
534 # single section document
535 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
536 }
537 return 1;
538}
539
540
541sub process_heading
542{
543 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
544 $strHeadingText = '' if (!defined($strHeadingText));
545
546 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
547
548 my $strSecMetadata = '';
549 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
550 {
551 $strSecMetadata .= $1;
552 }
553
554 $strHeadingText =~ s/^\s+//g;
555 $strHeadingText =~ s/\s+$//g;
556 $strSecMetadata =~ s/^\s+//g;
557 $strSecMetadata =~ s/\s+$//g;
558
559 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
560
561 if (length($strSecMetadata)) {
562 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
563 }
564
565 $strMetadata .= "\t</Description>\n";
566
567 return "<!--" . $strMetadata . "-->";
568}
569
570
571sub update_section_data
572{
573 my ($self, $arrSections, $nCurTocNo) = @_;
574 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
575
576 if ($nSections == 0) {
577 push @$arrSections, $nCurTocNo;
578 return $strBuffer;
579 }
580 $nLast = $arrSections->[$nSections - 1];
581 if ($nCurTocNo > $nLast) {
582 push @$arrSections, $nCurTocNo;
583 return $strBuffer;
584 }
585 for(my $i = $nSections - 1; $i >= 0; $i--) {
586 if ($nCurTocNo <= $arrSections->[$i]) {
587 $strBuffer .= "\n</Section>";
588 pop @$arrSections;
589 }
590 }
591 push @$arrSections, $nCurTocNo;
592 return $strBuffer;
593}
594
595
596# note that process_section may be called multiple times for a single
597# section (relying on the fact that add_utf8_text appends the text to any
598# that may exist already).
599sub process_section {
600 my $self = shift (@_);
601 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
602 # trap links
603 if (!$self->{'nolinks'}) {
604 # usemap="./#index" not handled correctly => change to "#index"
605## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
606
607 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
608 $self->replace_usemap_links($1, $2, $3)/isge;
609
610## $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
611
612 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
613 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
614 }
615
616 # trap images
617
618 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
619 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
620 # turned regular text succeeding images into links. That is, by embedding <imgs>
621 # inside <a href=""></a>, the text following images were no longer misbehaving.
622 # However, there would be many occasions whereby images were not meant to link
623 # to their source images but where the images would link to another web page.
624 # To allow this, the no_image_links option was introduced: it would prevent
625 # the behaviour of embedding images into links that referenced the source images.
626
627 # Somewhere along the line, the problem of normal text turning into links when
628 # such text followed images which were not embedded in <a href=""></a> ceased
629 # to occur. This is why the following lines have been commented out (as well as
630 # two lines in replace_images). They appear to no longer apply.
631
632 # If at any time, there is a need for having images embedded in <a> anchor tags,
633 # then it might be better to turn that into an HTMLPlugin option rather than make
634 # it the default behaviour. Also, eventually, no_image_links needs to become
635 # a deprecated option for HTMLPlugin as it has now become the default behaviour.
636
637 #if(!$self->{'no_image_links'}){
638 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
639 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
640 #}
641
642 # add text to document object
643 # turn \ into \\ so that the rest of greenstone doesn't think there
644 # is an escape code following. (Macro parsing loses them...)
645 $$textref =~ s/\\/\\\\/go;
646
647 $doc_obj->add_utf8_text($cursection, $$textref);
648}
649
650sub replace_images {
651 my $self = shift (@_);
652 my ($front, $link, $back, $base_dir,
653 $file, $doc_obj, $section) = @_;
654
655 # remove quotes from link at start and end if necessary
656 if ($link=~/^[\"\']/) {
657 $link=~s/^[\"\']//;
658 $link=~s/[\"\']$//;
659 $front.='"';
660 $back="\"$back";
661 }
662
663 $link =~ s/\n/ /g;
664
665 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
666 # If the Word file path has spaces in it, wv messes up and you end up with
667 # absolute paths for the images, and without the "file://" prefix
668 # So check for this special case and massage the data to be correct
669 if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
670 $link =~ s/^.*\\([^\\]+)$/$1/;
671 }
672
673 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
674
675 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
676
677# print STDERR "**** link = $link\n**** href = $href\n**** img_file = $img_file, rl = $rl\n";
678
679 my $anchor_name = $img_file;
680 #$anchor_name =~ s/^.*\///;
681 #$anchor_name = "<a name=\"$anchor_name\" ></a>";
682
683 my $image_link = $front . $img_file .$back;
684 return $image_link;
685
686 # The reasons for why the following two lines are no longer necessary can be
687 # found in subroutine process_section
688 #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
689 #return $anchor_link;
690
691 #return $front . $img_file . $back . $anchor_name;
692}
693
694sub replace_href_links {
695 my $self = shift (@_);
696 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
697
698 # remove quotes from link at start and end if necessary
699 if ($link=~/^[\"\']/) {
700 $link=~s/^[\"\']//;
701 $link=~s/[\"\']$//;
702 $front.='"';
703 $back="\"$back";
704 }
705
706 # attempt to sort out targets - frames are not handled
707 # well in this plugin and some cases will screw things
708 # up - e.g. the _parent target (so we'll just remove
709 # them all ;-)
710 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
711 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
712 $front =~ s/target=\"?_parent\"?//is;
713 $back =~ s/target=\"?_parent\"?//is;
714
715 return $front . $link . $back if $link =~ m/^\#/s;
716 $link =~ s/\n/ /g;
717
718 # Find file referred to by $link on file system
719 # This is more complicated than it sounds when char encodings
720 # is taken in to account
721 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
722
723 # href may use '\'s where '/'s should be on Windows
724 $href =~ s/\\/\//g;
725 my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
726
727
728 ##### leave all these links alone (they won't be picked up by intermediate
729 ##### pages). I think that's safest when dealing with frames, targets etc.
730 ##### (at least until I think of a better way to do it). Problems occur with
731 ##### mailto links from within small frames, the intermediate page is displayed
732 ##### within that frame and can't be seen. There is still potential for this to
733 ##### happen even with html pages - the solution seems to be to somehow tell
734 ##### the browser from the server side to display the page being sent (i.e.
735 ##### the intermediate page) in the top level window - I'm not sure if that's
736 ##### possible - the following line should probably be deleted if that can be done
737 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
738
739
740 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
741 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
742 &ghtml::urlsafe ($href);
743 return $front . "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part . $back;
744 } else {
745 # link is to some other type of file (eg image) so we'll
746 # need to associate that file
747 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
748 }
749}
750
751sub add_file {
752 my $self = shift (@_);
753 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
754 my ($newname);
755
756 my $filename = $href;
757 if ($base_dir eq "") {
758 # remove http:/ thereby leaving one slash at the start
759 $filename =~ s/^[^:]*:\///;
760 }
761 else {
762 # remove http://
763 $filename =~ s/^[^:]*:\/\///;
764 }
765
766 $filename = &util::filename_cat($base_dir, $filename);
767 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) {
768 # we are processing a tidytmp file - want paths to be in import
769 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
770 }
771 # Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some
772 # situations. If the *original* file's name was in URL encoding, the following method will not decode it.
773 my $utf8_filename = $filename;
774 $filename = $self->opt_url_decode($utf8_filename);
775
776 # some special processing if the intended filename was converted to utf8, but
777 # the actual file still needs to be renamed
778 if (!-e $filename) {
779 # try the original filename stored in map
780 my $original_filename = $self->{'utf8_to_original_filename'}->{$filename};
781 if (defined $original_filename && -e $original_filename) {
782 $filename = $original_filename;
783 }
784 }
785
786 my ($ext) = $filename =~ m/(\.[^\.]*)$/;
787
788 if ($rl == 0) {
789 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
790 return "_httpextlink_&amp;rl=0&amp;el=prompt&amp;href=" . $href . $hash_part;
791 }
792 else {
793 return "_httpextlink_&amp;rl=0&amp;el=direct&amp;href=" . $href . $hash_part;
794 }
795 }
796
797 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
798 return "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part;
799 }
800 # add the original image file as a source file
801 if (!$self->{'processing_tmp_files'} ) {
802 $doc_obj->associate_source_file($filename);
803 }
804 if ($self->{'rename_assoc_files'}) {
805 if (defined $self->{'aux_files'}->{$href}) {
806 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
807 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
808 } else {
809 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
810 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
811 $self->inc_filecount ();
812 }
813 $doc_obj->associate_file($filename, $newname, undef, $section);
814 return "_httpdocimg_/$newname";
815 } else {
816 if(&unicode::is_url_encoded($utf8_filename)) {
817 # use the possibly-decoded filename instead to avoid double URL encoding
818 ($newname) = $filename =~ m/([^\/\\]*)$/;
819 } else {
820 ($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
821 }
822
823 # Make sure this name uses only ASCII characters.
824 # We use either base64 or URL encoding, as these preserve original encoding
825 $newname = &util::rename_file($newname, $self->{'file_rename_method'});
826
827 $doc_obj->associate_file($filename, $newname, undef, $section);
828
829 # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
830 # of filenames, URL-encode the additional percent signs of the URL-encoded filename
831 my $newname_url = $newname;
832 $newname_url = &unicode::filename_to_url($newname_url);
833 return "_httpdocimg_/$newname_url";
834 }
835}
836
837
838sub format_link {
839 my $self = shift (@_);
840 my ($link, $base_dir, $file) = @_;
841
842 my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/;
843
844 $hash_part = "" if !defined $hash_part;
845 if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
846 my $outhandle = $self->{'outhandle'};
847 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
848 if $self->{'verbosity'};
849 return ($link, "", 0);
850 }
851
852 if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) {
853 my $type = $1;
854
855 if ($link =~ m/^(http|ftp):/i) {
856 # Turn url (using /) into file name (possibly using \ on windows)
857 my @http_dir_split = split('/', $before_hash);
858 $before_hash = &util::filename_cat(@http_dir_split);
859 }
860
861 $before_hash = $self->eval_dir_dots($before_hash);
862
863 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
864
865 my $rl = 0;
866 $rl = 1 if (-e $linkfilename);
867
868 # make sure there's a slash on the end if it's a directory
869 if ($before_hash !~ m/\/$/) {
870 $before_hash .= "/" if (-d $linkfilename);
871 }
872 return ($type . $before_hash, $hash_part, $rl);
873
874 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
875
876 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
877
878 # the first directory will be the domain name if file_is_url
879 # to generate archives, otherwise we'll assume all files are
880 # from the same site and base_dir is the root
881
882 if ($self->{'file_is_url'}) {
883 my @dirs = split /[\/\\]/, $file;
884 my $domname = shift (@dirs);
885 $before_hash = &util::filename_cat($domname, $before_hash);
886 $before_hash =~ s@\\@/@g; # for windows
887 }
888 else
889 {
890 # see if link shares directory with source document
891 # => turn into relative link if this is so!
892
893 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
894 # too difficult doing a pattern match with embedded '\'s...
895 my $win_before_hash=$before_hash;
896 $win_before_hash =~ s@(\\)+@/@g;
897 # $base_dir is already similarly "converted" on windows.
898 if ($win_before_hash =~ s@^$base_dir/@@o) {
899 # if this is true, we removed a prefix
900 $before_hash=$win_before_hash;
901 }
902 }
903 else {
904 # before_hash has lost leading slash by this point,
905 # -> add back in prior to substitution with $base_dir
906 $before_hash = "/$before_hash";
907
908 $before_hash = &util::filename_cat("",$before_hash);
909 $before_hash =~ s@^$base_dir/@@;
910 }
911 }
912 } else {
913 # Turn relative file path into full path
914 my $dirname = &File::Basename::dirname($file);
915 $before_hash = &util::filename_cat($dirname, $before_hash);
916 $before_hash = $self->eval_dir_dots($before_hash);
917 }
918
919 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
920 # make sure there's a slash on the end if it's a directory
921 if ($before_hash !~ m/\/$/) {
922 $before_hash .= "/" if (-d $linkfilename);
923 }
924 return ("http://" . $before_hash, $hash_part, 1);
925 } else {
926 # mailto, news, nntp, telnet, javascript or gopher link
927 return ($before_hash, "", 0);
928 }
929}
930
931sub extract_first_NNNN_characters {
932 my $self = shift (@_);
933 my ($textref, $doc_obj, $thissection) = @_;
934
935 foreach my $size (split /,/, $self->{'first'}) {
936 my $tmptext = $$textref;
937 # skip to the body
938 $tmptext =~ s/.*<body[^>]*>//i;
939 # remove javascript
940 $tmptext =~ s@<script.*?</script>@ @sig;
941 $tmptext =~ s/<[^>]*>/ /g;
942 $tmptext =~ s/&nbsp;/ /g;
943 $tmptext =~ s/^\s+//;
944 $tmptext =~ s/\s+$//;
945 $tmptext =~ s/\s+/ /gs;
946 $tmptext = &unicode::substr ($tmptext, 0, $size);
947 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
948 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
949 }
950}
951
952
953sub extract_metadata {
954 my $self = shift (@_);
955 my ($textref, $metadata, $doc_obj, $section) = @_;
956 my $outhandle = $self->{'outhandle'};
957 # if we don't want metadata, we may as well not be here ...
958 return if (!defined $self->{'metadata_fields'});
959
960 my $separator = $self->{'metadata_field_separator'};
961 if ($separator eq "") {
962 undef $separator;
963 }
964
965 # metadata fields to extract/save. 'key' is the (lowercase) name of the
966 # html meta, 'value' is the metadata name for greenstone to use
967 my %find_fields = ();
968
969 my %creator_fields = (); # short-cut for lookups
970
971
972 foreach my $field (split /,/, $self->{'metadata_fields'}) {
973 $field =~ s/^\s+//; # remove leading whitespace
974 $field =~ s/\s+$//; # remove trailing whitespace
975
976 # support tag<tagname>
977 if ($field =~ m/^(.*?)\s*<(.*?)>$/) {
978 # "$2" is the user's preferred gs metadata name
979 $find_fields{lc($1)}=$2; # lc = lowercase
980 } else { # no <tagname> for mapping
981 # "$field" is the user's preferred gs metadata name
982 $find_fields{lc($field)}=$field; # lc = lowercase
983 }
984 }
985
986 if (defined $self->{'hunt_creator_metadata'} &&
987 $self->{'hunt_creator_metadata'} == 1 ) {
988 my @extra_fields =
989 (
990 'author',
991 'author.email',
992 'creator',
993 'dc.creator',
994 'dc.creator.corporatename',
995 );
996
997 # add the creator_metadata fields to search for
998 foreach my $field (@extra_fields) {
999 $creator_fields{$field}=0; # add to lookup hash
1000 }
1001 }
1002
1003
1004 # find the header in the html file, which has the meta tags
1005 $$textref =~ m@<head>(.*?)</head>@si;
1006
1007 my $html_header=$1;
1008
1009 # go through every <meta... tag defined in the html and see if it is
1010 # one of the tags we want to match.
1011
1012 # special case for title - we want to remember if its been found
1013 my $found_title = 0;
1014 # this assumes that ">" won't appear. (I don't think it's allowed to...)
1015 $html_header =~ m/^/; # match the start of the string, for \G assertion
1016
1017 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
1018 my $metatag=$1;
1019 my ($tag, $value);
1020
1021 # find the tag name
1022 $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
1023 $tag=$2;
1024 # in case they're not using " or ', but they should...
1025 if (! $tag) {
1026 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1027 $tag=$1;
1028 }
1029
1030 if (!defined $tag) {
1031 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1032 next;
1033 }
1034
1035 # don't need to assign this field if it was passed in from a previous
1036 # (recursive) plugin
1037 if (defined $metadata->{$tag}) {next}
1038
1039 # find the tag content
1040 $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
1041 $value=$2;
1042
1043 if (! $value) {
1044 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1045 $value=$1;
1046 }
1047 if (!defined $value) {
1048 print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1049 next;
1050 }
1051
1052 # clean up and add
1053 $value =~ s/\s+/ /gs;
1054 chomp($value); # remove trailing \n, if any
1055 if (exists $creator_fields{lc($tag)}) {
1056 # map this value onto greenstone's "Creator" metadata
1057 $tag='Creator';
1058 } elsif (!exists $find_fields{lc($tag)}) {
1059 next; # don't want this tag
1060 } else {
1061 # get the user's preferred capitalisation
1062 $tag = $find_fields{lc($tag)};
1063 }
1064 if (lc($tag) eq "title") {
1065 $found_title = 1;
1066 }
1067
1068 if ($self->{'verbosity'} > 2) {
1069 print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1070 }
1071
1072 if ($tag =~ /\./) {
1073 # there is a . so has a namespace, add ex.
1074 $tag = "ex.$tag";
1075 }
1076 if (defined $separator) {
1077 my @values = split($separator, $value);
1078 foreach my $v (@values) {
1079 $doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1080 }
1081 }
1082 else {
1083 $doc_obj->add_utf8_metadata($section, $tag, $value);
1084 }
1085 }
1086
1087 # TITLE: extract the document title
1088 if (exists $find_fields{'title'} && !$found_title) {
1089 # we want a title, and didn't find one in the meta tags
1090 # see if there's a <title> tag
1091 my $title;
1092 my $from = ""; # for debugging output only
1093 if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
1094 $title = $1;
1095 $from = "<title> tags";
1096 }
1097
1098 if (!defined $title) {
1099 $from = "first 100 chars";
1100 # if no title use first 100 or so characters
1101 $title = $$textref;
1102 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1103 $title =~ s/^.*?<body>//si;
1104 # ignore javascript!
1105 $title =~ s@<script.*?</script>@ @sig;
1106 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1107 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1108 $title = substr ($title, 0, 100);
1109 $title =~ s/\s\S*$/.../;
1110 }
1111 $title =~ s/<[^>]*>/ /g; # remove html tags
1112 $title =~ s/&nbsp;/ /g;
1113 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1114 $title =~ s/\s+/ /gs; # collapse multiple spaces
1115 $title =~ s/^\s*//; # remove leading spaces
1116 $title =~ s/\s*$//; # remove trailing spaces
1117
1118 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1119 $title =~ s/^\s+//s; # in case title_sub introduced any...
1120 $doc_obj->add_utf8_metadata ($section, 'Title', $title);
1121 print $outhandle " extracted Title metadata \"$title\" from $from\n"
1122 if ($self->{'verbosity'} > 2);
1123 }
1124
1125 # add FileFormat metadata
1126 $doc_obj->add_metadata($section,"FileFormat", "HTML");
1127
1128 # Special, for metadata names such as tagH1 - extracts
1129 # the text between the first <H1> and </H1> tags into "H1" metadata.
1130
1131 foreach my $field (keys %find_fields) {
1132 if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1133 my $tag = $1;
1134 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1135 my $content = $1;
1136 $content =~ s/&nbsp;/ /g;
1137 $content =~ s/<[^>]*>/ /g;
1138 $content =~ s/^\s+//;
1139 $content =~ s/\s+$//;
1140 $content =~ s/\s+/ /gs;
1141 if ($content) {
1142 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1143 $tag =~ s/^tag//i;
1144 $doc_obj->add_utf8_metadata ($section, $tag, $content);
1145 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1146 if ($self->{'verbosity'} > 2);
1147 }
1148 }
1149 }
1150}
1151
1152
1153# evaluate any "../" to next directory up
1154# evaluate any "./" as here
1155sub eval_dir_dots {
1156 my $self = shift (@_);
1157 my ($filename) = @_;
1158 my $dirsep_os = &util::get_os_dirsep();
1159 my @dirsep = split(/$dirsep_os/,$filename);
1160
1161 my @eval_dirs = ();
1162 foreach my $d (@dirsep) {
1163 if ($d eq "..") {
1164 pop(@eval_dirs);
1165
1166 } elsif ($d eq ".") {
1167 # do nothing!
1168
1169 } else {
1170 push(@eval_dirs,$d);
1171 }
1172 }
1173
1174 # Need to fiddle with number of elements in @eval_dirs if the
1175 # first one is the empty string. This is because of a
1176 # modification to util::filename_cat that supresses the addition
1177 # of a leading '/' character (or \ if windows) (intended to help
1178 # filename cat with relative paths) if the first entry in the
1179 # array is the empty string. Making the array start with *two*
1180 # empty strings is a way to defeat this "smart" option.
1181 #
1182 if (scalar(@eval_dirs) > 0) {
1183 if ($eval_dirs[0] eq ""){
1184 unshift(@eval_dirs,"");
1185 }
1186 }
1187
1188 my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1189 return $evaluated_filename;
1190}
1191
1192sub replace_usemap_links {
1193 my $self = shift (@_);
1194 my ($front, $link, $back) = @_;
1195
1196 # remove quotes from link at start and end if necessary
1197 if ($link=~/^[\"\']/) {
1198 $link=~s/^[\"\']//;
1199 $link=~s/[\"\']$//;
1200 $front.='"';
1201 $back="\"$back";
1202 }
1203
1204 $link =~ s/^\.\///;
1205 return $front . $link . $back;
1206}
1207
1208sub inc_filecount {
1209 my $self = shift (@_);
1210
1211 if ($self->{'file_num'} == 1000) {
1212 $self->{'dir_num'} ++;
1213 $self->{'file_num'} = 0;
1214 } else {
1215 $self->{'file_num'} ++;
1216 }
1217}
1218
1219
1220# Extend read_file so that strings like &eacute; are
1221# converted to UTF8 internally.
1222#
1223# We don't convert &lt; or &gt; or &amp; or &quot; in case
1224# they interfere with the GML files
1225
1226sub read_file {
1227 my $self = shift(@_);
1228 my ($filename, $encoding, $language, $textref) = @_;
1229
1230 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1231
1232 # Convert entities to their UTF8 equivalents
1233 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
1234 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1235 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
1236
1237}
1238
1239sub HB_read_html_file {
1240 my $self = shift (@_);
1241 my ($htmlfile, $text) = @_;
1242
1243 # load in the file
1244 if (!open (FILE, $htmlfile)) {
1245 print STDERR "ERROR - could not open $htmlfile\n";
1246 return;
1247 }
1248
1249 my $foundbody = 0;
1250 $self->HB_gettext (\$foundbody, $text, "FILE");
1251 close FILE;
1252
1253 # just in case there was no <body> tag
1254 if (!$foundbody) {
1255 $foundbody = 1;
1256 open (FILE, $htmlfile) || return;
1257 $self->HB_gettext (\$foundbody, $text, "FILE");
1258 close FILE;
1259 }
1260 # text is in utf8
1261}
1262
1263# converts the text to utf8, as ghtml does that for &eacute; etc.
1264sub HB_gettext {
1265 my $self = shift (@_);
1266 my ($foundbody, $text, $handle) = @_;
1267
1268 my $line = "";
1269 while (defined ($line = <$handle>)) {
1270 # look for body tag
1271 if (!$$foundbody) {
1272 if ($line =~ s/^.*<body[^>]*>//i) {
1273 $$foundbody = 1;
1274 } else {
1275 next;
1276 }
1277 }
1278
1279 # check for symbol fonts
1280 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
1281 my $font = $1;
1282 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1283 if ($font !~ m/^arial$/i);
1284 }
1285
1286 $$text .= $line;
1287 }
1288
1289 if ($self->{'input_encoding'} eq "iso_8859_1") {
1290 # convert to utf-8
1291 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1292 }
1293 # convert any alphanumeric character entities to their utf-8
1294 # equivalent for indexing purposes
1295 #&ghtml::convertcharentities ($$text);
1296
1297 $$text =~ s/\s+/ /g; # remove \n's
1298
1299 # At this point $$text is a binary byte string
1300 # => turn it into a Unicode aware string, so full
1301 # Unicode aware pattern matching can be used.
1302 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
1303 #
1304
1305 $$text = decode("utf8",$$text);
1306}
1307
1308sub HB_clean_section {
1309 my $self = shift (@_);
1310 my ($section) = @_;
1311
1312 # remove tags without a starting tag from the section
1313 my ($tag, $tagstart);
1314 while ($section =~ m/<\/([^>]{1,10})>/) {
1315 $tag = $1;
1316 $tagstart = index($section, "<$tag");
1317 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1318 $section =~ s/<\/$tag>//;
1319 }
1320
1321 # remove extra paragraph tags
1322 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
1323
1324 # remove extra stuff at the end of the section
1325 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
1326
1327 # add a newline at the beginning of each paragraph
1328 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1329
1330 # add a newline every 80 characters at a word boundary
1331 # Note: this regular expression puts a line feed before
1332 # the last word in each section, even when it is not
1333 # needed.
1334 $section =~ s/(.{1,80})\s/$1\n/g;
1335
1336 # fix up the image links
1337 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
1338 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1339 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
1340 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1341
1342 return $section;
1343}
1344
1345# Will convert the oldHDL format to the new HDL format (using the Section tag)
1346sub convert_to_newHDLformat
1347{
1348 my $self = shift (@_);
1349 my ($file,$cnfile) = @_;
1350 my $input_filename = $file;
1351 my $tmp_filename = $cnfile;
1352
1353 # write HTML tmp file with new HDL format
1354 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1355
1356 # read in the file and do basic html cleaning (removing header etc)
1357 my $html = "";
1358 $self->HB_read_html_file ($input_filename, \$html);
1359
1360 # process the file one section at a time
1361 my $curtoclevel = 1;
1362 my $firstsection = 1;
1363 my $toclevel = 0;
1364 while (length ($html) > 0) {
1365 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
1366 $toclevel = $3;
1367 my $title = $4;
1368 my $sectiontext = "";
1369 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
1370 $sectiontext = $1;
1371 } else {
1372 $sectiontext = $html;
1373 $html = "";
1374 }
1375
1376 # remove tags and extra spaces from the title
1377 $title =~ s/<\/?[^>]+>//g;
1378 $title =~ s/^\s+|\s+$//g;
1379
1380 # close any sections below the current level and
1381 # create a new section (special case for the firstsection)
1382 print PROD "<!--\n";
1383 while (($curtoclevel > $toclevel) ||
1384 (!$firstsection && $curtoclevel == $toclevel)) {
1385 $curtoclevel--;
1386 print PROD "</Section>\n";
1387 }
1388 if ($curtoclevel+1 < $toclevel) {
1389 print STDERR "WARNING - jump in toc levels in $input_filename " .
1390 "from $curtoclevel to $toclevel\n";
1391 }
1392 while ($curtoclevel < $toclevel) {
1393 $curtoclevel++;
1394 }
1395
1396 if ($curtoclevel == 1) {
1397 # add the header tag
1398 print PROD "-->\n";
1399 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1400 print PROD "<!--\n";
1401 }
1402
1403 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1404
1405 print PROD "-->\n";
1406
1407 # clean up the section html
1408 $sectiontext = $self->HB_clean_section($sectiontext);
1409
1410 print PROD "$sectiontext\n";
1411
1412 } else {
1413 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1414 "\nin $input_filename\n";
1415 last;
1416 }
1417 $firstsection = 0;
1418 }
1419
1420 print PROD "<!--\n";
1421 while ($curtoclevel > 0) {
1422 $curtoclevel--;
1423 print PROD "</Section>\n";
1424 }
1425 print PROD "-->\n";
1426
1427 close (PROD) || die("Error Closing File: $tmp_filename $!");
1428
1429 return $tmp_filename;
1430}
1431
1432sub shorten {
1433 my $self = shift (@_);
1434 my ($text) = @_;
1435
1436 return "\"$text\"" if (length($text) < 100);
1437
1438 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1439 substr ($text, length($text)-50) . "\"";
1440}
1441
1442sub convert_tidy_or_oldHDL_file
1443{
1444 my $self = shift (@_);
1445 my ($file) = @_;
1446 my $input_filename = $file;
1447
1448 if (-d $input_filename)
1449 {
1450 return $input_filename;
1451 }
1452
1453 # get the input filename
1454 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1455 my $base_dirname = $dirname;
1456 $suffix = lc($suffix);
1457
1458 # derive tmp filename from input filename
1459 # Remove any white space from filename -- no risk of name collision, and
1460 # makes later conversion by utils simpler. Leave spaces in path...
1461 # tidy up the filename with space, dot, hyphen between
1462 $tailname =~ s/\s+//g;
1463 $tailname =~ s/\.+//g;
1464 $tailname =~ s/\-+//g;
1465 # convert to utf-8 otherwise we have problems with the doc.xml file
1466 # later on
1467 &unicode::ensure_utf8(\$tailname);
1468
1469 # softlink to collection tmp dir
1470 my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1471 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1472
1473 my $test_dirname = "";
1474 my $f_separator = &util::get_os_dirsep();
1475
1476 if ($dirname =~ m/import$f_separator/)
1477 {
1478 $test_dirname = $'; #'
1479
1480 #print STDERR "init $'\n";
1481
1482 while ($test_dirname =~ m/[$f_separator]/)
1483 {
1484 my $folderdirname = $`;
1485 $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1486 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1487 $test_dirname = $'; #'
1488 }
1489 }
1490
1491 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1492
1493 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1494 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
1495 {
1496 #convert the input file to a new style HDL
1497 my $hdl_output_filename = $input_filename;
1498 if ($self->{'old_style_HDL'})
1499 {
1500 $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1501 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1502 }
1503
1504 #just for checking copy all other file from the base dir to tmp dir if it is not exists
1505 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1506 my @files = grep {!/^\.+$/} readdir(DIR);
1507 close(DIR);
1508
1509 foreach my $file (@files)
1510 {
1511 my $src_file = &util::filename_cat($base_dirname,$file);
1512 my $dest_file = &util::filename_cat($tmp_dirname,$file);
1513 if ((!-e $dest_file) && (!-d $src_file))
1514 {
1515 # just copy the original file back to the tmp directory
1516 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1517 }
1518 }
1519
1520 # tidy the input file
1521 my $tidy_output_filename = $hdl_output_filename;
1522 if ($self->{'use_realistic_book'})
1523 {
1524 $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1525 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1526 }
1527 $tmp_filename = $tidy_output_filename;
1528 }
1529 else
1530 {
1531 if (!-e $tmp_filename)
1532 {
1533 # just copy the original file back to the tmp directory
1534 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1535 }
1536 }
1537
1538 return $tmp_filename;
1539}
1540
1541
1542# Will make the html input file as a proper XML file with removed font tag and
1543# image size added to the img tag.
1544# The tidying process takes place in a collection specific 'tmp' directory so
1545# that we don't accidentally damage the input.
1546sub tmp_tidy_file
1547{
1548 my $self = shift (@_);
1549 my ($file,$cnfile) = @_;
1550 my $input_filename = $file;
1551 my $tmp_filename = $cnfile;
1552
1553 # get the input filename
1554 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1555
1556 require HTML::TokeParser::Simple;
1557
1558 # create HTML parser to decode the input file
1559 my $parser = HTML::TokeParser::Simple->new($input_filename);
1560
1561 # write HTML tmp file without the font tag and image size are added to the img tag
1562 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1563 while (my $token = $parser->get_token())
1564 {
1565 # is it an img tag
1566 if ($token->is_start_tag('img'))
1567 {
1568 # get the attributes
1569 my $attr = $token->return_attr;
1570
1571 # get the full path to the image
1572 my $img_file = &util::filename_cat($dirname,$attr->{src});
1573
1574 # set the width and height attribute
1575 ($attr->{width}, $attr->{height}) = imgsize($img_file);
1576
1577 # recreate the tag
1578 print PROD "<img";
1579 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1580 print PROD ">";
1581 }
1582 # is it a font tag
1583 else
1584 {
1585 if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
1586 {
1587 # remove font tag
1588 print PROD "";
1589 }
1590 else
1591 {
1592 # print without changes
1593 print PROD $token->as_is;
1594 }
1595 }
1596 }
1597 close (PROD) || die("Error Closing File: $tmp_filename $!");
1598
1599 # run html-tidy on the tmp file to make it a proper XML file
1600
1601 my $outhandle = $self->{'outhandle'};
1602 print $outhandle "Converting HTML to be XML compliant:\n";
1603
1604 my $tidy_cmd = "tidy";
1605 $tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1606 $tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1607 if ($self->{'verbosity'} <= 2) {
1608 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1609 $tidy_cmd .= " 2>nul";
1610 }
1611 else {
1612 $tidy_cmd .= " 2>/dev/null";
1613 }
1614 print $outhandle " => $tidy_cmd\n";
1615 }
1616
1617 my $tidyfile = `$tidy_cmd`;
1618
1619 # write result back to the tmp file
1620 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1621 print PROD $tidyfile;
1622 close (PROD) || die("Error Closing File: $tmp_filename $!");
1623
1624 # return the output filename
1625 return $tmp_filename;
1626}
1627
1628sub associate_cover_image
1629{
1630 my $self = shift(@_);
1631 my ($doc_obj, $filename) = @_;
1632 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
1633 {
1634 # we will have cover image in tidytmp, but want it from import
1635 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1636 }
1637 $self->SUPER::associate_cover_image($doc_obj, $filename);
1638}
1639
1640
16411;
Note: See TracBrowser for help on using the repository browser.