source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 23363

Last change on this file since 23363 was 23363, checked in by davidb, 13 years ago

Plugin code upgrade to support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 58.1 KB
Line 
1###########################################################################
2#
3# HTMLPlugin.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlugin;
37
38use Encode;
39
40use ReadTextFile;
41use HBPlugin;
42use ghtml;
43use unicode;
44use util;
45use XMLParser;
46
47use Image::Size;
48use File::Copy;
49
50sub BEGIN {
51 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
52}
53
54use strict; # every perl program should have this!
55no strict 'refs'; # make an exception so we can use variables as filehandles
56
57my $arguments =
58 [ { 'name' => "process_exp",
59 'desc' => "{BasePlugin.process_exp}",
60 'type' => "regexp",
61 'deft' => &get_default_process_exp() },
62 { 'name' => "block_exp",
63 'desc' => "{BasePlugin.block_exp}",
64 'type' => 'regexp',
65 'deft' => &get_default_block_exp() },
66 { 'name' => "nolinks",
67 'desc' => "{HTMLPlugin.nolinks}",
68 'type' => "flag" },
69 { 'name' => "keep_head",
70 'desc' => "{HTMLPlugin.keep_head}",
71 'type' => "flag" },
72 { 'name' => "no_metadata",
73 'desc' => "{HTMLPlugin.no_metadata}",
74 'type' => "flag" },
75 { 'name' => "metadata_fields",
76 'desc' => "{HTMLPlugin.metadata_fields}",
77 'type' => "string",
78 'deft' => "Title" },
79 { 'name' => "metadata_field_separator",
80 'desc' => "{HTMLPlugin.metadata_field_separator}",
81 'type' => "string",
82 'deft' => "" },
83 { 'name' => "hunt_creator_metadata",
84 'desc' => "{HTMLPlugin.hunt_creator_metadata}",
85 'type' => "flag" },
86 { 'name' => "file_is_url",
87 'desc' => "{HTMLPlugin.file_is_url}",
88 'type' => "flag" },
89 { 'name' => "assoc_files",
90 'desc' => "{HTMLPlugin.assoc_files}",
91 'type' => "regexp",
92 'deft' => &get_default_block_exp() },
93 { 'name' => "rename_assoc_files",
94 'desc' => "{HTMLPlugin.rename_assoc_files}",
95 'type' => "flag" },
96 { 'name' => "title_sub",
97 'desc' => "{HTMLPlugin.title_sub}",
98 'type' => "string",
99 'deft' => "" },
100 { 'name' => "description_tags",
101 'desc' => "{HTMLPlugin.description_tags}",
102 'type' => "flag" },
103 # retain this for backward compatibility (w3mir option was replaced by
104 # file_is_url)
105 { 'name' => "w3mir",
106# 'desc' => "{HTMLPlugin.w3mir}",
107 'type' => "flag",
108 'hiddengli' => "yes"},
109 { 'name' => "no_strip_metadata_html",
110 'desc' => "{HTMLPlugin.no_strip_metadata_html}",
111 'type' => "string",
112 'deft' => "",
113 'reqd' => "no"},
114 { 'name' => "sectionalise_using_h_tags",
115 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
116 'type' => "flag" },
117 { 'name' => "use_realistic_book",
118 'desc' => "{HTMLPlugin.tidy_html}",
119 'type' => "flag"},
120 { 'name' => "old_style_HDL",
121 'desc' => "{HTMLPlugin.old_style_HDL}",
122 'type' => "flag"},
123 {'name' => "processing_tmp_files",
124 'desc' => "{BasePlugin.processing_tmp_files}",
125 'type' => "flag",
126 'hiddengli' => "yes"}
127 ];
128
129my $options = { 'name' => "HTMLPlugin",
130 'desc' => "{HTMLPlugin.desc}",
131 'abstract' => "no",
132 'inherits' => "yes",
133 'args' => $arguments };
134
135
136sub new {
137 my ($class) = shift (@_);
138 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139 push(@$pluginlist, $class);
140
141 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142 push(@{$hashArgOptLists->{"OptList"}},$options);
143
144
145 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
146
147 if ($self->{'w3mir'}) {
148 $self->{'file_is_url'} = 1;
149 }
150 $self->{'aux_files'} = {};
151 $self->{'dir_num'} = 0;
152 $self->{'file_num'} = 0;
153
154 return bless $self, $class;
155}
156
157# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
158# if have eg <script language="javascript" src="img/lib.js@123">
159# blocking is now done by reading through the file and recording all the
160# images and other files
161sub get_default_block_exp {
162 my $self = shift (@_);
163
164 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
165 return "";
166}
167
168sub get_default_process_exp {
169 my $self = shift (@_);
170
171 # the last option is an attempt to encode the concept of an html query ...
172 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
173}
174
175sub store_block_files
176{
177 my $self =shift (@_);
178 my ($filename_full_path, $block_hash) = @_;
179
180 my $html_fname = $filename_full_path;
181 my @file_blocks;
182
183 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
184 $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
185
186 # read in file ($text will be in utf8)
187 my $raw_text = "";
188 $self->read_file_no_decoding($filename_full_path, \$raw_text);
189
190 my $textref = \$raw_text;
191 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
192 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
193 $$textref =~ s/$opencom(.*?)$closecom//gs;
194
195 # Convert entities to their UTF8 equivalents
196 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
197 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
198 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
199
200 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
201 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
202 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
203 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
204 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
205 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
206 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
207
208 if(!defined $self->{'utf8_to_original_filename'}) {
209 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
210 $self->{'utf8_to_original_filename'} = {};
211 }
212
213 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
214
215 # remove quotes from link at start and end if necessary
216 if ($link =~ m/^\"/) {
217 $link =~ s/^\"//;
218 $link =~ s/\"$//;
219 }
220
221 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
222 # some links may just be anchor names
223 next unless ($link =~ /\S+/);
224
225 if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
226 # Turn relative file path into full path
227 my $dirname = &File::Basename::dirname($filename_full_path);
228 $link = &util::filename_cat($dirname, $link);
229 }
230 $link = $self->eval_dir_dots($link);
231
232 # this is the actual filename on the filesystem (that the link refers to)
233 my $url_original_filename = $self->opt_url_decode($link);
234
235 # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
236 my $utf8_link = "";
237 $self->decode_text($link,$content_encoding,$language,\$utf8_link);
238
239 $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
240# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
241
242 if ($url_original_filename ne $utf8_link) {
243 my $outhandle = $self->{'outhandle'};
244
245 print $outhandle "URL Encoding $url_original_filename\n";
246 print $outhandle " ->$utf8_link\n";
247 }
248
249
250 my $unicode_url_original_filename = decode("utf8",$url_original_filename);
251
252## print STDERR "*****!!! Blocking url original filename = $unicode_url_original_filename\n";
253
254 # Allow for possibility of raw byte version (UTF8) and Unicode versions of file
255 $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
256 $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1;
257 }
258}
259
260# Given a filename in any encoding, will URL decode it to get back the original filename
261# in the original encoding. Because this method is intended to work out the *original*
262# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
263# string already exists in the local folder.
264#
265# Is the following still true??
266# Return the original filename corresponding to the parameter URL-encoded filename, and
267# a decoded flag that is set to true iff URL-decoding had to be applied.
268sub opt_url_decode {
269 my $self = shift (@_);
270 my ($link) = @_;
271
272 # Replace %XX's in URL with decoded value if required.
273 # Note that the filename may include the %XX in some situations
274 if ($link =~ m/\%[A-F0-9]{2}/i) {
275 if (!-e $link) {
276 $link = &unicode::url_decode($link);
277 }
278 }
279
280 return $link;
281}
282
283sub read_into_doc_obj
284{
285 my $self = shift (@_);
286 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
287
288 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
289
290 # Lookup content_encoding worked out in file_block pass for this file
291 # Store it under the local name 'content_encoding' so its nice and
292 # easy to access
293 $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
294
295 # get the input file
296 my $input_filename = $file;
297 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
298 $suffix = lc($suffix);
299 my $tidy_filename;
300 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
301 {
302 # because the document has to be sectionalized set the description tags
303 $self->{'description_tags'} = 1;
304
305 # set the file to be tidied
306 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
307
308 # get the tidied file
309 #my $tidy_filename = $self->tmp_tidy_file($input_filename);
310 $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
311
312 # derive tmp filename from input filename
313 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
314
315 # set the new input file and base_dir to be from the tidied file
316 $file = "$tailname$suffix";
317 $base_dir = $dirname;
318 }
319
320 # call the parent read_into_doc_obj
321 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
322 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
323 {
324 # now we need to reset the filenames in the doc obj so that the converted filenames are not used
325 my $collect_file = &util::filename_within_collection($filename_full_path);
326 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
327 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
328 # build. So set it manually.
329 $doc_obj->set_source_path($filename_full_path);
330 my $collect_conv_file = &util::filename_within_collection($tidy_filename);
331 $doc_obj->set_converted_filename($collect_conv_file);
332
333 my $plugin_filename_encoding = $self->{'filename_encoding'};
334 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
335 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
336 }
337
338 delete $self->{'store_content_encoding'}->{$filename_full_path};
339 $self->{'content_encoding'} = undef;
340
341 return ($process_status,$doc_obj);
342}
343
344# do plugin specific processing of doc_obj
345sub process {
346 my $self = shift (@_);
347 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
348 my $outhandle = $self->{'outhandle'};
349
350 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
351 # this makes life so much easier... perl can cope with unix-style '/'s.
352 $base_dir =~ s@(\\)+@/@g;
353 $file =~ s@(\\)+@/@g;
354 }
355
356 # reset per-doc stuff...
357 $self->{'aux_files'} = {};
358 $self->{'dir_num'} = 0;
359 $self->{'file_num'} = 0;
360
361 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
362 # you can also include metadata in the format (X can be any number)
363 # <hX>Title<!--gsdl-metadata
364 # <Metadata name="name1">value1</Metadata>
365 # ...
366 # <Metadata name="nameN">valueN</Metadata>
367 #--></hX>
368 if ($self->{'sectionalise_using_h_tags'}) {
369 # description_tags should allways be activated because we convert headings to description tags
370 $self->{'description_tags'} = 1;
371
372 my $arrSections = [];
373 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
374
375 if (scalar(@$arrSections)) {
376 my $strMetadata = $self->update_section_data($arrSections, -1);
377 if (length($strMetadata)) {
378 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
379 $$textref =~ s/<\/body>/$strMetadata/ig;
380 }
381 }
382 }
383
384 my $cursection = $doc_obj->get_top_section();
385
386 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
387 unless $self->{'no_metadata'} || $self->{'description_tags'};
388
389 # Store URL for page as metadata - this can be used for an
390 # altavista style search interface. The URL won't be valid
391 # unless the file structure contains the domain name (i.e.
392 # like when w3mir is used to download a website).
393
394 # URL metadata (even invalid ones) are used to support internal
395 # links, so even if 'file_is_url' is off, still need to store info
396
397 my ($tailname,$dirname) = &File::Basename::fileparse($file);
398
399# my $utf8_file = $self->filename_to_utf8_metadata($file);
400# $utf8_file =~ s/&\#095;/_/g;
401 my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
402
403# if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
404# print STDERR "***!! file = $file\n";
405# print STDERR "***!! utf8_file = $utf8_file\n";
406# }
407
408
409 my $web_url = "http://";
410 if(defined $dirname) { # local directory
411 # Check for "ftp" in the domain name of the directory
412 # structure to determine if this URL should be a ftp:// URL
413 # This check is not infallible, but better than omitting the
414 # check, which would cause all files downloaded from ftp sites
415 # via mirroring with wget to have potentially erroneous http:// URLs
416 # assigned in their metadata
417 if ($dirname =~ /^[^\/]*ftp/i)
418 {
419 $web_url = "ftp://";
420 }
421 $dirname = $self->eval_dir_dots($dirname);
422 $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
423 $web_url = $web_url.$dirname.$utf8_file;
424 } else {
425 $web_url = $web_url.$utf8_file;
426 }
427 $web_url =~ s/\\/\//g;
428 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
429
430 if ($self->{'file_is_url'}) {
431 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
432 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
433 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
434 }
435
436 if ($self->{'description_tags'}) {
437 # remove the html header - note that doing this here means any
438 # sections defined within the header will be lost (so all <Section>
439 # tags must appear within the body of the HTML)
440 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
441
442 $$textref =~ s/^.*?<body[^>]*>//is;
443 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
444
445 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
446 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
447
448 my $lt = '(?:<|&lt;)';
449 my $gt = '(?:>|&gt;)';
450 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
451
452 my $dont_strip = '';
453 if ($self->{'no_strip_metadata_html'}) {
454 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
455 }
456
457 my $found_something = 0; my $top = 1;
458 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
459 my $text = $1;
460 my $comment = $2;
461 if (defined $text) {
462 # text before a comment - note that getting to here
463 # doesn't necessarily mean there are Section tags in
464 # the document
465 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
466 }
467 while ($comment =~ s/$lt(.*?)$gt//s) {
468 my $tag = $1;
469 if ($tag eq "Section") {
470 $found_something = 1;
471 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
472 $top = 0;
473 } elsif ($tag eq "/Section") {
474 $found_something = 1;
475 $cursection = $doc_obj->get_parent_section ($cursection);
476 } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
477 my $metaname = $1;
478 my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
479 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
480 my $metavalue = $1;
481 $metavalue =~ s/^\s+//;
482 $metavalue =~ s/\s+$//;
483 # assume that no metadata value intentionally includes
484 # carriage returns or HTML tags (if they're there they
485 # were probably introduced when converting to HTML from
486 # some other format).
487 # actually some people want to have html tags in their
488 # metadata.
489 $metavalue =~ s/[\cJ\cM]/ /sg;
490 $metavalue =~ s/<[^>]+>//sg
491 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
492 $metavalue =~ s/\s+/ /sg;
493 if ($metaname =~ /\./) { # has a namespace
494 $metaname = "ex.$metaname";
495 }
496 if ($accumulate) {
497 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
498 } else {
499 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
500 }
501 } elsif ($tag eq "Description" || $tag eq "/Description") {
502 # do nothing with containing Description tags
503 } else {
504 # simple HTML tag (probably created by the conversion
505 # to HTML from some other format) - we'll ignore it and
506 # hope for the best ;-)
507 }
508 }
509 }
510 if ($cursection ne "") {
511 print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
512 }
513
514 $$textref =~ s/^.*?<body[^>]*>//is;
515 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
516 if ($$textref =~ m/\S/) {
517 if (!$found_something) {
518 if ($self->{'verbosity'} > 2) {
519 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
520 print $outhandle " will be processed as a single section document\n";
521 }
522
523 # go ahead and process single-section document
524 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
525
526 # if document contains no Section tags we'll go ahead
527 # and extract metadata (this won't have been done
528 # above as the -description_tags option prevents it)
529 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
530 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
531 unless $self->{'no_metadata'};
532
533 } else {
534 print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
535 print $outhandle " of the final closing </Section> tag. This text will\n";
536 print $outhandle " be ignored.";
537
538 my ($text);
539 if (length($$textref) > 30) {
540 $text = substr($$textref, 0, 30) . "...";
541 } else {
542 $text = $$textref;
543 }
544 $text =~ s/\n/ /isg;
545 print $outhandle " ($text)\n";
546 }
547 } elsif (!$found_something) {
548
549 if ($self->{'verbosity'} > 2) {
550 # may get to here if document contained no valid Section
551 # tags but did contain some comments. The text will have
552 # been processed already but we should print the warning
553 # as above and extract metadata
554 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
555 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
556 }
557
558 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
559 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
560 unless $self->{'no_metadata'};
561 }
562
563 } else {
564
565 # remove header and footer
566 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
567 $$textref =~ s/^.*?<body[^>]*>//is;
568 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
569 }
570
571 # single section document
572 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
573 }
574
575 return 1;
576}
577
578
579sub process_heading
580{
581 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
582 $strHeadingText = '' if (!defined($strHeadingText));
583
584 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
585
586 my $strSecMetadata = '';
587 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
588 {
589 $strSecMetadata .= $1;
590 }
591
592 $strHeadingText =~ s/^\s+//g;
593 $strHeadingText =~ s/\s+$//g;
594 $strSecMetadata =~ s/^\s+//g;
595 $strSecMetadata =~ s/\s+$//g;
596
597 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
598
599 if (length($strSecMetadata)) {
600 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
601 }
602
603 $strMetadata .= "\t</Description>\n";
604
605 return "<!--" . $strMetadata . "-->";
606}
607
608
609sub update_section_data
610{
611 my ($self, $arrSections, $nCurTocNo) = @_;
612 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
613
614 if ($nSections == 0) {
615 push @$arrSections, $nCurTocNo;
616 return $strBuffer;
617 }
618 $nLast = $arrSections->[$nSections - 1];
619 if ($nCurTocNo > $nLast) {
620 push @$arrSections, $nCurTocNo;
621 return $strBuffer;
622 }
623 for(my $i = $nSections - 1; $i >= 0; $i--) {
624 if ($nCurTocNo <= $arrSections->[$i]) {
625 $strBuffer .= "\n</Section>";
626 pop @$arrSections;
627 }
628 }
629 push @$arrSections, $nCurTocNo;
630 return $strBuffer;
631}
632
633
634# note that process_section may be called multiple times for a single
635# section (relying on the fact that add_utf8_text appends the text to any
636# that may exist already).
637sub process_section {
638 my $self = shift (@_);
639 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
640 # trap links
641 if (!$self->{'nolinks'}) {
642 # usemap="./#index" not handled correctly => change to "#index"
643## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
644
645 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
646 $self->replace_usemap_links($1, $2, $3)/isge;
647
648## $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
649
650 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
651 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
652 }
653
654 # trap images
655
656 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
657 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
658 # turned regular text succeeding images into links. That is, by embedding <imgs>
659 # inside <a href=""></a>, the text following images were no longer misbehaving.
660 # However, there would be many occasions whereby images were not meant to link
661 # to their source images but where the images would link to another web page.
662 # To allow this, the no_image_links option was introduced: it would prevent
663 # the behaviour of embedding images into links that referenced the source images.
664
665 # Somewhere along the line, the problem of normal text turning into links when
666 # such text followed images which were not embedded in <a href=""></a> ceased
667 # to occur. This is why the following lines have been commented out (as well as
668 # two lines in replace_images). They appear to no longer apply.
669
670 # If at any time, there is a need for having images embedded in <a> anchor tags,
671 # then it might be better to turn that into an HTMLPlugin option rather than make
672 # it the default behaviour. Also, eventually, no_image_links needs to become
673 # a deprecated option for HTMLPlugin as it has now become the default behaviour.
674
675 #if(!$self->{'no_image_links'}){
676 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
677 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
678 #}
679
680 # add text to document object
681 # turn \ into \\ so that the rest of greenstone doesn't think there
682 # is an escape code following. (Macro parsing loses them...)
683 $$textref =~ s/\\/\\\\/go;
684
685 $doc_obj->add_utf8_text($cursection, $$textref);
686}
687
688sub replace_images {
689 my $self = shift (@_);
690 my ($front, $link, $back, $base_dir,
691 $file, $doc_obj, $section) = @_;
692
693 # remove quotes from link at start and end if necessary
694 if ($link=~/^[\"\']/) {
695 $link=~s/^[\"\']//;
696 $link=~s/[\"\']$//;
697 $front.='"';
698 $back="\"$back";
699 }
700
701 $link =~ s/\n/ /g;
702
703 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
704 # If the Word file path has spaces in it, wv messes up and you end up with
705 # absolute paths for the images, and without the "file://" prefix
706 # So check for this special case and massage the data to be correct
707 if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
708 $link =~ s/^.*\\([^\\]+)$/$1/;
709 }
710
711 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
712
713 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
714
715# print STDERR "**** link = $link\n**** href = $href\n**** img_file = $img_file, rl = $rl\n";
716
717 my $anchor_name = $img_file;
718 #$anchor_name =~ s/^.*\///;
719 #$anchor_name = "<a name=\"$anchor_name\" ></a>";
720
721 my $image_link = $front . $img_file .$back;
722 return $image_link;
723
724 # The reasons for why the following two lines are no longer necessary can be
725 # found in subroutine process_section
726 #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
727 #return $anchor_link;
728
729 #return $front . $img_file . $back . $anchor_name;
730}
731
732sub replace_href_links {
733 my $self = shift (@_);
734 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
735
736 # remove quotes from link at start and end if necessary
737 if ($link=~/^[\"\']/) {
738 $link=~s/^[\"\']//;
739 $link=~s/[\"\']$//;
740 $front.='"';
741 $back="\"$back";
742 }
743
744 # attempt to sort out targets - frames are not handled
745 # well in this plugin and some cases will screw things
746 # up - e.g. the _parent target (so we'll just remove
747 # them all ;-)
748 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
749 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
750 $front =~ s/target=\"?_parent\"?//is;
751 $back =~ s/target=\"?_parent\"?//is;
752
753 return $front . $link . $back if $link =~ m/^\#/s;
754 $link =~ s/\n/ /g;
755
756 # Find file referred to by $link on file system
757 # This is more complicated than it sounds when char encodings
758 # is taken in to account
759 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
760
761 # href may use '\'s where '/'s should be on Windows
762 $href =~ s/\\/\//g;
763 my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
764
765
766 ##### leave all these links alone (they won't be picked up by intermediate
767 ##### pages). I think that's safest when dealing with frames, targets etc.
768 ##### (at least until I think of a better way to do it). Problems occur with
769 ##### mailto links from within small frames, the intermediate page is displayed
770 ##### within that frame and can't be seen. There is still potential for this to
771 ##### happen even with html pages - the solution seems to be to somehow tell
772 ##### the browser from the server side to display the page being sent (i.e.
773 ##### the intermediate page) in the top level window - I'm not sure if that's
774 ##### possible - the following line should probably be deleted if that can be done
775 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
776
777 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
778 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
779
780 # If web page didn't give encoding, then default to utf8
781 my $content_encoding= $self->{'content_encoding'} || "utf8";
782
783 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
784 print STDERR "*** Encoding with $content_encoding href: $href\n";
785 }
786
787 $href = encode($content_encoding,$href);
788
789 $href = &unicode::raw_filename_to_url_encoded($href);
790 $href = &unicode::filename_to_url($href);
791
792 &ghtml::urlsafe ($href);
793 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
794 print STDERR "***!!! href=$href\n";
795 }
796
797
798 return $front . "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part . $back;
799 } else {
800 # link is to some other type of file (e.g., an image) so we'll
801 # need to associate that file
802 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
803 }
804}
805
806sub add_file {
807 my $self = shift (@_);
808 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
809 my ($newname);
810
811 my $filename = $href;
812 if ($base_dir eq "") {
813 # remove http:/ thereby leaving one slash at the start
814 $filename =~ s/^[^:]*:\///;
815 }
816 else {
817 # remove http://
818 $filename =~ s/^[^:]*:\/\///;
819 }
820
821 $filename = &util::filename_cat($base_dir, $filename);
822
823 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) {
824 # we are processing a tidytmp file - want paths to be in import
825 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
826 }
827
828 # Replace %XX's in URL with decoded value if required. Note that the
829 # filename may include the %XX in some situations. If the *original*
830 # file's name was in URL encoding, the following method will not decode
831 # it.
832 my $utf8_filename = $filename;
833 my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename);
834
835 my $content_encoding= $self->{'content_encoding'} || "utf8";
836
837 # The filenames that come through the HTML file have been decoded
838 # into Unicode aware Perl strings. Need to convert them back
839 # to their initial raw-byte encoding to match the file that
840 # exists on the file system
841 $filename = encode($content_encoding, $opt_decode_utf8_filename);
842
843 # some special processing if the intended filename was converted to utf8, but
844 # the actual file still needs to be renamed
845 if (!&util::fd_exists($filename)) {
846 # try the original filename stored in map
847 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
848 print STDERR "******!! orig filename did not exist: $filename\n";
849 }
850
851## print STDERR "**** trying to look up utf8_filename: $utf8_filename\n";
852
853 my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename};
854
855 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
856 print STDERR "****** From lookup utf8_filename, now trying for: $original_filename\n";
857 }
858
859 if (defined $original_filename && -e $original_filename) {
860 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
861 print STDERR "****** Found match!\n";
862 }
863 $filename = $original_filename;
864 }
865 }
866
867 my ($ext) = $filename =~ m/(\.[^\.]*)$/;
868
869 if ($rl == 0) {
870 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
871 return "_httpextlink_&amp;rl=0&amp;el=prompt&amp;href=" . $href . $hash_part;
872 }
873 else {
874 return "_httpextlink_&amp;rl=0&amp;el=direct&amp;href=" . $href . $hash_part;
875 }
876 }
877
878 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
879 return "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part;
880 }
881 # add the original image file as a source file
882 if (!$self->{'processing_tmp_files'} ) {
883 $doc_obj->associate_source_file($filename);
884 }
885 if ($self->{'rename_assoc_files'}) {
886 if (defined $self->{'aux_files'}->{$href}) {
887 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
888 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
889 } else {
890 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
891 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
892 $self->inc_filecount ();
893 }
894 $doc_obj->associate_file($filename, $newname, undef, $section);
895 return "_httpdocimg_/$newname";
896 } else {
897 if(&unicode::is_url_encoded($utf8_filename)) {
898 # use the possibly-decoded filename instead to avoid double URL encoding
899 ($newname) = $filename =~ m/([^\/\\]*)$/;
900 } else {
901 ($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
902 }
903
904 # Make sure this name uses only ASCII characters.
905 # We use either base64 or URL encoding, as these preserve original encoding
906 $newname = &util::rename_file($newname, $self->{'file_rename_method'});
907
908### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
909 $doc_obj->associate_file($filename, $newname, undef, $section);
910
911 # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
912 # of filenames, URL-encode the additional percent signs of the URL-encoded filename
913 my $newname_url = $newname;
914 $newname_url = &unicode::filename_to_url($newname_url);
915 return "_httpdocimg_/$newname_url";
916 }
917}
918
919
920sub format_link {
921 my $self = shift (@_);
922 my ($link, $base_dir, $file) = @_;
923
924 my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/;
925
926 $hash_part = "" if !defined $hash_part;
927 if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
928 my $outhandle = $self->{'outhandle'};
929 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
930 if $self->{'verbosity'};
931 return ($link, "", 0);
932 }
933
934 if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) {
935 my $type = $1;
936
937 if ($link =~ m/^(http|ftp):/i) {
938 # Turn url (using /) into file name (possibly using \ on windows)
939 my @http_dir_split = split('/', $before_hash);
940 $before_hash = &util::filename_cat(@http_dir_split);
941 }
942
943 $before_hash = $self->eval_dir_dots($before_hash);
944
945 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
946
947 my $rl = 0;
948 $rl = 1 if (-e $linkfilename);
949
950 # make sure there's a slash on the end if it's a directory
951 if ($before_hash !~ m/\/$/) {
952 $before_hash .= "/" if (-d $linkfilename);
953 }
954 return ($type . $before_hash, $hash_part, $rl);
955
956 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
957
958 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
959
960 # the first directory will be the domain name if file_is_url
961 # to generate archives, otherwise we'll assume all files are
962 # from the same site and base_dir is the root
963
964 if ($self->{'file_is_url'}) {
965 my @dirs = split /[\/\\]/, $file;
966 my $domname = shift (@dirs);
967 $before_hash = &util::filename_cat($domname, $before_hash);
968 $before_hash =~ s@\\@/@g; # for windows
969 }
970 else
971 {
972 # see if link shares directory with source document
973 # => turn into relative link if this is so!
974
975 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
976 # too difficult doing a pattern match with embedded '\'s...
977 my $win_before_hash=$before_hash;
978 $win_before_hash =~ s@(\\)+@/@g;
979 # $base_dir is already similarly "converted" on windows.
980 if ($win_before_hash =~ s@^$base_dir/@@o) {
981 # if this is true, we removed a prefix
982 $before_hash=$win_before_hash;
983 }
984 }
985 else {
986 # before_hash has lost leading slash by this point,
987 # -> add back in prior to substitution with $base_dir
988 $before_hash = "/$before_hash";
989
990 $before_hash = &util::filename_cat("",$before_hash);
991 $before_hash =~ s@^$base_dir/@@;
992 }
993 }
994 } else {
995 # Turn relative file path into full path
996 my $dirname = &File::Basename::dirname($file);
997 $before_hash = &util::filename_cat($dirname, $before_hash);
998 $before_hash = $self->eval_dir_dots($before_hash);
999 }
1000
1001 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1002 # make sure there's a slash on the end if it's a directory
1003 if ($before_hash !~ m/\/$/) {
1004 $before_hash .= "/" if (-d $linkfilename);
1005 }
1006 return ("http://" . $before_hash, $hash_part, 1);
1007 } else {
1008 # mailto, news, nntp, telnet, javascript or gopher link
1009 return ($before_hash, "", 0);
1010 }
1011}
1012
1013sub extract_first_NNNN_characters {
1014 my $self = shift (@_);
1015 my ($textref, $doc_obj, $thissection) = @_;
1016
1017 foreach my $size (split /,/, $self->{'first'}) {
1018 my $tmptext = $$textref;
1019 # skip to the body
1020 $tmptext =~ s/.*<body[^>]*>//i;
1021 # remove javascript
1022 $tmptext =~ s@<script.*?</script>@ @sig;
1023 $tmptext =~ s/<[^>]*>/ /g;
1024 $tmptext =~ s/&nbsp;/ /g;
1025 $tmptext =~ s/^\s+//;
1026 $tmptext =~ s/\s+$//;
1027 $tmptext =~ s/\s+/ /gs;
1028 $tmptext = &unicode::substr ($tmptext, 0, $size);
1029 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
1030 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1031 }
1032}
1033
1034
1035sub extract_metadata {
1036 my $self = shift (@_);
1037 my ($textref, $metadata, $doc_obj, $section) = @_;
1038 my $outhandle = $self->{'outhandle'};
1039 # if we don't want metadata, we may as well not be here ...
1040 return if (!defined $self->{'metadata_fields'});
1041
1042 my $separator = $self->{'metadata_field_separator'};
1043 if ($separator eq "") {
1044 undef $separator;
1045 }
1046
1047 # metadata fields to extract/save. 'key' is the (lowercase) name of the
1048 # html meta, 'value' is the metadata name for greenstone to use
1049 my %find_fields = ();
1050
1051 my %creator_fields = (); # short-cut for lookups
1052
1053
1054 foreach my $field (split /,/, $self->{'metadata_fields'}) {
1055 $field =~ s/^\s+//; # remove leading whitespace
1056 $field =~ s/\s+$//; # remove trailing whitespace
1057
1058 # support tag<tagname>
1059 if ($field =~ m/^(.*?)\s*<(.*?)>$/) {
1060 # "$2" is the user's preferred gs metadata name
1061 $find_fields{lc($1)}=$2; # lc = lowercase
1062 } else { # no <tagname> for mapping
1063 # "$field" is the user's preferred gs metadata name
1064 $find_fields{lc($field)}=$field; # lc = lowercase
1065 }
1066 }
1067
1068 if (defined $self->{'hunt_creator_metadata'} &&
1069 $self->{'hunt_creator_metadata'} == 1 ) {
1070 my @extra_fields =
1071 (
1072 'author',
1073 'author.email',
1074 'creator',
1075 'dc.creator',
1076 'dc.creator.corporatename',
1077 );
1078
1079 # add the creator_metadata fields to search for
1080 foreach my $field (@extra_fields) {
1081 $creator_fields{$field}=0; # add to lookup hash
1082 }
1083 }
1084
1085
1086 # find the header in the html file, which has the meta tags
1087 $$textref =~ m@<head>(.*?)</head>@si;
1088
1089 my $html_header=$1;
1090
1091 # go through every <meta... tag defined in the html and see if it is
1092 # one of the tags we want to match.
1093
1094 # special case for title - we want to remember if its been found
1095 my $found_title = 0;
1096 # this assumes that ">" won't appear. (I don't think it's allowed to...)
1097 $html_header =~ m/^/; # match the start of the string, for \G assertion
1098
1099 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
1100 my $metatag=$1;
1101 my ($tag, $value);
1102
1103 # find the tag name
1104 $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
1105 $tag=$2;
1106 # in case they're not using " or ', but they should...
1107 if (! $tag) {
1108 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1109 $tag=$1;
1110 }
1111
1112 if (!defined $tag) {
1113 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1114 next;
1115 }
1116
1117 # don't need to assign this field if it was passed in from a previous
1118 # (recursive) plugin
1119 if (defined $metadata->{$tag}) {next}
1120
1121 # find the tag content
1122 $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
1123 $value=$2;
1124
1125 if (! $value) {
1126 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1127 $value=$1;
1128 }
1129 if (!defined $value) {
1130 print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1131 next;
1132 }
1133
1134 # clean up and add
1135 $value =~ s/\s+/ /gs;
1136 chomp($value); # remove trailing \n, if any
1137 if (exists $creator_fields{lc($tag)}) {
1138 # map this value onto greenstone's "Creator" metadata
1139 $tag='Creator';
1140 } elsif (!exists $find_fields{lc($tag)}) {
1141 next; # don't want this tag
1142 } else {
1143 # get the user's preferred capitalisation
1144 $tag = $find_fields{lc($tag)};
1145 }
1146 if (lc($tag) eq "title") {
1147 $found_title = 1;
1148 }
1149
1150 if ($self->{'verbosity'} > 2) {
1151 print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1152 }
1153
1154 if ($tag =~ /\./) {
1155 # there is a . so has a namespace, add ex.
1156 $tag = "ex.$tag";
1157 }
1158 if (defined $separator) {
1159 my @values = split($separator, $value);
1160 foreach my $v (@values) {
1161 $doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1162 }
1163 }
1164 else {
1165 $doc_obj->add_utf8_metadata($section, $tag, $value);
1166 }
1167 }
1168
1169 # TITLE: extract the document title
1170 if (exists $find_fields{'title'} && !$found_title) {
1171 # we want a title, and didn't find one in the meta tags
1172 # see if there's a <title> tag
1173 my $title;
1174 my $from = ""; # for debugging output only
1175 if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
1176 $title = $1;
1177 $from = "<title> tags";
1178 }
1179
1180 if (!defined $title) {
1181 $from = "first 100 chars";
1182 # if no title use first 100 or so characters
1183 $title = $$textref;
1184 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1185 $title =~ s/^.*?<body>//si;
1186 # ignore javascript!
1187 $title =~ s@<script.*?</script>@ @sig;
1188 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1189 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1190 $title = substr ($title, 0, 100);
1191 $title =~ s/\s\S*$/.../;
1192 }
1193 $title =~ s/<[^>]*>/ /g; # remove html tags
1194 $title =~ s/&nbsp;/ /g;
1195 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1196 $title =~ s/\s+/ /gs; # collapse multiple spaces
1197 $title =~ s/^\s*//; # remove leading spaces
1198 $title =~ s/\s*$//; # remove trailing spaces
1199
1200 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1201 $title =~ s/^\s+//s; # in case title_sub introduced any...
1202 $doc_obj->add_utf8_metadata ($section, "Title", $title);
1203 print $outhandle " extracted Title metadata \"$title\" from $from\n"
1204 if ($self->{'verbosity'} > 2);
1205 }
1206
1207 # add FileFormat metadata
1208 $doc_obj->add_metadata($section,"FileFormat", "HTML");
1209
1210 # Special, for metadata names such as tagH1 - extracts
1211 # the text between the first <H1> and </H1> tags into "H1" metadata.
1212
1213 foreach my $field (keys %find_fields) {
1214 if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1215 my $tag = $1;
1216 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1217 my $content = $1;
1218 $content =~ s/&nbsp;/ /g;
1219 $content =~ s/<[^>]*>/ /g;
1220 $content =~ s/^\s+//;
1221 $content =~ s/\s+$//;
1222 $content =~ s/\s+/ /gs;
1223 if ($content) {
1224 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1225 $tag =~ s/^tag//i;
1226 $doc_obj->add_utf8_metadata ($section, $tag, $content);
1227 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1228 if ($self->{'verbosity'} > 2);
1229 }
1230 }
1231 }
1232}
1233
1234
1235# evaluate any "../" to next directory up
1236# evaluate any "./" as here
1237sub eval_dir_dots {
1238 my $self = shift (@_);
1239 my ($filename) = @_;
1240 my $dirsep_os = &util::get_os_dirsep();
1241 my @dirsep = split(/$dirsep_os/,$filename);
1242
1243 my @eval_dirs = ();
1244 foreach my $d (@dirsep) {
1245 if ($d eq "..") {
1246 pop(@eval_dirs);
1247
1248 } elsif ($d eq ".") {
1249 # do nothing!
1250
1251 } else {
1252 push(@eval_dirs,$d);
1253 }
1254 }
1255
1256 # Need to fiddle with number of elements in @eval_dirs if the
1257 # first one is the empty string. This is because of a
1258 # modification to util::filename_cat that supresses the addition
1259 # of a leading '/' character (or \ if windows) (intended to help
1260 # filename cat with relative paths) if the first entry in the
1261 # array is the empty string. Making the array start with *two*
1262 # empty strings is a way to defeat this "smart" option.
1263 #
1264 if (scalar(@eval_dirs) > 0) {
1265 if ($eval_dirs[0] eq ""){
1266 unshift(@eval_dirs,"");
1267 }
1268 }
1269
1270 my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1271 return $evaluated_filename;
1272}
1273
1274sub replace_usemap_links {
1275 my $self = shift (@_);
1276 my ($front, $link, $back) = @_;
1277
1278 # remove quotes from link at start and end if necessary
1279 if ($link=~/^[\"\']/) {
1280 $link=~s/^[\"\']//;
1281 $link=~s/[\"\']$//;
1282 $front.='"';
1283 $back="\"$back";
1284 }
1285
1286 $link =~ s/^\.\///;
1287 return $front . $link . $back;
1288}
1289
1290sub inc_filecount {
1291 my $self = shift (@_);
1292
1293 if ($self->{'file_num'} == 1000) {
1294 $self->{'dir_num'} ++;
1295 $self->{'file_num'} = 0;
1296 } else {
1297 $self->{'file_num'} ++;
1298 }
1299}
1300
1301
1302# Extend read_file so that strings like &eacute; are
1303# converted to UTF8 internally.
1304#
1305# We don't convert &lt; or &gt; or &amp; or &quot; in case
1306# they interfere with the GML files
1307
1308sub read_file {
1309 my $self = shift(@_);
1310 my ($filename, $encoding, $language, $textref) = @_;
1311
1312 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1313
1314 # Convert entities to their Unicode code-point equivalents
1315 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
1316 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1317 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
1318
1319}
1320
1321sub HB_read_html_file {
1322 my $self = shift (@_);
1323 my ($htmlfile, $text) = @_;
1324
1325 # load in the file
1326 if (!open (FILE, $htmlfile)) {
1327 print STDERR "ERROR - could not open $htmlfile\n";
1328 return;
1329 }
1330
1331 my $foundbody = 0;
1332 $self->HB_gettext (\$foundbody, $text, "FILE");
1333 close FILE;
1334
1335 # just in case there was no <body> tag
1336 if (!$foundbody) {
1337 $foundbody = 1;
1338 open (FILE, $htmlfile) || return;
1339 $self->HB_gettext (\$foundbody, $text, "FILE");
1340 close FILE;
1341 }
1342 # text is in utf8
1343}
1344
1345# converts the text to utf8, as ghtml does that for &eacute; etc.
1346sub HB_gettext {
1347 my $self = shift (@_);
1348 my ($foundbody, $text, $handle) = @_;
1349
1350 my $line = "";
1351 while (defined ($line = <$handle>)) {
1352 # look for body tag
1353 if (!$$foundbody) {
1354 if ($line =~ s/^.*<body[^>]*>//i) {
1355 $$foundbody = 1;
1356 } else {
1357 next;
1358 }
1359 }
1360
1361 # check for symbol fonts
1362 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
1363 my $font = $1;
1364 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1365 if ($font !~ m/^arial$/i);
1366 }
1367
1368 $$text .= $line;
1369 }
1370
1371 if ($self->{'input_encoding'} eq "iso_8859_1") {
1372 # convert to utf-8
1373 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1374 }
1375 # convert any alphanumeric character entities to their utf-8
1376 # equivalent for indexing purposes
1377 #&ghtml::convertcharentities ($$text);
1378
1379 $$text =~ s/\s+/ /g; # remove \n's
1380
1381 # At this point $$text is a binary byte string
1382 # => turn it into a Unicode aware string, so full
1383 # Unicode aware pattern matching can be used.
1384 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
1385 #
1386
1387 $$text = decode("utf8",$$text);
1388}
1389
1390sub HB_clean_section {
1391 my $self = shift (@_);
1392 my ($section) = @_;
1393
1394 # remove tags without a starting tag from the section
1395 my ($tag, $tagstart);
1396 while ($section =~ m/<\/([^>]{1,10})>/) {
1397 $tag = $1;
1398 $tagstart = index($section, "<$tag");
1399 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1400 $section =~ s/<\/$tag>//;
1401 }
1402
1403 # remove extra paragraph tags
1404 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
1405
1406 # remove extra stuff at the end of the section
1407 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
1408
1409 # add a newline at the beginning of each paragraph
1410 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1411
1412 # add a newline every 80 characters at a word boundary
1413 # Note: this regular expression puts a line feed before
1414 # the last word in each section, even when it is not
1415 # needed.
1416 $section =~ s/(.{1,80})\s/$1\n/g;
1417
1418 # fix up the image links
1419 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
1420 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1421 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
1422 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1423
1424 return $section;
1425}
1426
1427# Will convert the oldHDL format to the new HDL format (using the Section tag)
1428sub convert_to_newHDLformat
1429{
1430 my $self = shift (@_);
1431 my ($file,$cnfile) = @_;
1432 my $input_filename = $file;
1433 my $tmp_filename = $cnfile;
1434
1435 # write HTML tmp file with new HDL format
1436 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1437
1438 # read in the file and do basic html cleaning (removing header etc)
1439 my $html = "";
1440 $self->HB_read_html_file ($input_filename, \$html);
1441
1442 # process the file one section at a time
1443 my $curtoclevel = 1;
1444 my $firstsection = 1;
1445 my $toclevel = 0;
1446 while (length ($html) > 0) {
1447 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
1448 $toclevel = $3;
1449 my $title = $4;
1450 my $sectiontext = "";
1451 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
1452 $sectiontext = $1;
1453 } else {
1454 $sectiontext = $html;
1455 $html = "";
1456 }
1457
1458 # remove tags and extra spaces from the title
1459 $title =~ s/<\/?[^>]+>//g;
1460 $title =~ s/^\s+|\s+$//g;
1461
1462 # close any sections below the current level and
1463 # create a new section (special case for the firstsection)
1464 print PROD "<!--\n";
1465 while (($curtoclevel > $toclevel) ||
1466 (!$firstsection && $curtoclevel == $toclevel)) {
1467 $curtoclevel--;
1468 print PROD "</Section>\n";
1469 }
1470 if ($curtoclevel+1 < $toclevel) {
1471 print STDERR "WARNING - jump in toc levels in $input_filename " .
1472 "from $curtoclevel to $toclevel\n";
1473 }
1474 while ($curtoclevel < $toclevel) {
1475 $curtoclevel++;
1476 }
1477
1478 if ($curtoclevel == 1) {
1479 # add the header tag
1480 print PROD "-->\n";
1481 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1482 print PROD "<!--\n";
1483 }
1484
1485 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1486
1487 print PROD "-->\n";
1488
1489 # clean up the section html
1490 $sectiontext = $self->HB_clean_section($sectiontext);
1491
1492 print PROD "$sectiontext\n";
1493
1494 } else {
1495 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1496 "\nin $input_filename\n";
1497 last;
1498 }
1499 $firstsection = 0;
1500 }
1501
1502 print PROD "<!--\n";
1503 while ($curtoclevel > 0) {
1504 $curtoclevel--;
1505 print PROD "</Section>\n";
1506 }
1507 print PROD "-->\n";
1508
1509 close (PROD) || die("Error Closing File: $tmp_filename $!");
1510
1511 return $tmp_filename;
1512}
1513
1514sub shorten {
1515 my $self = shift (@_);
1516 my ($text) = @_;
1517
1518 return "\"$text\"" if (length($text) < 100);
1519
1520 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1521 substr ($text, length($text)-50) . "\"";
1522}
1523
1524sub convert_tidy_or_oldHDL_file
1525{
1526 my $self = shift (@_);
1527 my ($file) = @_;
1528 my $input_filename = $file;
1529
1530 if (-d $input_filename)
1531 {
1532 return $input_filename;
1533 }
1534
1535 # get the input filename
1536 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1537 my $base_dirname = $dirname;
1538 $suffix = lc($suffix);
1539
1540 # derive tmp filename from input filename
1541 # Remove any white space from filename -- no risk of name collision, and
1542 # makes later conversion by utils simpler. Leave spaces in path...
1543 # tidy up the filename with space, dot, hyphen between
1544 $tailname =~ s/\s+//g;
1545 $tailname =~ s/\.+//g;
1546 $tailname =~ s/\-+//g;
1547 # convert to utf-8 otherwise we have problems with the doc.xml file
1548 # later on
1549 &unicode::ensure_utf8(\$tailname);
1550
1551 # softlink to collection tmp dir
1552 my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1553 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1554
1555 my $test_dirname = "";
1556 my $f_separator = &util::get_os_dirsep();
1557
1558 if ($dirname =~ m/import$f_separator/)
1559 {
1560 $test_dirname = $'; #'
1561
1562 #print STDERR "init $'\n";
1563
1564 while ($test_dirname =~ m/[$f_separator]/)
1565 {
1566 my $folderdirname = $`;
1567 $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1568 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1569 $test_dirname = $'; #'
1570 }
1571 }
1572
1573 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1574
1575 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1576 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
1577 {
1578 #convert the input file to a new style HDL
1579 my $hdl_output_filename = $input_filename;
1580 if ($self->{'old_style_HDL'})
1581 {
1582 $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1583 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1584 }
1585
1586 #just for checking copy all other file from the base dir to tmp dir if it is not exists
1587 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1588 my @files = grep {!/^\.+$/} readdir(DIR);
1589 close(DIR);
1590
1591 foreach my $file (@files)
1592 {
1593 my $src_file = &util::filename_cat($base_dirname,$file);
1594 my $dest_file = &util::filename_cat($tmp_dirname,$file);
1595 if ((!-e $dest_file) && (!-d $src_file))
1596 {
1597 # just copy the original file back to the tmp directory
1598 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1599 }
1600 }
1601
1602 # tidy the input file
1603 my $tidy_output_filename = $hdl_output_filename;
1604 if ($self->{'use_realistic_book'})
1605 {
1606 $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1607 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1608 }
1609 $tmp_filename = $tidy_output_filename;
1610 }
1611 else
1612 {
1613 if (!-e $tmp_filename)
1614 {
1615 # just copy the original file back to the tmp directory
1616 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1617 }
1618 }
1619
1620 return $tmp_filename;
1621}
1622
1623
1624# Will make the html input file as a proper XML file with removed font tag and
1625# image size added to the img tag.
1626# The tidying process takes place in a collection specific 'tmp' directory so
1627# that we don't accidentally damage the input.
1628sub tmp_tidy_file
1629{
1630 my $self = shift (@_);
1631 my ($file,$cnfile) = @_;
1632 my $input_filename = $file;
1633 my $tmp_filename = $cnfile;
1634
1635 # get the input filename
1636 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1637
1638 require HTML::TokeParser::Simple;
1639
1640 # create HTML parser to decode the input file
1641 my $parser = HTML::TokeParser::Simple->new($input_filename);
1642
1643 # write HTML tmp file without the font tag and image size are added to the img tag
1644 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1645 while (my $token = $parser->get_token())
1646 {
1647 # is it an img tag
1648 if ($token->is_start_tag('img'))
1649 {
1650 # get the attributes
1651 my $attr = $token->return_attr;
1652
1653 # get the full path to the image
1654 my $img_file = &util::filename_cat($dirname,$attr->{src});
1655
1656 # set the width and height attribute
1657 ($attr->{width}, $attr->{height}) = imgsize($img_file);
1658
1659 # recreate the tag
1660 print PROD "<img";
1661 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1662 print PROD ">";
1663 }
1664 # is it a font tag
1665 else
1666 {
1667 if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
1668 {
1669 # remove font tag
1670 print PROD "";
1671 }
1672 else
1673 {
1674 # print without changes
1675 print PROD $token->as_is;
1676 }
1677 }
1678 }
1679 close (PROD) || die("Error Closing File: $tmp_filename $!");
1680
1681 # run html-tidy on the tmp file to make it a proper XML file
1682
1683 my $outhandle = $self->{'outhandle'};
1684 print $outhandle "Converting HTML to be XML compliant:\n";
1685
1686 my $tidy_cmd = "tidy";
1687 $tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1688 $tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1689 if ($self->{'verbosity'} <= 2) {
1690 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1691 $tidy_cmd .= " 2>nul";
1692 }
1693 else {
1694 $tidy_cmd .= " 2>/dev/null";
1695 }
1696 print $outhandle " => $tidy_cmd\n";
1697 }
1698
1699 my $tidyfile = `$tidy_cmd`;
1700
1701 # write result back to the tmp file
1702 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1703 print PROD $tidyfile;
1704 close (PROD) || die("Error Closing File: $tmp_filename $!");
1705
1706 # return the output filename
1707 return $tmp_filename;
1708}
1709
1710sub associate_cover_image
1711{
1712 my $self = shift(@_);
1713 my ($doc_obj, $filename) = @_;
1714 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
1715 {
1716 # we will have cover image in tidytmp, but want it from import
1717 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1718 }
1719 $self->SUPER::associate_cover_image($doc_obj, $filename);
1720}
1721
1722
17231;
Note: See TracBrowser for help on using the repository browser.