source: gsdl/trunk/perllib/plugins/HTMLPlugin.pm@ 17983

Last change on this file since 17983 was 17983, checked in by ak19, 15 years ago

Wvware inserts a comment at the end of the HTML it generates which contains urls that Greenstone replaces with macros. When the macros are expanded by GS, the wvWare comment breaks and the resulting HTML is ugly. This comment has been removed in order to prevent the page from breaking when the page is served by the Greenstone server.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 51.9 KB
Line 
1###########################################################################
2#
3# HTMLPlugin.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlugin;
37
38use ReadTextFile;
39use HBPlugin;
40use ghtml;
41use unicode;
42use util;
43use XMLParser;
44
45use Image::Size;
46use File::Copy;
47
48sub BEGIN {
49 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
50}
51
52use strict; # every perl program should have this!
53no strict 'refs'; # make an exception so we can use variables as filehandles
54
55my $arguments =
56 [ { 'name' => "process_exp",
57 'desc' => "{BasePlugin.process_exp}",
58 'type' => "regexp",
59 'deft' => &get_default_process_exp() },
60 { 'name' => "block_exp",
61 'desc' => "{BasePlugin.block_exp}",
62 'type' => 'regexp',
63 'deft' => &get_default_block_exp() },
64 { 'name' => "nolinks",
65 'desc' => "{HTMLPlugin.nolinks}",
66 'type' => "flag" },
67 { 'name' => "keep_head",
68 'desc' => "{HTMLPlugin.keep_head}",
69 'type' => "flag" },
70 { 'name' => "no_metadata",
71 'desc' => "{HTMLPlugin.no_metadata}",
72 'type' => "flag" },
73 { 'name' => "metadata_fields",
74 'desc' => "{HTMLPlugin.metadata_fields}",
75 'type' => "string",
76 'deft' => "Title" },
77 { 'name' => "hunt_creator_metadata",
78 'desc' => "{HTMLPlugin.hunt_creator_metadata}",
79 'type' => "flag" },
80 { 'name' => "file_is_url",
81 'desc' => "{HTMLPlugin.file_is_url}",
82 'type' => "flag" },
83 { 'name' => "assoc_files",
84 'desc' => "{HTMLPlugin.assoc_files}",
85 'type' => "regexp",
86 'deft' => &get_default_block_exp() },
87 { 'name' => "rename_assoc_files",
88 'desc' => "{HTMLPlugin.rename_assoc_files}",
89 'type' => "flag" },
90 { 'name' => "title_sub",
91 'desc' => "{HTMLPlugin.title_sub}",
92 'type' => "string",
93 'deft' => "" },
94 { 'name' => "description_tags",
95 'desc' => "{HTMLPlugin.description_tags}",
96 'type' => "flag" },
97 # retain this for backward compatibility (w3mir option was replaced by
98 # file_is_url)
99 { 'name' => "w3mir",
100# 'desc' => "{HTMLPlugin.w3mir}",
101 'type' => "flag",
102 'hiddengli' => "yes"},
103 { 'name' => "no_strip_metadata_html",
104 'desc' => "{HTMLPlugin.no_strip_metadata_html}",
105 'type' => "string",
106 'deft' => "",
107 'reqd' => "no"},
108 { 'name' => "sectionalise_using_h_tags",
109 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
110 'type' => "flag" },
111 { 'name' => "use_realistic_book",
112 'desc' => "{HTMLPlugin.tidy_html}",
113 'type' => "flag"},
114 { 'name' => "old_style_HDL",
115 'desc' => "{HTMLPlugin.old_style_HDL}",
116 'type' => "flag"}
117 ];
118
119my $options = { 'name' => "HTMLPlugin",
120 'desc' => "{HTMLPlugin.desc}",
121 'abstract' => "no",
122 'inherits' => "yes",
123 'args' => $arguments };
124
125
126sub HB_read_html_file {
127 my $self = shift (@_);
128 my ($htmlfile, $text) = @_;
129
130 # load in the file
131 if (!open (FILE, $htmlfile)) {
132 print STDERR "ERROR - could not open $htmlfile\n";
133 return;
134 }
135
136 my $foundbody = 0;
137 $self->HB_gettext (\$foundbody, $text, "FILE");
138 close FILE;
139
140 # just in case there was no <body> tag
141 if (!$foundbody) {
142 $foundbody = 1;
143 open (FILE, $htmlfile) || return;
144 $self->HB_gettext (\$foundbody, $text, "FILE");
145 close FILE;
146 }
147 # text is in utf8
148}
149
150# converts the text to utf8, as ghtml does that for &eacute; etc.
151sub HB_gettext {
152 my $self = shift (@_);
153 my ($foundbody, $text, $handle) = @_;
154
155 my $line = "";
156 while (defined ($line = <$handle>)) {
157 # look for body tag
158 if (!$$foundbody) {
159 if ($line =~ s/^.*<body[^>]*>//i) {
160 $$foundbody = 1;
161 } else {
162 next;
163 }
164 }
165
166 # check for symbol fonts
167 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
168 my $font = $1;
169 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
170 if ($font !~ m/^arial$/i);
171 }
172
173 $$text .= $line;
174 }
175
176 if ($self->{'input_encoding'} eq "iso_8859_1") {
177 # convert to utf-8
178 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
179 }
180 # convert any alphanumeric character entities to their utf-8
181 # equivalent for indexing purposes
182 #&ghtml::convertcharentities ($$text);
183
184 $$text =~ s/\s+/ /g; # remove \n's
185}
186
187sub HB_clean_section {
188 my $self = shift (@_);
189 my ($section) = @_;
190
191 # remove tags without a starting tag from the section
192 my ($tag, $tagstart);
193 while ($section =~ m/<\/([^>]{1,10})>/) {
194 $tag = $1;
195 $tagstart = index($section, "<$tag");
196 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
197 $section =~ s/<\/$tag>//;
198 }
199
200 # remove extra paragraph tags
201 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
202
203 # remove extra stuff at the end of the section
204 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
205
206 # add a newline at the beginning of each paragraph
207 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
208
209 # add a newline every 80 characters at a word boundary
210 # Note: this regular expression puts a line feed before
211 # the last word in each section, even when it is not
212 # needed.
213 $section =~ s/(.{1,80})\s/$1\n/g;
214
215 # fix up the image links
216 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
217 <center><img src=\"$1\" \/><\/center><br\/>/ig;
218 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
219 <center><img src=\"$1\" \/><\/center><br\/>/ig;
220
221 return $section;
222}
223
224# Will convert the oldHDL format to the new HDL format (using the Section tag)
225sub convert_to_newHDLformat
226{
227 my $self = shift (@_);
228 my ($file,$cnfile) = @_;
229 my $input_filename = $file;
230 my $tmp_filename = $cnfile;
231
232 # write HTML tmp file with new HDL format
233 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
234
235 # read in the file and do basic html cleaning (removing header etc)
236 my $html = "";
237 $self->HB_read_html_file ($input_filename, \$html);
238
239 # process the file one section at a time
240 my $curtoclevel = 1;
241 my $firstsection = 1;
242 my $toclevel = 0;
243 while (length ($html) > 0) {
244 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
245 $toclevel = $3;
246 my $title = $4;
247 my $sectiontext = "";
248 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
249 $sectiontext = $1;
250 } else {
251 $sectiontext = $html;
252 $html = "";
253 }
254
255 # remove tags and extra spaces from the title
256 $title =~ s/<\/?[^>]+>//g;
257 $title =~ s/^\s+|\s+$//g;
258
259 # close any sections below the current level and
260 # create a new section (special case for the firstsection)
261 print PROD "<!--\n";
262 while (($curtoclevel > $toclevel) ||
263 (!$firstsection && $curtoclevel == $toclevel)) {
264 $curtoclevel--;
265 print PROD "</Section>\n";
266 }
267 if ($curtoclevel+1 < $toclevel) {
268 print STDERR "WARNING - jump in toc levels in $input_filename " .
269 "from $curtoclevel to $toclevel\n";
270 }
271 while ($curtoclevel < $toclevel) {
272 $curtoclevel++;
273 }
274
275 if ($curtoclevel == 1) {
276 # add the header tag
277 print PROD "-->\n";
278 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
279 print PROD "<!--\n";
280 }
281
282 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
283
284 print PROD "-->\n";
285
286 # clean up the section html
287 $sectiontext = $self->HB_clean_section($sectiontext);
288
289 print PROD "$sectiontext\n";
290
291 } else {
292 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
293 "\nin $input_filename\n";
294 last;
295 }
296 $firstsection = 0;
297 }
298
299 print PROD "<!--\n";
300 while ($curtoclevel > 0) {
301 $curtoclevel--;
302 print PROD "</Section>\n";
303 }
304 print PROD "-->\n";
305
306 close (PROD) || die("Error Closing File: $tmp_filename $!");
307
308 return $tmp_filename;
309}
310
311sub shorten {
312 my $self = shift (@_);
313 my ($text) = @_;
314
315 return "\"$text\"" if (length($text) < 100);
316
317 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
318 substr ($text, length($text)-50) . "\"";
319}
320
321sub convert_tidy_or_oldHDL_file
322{
323 my $self = shift (@_);
324 my ($file) = @_;
325 my $input_filename = $file;
326
327 if (-d $input_filename)
328 {
329 return $input_filename;
330 }
331
332 # get the input filename
333 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
334 my $base_dirname = $dirname;
335 $suffix = lc($suffix);
336
337 # derive tmp filename from input filename
338 # Remove any white space from filename -- no risk of name collision, and
339 # makes later conversion by utils simpler. Leave spaces in path...
340 # tidy up the filename with space, dot, hyphen between
341 $tailname =~ s/\s+//g;
342 $tailname =~ s/\.+//g;
343 $tailname =~ s/\-+//g;
344 # convert to utf-8 otherwise we have problems with the doc.xml file
345 # later on
346 &unicode::ensure_utf8(\$tailname);
347
348 # softlink to collection tmp dir
349 my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
350 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
351
352 my $test_dirname = "";
353 my $f_separator = &util::get_os_dirsep();
354
355 if ($dirname =~ m/import$f_separator/)
356 {
357 $test_dirname = $'; #'
358
359 #print STDERR "init $'\n";
360
361 while ($test_dirname =~ m/[$f_separator]/)
362 {
363 my $folderdirname = $`;
364 $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
365 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
366 $test_dirname = $'; #'
367 }
368 }
369
370 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
371
372 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
373 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
374 {
375 #convert the input file to a new style HDL
376 my $hdl_output_filename = $input_filename;
377 if ($self->{'old_style_HDL'})
378 {
379 $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
380 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
381 }
382
383 #just for checking copy all other file from the base dir to tmp dir if it is not exists
384 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
385 my @files = grep {!/^\.+$/} readdir(DIR);
386 close(DIR);
387
388 foreach my $file (@files)
389 {
390 my $src_file = &util::filename_cat($base_dirname,$file);
391 my $dest_file = &util::filename_cat($tmp_dirname,$file);
392 if ((!-e $dest_file) && (!-d $src_file))
393 {
394 # just copy the original file back to the tmp directory
395 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
396 }
397 }
398
399 # tidy the input file
400 my $tidy_output_filename = $hdl_output_filename;
401 if ($self->{'use_realistic_book'})
402 {
403 $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
404 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
405 }
406 $tmp_filename = $tidy_output_filename;
407 }
408 else
409 {
410 if (!-e $tmp_filename)
411 {
412 # just copy the original file back to the tmp directory
413 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
414 }
415 }
416
417 return $tmp_filename;
418}
419
420
421# Will make the html input file as a proper XML file with removed font tag and
422# image size added to the img tag.
423# The tidying process takes place in a collection specific 'tmp' directory so
424# that we don't accidentally damage the input.
425sub tmp_tidy_file
426{
427 my $self = shift (@_);
428 my ($file,$cnfile) = @_;
429 my $input_filename = $file;
430 my $tmp_filename = $cnfile;
431
432 # get the input filename
433 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
434
435 require HTML::TokeParser::Simple;
436
437 # create HTML parser to decode the input file
438 my $parser = HTML::TokeParser::Simple->new($input_filename);
439
440 # write HTML tmp file without the font tag and image size are added to the img tag
441 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
442 while (my $token = $parser->get_token())
443 {
444 # is it an img tag
445 if ($token->is_start_tag('img'))
446 {
447 # get the attributes
448 my $attr = $token->return_attr;
449
450 # get the full path to the image
451 my $img_file = &util::filename_cat($dirname,$attr->{src});
452
453 # set the width and height attribute
454 ($attr->{width}, $attr->{height}) = imgsize($img_file);
455
456 # recreate the tag
457 print PROD "<img";
458 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
459 print PROD ">";
460 }
461 # is it a font tag
462 else
463 {
464 if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
465 {
466 # remove font tag
467 print PROD "";
468 }
469 else
470 {
471 # print without changes
472 print PROD $token->as_is;
473 }
474 }
475 }
476 close (PROD) || die("Error Closing File: $tmp_filename $!");
477
478 # run html-tidy on the tmp file to make it a proper XML file
479 my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`;
480
481 # write result back to the tmp file
482 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
483 print PROD $tidyfile;
484 close (PROD) || die("Error Closing File: $tmp_filename $!");
485
486 # return the output filename
487 return $tmp_filename;
488}
489
490sub read_into_doc_obj
491{
492 my $self = shift (@_);
493 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
494
495 # get the input file
496 my $input_filename = $file;
497 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
498 $suffix = lc($suffix);
499
500 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
501 {
502 # because the document has to be sectionalized set the description tags
503 $self->{'description_tags'} = 1;
504
505 # set the file to be tidied
506 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
507
508 # get the tidied file
509 #my $tidy_filename = $self->tmp_tidy_file($input_filename);
510 my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
511
512 # derive tmp filename from input filename
513 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
514
515 # set the new input file and base_dir to be from the tidied file
516 $file = "$tailname$suffix";
517 $base_dir = $dirname;
518 }
519
520 # call the parent read_into_doc_obj
521 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
522
523 return ($process_status,$doc_obj);
524}
525
526sub new {
527 my ($class) = shift (@_);
528 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
529 push(@$pluginlist, $class);
530
531 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
532 push(@{$hashArgOptLists->{"OptList"}},$options);
533
534
535 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
536
537 if ($self->{'w3mir'}) {
538 $self->{'file_is_url'} = 1;
539 }
540 $self->{'aux_files'} = {};
541 $self->{'dir_num'} = 0;
542 $self->{'file_num'} = 0;
543
544 return bless $self, $class;
545}
546
547# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
548# if have eg <script language="javascript" src="img/lib.js@123">
549sub get_default_block_exp {
550 my $self = shift (@_);
551
552 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
553 return "";
554}
555
556sub get_default_process_exp {
557 my $self = shift (@_);
558
559 # the last option is an attempt to encode the concept of an html query ...
560 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
561}
562
563sub store_block_files
564{
565 my $self =shift (@_);
566 my ($filename_full_path, $block_hash) = @_;
567
568 my $html_fname = $filename_full_path;
569 my @file_blocks;
570
571 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
572
573 # read in file ($text will be in utf8)
574 my $raw_text = "";
575 $self->read_file_no_decoding ($filename_full_path, \$raw_text);
576
577 my $textref = \$raw_text;
578 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
579 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
580 $$textref =~ s/$opencom(.*?)$closecom//gs;
581
582 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
583 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
584 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
585 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
586 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
587 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
588 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
589
590 if(!defined $self->{'utf8_to_original_filename'}) {
591 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
592 $self->{'utf8_to_original_filename'} = {};
593 }
594
595 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
596
597 # remove quotes from link at start and end if necessary
598 if ($link=~/^\"/) {
599 $link=~s/^\"//;
600 $link=~s/\"$//;
601 }
602
603 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
604 # some links may just be anchor names
605 next unless ($link =~ /\S+/);
606
607 if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
608 # Turn relative file path into full path
609 my $dirname = &File::Basename::dirname($filename_full_path);
610 $link = &util::filename_cat($dirname, $link);
611 }
612 $link = $self->eval_dir_dots($link);
613
614 # this is the actual filename on the filesystem (that the link refers to)
615 my $url_original_filename = $self->opt_url_decode($link);
616
617 # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
618 my $utf8_link = "";
619 $self->decode_text($link,$encoding,$language,\$utf8_link);
620
621 $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
622# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
623
624 if ($url_original_filename ne $utf8_link) {
625 my $outhandle = $self->{'outhandle'};
626
627 print $outhandle "URL Encoding $url_original_filename\n";
628 print $outhandle " ->$utf8_link\n";
629 }
630
631 $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
632 }
633}
634
635# Given a filename in any encoding, will URL decode it to get back the original filename
636# in the original encoding. Because this method is intended to work out the *original*
637# filename*, it not URL decode any filename if a file by the name of the *URL-encoded*
638# string already exists in the local folder.
639# Return the original filename corresponding to the parameter URL-encoded filename, and
640# a decoded flag that is set to true iff URL-decoding had to be applied.
641sub opt_url_decode {
642 my $self = shift (@_);
643 my ($link) = @_;
644
645 # Replace %XX's in URL with decoded value if required.
646 # Note that the filename may include the %XX in some situations
647 if ($link =~ m/\%[A-F0-9]{2}/i) {
648 if (!-e $link) {
649 $link = &unicode::url_decode($link);
650 }
651 }
652
653 return $link;
654}
655
656
657# do plugin specific processing of doc_obj
658sub process {
659 my $self = shift (@_);
660 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
661 my $outhandle = $self->{'outhandle'};
662
663 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
664 # this makes life so much easier... perl can cope with unix-style '/'s.
665 $base_dir =~ s@(\\)+@/@g;
666 $file =~ s@(\\)+@/@g;
667 }
668
669 # Any document converted by wvWare into HTML tends to have a certain comment inserted
670 # by wvware at the end of the HTML it generated. This comment contains urls that
671 # Greenstone replaces with macros, which then break the comment when the macros are
672 # expanded. The result is that the HTML displayed in the Greenstone browser is ugly.
673 # Therefore, we remove wvWare's insertion here.
674 $$textref =~ s/<!--\n(?:.*?)Document created with(?:.*?)wvware(?:.*?)-->//s;
675
676 # reset per-doc stuff...
677 $self->{'aux_files'} = {};
678 $self->{'dir_num'} = 0;
679 $self->{'file_num'} = 0;
680
681 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
682 # you can also include metadata in the format (X can be any number)
683 # <hX>Title<!--gsdl-metadata
684 # <Metadata name="name1">value1</Metadata>
685 # ...
686 # <Metadata name="nameN">valueN</Metadata>
687 #--></hX>
688 if ($self->{'sectionalise_using_h_tags'}) {
689 # description_tags should allways be activated because we convert headings to description tags
690 $self->{'description_tags'} = 1;
691
692 my $arrSections = [];
693 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
694
695 if (scalar(@$arrSections)) {
696 my $strMetadata = $self->update_section_data($arrSections, -1);
697 if (length($strMetadata)) {
698 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
699 $$textref =~ s/<\/body>/$strMetadata/ig;
700 }
701 }
702 }
703
704 my $cursection = $doc_obj->get_top_section();
705
706 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
707 unless $self->{'no_metadata'} || $self->{'description_tags'};
708
709 # Store URL for page as metadata - this can be used for an
710 # altavista style search interface. The URL won't be valid
711 # unless the file structure contains the domain name (i.e.
712 # like when w3mir is used to download a website).
713
714 # URL metadata (even invalid ones) are used to support internal
715 # links, so even if 'file_is_url' is off, still need to store info
716
717 my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$");
718 my $utf8_file = $self->filename_to_utf8_metadata($file);
719 my $web_url = "http://";
720 if(defined $dirname) { # local directory
721 $dirname = $self->eval_dir_dots($dirname);
722 $dirname .= "/" if $dirname ne ""; # if there's a directory, it should end on "/"
723 $web_url = $web_url.$dirname.$utf8_file;
724 } else {
725 $web_url = $web_url.$utf8_file;
726 }
727
728 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
729
730 if ($self->{'file_is_url'}) {
731 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
732 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
733 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
734 }
735
736 if ($self->{'description_tags'}) {
737 # remove the html header - note that doing this here means any
738 # sections defined within the header will be lost (so all <Section>
739 # tags must appear within the body of the HTML)
740 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
741
742 $$textref =~ s/^.*?<body[^>]*>//is;
743 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
744
745 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
746 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
747
748 my $lt = '(?:<|&lt;)';
749 my $gt = '(?:>|&gt;)';
750 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
751
752 my $dont_strip = '';
753 if ($self->{'no_strip_metadata_html'}) {
754 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
755 }
756
757 my $found_something = 0; my $top = 1;
758 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
759 my $text = $1;
760 my $comment = $2;
761 if (defined $text) {
762 # text before a comment - note that getting to here
763 # doesn't necessarily mean there are Section tags in
764 # the document
765 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
766 }
767 while ($comment =~ s/$lt(.*?)$gt//s) {
768 my $tag = $1;
769 if ($tag eq "Section") {
770 $found_something = 1;
771 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
772 $top = 0;
773 } elsif ($tag eq "/Section") {
774 $found_something = 1;
775 $cursection = $doc_obj->get_parent_section ($cursection);
776 } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
777 my $metaname = $1;
778 my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
779 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
780 my $metavalue = $1;
781 $metavalue =~ s/^\s+//;
782 $metavalue =~ s/\s+$//;
783 # assume that no metadata value intentionally includes
784 # carriage returns or HTML tags (if they're there they
785 # were probably introduced when converting to HTML from
786 # some other format).
787 # actually some people want to have html tags in their
788 # metadata.
789 $metavalue =~ s/[\cJ\cM]/ /sg;
790 $metavalue =~ s/<[^>]+>//sg
791 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
792 $metavalue =~ s/\s+/ /sg;
793 if ($accumulate) {
794 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
795 } else {
796 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
797 }
798 } elsif ($tag eq "Description" || $tag eq "/Description") {
799 # do nothing with containing Description tags
800 } else {
801 # simple HTML tag (probably created by the conversion
802 # to HTML from some other format) - we'll ignore it and
803 # hope for the best ;-)
804 }
805 }
806 }
807 if ($cursection ne "") {
808 print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
809 }
810
811 $$textref =~ s/^.*?<body[^>]*>//is;
812 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
813 if ($$textref =~ m/\S/) {
814 if (!$found_something) {
815 if ($self->{'verbosity'} > 2) {
816 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
817 print $outhandle " will be processed as a single section document\n";
818 }
819
820 # go ahead and process single-section document
821 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
822
823 # if document contains no Section tags we'll go ahead
824 # and extract metadata (this won't have been done
825 # above as the -description_tags option prevents it)
826 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
827 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
828 unless $self->{'no_metadata'};
829
830 } else {
831 print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
832 print $outhandle " of the final closing </Section> tag. This text will\n";
833 print $outhandle " be ignored.";
834
835 my ($text);
836 if (length($$textref) > 30) {
837 $text = substr($$textref, 0, 30) . "...";
838 } else {
839 $text = $$textref;
840 }
841 $text =~ s/\n/ /isg;
842 print $outhandle " ($text)\n";
843 }
844 } elsif (!$found_something) {
845
846 if ($self->{'verbosity'} > 2) {
847 # may get to here if document contained no valid Section
848 # tags but did contain some comments. The text will have
849 # been processed already but we should print the warning
850 # as above and extract metadata
851 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
852 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
853 }
854
855 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
856 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
857 unless $self->{'no_metadata'};
858 }
859
860 } else {
861
862 # remove header and footer
863 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
864 $$textref =~ s/^.*?<body[^>]*>//is;
865 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
866 }
867
868 # single section document
869 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
870 }
871 return 1;
872}
873
874
875sub process_heading
876{
877 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
878 $strHeadingText = '' if (!defined($strHeadingText));
879
880 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
881
882 my $strSecMetadata = '';
883 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
884 {
885 $strSecMetadata .= $1;
886 }
887
888 $strHeadingText =~ s/^\s+//g;
889 $strHeadingText =~ s/\s+$//g;
890 $strSecMetadata =~ s/^\s+//g;
891 $strSecMetadata =~ s/\s+$//g;
892
893 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
894
895 if (length($strSecMetadata)) {
896 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
897 }
898
899 $strMetadata .= "\t</Description>\n";
900
901 return "<!--" . $strMetadata . "-->";
902}
903
904
905sub update_section_data
906{
907 my ($self, $arrSections, $nCurTocNo) = @_;
908 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
909
910 if ($nSections == 0) {
911 push @$arrSections, $nCurTocNo;
912 return $strBuffer;
913 }
914 $nLast = $arrSections->[$nSections - 1];
915 if ($nCurTocNo > $nLast) {
916 push @$arrSections, $nCurTocNo;
917 return $strBuffer;
918 }
919 for(my $i = $nSections - 1; $i >= 0; $i--) {
920 if ($nCurTocNo <= $arrSections->[$i]) {
921 $strBuffer .= "\n</Section>";
922 pop @$arrSections;
923 }
924 }
925 push @$arrSections, $nCurTocNo;
926 return $strBuffer;
927}
928
929
930# note that process_section may be called multiple times for a single
931# section (relying on the fact that add_utf8_text appends the text to any
932# that may exist already).
933sub process_section {
934 my $self = shift (@_);
935 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
936 # trap links
937 if (!$self->{'nolinks'}) {
938 # usemap="./#index" not handled correctly => change to "#index"
939## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
940
941 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
942 $self->replace_usemap_links($1, $2, $3)/isge;
943
944## $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
945
946 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
947 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
948 }
949
950 # trap images
951
952 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
953 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
954 # turned regular text succeeding images into links. That is, by embedding <imgs>
955 # inside <a href=""></a>, the text following images were no longer misbehaving.
956 # However, there would be many occasions whereby images were not meant to link
957 # to their source images but where the images would link to another web page.
958 # To allow this, the no_image_links option was introduced: it would prevent
959 # the behaviour of embedding images into links that referenced the source images.
960
961 # Somewhere along the line, the problem of normal text turning into links when
962 # such text followed images which were not embedded in <a href=""></a> ceased
963 # to occur. This is why the following lines have been commented out (as well as
964 # two lines in replace_images). They appear to no longer apply.
965
966 # If at any time, there is a need for having images embedded in <a> anchor tags,
967 # then it might be better to turn that into an HTMLPlugin option rather than make
968 # it the default behaviour. Also, eventually, no_image_links needs to become
969 # a deprecated option for HTMLPlugin as it has now become the default behaviour.
970
971 #if(!$self->{'no_image_links'}){
972 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
973 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
974 #}
975
976 # add text to document object
977 # turn \ into \\ so that the rest of greenstone doesn't think there
978 # is an escape code following. (Macro parsing loses them...)
979 $$textref =~ s/\\/\\\\/go;
980
981 $doc_obj->add_utf8_text($cursection, $$textref);
982}
983
984sub replace_images {
985 my $self = shift (@_);
986 my ($front, $link, $back, $base_dir,
987 $file, $doc_obj, $section) = @_;
988
989 # remove quotes from link at start and end if necessary
990 if ($link=~/^[\"\']/) {
991 $link=~s/^[\"\']//;
992 $link=~s/[\"\']$//;
993 $front.='"';
994 $back="\"$back";
995 }
996
997 $link =~ s/\n/ /g;
998
999 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
1000 # If the Word file path has spaces in it, wv messes up and you end up with
1001 # absolute paths for the images, and without the "file://" prefix
1002 # So check for this special case and massage the data to be correct
1003 if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
1004 $link =~ s/^.*\\([^\\]+)$/$1/;
1005 }
1006
1007 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
1008
1009 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
1010
1011# print STDERR "**** link = $link\n**** href = $href\n**** img_file = $img_file, rl = $rl\n";
1012
1013 my $anchor_name = $img_file;
1014 #$anchor_name =~ s/^.*\///;
1015 #$anchor_name = "<a name=\"$anchor_name\" ></a>";
1016
1017 my $image_link = $front . $img_file .$back;
1018 return $image_link;
1019
1020 # The reasons for why the following two lines are no longer necessary can be
1021 # found in subroutine process_section
1022 #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
1023 #return $anchor_link;
1024
1025 #return $front . $img_file . $back . $anchor_name;
1026}
1027
1028sub replace_href_links {
1029 my $self = shift (@_);
1030 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
1031
1032 # remove quotes from link at start and end if necessary
1033 if ($link=~/^[\"\']/) {
1034 $link=~s/^[\"\']//;
1035 $link=~s/[\"\']$//;
1036 $front.='"';
1037 $back="\"$back";
1038 }
1039
1040 # attempt to sort out targets - frames are not handled
1041 # well in this plugin and some cases will screw things
1042 # up - e.g. the _parent target (so we'll just remove
1043 # them all ;-)
1044 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
1045 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
1046 $front =~ s/target=\"?_parent\"?//is;
1047 $back =~ s/target=\"?_parent\"?//is;
1048
1049 return $front . $link . $back if $link =~ m/^\#/s;
1050 $link =~ s/\n/ /g;
1051
1052 # Find file referred to by $link on file system
1053 # This is more complicated than it sounds when char encodings
1054 # is taken in to account
1055 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
1056
1057 # href may use '\'s where '/'s should be on Windows
1058 $href =~ s/\\/\//g;
1059 my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
1060
1061
1062 ##### leave all these links alone (they won't be picked up by intermediate
1063 ##### pages). I think that's safest when dealing with frames, targets etc.
1064 ##### (at least until I think of a better way to do it). Problems occur with
1065 ##### mailto links from within small frames, the intermediate page is displayed
1066 ##### within that frame and can't be seen. There is still potential for this to
1067 ##### happen even with html pages - the solution seems to be to somehow tell
1068 ##### the browser from the server side to display the page being sent (i.e.
1069 ##### the intermediate page) in the top level window - I'm not sure if that's
1070 ##### possible - the following line should probably be deleted if that can be done
1071 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
1072
1073
1074 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
1075 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
1076 &ghtml::urlsafe ($href);
1077 return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
1078 } else {
1079 # link is to some other type of file (eg image) so we'll
1080 # need to associate that file
1081 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
1082 }
1083}
1084
1085sub add_file {
1086 my $self = shift (@_);
1087 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
1088 my ($newname);
1089
1090 my $filename = $href;
1091 if ($base_dir eq "") {
1092 # remove http:/ thereby leaving one slash at the start
1093 $filename =~ s/^[^:]*:\///;
1094 }
1095 else {
1096 # remove http://
1097 $filename =~ s/^[^:]*:\/\///;
1098 }
1099
1100 $filename = &util::filename_cat($base_dir, $filename);
1101 # Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some
1102 # situations. If the *original* file's name was in URL encoding, the following method will not decode it.
1103 my $utf8_filename = $filename;
1104 $filename = $self->opt_url_decode($utf8_filename);
1105
1106 # some special processing if the intended filename was converted to utf8, but
1107 # the actual file still needs to be renamed
1108 if (!-e $filename) {
1109 # try the original filename stored in map
1110 my $original_filename = $self->{'utf8_to_original_filename'}->{$filename};
1111 if (defined $original_filename && -e $original_filename) {
1112 $filename = $original_filename;
1113 }
1114 }
1115
1116 my ($ext) = $filename =~ m/(\.[^\.]*)$/;
1117
1118 if ($rl == 0) {
1119 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
1120 return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
1121 }
1122 else {
1123 return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
1124 }
1125 }
1126
1127 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
1128 return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
1129 }
1130 if ($self->{'rename_assoc_files'}) {
1131 if (defined $self->{'aux_files'}->{$href}) {
1132 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1133 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1134 } else {
1135 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1136 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1137 $self->inc_filecount ();
1138 }
1139 $doc_obj->associate_file($filename, $newname, undef, $section);
1140 return "_httpdocimg_/$newname";
1141 } else {
1142 if(&unicode::is_url_encoded($utf8_filename)) {
1143 # use the possibly-decoded filename instead to avoid double URL encoding
1144 ($newname) = $filename =~ m/([^\/\\]*)$/;
1145 } else {
1146 ($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
1147 }
1148
1149 # Make sure this name uses only ASCII characters
1150 # => use URL encoding, as it preserves original encoding
1151 $newname = &unicode::url_encode($newname);
1152
1153 $doc_obj->associate_file($filename, $newname, undef, $section);
1154
1155 # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1156 # of filenames, URL-encode the additional percent signs of the URL-encoded filename
1157 my $newname_url = $newname;
1158 $newname_url = &unicode::filename_to_url($newname_url); #$newname_url =~ s/%/%25/g if $newname_url !~ m/%25/;
1159 return "_httpdocimg_/$newname_url";
1160 }
1161}
1162
1163
1164sub format_link {
1165 my $self = shift (@_);
1166 my ($link, $base_dir, $file) = @_;
1167
1168 my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/;
1169
1170 $hash_part = "" if !defined $hash_part;
1171 if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
1172 my $outhandle = $self->{'outhandle'};
1173 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1174 if $self->{'verbosity'};
1175 return ($link, "", 0);
1176 }
1177
1178 if ($before_hash =~ s@^((?:http|ftp|file)://)@@i) {
1179 my $type = $1;
1180
1181 if ($link =~ m/^(http|ftp):/i) {
1182 # Turn url (using /) into file name (possibly using \ on windows)
1183 my @http_dir_split = split('/', $before_hash);
1184 $before_hash = &util::filename_cat(@http_dir_split);
1185 }
1186
1187 $before_hash = $self->eval_dir_dots($before_hash);
1188
1189 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1190
1191 my $rl = 0;
1192 $rl = 1 if (-e $linkfilename);
1193
1194 # make sure there's a slash on the end if it's a directory
1195 if ($before_hash !~ m/\/$/) {
1196 $before_hash .= "/" if (-d $linkfilename);
1197 }
1198 return ($type . $before_hash, $hash_part, $rl);
1199
1200 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
1201
1202 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
1203
1204 # the first directory will be the domain name if file_is_url
1205 # to generate archives, otherwise we'll assume all files are
1206 # from the same site and base_dir is the root
1207
1208 if ($self->{'file_is_url'}) {
1209 my @dirs = split /[\/\\]/, $file;
1210 my $domname = shift (@dirs);
1211 $before_hash = &util::filename_cat($domname, $before_hash);
1212 $before_hash =~ s@\\@/@g; # for windows
1213 }
1214 else
1215 {
1216 # see if link shares directory with source document
1217 # => turn into relative link if this is so!
1218
1219 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1220 # too difficult doing a pattern match with embedded '\'s...
1221 my $win_before_hash=$before_hash;
1222 $win_before_hash =~ s@(\\)+@/@g;
1223 # $base_dir is already similarly "converted" on windows.
1224 if ($win_before_hash =~ s@^$base_dir/@@o) {
1225 # if this is true, we removed a prefix
1226 $before_hash=$win_before_hash;
1227 }
1228 }
1229 else {
1230 # before_hash has lost leading slash by this point,
1231 # -> add back in prior to substitution with $base_dir
1232 $before_hash = "/$before_hash";
1233
1234 $before_hash = &util::filename_cat("",$before_hash);
1235 $before_hash =~ s@^$base_dir/@@;
1236 }
1237 }
1238 } else {
1239 # Turn relative file path into full path
1240 my $dirname = &File::Basename::dirname($file);
1241 $before_hash = &util::filename_cat($dirname, $before_hash);
1242 $before_hash = $self->eval_dir_dots($before_hash);
1243 }
1244
1245 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1246 # make sure there's a slash on the end if it's a directory
1247 if ($before_hash !~ m/\/$/) {
1248 $before_hash .= "/" if (-d $linkfilename);
1249 }
1250 return ("http://" . $before_hash, $hash_part, 1);
1251 } else {
1252 # mailto, news, nntp, telnet, javascript or gopher link
1253 return ($before_hash, "", 0);
1254 }
1255}
1256
1257sub extract_first_NNNN_characters {
1258 my $self = shift (@_);
1259 my ($textref, $doc_obj, $thissection) = @_;
1260
1261 foreach my $size (split /,/, $self->{'first'}) {
1262 my $tmptext = $$textref;
1263 # skip to the body
1264 $tmptext =~ s/.*<body[^>]*>//i;
1265 # remove javascript
1266 $tmptext =~ s@<script.*?</script>@ @sig;
1267 $tmptext =~ s/<[^>]*>/ /g;
1268 $tmptext =~ s/&nbsp;/ /g;
1269 $tmptext =~ s/^\s+//;
1270 $tmptext =~ s/\s+$//;
1271 $tmptext =~ s/\s+/ /gs;
1272 $tmptext = &unicode::substr ($tmptext, 0, $size);
1273 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
1274 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1275 }
1276}
1277
1278
1279sub extract_metadata {
1280 my $self = shift (@_);
1281 my ($textref, $metadata, $doc_obj, $section) = @_;
1282 my $outhandle = $self->{'outhandle'};
1283 # if we don't want metadata, we may as well not be here ...
1284 return if (!defined $self->{'metadata_fields'});
1285
1286 # metadata fields to extract/save. 'key' is the (lowercase) name of the
1287 # html meta, 'value' is the metadata name for greenstone to use
1288 my %find_fields = ();
1289
1290 my %creator_fields = (); # short-cut for lookups
1291
1292
1293 foreach my $field (split /,/, $self->{'metadata_fields'}) {
1294 $field =~ s/^\s+//; # remove leading whitespace
1295 $field =~ s/\s+$//; # remove trailing whitespace
1296
1297 # support tag<tagname>
1298 if ($field =~ m/^(.*?)<(.*?)>$/) {
1299 # "$2" is the user's preferred gs metadata name
1300 $find_fields{lc($1)}=$2; # lc = lowercase
1301 } else { # no <tagname> for mapping
1302 # "$field" is the user's preferred gs metadata name
1303 $find_fields{lc($field)}=$field; # lc = lowercase
1304 }
1305 }
1306
1307 if (defined $self->{'hunt_creator_metadata'} &&
1308 $self->{'hunt_creator_metadata'} == 1 ) {
1309 my @extra_fields =
1310 (
1311 'author',
1312 'author.email',
1313 'creator',
1314 'dc.creator',
1315 'dc.creator.corporatename',
1316 );
1317
1318 # add the creator_metadata fields to search for
1319 foreach my $field (@extra_fields) {
1320 $creator_fields{$field}=0; # add to lookup hash
1321 }
1322 }
1323
1324
1325 # find the header in the html file, which has the meta tags
1326 $$textref =~ m@<head>(.*?)</head>@si;
1327
1328 my $html_header=$1;
1329
1330 # go through every <meta... tag defined in the html and see if it is
1331 # one of the tags we want to match.
1332
1333 # special case for title - we want to remember if its been found
1334 my $found_title = 0;
1335 # this assumes that ">" won't appear. (I don't think it's allowed to...)
1336 $html_header =~ m/^/; # match the start of the string, for \G assertion
1337
1338 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
1339 my $metatag=$1;
1340 my ($tag, $value);
1341
1342 # find the tag name
1343 $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
1344 $tag=$2;
1345 # in case they're not using " or ', but they should...
1346 if (! $tag) {
1347 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1348 $tag=$1;
1349 }
1350
1351 if (!defined $tag) {
1352 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1353 next;
1354 }
1355
1356 # don't need to assign this field if it was passed in from a previous
1357 # (recursive) plugin
1358 if (defined $metadata->{$tag}) {next}
1359
1360 # find the tag content
1361 $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
1362 $value=$2;
1363
1364 if (! $value) {
1365 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1366 $value=$1;
1367 }
1368 if (!defined $value) {
1369 print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1370 next;
1371 }
1372
1373 # clean up and add
1374 $value =~ s/\s+/ /gs;
1375 chomp($value); # remove trailing \n, if any
1376 if (exists $creator_fields{lc($tag)}) {
1377 # map this value onto greenstone's "Creator" metadata
1378 $tag='Creator';
1379 } elsif (!exists $find_fields{lc($tag)}) {
1380 next; # don't want this tag
1381 } else {
1382 # get the user's preferred capitalisation
1383 $tag = $find_fields{lc($tag)};
1384 }
1385 if (lc($tag) eq "title") {
1386 $found_title = 1;
1387 }
1388 print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
1389 if ($self->{'verbosity'} > 2);
1390 if ($tag =~ m/date.*/i){
1391 $tag = lc($tag);
1392 }
1393 $doc_obj->add_utf8_metadata($section, $tag, $value);
1394
1395 }
1396
1397 # TITLE: extract the document title
1398 if (exists $find_fields{'title'} && !$found_title) {
1399 # we want a title, and didn't find one in the meta tags
1400 # see if there's a <title> tag
1401 my $title;
1402 my $from = ""; # for debugging output only
1403 if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
1404 $title = $1;
1405 $from = "<title> tags";
1406 }
1407
1408 if (!defined $title) {
1409 $from = "first 100 chars";
1410 # if no title use first 100 or so characters
1411 $title = $$textref;
1412 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1413 $title =~ s/^.*?<body>//si;
1414 # ignore javascript!
1415 $title =~ s@<script.*?</script>@ @sig;
1416 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1417 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1418 $title = substr ($title, 0, 100);
1419 $title =~ s/\s\S*$/.../;
1420 }
1421 $title =~ s/<[^>]*>/ /g; # remove html tags
1422 $title =~ s/&nbsp;/ /g;
1423 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1424 $title =~ s/\s+/ /gs; # collapse multiple spaces
1425 $title =~ s/^\s*//; # remove leading spaces
1426 $title =~ s/\s*$//; # remove trailing spaces
1427
1428 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1429 $title =~ s/^\s+//s; # in case title_sub introduced any...
1430 $doc_obj->add_utf8_metadata ($section, 'Title', $title);
1431 print $outhandle " extracted Title metadata \"$title\" from $from\n"
1432 if ($self->{'verbosity'} > 2);
1433 }
1434
1435 # add FileFormat metadata
1436 $doc_obj->add_metadata($section,"FileFormat", "HTML");
1437
1438 # Special, for metadata names such as tagH1 - extracts
1439 # the text between the first <H1> and </H1> tags into "H1" metadata.
1440
1441 foreach my $field (keys %find_fields) {
1442 if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1443 my $tag = $1;
1444 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1445 my $content = $1;
1446 $content =~ s/&nbsp;/ /g;
1447 $content =~ s/<[^>]*>/ /g;
1448 $content =~ s/^\s+//;
1449 $content =~ s/\s+$//;
1450 $content =~ s/\s+/ /gs;
1451 if ($content) {
1452 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1453 $tag =~ s/^tag//i;
1454 $doc_obj->add_utf8_metadata ($section, $tag, $content);
1455 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1456 if ($self->{'verbosity'} > 2);
1457 }
1458 }
1459 }
1460}
1461
1462
1463# evaluate any "../" to next directory up
1464# evaluate any "./" as here
1465sub eval_dir_dots {
1466 my $self = shift (@_);
1467 my ($filename) = @_;
1468 my $dirsep_os = &util::get_os_dirsep();
1469 my @dirsep = split(/$dirsep_os/,$filename);
1470
1471 my @eval_dirs = ();
1472 foreach my $d (@dirsep) {
1473 if ($d eq "..") {
1474 pop(@eval_dirs);
1475
1476 } elsif ($d eq ".") {
1477 # do nothing!
1478
1479 } else {
1480 push(@eval_dirs,$d);
1481 }
1482 }
1483
1484 # Need to fiddle with number of elements in @eval_dirs if the
1485 # first one is the empty string. This is because of a
1486 # modification to util::filename_cat that supresses the addition
1487 # of a leading '/' character (or \ if windows) (intended to help
1488 # filename cat with relative paths) if the first entry in the
1489 # array is the empty string. Making the array start with *two*
1490 # empty strings is a way to defeat this "smart" option.
1491 #
1492 if (scalar(@eval_dirs) > 0) {
1493 if ($eval_dirs[0] eq ""){
1494 unshift(@eval_dirs,"");
1495 }
1496 }
1497
1498 my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1499 return $evaluated_filename;
1500}
1501
1502sub replace_usemap_links {
1503 my $self = shift (@_);
1504 my ($front, $link, $back) = @_;
1505
1506 # remove quotes from link at start and end if necessary
1507 if ($link=~/^[\"\']/) {
1508 $link=~s/^[\"\']//;
1509 $link=~s/[\"\']$//;
1510 $front.='"';
1511 $back="\"$back";
1512 }
1513
1514 $link =~ s/^\.\///;
1515 return $front . $link . $back;
1516}
1517
1518sub inc_filecount {
1519 my $self = shift (@_);
1520
1521 if ($self->{'file_num'} == 1000) {
1522 $self->{'dir_num'} ++;
1523 $self->{'file_num'} = 0;
1524 } else {
1525 $self->{'file_num'} ++;
1526 }
1527}
1528
1529
1530# Extend read_file so that strings like &eacute; are
1531# converted to UTF8 internally.
1532#
1533# We don't convert &lt; or &gt; or &amp; or &quot; in case
1534# they interfere with the GML files
1535
1536sub read_file {
1537 my $self = shift(@_);
1538 my ($filename, $encoding, $language, $textref) = @_;
1539
1540 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1541
1542 # Convert entities to their UTF8 equivalents
1543 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
1544 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
1545 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
1546}
1547
15481;
Note: See TracBrowser for help on using the repository browser.