source: gsdl/trunk/perllib/plugins/HTMLPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 47.2 KB
Line 
1###########################################################################
2#
3# HTMLPlugin.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlugin;
37
38use ReadTextFile;
39use HBPlugin;
40use ghtml;
41use unicode;
42use util;
43use XMLParser;
44
45use Image::Size;
46use File::Copy;
47
48sub BEGIN {
49 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
50}
51
52use strict; # every perl program should have this!
53no strict 'refs'; # make an exception so we can use variables as filehandles
54
55my $arguments =
56 [ { 'name' => "process_exp",
57 'desc' => "{BasePlugin.process_exp}",
58 'type' => "regexp",
59 'deft' => &get_default_process_exp() },
60 { 'name' => "block_exp",
61 'desc' => "{BasePlugin.block_exp}",
62 'type' => 'regexp',
63 'deft' => &get_default_block_exp() },
64 { 'name' => "nolinks",
65 'desc' => "{HTMLPlugin.nolinks}",
66 'type' => "flag" },
67 { 'name' => "keep_head",
68 'desc' => "{HTMLPlugin.keep_head}",
69 'type' => "flag" },
70 { 'name' => "no_metadata",
71 'desc' => "{HTMLPlugin.no_metadata}",
72 'type' => "flag" },
73 { 'name' => "metadata_fields",
74 'desc' => "{HTMLPlugin.metadata_fields}",
75 'type' => "string",
76 'deft' => "Title" },
77 { 'name' => "hunt_creator_metadata",
78 'desc' => "{HTMLPlugin.hunt_creator_metadata}",
79 'type' => "flag" },
80 { 'name' => "file_is_url",
81 'desc' => "{HTMLPlugin.file_is_url}",
82 'type' => "flag" },
83 { 'name' => "assoc_files",
84 'desc' => "{HTMLPlugin.assoc_files}",
85 'type' => "regexp",
86 'deft' => &get_default_block_exp() },
87 { 'name' => "rename_assoc_files",
88 'desc' => "{HTMLPlugin.rename_assoc_files}",
89 'type' => "flag" },
90 { 'name' => "title_sub",
91 'desc' => "{HTMLPlugin.title_sub}",
92 'type' => "string",
93 'deft' => "" },
94 { 'name' => "description_tags",
95 'desc' => "{HTMLPlugin.description_tags}",
96 'type' => "flag" },
97 # retain this for backward compatibility (w3mir option was replaced by
98 # file_is_url)
99 { 'name' => "w3mir",
100# 'desc' => "{HTMLPlugin.w3mir}",
101 'type' => "flag",
102 'hiddengli' => "yes"},
103 { 'name' => "no_strip_metadata_html",
104 'desc' => "{HTMLPlugin.no_strip_metadata_html}",
105 'type' => "string",
106 'deft' => "",
107 'reqd' => "no"},
108 { 'name' => "sectionalise_using_h_tags",
109 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
110 'type' => "flag" },
111 { 'name' => "use_realistic_book",
112 'desc' => "{HTMLPlugin.tidy_html}",
113 'type' => "flag"},
114 { 'name' => "old_style_HDL",
115 'desc' => "{HTMLPlugin.old_style_HDL}",
116 'type' => "flag"}
117 ];
118
119my $options = { 'name' => "HTMLPlugin",
120 'desc' => "{HTMLPlugin.desc}",
121 'abstract' => "no",
122 'inherits' => "yes",
123 'args' => $arguments };
124
125
126sub HB_read_html_file {
127 my $self = shift (@_);
128 my ($htmlfile, $text) = @_;
129
130 # load in the file
131 if (!open (FILE, $htmlfile)) {
132 print STDERR "ERROR - could not open $htmlfile\n";
133 return;
134 }
135
136 my $foundbody = 0;
137 $self->HB_gettext (\$foundbody, $text, "FILE");
138 close FILE;
139
140 # just in case there was no <body> tag
141 if (!$foundbody) {
142 $foundbody = 1;
143 open (FILE, $htmlfile) || return;
144 $self->HB_gettext (\$foundbody, $text, "FILE");
145 close FILE;
146 }
147 # text is in utf8
148}
149
150# converts the text to utf8, as ghtml does that for &eacute; etc.
151sub HB_gettext {
152 my $self = shift (@_);
153 my ($foundbody, $text, $handle) = @_;
154
155 my $line = "";
156 while (defined ($line = <$handle>)) {
157 # look for body tag
158 if (!$$foundbody) {
159 if ($line =~ s/^.*<body[^>]*>//i) {
160 $$foundbody = 1;
161 } else {
162 next;
163 }
164 }
165
166 # check for symbol fonts
167 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
168 my $font = $1;
169 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
170 if ($font !~ /^arial$/i);
171 }
172
173 $$text .= $line;
174 }
175
176 if ($self->{'input_encoding'} eq "iso_8859_1") {
177 # convert to utf-8
178 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
179 }
180 # convert any alphanumeric character entities to their utf-8
181 # equivalent for indexing purposes
182 #&ghtml::convertcharentities ($$text);
183
184 $$text =~ s/\s+/ /g; # remove \n's
185}
186
187sub HB_clean_section {
188 my $self = shift (@_);
189 my ($section) = @_;
190
191 # remove tags without a starting tag from the section
192 my ($tag, $tagstart);
193 while ($section =~ /<\/([^>]{1,10})>/) {
194 $tag = $1;
195 $tagstart = index($section, "<$tag");
196 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
197 $section =~ s/<\/$tag>//;
198 }
199
200 # remove extra paragraph tags
201 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
202
203 # remove extra stuff at the end of the section
204 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
205
206 # add a newline at the beginning of each paragraph
207 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
208
209 # add a newline every 80 characters at a word boundary
210 # Note: this regular expression puts a line feed before
211 # the last word in each section, even when it is not
212 # needed.
213 $section =~ s/(.{1,80})\s/$1\n/g;
214
215 # fix up the image links
216 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
217 <center><img src=\"$1\" \/><\/center><br\/>/ig;
218 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
219 <center><img src=\"$1\" \/><\/center><br\/>/ig;
220
221 return $section;
222}
223
224# Will convert the oldHDL format to the new HDL format (using the Section tag)
225sub convert_to_newHDLformat
226{
227 my $self = shift (@_);
228 my ($file,$cnfile) = @_;
229 my $input_filename = $file;
230 my $tmp_filename = $cnfile;
231
232 # write HTML tmp file with new HDL format
233 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
234
235 # read in the file and do basic html cleaning (removing header etc)
236 my $html = "";
237 $self->HB_read_html_file ($input_filename, \$html);
238
239 # process the file one section at a time
240 my $curtoclevel = 1;
241 my $firstsection = 1;
242 my $toclevel = 0;
243 while (length ($html) > 0) {
244 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
245 $toclevel = $3;
246 my $title = $4;
247 my $sectiontext = "";
248 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
249 $sectiontext = $1;
250 } else {
251 $sectiontext = $html;
252 $html = "";
253 }
254
255 # remove tags and extra spaces from the title
256 $title =~ s/<\/?[^>]+>//g;
257 $title =~ s/^\s+|\s+$//g;
258
259 # close any sections below the current level and
260 # create a new section (special case for the firstsection)
261 print PROD "<!--\n";
262 while (($curtoclevel > $toclevel) ||
263 (!$firstsection && $curtoclevel == $toclevel)) {
264 $curtoclevel--;
265 print PROD "</Section>\n";
266 }
267 if ($curtoclevel+1 < $toclevel) {
268 print STDERR "WARNING - jump in toc levels in $input_filename " .
269 "from $curtoclevel to $toclevel\n";
270 }
271 while ($curtoclevel < $toclevel) {
272 $curtoclevel++;
273 }
274
275 if ($curtoclevel == 1) {
276 # add the header tag
277 print PROD "-->\n";
278 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
279 print PROD "<!--\n";
280 }
281
282 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
283
284 print PROD "-->\n";
285
286 # clean up the section html
287 $sectiontext = $self->HB_clean_section($sectiontext);
288
289 print PROD "$sectiontext\n";
290
291 } else {
292 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
293 "\nin $input_filename\n";
294 last;
295 }
296 $firstsection = 0;
297 }
298
299 print PROD "<!--\n";
300 while ($curtoclevel > 0) {
301 $curtoclevel--;
302 print PROD "</Section>\n";
303 }
304 print PROD "-->\n";
305
306 close (PROD) || die("Error Closing File: $tmp_filename $!");
307
308 return $tmp_filename;
309}
310
311sub shorten {
312 my $self = shift (@_);
313 my ($text) = @_;
314
315 return "\"$text\"" if (length($text) < 100);
316
317 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
318 substr ($text, length($text)-50) . "\"";
319}
320
321sub convert_tidy_or_oldHDL_file
322{
323 my $self = shift (@_);
324 my ($file) = @_;
325 my $input_filename = $file;
326
327 if (-d $input_filename)
328 {
329 return $input_filename;
330 }
331
332 # get the input filename
333 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
334 my $base_dirname = $dirname;
335 $suffix = lc($suffix);
336
337 # derive tmp filename from input filename
338 # Remove any white space from filename -- no risk of name collision, and
339 # makes later conversion by utils simpler. Leave spaces in path...
340 # tidy up the filename with space, dot, hyphen between
341 $tailname =~ s/\s+//g;
342 $tailname =~ s/\.+//g;
343 $tailname =~ s/\-+//g;
344 # convert to utf-8 otherwise we have problems with the doc.xml file
345 # later on
346 &unicode::ensure_utf8(\$tailname);
347
348 # softlink to collection tmp dir
349 my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
350 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
351
352 my $test_dirname = "";
353 my $f_separator = &util::get_os_dirsep();
354
355 if ($dirname =~ /import$f_separator/)
356 {
357 $test_dirname = $';
358
359 #print STDERR "init $'\n";
360
361 while ($test_dirname =~ /[$f_separator]/)
362 {
363 my $folderdirname = $`;
364 $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
365 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
366 $test_dirname = $';
367 }
368 }
369
370 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
371
372 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
373 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
374 {
375 #convert the input file to a new style HDL
376 my $hdl_output_filename = $input_filename;
377 if ($self->{'old_style_HDL'})
378 {
379 $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
380 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
381 }
382
383 #just for checking copy all other file from the base dir to tmp dir if it is not exists
384 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
385 my @files = grep {!/^\.+$/} readdir(DIR);
386 close(DIR);
387
388 foreach my $file (@files)
389 {
390 my $src_file = &util::filename_cat($base_dirname,$file);
391 my $dest_file = &util::filename_cat($tmp_dirname,$file);
392 if ((!-e $dest_file) && (!-d $src_file))
393 {
394 # just copy the original file back to the tmp directory
395 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
396 }
397 }
398
399 # tidy the input file
400 my $tidy_output_filename = $hdl_output_filename;
401 if ($self->{'tidy_html'})
402 {
403 $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
404 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
405 }
406 $tmp_filename = $tidy_output_filename;
407 }
408 else
409 {
410 if (!-e $tmp_filename)
411 {
412 # just copy the original file back to the tmp directory
413 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
414 }
415 }
416
417 return $tmp_filename;
418}
419
420
421# Will make the html input file as a proper XML file with removed font tag and
422# image size added to the img tag.
423# The tidying process takes place in a collection specific 'tmp' directory so
424# that we don't accidentally damage the input.
425sub tmp_tidy_file
426{
427 my $self = shift (@_);
428 my ($file,$cnfile) = @_;
429 my $input_filename = $file;
430 my $tmp_filename = $cnfile;
431
432 # get the input filename
433 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
434
435 require HTML::TokeParser::Simple;
436
437 # create HTML parser to decode the input file
438 my $parser = HTML::TokeParser::Simple->new($input_filename);
439
440 # write HTML tmp file without the font tag and image size are added to the img tag
441 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
442 while (my $token = $parser->get_token())
443 {
444 # is it an img tag
445 if ($token->is_start_tag('img'))
446 {
447 # get the attributes
448 my $attr = $token->return_attr;
449
450 # get the full path to the image
451 my $img_file = &util::filename_cat($dirname,$attr->{src});
452
453 # set the width and height attribute
454 ($attr->{width}, $attr->{height}) = imgsize($img_file);
455
456 # recreate the tag
457 print PROD "<img";
458 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
459 print PROD ">";
460 }
461 # is it a font tag
462 else
463 {
464 if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
465 {
466 # remove font tag
467 print PROD "";
468 }
469 else
470 {
471 # print without changes
472 print PROD $token->as_is;
473 }
474 }
475 }
476 close (PROD) || die("Error Closing File: $tmp_filename $!");
477
478 # run html-tidy on the tmp file to make it a proper XML file
479 my $tidyfile = `tidy -utf8 -wrap 0 -asxml $tmp_filename`;
480
481 # write result back to the tmp file
482 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
483 print PROD $tidyfile;
484 close (PROD) || die("Error Closing File: $tmp_filename $!");
485
486 # return the output filename
487 return $tmp_filename;
488}
489
490sub read_into_doc_obj
491{
492 my $self = shift (@_);
493 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
494
495 # check the process_exp and block_exp thing
496 my ($block_status,$filename) = $self->read_block(@_);
497 return $block_status if ((!defined $block_status) || ($block_status==0));
498
499 # get the input file
500 my $input_filename = $file;
501 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
502 $suffix = lc($suffix);
503
504 if (($self->{'tidy_html'}) || ($self->{'old_style_HDL'}))
505 {
506 # because the document has to be sectionalized set the description tags
507 $self->{'description_tags'} = 1;
508
509 # set the file to be tidied
510 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/;
511
512 # get the tidied file
513 #my $tidy_filename = $self->tmp_tidy_file($input_filename);
514 my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
515
516 # derive tmp filename from input filename
517 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
518
519 # set the new input file and base_dir to be from the tidied file
520 $file = "$tailname$suffix";
521 $base_dir = $dirname;
522 }
523
524 # call the parent read_into_doc_obj
525 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
526
527 return ($process_status,$doc_obj);
528}
529
530sub new {
531 my ($class) = shift (@_);
532 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
533 push(@$pluginlist, $class);
534
535 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
536 push(@{$hashArgOptLists->{"OptList"}},$options);
537
538
539 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
540
541 if ($self->{'w3mir'}) {
542 $self->{'file_is_url'} = 1;
543 }
544 $self->{'aux_files'} = {};
545 $self->{'dir_num'} = 0;
546 $self->{'file_num'} = 0;
547
548 return bless $self, $class;
549}
550
551# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
552# if have eg <script language="javascript" src="img/lib.js@123">
553sub get_default_block_exp {
554 my $self = shift (@_);
555
556 return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
557}
558
559sub get_default_process_exp {
560 my $self = shift (@_);
561
562 # the last option is an attempt to encode the concept of an html query ...
563 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
564}
565
566sub store_block_files
567{
568 my $self =shift (@_);
569 my ($filename) = @_;
570 my $html_fname = $filename;
571 my @file_blocks;
572
573 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
574
575 # read in file ($text will be in utf8)
576 my $text = "";
577 $self->read_file ($filename, $encoding, $language, \$text);
578 my $textref = \$text;
579 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
580 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
581 $$textref =~ s/$opencom(.*?)$closecom//gs;
582
583 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
584 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
585 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
586 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
587 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
588 my @tabbg_matches = ($$textref =~ m/<(?:table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
589
590 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches) {
591
592 # remove quotes from link at start and end if necessary
593 if ($link=~/^\"/) {
594 $link=~s/^\"//;
595 $link=~s/\"$//;
596 }
597
598 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
599
600 if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
601 # Turn relative file path into full path
602 my $dirname = &File::Basename::dirname($filename);
603 $link = &util::filename_cat($dirname, $link);
604 }
605 $link = $self->eval_dir_dots($link);
606
607 $self->{'file_blocks'}->{$link} = 1;
608 }
609}
610
611
612# do plugin specific processing of doc_obj
613sub process {
614 my $self = shift (@_);
615 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
616 my $outhandle = $self->{'outhandle'};
617
618 print STDERR "<Processing n='$file' p='HTMLPlugin'>\n" if ($gli);
619
620 print $outhandle "HTMLPlugin: processing $file\n"
621 if $self->{'verbosity'} > 1;
622
623 if ($ENV{'GSDLOS'} =~ /^windows/i) {
624 # this makes life so much easier... perl can cope with unix-style '/'s.
625 $base_dir =~ s@(\\)+@/@g;
626 $file =~ s@(\\)+@/@g;
627 }
628
629 # reset per-doc stuff...
630 $self->{'aux_files'} = {};
631 $self->{'dir_num'} = 0;
632 $self->{'file_num'} = 0;
633
634 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
635 # you can also include metadata in the format (X can be any number)
636 # <hX>Title<!--gsdl-metadata
637 # <Metadata name="name1">value1</Metadata>
638 # ...
639 # <Metadata name="nameN">valueN</Metadata>
640 #--></hX>
641 if ($self->{'sectionalise_using_h_tags'}) {
642 # description_tags should allways be activated because we convert headings to description tags
643 $self->{'description_tags'} = 1;
644
645 my $arrSections = [];
646 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
647
648 if (scalar(@$arrSections)) {
649 my $strMetadata = $self->update_section_data($arrSections, -1);
650 if (length($strMetadata)) {
651 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
652 $$textref =~ s/<\/body>/$strMetadata/ig;
653 }
654 }
655 }
656
657 my $cursection = $doc_obj->get_top_section();
658
659 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
660 unless $self->{'no_metadata'} || $self->{'description_tags'};
661
662 # Store URL for page as metadata - this can be used for an
663 # altavista style search interface. The URL won't be valid
664 # unless the file structure contains the domain name (i.e.
665 # like when w3mir is used to download a website).
666
667 # URL metadata (even invalid ones) are used to support internal
668 # links, so even if 'file_is_url' is off, still need to store info
669
670 my $utf8_file = $self->filename_to_utf8_metadata($file);
671 my $web_url = "http://$utf8_file";
672 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
673
674 if ($self->{'file_is_url'}) {
675 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
676 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
677 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
678 }
679
680 if ($self->{'description_tags'}) {
681 # remove the html header - note that doing this here means any
682 # sections defined within the header will be lost (so all <Section>
683 # tags must appear within the body of the HTML)
684 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
685
686 $$textref =~ s/^.*?<body[^>]*>//is;
687 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
688
689 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
690 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
691
692 my $lt = '(?:<|&lt;)';
693 my $gt = '(?:>|&gt;)';
694 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
695
696 my $dont_strip = '';
697 if ($self->{'no_strip_metadata_html'}) {
698 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
699 }
700
701 my $found_something = 0; my $top = 1;
702 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
703 my $text = $1;
704 my $comment = $2;
705 if (defined $text) {
706 # text before a comment - note that getting to here
707 # doesn't necessarily mean there are Section tags in
708 # the document
709 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
710 }
711 while ($comment =~ s/$lt(.*?)$gt//s) {
712 my $tag = $1;
713 if ($tag eq "Section") {
714 $found_something = 1;
715 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
716 $top = 0;
717 } elsif ($tag eq "/Section") {
718 $found_something = 1;
719 $cursection = $doc_obj->get_parent_section ($cursection);
720 } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
721 my $metaname = $1;
722 my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
723 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
724 my $metavalue = $1;
725 $metavalue =~ s/^\s+//;
726 $metavalue =~ s/\s+$//;
727 # assume that no metadata value intentionally includes
728 # carriage returns or HTML tags (if they're there they
729 # were probably introduced when converting to HTML from
730 # some other format).
731 # actually some people want to have html tags in their
732 # metadata.
733 $metavalue =~ s/[\cJ\cM]/ /sg;
734 $metavalue =~ s/<[^>]+>//sg
735 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ /^($dont_strip)$/);
736 $metavalue =~ s/\s+/ /sg;
737 if ($accumulate) {
738 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
739 } else {
740 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
741 }
742 } elsif ($tag eq "Description" || $tag eq "/Description") {
743 # do nothing with containing Description tags
744 } else {
745 # simple HTML tag (probably created by the conversion
746 # to HTML from some other format) - we'll ignore it and
747 # hope for the best ;-)
748 }
749 }
750 }
751 if ($cursection ne "") {
752 print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
753 }
754
755 $$textref =~ s/^.*?<body[^>]*>//is;
756 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
757 if ($$textref =~ /\S/) {
758 if (!$found_something) {
759 if ($self->{'verbosity'} > 2) {
760 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
761 print $outhandle " will be processed as a single section document\n";
762 }
763
764 # go ahead and process single-section document
765 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
766
767 # if document contains no Section tags we'll go ahead
768 # and extract metadata (this won't have been done
769 # above as the -description_tags option prevents it)
770 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
771 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
772 unless $self->{'no_metadata'};
773
774 } else {
775 print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
776 print $outhandle " of the final closing </Section> tag. This text will\n";
777 print $outhandle " be ignored.";
778
779 my ($text);
780 if (length($$textref) > 30) {
781 $text = substr($$textref, 0, 30) . "...";
782 } else {
783 $text = $$textref;
784 }
785 $text =~ s/\n/ /isg;
786 print $outhandle " ($text)\n";
787 }
788 } elsif (!$found_something) {
789
790 if ($self->{'verbosity'} > 2) {
791 # may get to here if document contained no valid Section
792 # tags but did contain some comments. The text will have
793 # been processed already but we should print the warning
794 # as above and extract metadata
795 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
796 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
797 }
798
799 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
800 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
801 unless $self->{'no_metadata'};
802 }
803
804 } else {
805
806 # remove header and footer
807 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
808 $$textref =~ s/^.*?<body[^>]*>//is;
809 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
810 }
811
812 # single section document
813 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
814 }
815 return 1;
816}
817
818
819sub process_heading
820{
821 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
822 $strHeadingText = '' if (!defined($strHeadingText));
823
824 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
825
826 my $strSecMetadata = '';
827 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
828 {
829 $strSecMetadata .= $1;
830 }
831
832 $strHeadingText =~ s/^\s+//g;
833 $strHeadingText =~ s/\s+$//g;
834 $strSecMetadata =~ s/^\s+//g;
835 $strSecMetadata =~ s/\s+$//g;
836
837 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
838
839 if (length($strSecMetadata)) {
840 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
841 }
842
843 $strMetadata .= "\t</Description>\n";
844
845 return "<!--" . $strMetadata . "-->";
846}
847
848
849sub update_section_data
850{
851 my ($self, $arrSections, $nCurTocNo) = @_;
852 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
853
854 if ($nSections == 0) {
855 push @$arrSections, $nCurTocNo;
856 return $strBuffer;
857 }
858 $nLast = $arrSections->[$nSections - 1];
859 if ($nCurTocNo > $nLast) {
860 push @$arrSections, $nCurTocNo;
861 return $strBuffer;
862 }
863 for(my $i = $nSections - 1; $i >= 0; $i--) {
864 if ($nCurTocNo <= $arrSections->[$i]) {
865 $strBuffer .= "\n</Section>";
866 pop @$arrSections;
867 }
868 }
869 push @$arrSections, $nCurTocNo;
870 return $strBuffer;
871}
872
873
874# note that process_section may be called multiple times for a single
875# section (relying on the fact that add_utf8_text appends the text to any
876# that may exist already).
877sub process_section {
878 my $self = shift (@_);
879 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
880 # trap links
881 if (!$self->{'nolinks'}) {
882 # usemap="./#index" not handled correctly => change to "#index"
883 $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
884 $self->replace_usemap_links($1, $2, $3)/isge;
885
886 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
887 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
888 }
889
890 # trap images
891
892 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
893 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
894 # turned regular text succeeding images into links. That is, by embedding <imgs>
895 # inside <a href=""></a>, the text following images were no longer misbehaving.
896 # However, there would be many occasions whereby images were not meant to link
897 # to their source images but where the images would link to another web page.
898 # To allow this, the no_image_links option was introduced: it would prevent
899 # the behaviour of embedding images into links that referenced the source images.
900
901 # Somewhere along the line, the problem of normal text turning into links when
902 # such text followed images which were not embedded in <a href=""></a> ceased
903 # to occur. This is why the following lines have been commented out (as well as
904 # two lines in replace_images). They appear to no longer apply.
905
906 # If at any time, there is a need for having images embedded in <a> anchor tags,
907 # then it might be better to turn that into an HTMLPlugin option rather than make
908 # it the default behaviour. Also, eventually, no_image_links needs to become
909 # a deprecated option for HTMLPlugin as it has now become the default behaviour.
910
911 #if(!$self->{'no_image_links'}){
912 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)([\"][^\"]+[\"]|[\'][^\']+[\']|[^\s\/>]+)([^>]*>)/
913 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
914 #}
915
916 # add text to document object
917 # turn \ into \\ so that the rest of greenstone doesn't think there
918 # is an escape code following. (Macro parsing loses them...)
919 $$textref =~ s/\\/\\\\/go;
920
921 $doc_obj->add_utf8_text($cursection, $$textref);
922}
923
924sub replace_images {
925 my $self = shift (@_);
926 my ($front, $link, $back, $base_dir,
927 $file, $doc_obj, $section) = @_;
928
929 # remove quotes from link at start and end if necessary
930 if ($link=~/^[\"\']/) {
931 $link=~s/^[\"\']//;
932 $link=~s/[\"\']$//;
933 $front.='"';
934 $back="\"$back";
935 }
936
937 $link =~ s/\n/ /g;
938
939 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
940 # If the Word file path has spaces in it, wv messes up and you end up with
941 # absolute paths for the images, and without the "file://" prefix
942 # So check for this special case and massage the data to be correct
943 if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
944 $link =~ s/^.*\\([^\\]+)$/$1/;
945 }
946
947 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
948
949 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
950
951 my $anchor_name = $img_file;
952 #$anchor_name =~ s/^.*\///;
953 #$anchor_name = "<a name=\"$anchor_name\" ></a>";
954
955 my $image_link = $front . $img_file .$back;
956 return $image_link;
957
958 # The reasons for why the following two lines are no longer necessary can be
959 # found in subroutine process_section
960 #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
961 #return $anchor_link;
962
963 #return $front . $img_file . $back . $anchor_name;
964}
965
966sub replace_href_links {
967 my $self = shift (@_);
968 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
969
970 # attempt to sort out targets - frames are not handled
971 # well in this plugin and some cases will screw things
972 # up - e.g. the _parent target (so we'll just remove
973 # them all ;-)
974 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
975 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
976 $front =~ s/target=\"?_parent\"?//is;
977 $back =~ s/target=\"?_parent\"?//is;
978
979 return $front . $link . $back if $link =~ /^\#/s;
980 $link =~ s/\n/ /g;
981
982 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
983 # href may use '\'s where '/'s should be on Windows
984 $href =~ s/\\/\//g;
985
986 my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/;
987
988
989 ##### leave all these links alone (they won't be picked up by intermediate
990 ##### pages). I think that's safest when dealing with frames, targets etc.
991 ##### (at least until I think of a better way to do it). Problems occur with
992 ##### mailto links from within small frames, the intermediate page is displayed
993 ##### within that frame and can't be seen. There is still potential for this to
994 ##### happen even with html pages - the solution seems to be to somehow tell
995 ##### the browser from the server side to display the page being sent (i.e.
996 ##### the intermediate page) in the top level window - I'm not sure if that's
997 ##### possible - the following line should probably be deleted if that can be done
998 return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is;
999
1000
1001 if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||
1002 ($href =~ /\/$/) || ($href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
1003 &ghtml::urlsafe ($href);
1004 return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
1005 } else {
1006 # link is to some other type of file (eg image) so we'll
1007 # need to associate that file
1008 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
1009 }
1010}
1011
1012sub add_file {
1013 my $self = shift (@_);
1014 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
1015 my ($newname);
1016
1017 my $filename = $href;
1018 if ($base_dir eq "") {
1019 # remove http:/ thereby leaving one slash at the start
1020 $filename =~ s/^[^:]*:\///;
1021 }
1022 else {
1023 # remove http://
1024 $filename =~ s/^[^:]*:\/\///;
1025 }
1026
1027 $filename = &util::filename_cat($base_dir, $filename);
1028
1029 # Replace %20's in URL with a space if required. Note that the filename
1030 # may include the %20 in some situations
1031 if ($filename =~ /\%20/) {
1032 if (!-e $filename) {
1033 $filename =~ s/\%20/ /g;
1034 }
1035 }
1036
1037 my ($ext) = $filename =~ /(\.[^\.]*)$/;
1038
1039 if ($rl == 0) {
1040 if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
1041 return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
1042 }
1043 else {
1044 return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
1045 }
1046 }
1047
1048 if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
1049 return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
1050 }
1051 if ($self->{'rename_assoc_files'}) {
1052 if (defined $self->{'aux_files'}->{$href}) {
1053 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1054 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1055 } else {
1056 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1057 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1058 $self->inc_filecount ();
1059 }
1060 $doc_obj->associate_file($filename, $newname, undef, $section);
1061 return "_httpdocimg_/$newname";
1062 } else {
1063 ($newname) = $filename =~ /([^\/\\]*)$/;
1064 $doc_obj->associate_file($filename, $newname, undef, $section);
1065 return "_httpdocimg_/$newname";
1066 }
1067}
1068
1069
1070sub format_link {
1071 my $self = shift (@_);
1072 my ($link, $base_dir, $file) = @_;
1073
1074 my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;
1075
1076 $hash_part = "" if !defined $hash_part;
1077 if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {
1078 my $outhandle = $self->{'outhandle'};
1079 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1080 if $self->{'verbosity'};
1081 return ($link, "", 0);
1082 }
1083
1084 if ($before_hash =~ s@^((?:http|ftp|file)://)@@i) {
1085 my $type = $1;
1086
1087 if ($link =~ /^(http|ftp):/i) {
1088 # Turn url (using /) into file name (possibly using \ on windows)
1089 my @http_dir_split = split('/', $before_hash);
1090 $before_hash = &util::filename_cat(@http_dir_split);
1091 }
1092
1093 $before_hash = $self->eval_dir_dots($before_hash);
1094
1095 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1096
1097 my $rl = 0;
1098 $rl = 1 if (-e $linkfilename);
1099
1100 # make sure there's a slash on the end if it's a directory
1101 if ($before_hash !~ /\/$/) {
1102 $before_hash .= "/" if (-d $linkfilename);
1103 }
1104
1105 return ($type . $before_hash, $hash_part, $rl);
1106
1107 } elsif ($link !~ /^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ /^\//) {
1108 if ($before_hash =~ s@^/@@ || $before_hash =~ /\\/) {
1109
1110 # the first directory will be the domain name if file_is_url
1111 # to generate archives, otherwise we'll assume all files are
1112 # from the same site and base_dir is the root
1113
1114 if ($self->{'file_is_url'}) {
1115 my @dirs = split /[\/\\]/, $file;
1116 my $domname = shift (@dirs);
1117 $before_hash = &util::filename_cat($domname, $before_hash);
1118 $before_hash =~ s@\\@/@g; # for windows
1119 }
1120 else
1121 {
1122 # see if link shares directory with source document
1123 # => turn into relative link if this is so!
1124
1125 if ($ENV{'GSDLOS'} =~ /^windows/i) {
1126 # too difficult doing a pattern match with embedded '\'s...
1127 my $win_before_hash=$before_hash;
1128 $win_before_hash =~ s@(\\)+@/@g;
1129 # $base_dir is already similarly "converted" on windows.
1130 if ($win_before_hash =~ s@^$base_dir/@@o) {
1131 # if this is true, we removed a prefix
1132 $before_hash=$win_before_hash;
1133 }
1134 }
1135 else {
1136 # before_hash has lost leading slash by this point,
1137 # -> add back in prior to substitution with $base_dir
1138 $before_hash = "/$before_hash";
1139
1140 $before_hash = &util::filename_cat("",$before_hash);
1141 $before_hash =~ s@^$base_dir/@@;
1142 }
1143 }
1144 } else {
1145 # Turn relative file path into full path
1146 my $dirname = &File::Basename::dirname($file);
1147 $before_hash = &util::filename_cat($dirname, $before_hash);
1148 $before_hash = $self->eval_dir_dots($before_hash);
1149 }
1150
1151 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1152 # make sure there's a slash on the end if it's a directory
1153 if ($before_hash !~ /\/$/) {
1154 $before_hash .= "/" if (-d $linkfilename);
1155 }
1156 return ("http://" . $before_hash, $hash_part, 1);
1157 } else {
1158 # mailto, news, nntp, telnet, javascript or gopher link
1159 return ($before_hash, "", 0);
1160 }
1161}
1162
1163sub extract_first_NNNN_characters {
1164 my $self = shift (@_);
1165 my ($textref, $doc_obj, $thissection) = @_;
1166
1167 foreach my $size (split /,/, $self->{'first'}) {
1168 my $tmptext = $$textref;
1169 # skip to the body
1170 $tmptext =~ s/.*<body[^>]*>//i;
1171 # remove javascript
1172 $tmptext =~ s@<script.*?</script>@ @sig;
1173 $tmptext =~ s/<[^>]*>/ /g;
1174 $tmptext =~ s/&nbsp;/ /g;
1175 $tmptext =~ s/^\s+//;
1176 $tmptext =~ s/\s+$//;
1177 $tmptext =~ s/\s+/ /gs;
1178 $tmptext = &unicode::substr ($tmptext, 0, $size);
1179 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
1180 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1181 }
1182}
1183
1184
1185sub extract_metadata {
1186 my $self = shift (@_);
1187 my ($textref, $metadata, $doc_obj, $section) = @_;
1188 my $outhandle = $self->{'outhandle'};
1189 # if we don't want metadata, we may as well not be here ...
1190 return if (!defined $self->{'metadata_fields'});
1191
1192 # metadata fields to extract/save. 'key' is the (lowercase) name of the
1193 # html meta, 'value' is the metadata name for greenstone to use
1194 my %find_fields = ();
1195
1196 my %creator_fields = (); # short-cut for lookups
1197
1198
1199 foreach my $field (split /,/, $self->{'metadata_fields'}) {
1200 $field =~ s/^\s+//; # remove leading whitespace
1201 $field =~ s/\s+$//; # remove trailing whitespace
1202
1203 # support tag<tagname>
1204 if ($field =~ /^(.*?)<(.*?)>$/) {
1205 # "$2" is the user's preferred gs metadata name
1206 $find_fields{lc($1)}=$2; # lc = lowercase
1207 } else { # no <tagname> for mapping
1208 # "$field" is the user's preferred gs metadata name
1209 $find_fields{lc($field)}=$field; # lc = lowercase
1210 }
1211 }
1212
1213 if (defined $self->{'hunt_creator_metadata'} &&
1214 $self->{'hunt_creator_metadata'} == 1 ) {
1215 my @extra_fields =
1216 (
1217 'author',
1218 'author.email',
1219 'creator',
1220 'dc.creator',
1221 'dc.creator.corporatename',
1222 );
1223
1224 # add the creator_metadata fields to search for
1225 foreach my $field (@extra_fields) {
1226 $creator_fields{$field}=0; # add to lookup hash
1227 }
1228 }
1229
1230
1231 # find the header in the html file, which has the meta tags
1232 $$textref =~ m@<head>(.*?)</head>@si;
1233
1234 my $html_header=$1;
1235
1236 # go through every <meta... tag defined in the html and see if it is
1237 # one of the tags we want to match.
1238
1239 # special case for title - we want to remember if its been found
1240 my $found_title = 0;
1241 # this assumes that ">" won't appear. (I don't think it's allowed to...)
1242 $html_header =~ /^/; # match the start of the string, for \G assertion
1243
1244 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
1245 my $metatag=$1;
1246 my ($tag, $value);
1247
1248 # find the tag name
1249 $metatag =~ /(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
1250 $tag=$2;
1251 # in case they're not using " or ', but they should...
1252 if (! $tag) {
1253 $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1254 $tag=$1;
1255 }
1256
1257 if (!defined $tag) {
1258 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1259 next;
1260 }
1261
1262 # don't need to assign this field if it was passed in from a previous
1263 # (recursive) plugin
1264 if (defined $metadata->{$tag}) {next}
1265
1266 # find the tag content
1267 $metatag =~ /content\s*=\s*([\"\'])?(.*?)\1/is;
1268 $value=$2;
1269
1270 if (! $value) {
1271 $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1272 $value=$1;
1273 }
1274 if (!defined $value) {
1275 print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1276 next;
1277 }
1278
1279 # clean up and add
1280 $value =~ s/\s+/ /gs;
1281 chomp($value); # remove trailing \n, if any
1282 if (exists $creator_fields{lc($tag)}) {
1283 # map this value onto greenstone's "Creator" metadata
1284 $tag='Creator';
1285 } elsif (!exists $find_fields{lc($tag)}) {
1286 next; # don't want this tag
1287 } else {
1288 # get the user's preferred capitalisation
1289 $tag = $find_fields{lc($tag)};
1290 }
1291 if (lc($tag) eq "title") {
1292 $found_title = 1;
1293 }
1294 print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
1295 if ($self->{'verbosity'} > 2);
1296 if ($tag =~ /date.*/i){
1297 $tag = lc($tag);
1298 }
1299 $doc_obj->add_utf8_metadata($section, $tag, $value);
1300
1301 }
1302
1303 # TITLE: extract the document title
1304 if (exists $find_fields{'title'} && !$found_title) {
1305 # we want a title, and didn't find one in the meta tags
1306 # see if there's a <title> tag
1307 my $title;
1308 my $from = ""; # for debugging output only
1309 if ($html_header =~ /<title[^>]*>([^<]+)<\/title[^>]*>/is) {
1310 $title = $1;
1311 $from = "<title> tags";
1312 }
1313
1314 if (!defined $title) {
1315 $from = "first 100 chars";
1316 # if no title use first 100 or so characters
1317 $title = $$textref;
1318 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1319 $title =~ s/^.*?<body>//si;
1320 # ignore javascript!
1321 $title =~ s@<script.*?</script>@ @sig;
1322 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1323 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1324 $title = substr ($title, 0, 100);
1325 $title =~ s/\s\S*$/.../;
1326 }
1327 $title =~ s/<[^>]*>/ /g; # remove html tags
1328 $title =~ s/&nbsp;/ /g;
1329 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1330 $title =~ s/\s+/ /gs; # collapse multiple spaces
1331 $title =~ s/^\s*//; # remove leading spaces
1332 $title =~ s/\s*$//; # remove trailing spaces
1333
1334 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1335 $title =~ s/^\s+//s; # in case title_sub introduced any...
1336 $doc_obj->add_utf8_metadata ($section, 'Title', $title);
1337 print $outhandle " extracted Title metadata \"$title\" from $from\n"
1338 if ($self->{'verbosity'} > 2);
1339 }
1340
1341 # add FileFormat metadata
1342 $doc_obj->add_metadata($section,"FileFormat", "HTML");
1343
1344 # Special, for metadata names such as tagH1 - extracts
1345 # the text between the first <H1> and </H1> tags into "H1" metadata.
1346
1347 foreach my $field (keys %find_fields) {
1348 if ($field !~ /^tag([a-z0-9]+)$/i) {next}
1349 my $tag = $1;
1350 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1351 my $content = $1;
1352 $content =~ s/&nbsp;/ /g;
1353 $content =~ s/<[^>]*>/ /g;
1354 $content =~ s/^\s+//;
1355 $content =~ s/\s+$//;
1356 $content =~ s/\s+/ /gs;
1357 if ($content) {
1358 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1359 $tag =~ s/^tag//i;
1360 $doc_obj->add_utf8_metadata ($section, $tag, $content);
1361 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1362 if ($self->{'verbosity'} > 2);
1363 }
1364 }
1365 }
1366}
1367
1368
1369# evaluate any "../" to next directory up
1370# evaluate any "./" as here
1371sub eval_dir_dots {
1372 my $self = shift (@_);
1373 my ($filename) = @_;
1374 my $dirsep_os = &util::get_os_dirsep();
1375 my @dirsep = split(/$dirsep_os/,$filename);
1376
1377 my @eval_dirs = ();
1378 foreach my $d (@dirsep) {
1379 if ($d eq "..") {
1380 pop(@eval_dirs);
1381
1382 } elsif ($d eq ".") {
1383 # do nothing!
1384
1385 } else {
1386 push(@eval_dirs,$d);
1387 }
1388 }
1389
1390 # Need to fiddle with number of elements in @eval_dirs if the
1391 # first one is the empty string. This is because of a
1392 # modification to util::filename_cat that supresses the addition
1393 # of a leading '/' character (or \ if windows) (intended to help
1394 # filename cat with relative paths) if the first entry in the
1395 # array is the empty string. Making the array start with *two*
1396 # empty strings is a way to defeat this "smart" option.
1397 #
1398 if (scalar(@eval_dirs) > 0) {
1399 if ($eval_dirs[0] eq ""){
1400 unshift(@eval_dirs,"");
1401 }
1402 }
1403 return &util::filename_cat(@eval_dirs);
1404}
1405
1406sub replace_usemap_links {
1407 my $self = shift (@_);
1408 my ($front, $link, $back) = @_;
1409
1410 $link =~ s/^\.\///;
1411 return $front . $link . $back;
1412}
1413
1414sub inc_filecount {
1415 my $self = shift (@_);
1416
1417 if ($self->{'file_num'} == 1000) {
1418 $self->{'dir_num'} ++;
1419 $self->{'file_num'} = 0;
1420 } else {
1421 $self->{'file_num'} ++;
1422 }
1423}
1424
1425
1426# Extend read_file so that strings like &eacute; are
1427# converted to UTF8 internally.
1428#
1429# We don't convert &lt; or &gt; or &amp; or &quot; in case
1430# they interfere with the GML files
1431
1432sub read_file {
1433 my $self = shift(@_);
1434 my ($filename, $encoding, $language, $textref) = @_;
1435
1436 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1437
1438 # Convert entities to their UTF8 equivalents
1439 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
1440 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
1441 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
1442}
1443
14441;
Note: See TracBrowser for help on using the repository browser.