source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 9056

Last change on this file since 9056 was 9056, checked in by kjdon, 19 years ago

added an option to not strip html tags from metadata in description tags. contributed by jens wille

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 29.6 KB
Line 
1###########################################################################
2#
3# HTMLPlug.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlug;
37
38use BasPlug;
39use ghtml;
40use unicode;
41use util;
42use parsargv;
43use XMLParser;
44
45sub BEGIN {
46 @HTMLPlug::ISA = ('BasPlug');
47}
48
49use strict; # every perl program should have this!
50no strict 'refs'; # make an exception so we can use variables as filehandles
51
52my $arguments =
53 [ { 'name' => "process_exp",
54 'desc' => "{BasPlug.process_exp}",
55 'type' => "regexp",
56 'deft' => &get_default_process_exp() },
57 { 'name' => "block_exp",
58 'desc' => "{BasPlug.block_exp}",
59 'type' => 'regexp',
60 'deft' => &get_default_block_exp() },
61 { 'name' => "nolinks",
62 'desc' => "{HTMLPlug.nolinks}",
63 'type' => "flag" },
64 { 'name' => "keep_head",
65 'desc' => "{HTMLPlug.keep_head}",
66 'type' => "flag" },
67 { 'name' => "no_metadata",
68 'desc' => "{HTMLPlug.no_metadata}",
69 'type' => "flag" },
70 { 'name' => "metadata_fields",
71 'desc' => "{HTMLPlug.metadata_fields}",
72 'type' => "string",
73 'deft' => "Title" },
74 { 'name' => "hunt_creator_metadata",
75 'desc' => "{HTMLPlug.hunt_creator_metadata}",
76 'type' => "flag" },
77 { 'name' => "file_is_url",
78 'desc' => "{HTMLPlug.file_is_url}",
79 'type' => "flag" },
80 { 'name' => "assoc_files",
81 'desc' => "{HTMLPlug.assoc_files}",
82 'type' => "regexp",
83 'deft' => &get_default_block_exp() },
84 { 'name' => "rename_assoc_files",
85 'desc' => "{HTMLPlug.rename_assoc_files}",
86 'type' => "flag" },
87 { 'name' => "title_sub",
88 'desc' => "{HTMLPlug.title_sub}",
89 'type' => "string",
90 'deft' => "" },
91 { 'name' => "description_tags",
92 'desc' => "{HTMLPlug.description_tags}",
93 'type' => "flag" } ,
94 { 'name' => "no_strip_metadata_html",
95 'desc' => "{HTMLPlug.no_strip_metadata_html}",
96 'type' => "string",
97 'deft' => "",
98 'reqd' => "no"}
99 ];
100
101my $options = { 'name' => "HTMLPlug",
102 'desc' => "{HTMLPlug.desc}",
103 'abstract' => "no",
104 'inherits' => "yes",
105 'args' => $arguments };
106
107sub new {
108 my $class = shift (@_);
109 my $self = new BasPlug ($class, @_);
110 $self->{'plugin_type'} = "HTMLPlug";
111 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
112 my $option_list = $self->{'option_list'};
113 push( @{$option_list}, $options );
114
115 if (!parsargv::parse(\@_,
116 q^nolinks^, \$self->{'nolinks'},
117 q^keep_head^, \$self->{'keep_head'},
118 q^no_metadata^, \$self->{'no_metadata'},
119 q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
120 q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'},
121 q^w3mir^, \$self->{'w3mir'},
122 q^file_is_url^, \$self->{'file_is_url'},
123 q^assoc_files/.*/(?i)\.(jpe?g|jpe|gif|png|css)$^, \$self->{'assoc_files'},
124 q^rename_assoc_files^, \$self->{'rename_assoc_files'},
125 q^title_sub/.*/^, \$self->{'title_sub'},
126 q^description_tags^, \$self->{'description_tags'},
127 q^no_strip_metadata_html/.*/^, \$self->{'no_strip_metadata_html'},
128 "allow_extra_options")) {
129
130 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
131 $self->print_txt_usage(""); # Use default resource bundle
132 die "\n";
133 }
134
135 # retain this for backward compatibility (w3mir option was replaced by
136 # file_is_url)
137 if ($self->{'w3mir'}) {
138 $self->{'file_is_url'} = 1;
139 }
140
141 $self->{'aux_files'} = {};
142 $self->{'dir_num'} = 0;
143 $self->{'file_num'} = 0;
144
145 return bless $self, $class;
146}
147
148# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
149# if have eg <script language="javascript" src="img/lib.js@123">
150sub get_default_block_exp {
151 my $self = shift (@_);
152
153 return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
154}
155
156sub get_default_process_exp {
157 my $self = shift (@_);
158
159 # the last option is an attempt to encode the concept of an html query ...
160 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
161}
162
163
164sub metadata_read {
165 my $self = shift (@_);
166 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
167
168 my $outhandle = $self->{'outhandle'};
169
170 my $filename = $file;
171 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
172
173 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
174
175 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
176 return undef; # can't recognise
177 }
178
179 if ($self->{'smart_block'}) {
180 # Do encoding stuff
181 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
182
183 # read in file ($text will be in utf8)
184 my $text = "";
185 $self->read_file ($filename, $encoding, $language, \$text);
186 $self->store_block_files (\$text, $filename);
187 }
188 return 1;
189}
190
191sub store_block_files
192{
193 my $self =shift (@_);
194 my ($textref, $filename) = @_;
195
196 my $html_fname = $filename;
197 my @file_blocks;
198
199 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
200 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
201 $$textref =~ s/$opencom(.*?)$closecom//gs;
202
203 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
204 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
205 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
206 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
207
208
209 foreach my $link (@img_matches, @usemap_matches, @link_matches) {
210
211 # remove quotes from link at start and end if necessary
212 if ($link=~/^\"/) {
213 $link=~s/^\"//;
214 $link=~s/\"$//;
215 }
216
217 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
218
219 if ($link !~ s@^/@@ && $link !~ /^([A-Z]:?)\\/) {
220 # Turn relative file path into full path
221 my $dirname = &File::Basename::dirname($filename);
222 $link = &util::filename_cat($dirname, $link);
223 }
224 $link = $self->eval_dir_dots($link);
225 $self->{'file_blocks'}->{$link} = 1;
226 }
227}
228
229# do plugin specific processing of doc_obj
230sub process {
231 my $self = shift (@_);
232 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
233 my $outhandle = $self->{'outhandle'};
234
235 print STDERR "<Processing n='$file' p='HTMLPlug'>\n" if ($gli);
236
237 print $outhandle "HTMLPlug: processing $file\n"
238 if $self->{'verbosity'} > 1;
239
240 if ($ENV{'GSDLOS'} =~ /^windows/i) {
241 # this makes life so much easier... perl can cope with unix-style '/'s.
242 $base_dir =~ s@(\\)+@/@g;
243 $file =~ s@(\\)+@/@g;
244 }
245
246 # reset per-doc stuff...
247 $self->{'aux_files'} = {};
248 $self->{'dir_num'} = 0;
249 $self->{'file_num'} = 0;
250
251 my $cursection = $doc_obj->get_top_section();
252
253 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
254 unless $self->{'no_metadata'} || $self->{'description_tags'};
255
256 # Store URL for page as metadata - this can be used for an
257 # altavista style search interface. The URL won't be valid
258 # unless the file structure contains the domain name (i.e.
259 # like when w3mir is used to download a website).
260 my $web_url = "http://$file";
261 $doc_obj->add_metadata($cursection, "URL", $web_url);
262
263 if ($self->{'description_tags'}) {
264 # remove the html header - note that doing this here means any
265 # sections defined within the header will be lost (so all <Section>
266 # tags must appear within the body of the HTML)
267 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
268
269 $$textref =~ s/^.*?<body[^>]*>//is;
270 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
271
272 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
273 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
274
275 my $lt = '(?:<|&lt;)';
276 my $gt = '(?:>|&gt;)';
277 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
278
279 my $found_something = 0; my $top = 1;
280 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
281 my $text = $1;
282 my $comment = $2;
283 if (defined $text) {
284 # text before a comment - note that getting to here
285 # doesn't necessarily mean there are Section tags in
286 # the document
287 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
288 }
289 while ($comment =~ s/$lt(.*?)$gt//s) {
290 my $tag = $1;
291 if ($tag eq "Section") {
292 $found_something = 1;
293 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
294 $top = 0;
295 } elsif ($tag eq "/Section") {
296 $found_something = 1;
297 $cursection = $doc_obj->get_parent_section ($cursection);
298 } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
299 my $metaname = $1;
300 my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
301 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
302 my $metavalue = $1;
303 $metavalue =~ s/^\s+//;
304 $metavalue =~ s/\s+$//;
305 # assume that no metadata value intentionally includes
306 # carriage returns or HTML tags (if they're there they
307 # were probably introduced when converting to HTML from
308 # some other format).
309 # actually some people want to have html tags in their
310 # metadata.
311 $metavalue =~ s/[\cJ\cM]/ /sg;
312 my $dont = '';
313 if ($self->{'no_strip_metadata_html'}) {
314 ($dont = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
315 }
316 $metavalue =~ s/<[^>]+>//sg
317 unless $dont && ($dont eq 'all' || $metaname =~ /^($dont)$/);
318 if ($dont && ($dont eq 'all' || $metaname =~ /^($dont)$/)) {
319 }
320 $metavalue =~ s/\s+/ /sg;
321 if ($accumulate) {
322 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
323 } else {
324 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
325 }
326 } elsif ($tag eq "Description" || $tag eq "/Description") {
327 # do nothing with containing Description tags
328 } else {
329 # simple HTML tag (probably created by the conversion
330 # to HTML from some other format) - we'll ignore it and
331 # hope for the best ;-)
332 }
333 }
334 }
335 if ($cursection ne "") {
336 print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
337 }
338
339 $$textref =~ s/^.*?<body[^>]*>//is;
340 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
341 if ($$textref =~ /\S/) {
342 if (!$found_something) {
343 if ($self->{'verbosity'} > 2) {
344 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
345 print $outhandle " will be processed as a single section document\n";
346 }
347
348 # go ahead and process single-section document
349 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
350
351 # if document contains no Section tags we'll go ahead
352 # and extract metadata (this won't have been done
353 # above as the -description_tags option prevents it)
354 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
355 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
356 unless $self->{'no_metadata'};
357
358 } else {
359 print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
360 print $outhandle " of the final closing </Section> tag. This text will\n";
361 print $outhandle " be ignored.";
362
363 my ($text);
364 if (length($$textref) > 30) {
365 $text = substr($$textref, 0, 30) . "...";
366 } else {
367 $text = $$textref;
368 }
369 $text =~ s/\n/ /isg;
370 print $outhandle " ($text)\n";
371 }
372 } elsif (!$found_something) {
373
374 if ($self->{'verbosity'} > 2) {
375 # may get to here if document contained no valid Section
376 # tags but did contain some comments. The text will have
377 # been processed already but we should print the warning
378 # as above and extract metadata
379 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
380 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
381 }
382
383 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
384 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
385 unless $self->{'no_metadata'};
386 }
387
388 } else {
389
390 # remove header and footer
391 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
392 $$textref =~ s/^.*?<body[^>]*>//is;
393 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
394 }
395
396 # single section document
397 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
398 }
399 return 1;
400}
401
402# note that process_section may be called multiple times for a single
403# section (relying on the fact that add_utf8_text appends the text to any
404# that may exist already).
405sub process_section {
406 my $self = shift (@_);
407 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
408 # trap links
409 if (!$self->{'nolinks'}) {
410
411 # usemap="./#index" not handled correctly => change to "#index"
412 $$textref =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
413 $self->replace_usemap_links($1, $2, $3)/isge;
414
415 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
416 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
417 }
418
419 # trap images
420
421 # allow spaces if inside quotes - jrm21
422 $$textref =~ s/(<img[^>]*?src\s*=\s*)(\"[^\"]+\"|[^\s>]+)([^>]*>)/
423 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
424
425 # add text to document object
426 # turn \ into \\ so that the rest of greenstone doesn't think there
427 # is an escape code following. (Macro parsing loses them...)
428 $$textref =~ s/\\/\\\\/go;
429 $doc_obj->add_utf8_text($cursection, $$textref);
430}
431
432sub replace_images {
433 my $self = shift (@_);
434 my ($front, $link, $back, $base_dir,
435 $file, $doc_obj, $section) = @_;
436
437 # remove quotes from link at start and end if necessary
438 if ($link=~/^\"/) {
439 $link=~s/^\"//;$link=~s/\"$//;
440 $front.='"';
441 $back="\"$back";
442 }
443
444 $link =~ s/\n/ /g;
445
446 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
447 # If the Word file path has spaces in it, wv messes up and you end up with
448 # absolute paths for the images, and without the "file://" prefix
449 # So check for this special case and massage the data to be correct
450 if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
451 $link =~ s/^.*\\([^\\]+)$/$1/;
452 }
453
454 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
455
456 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
457 my $anchor_name = $img_file;
458 $anchor_name =~ s/^.*\///;
459 $anchor_name = "<a name=\"$anchor_name\">";
460
461 return $front . $img_file . $back . $anchor_name;
462}
463
464sub replace_href_links {
465 my $self = shift (@_);
466 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
467
468 # attempt to sort out targets - frames are not handled
469 # well in this plugin and some cases will screw things
470 # up - e.g. the _parent target (so we'll just remove
471 # them all ;-)
472 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
473 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
474 $front =~ s/target=\"?_parent\"?//is;
475 $back =~ s/target=\"?_parent\"?//is;
476
477 return $front . $link . $back if $link =~ /^\#/s;
478 $link =~ s/\n/ /g;
479
480 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
481 # href may use '\'s where '/'s should be on Windows
482 $href =~ s/\\/\//g;
483
484 my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/;
485
486
487 ##### leave all these links alone (they won't be picked up by intermediate
488 ##### pages). I think that's safest when dealing with frames, targets etc.
489 ##### (at least until I think of a better way to do it). Problems occur with
490 ##### mailto links from within small frames, the intermediate page is displayed
491 ##### within that frame and can't be seen. There is still potential for this to
492 ##### happen even with html pages - the solution seems to be to somehow tell
493 ##### the browser from the server side to display the page being sent (i.e.
494 ##### the intermediate page) in the top level window - I'm not sure if that's
495 ##### possible - the following line should probably be deleted if that can be done
496 return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is;
497
498
499 if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||
500 ($href =~ /\/$/) || ($href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
501 &ghtml::urlsafe ($href);
502 return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
503 } else {
504 # link is to some other type of file (eg image) so we'll
505 # need to associate that file
506 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
507 }
508}
509
510sub add_file {
511 my $self = shift (@_);
512 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
513 my ($newname);
514
515 my $filename = $href;
516 $filename =~ s/^[^:]*:\/\///;
517 $filename = &util::filename_cat($base_dir, $filename);
518
519 # Replace %20's in URL with a space if required. Note that the filename
520 # may include the %20 in some situations
521 if ($filename =~ /\%20/) {
522 if (!-e $filename) {
523 $filename =~ s/\%20/ /g;
524 }
525 }
526
527 my ($ext) = $filename =~ /(\.[^\.]*)$/;
528
529 if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
530 return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
531 }
532 if ($self->{'rename_assoc_files'}) {
533 if (defined $self->{'aux_files'}->{$href}) {
534 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
535 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
536 } else {
537 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
538 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
539 $self->inc_filecount ();
540 }
541 $doc_obj->associate_file($filename, $newname, undef, $section);
542 return "_httpdocimg_/$newname";
543 } else {
544 ($newname) = $filename =~ /([^\/\\]*)$/;
545 $doc_obj->associate_file($filename, $newname, undef, $section);
546 return "_httpdocimg_/$newname";
547 }
548}
549
550
551sub format_link {
552 my $self = shift (@_);
553 my ($link, $base_dir, $file) = @_;
554
555 my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;
556
557 $hash_part = "" if !defined $hash_part;
558 if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {
559 my $outhandle = $self->{'outhandle'};
560 print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
561 if $self->{'verbosity'};
562 return ($link, "", 0);
563 }
564
565 if ($before_hash =~ s@^((?:http|ftp|file)://)@@i) {
566 my $type = $1;
567
568 if ($link =~ /^(http|ftp):/i) {
569 # Turn url (using /) into file name (possibly using \ on windows)
570 my @http_dir_split = split('/', $before_hash);
571 $before_hash = &util::filename_cat(@http_dir_split);
572 }
573
574 $before_hash = $self->eval_dir_dots($before_hash);
575
576 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
577
578 my $rl = 0;
579 $rl = 1 if (-e $linkfilename);
580
581 # make sure there's a slash on the end if it's a directory
582 if ($before_hash !~ /\/$/) {
583 $before_hash .= "/" if (-d $linkfilename);
584 }
585
586 return ($type . $before_hash, $hash_part, $rl);
587
588 } elsif ($link !~ /^(mailto|news|gopher|nntp|telnet|javascript):/i) {
589 if ($before_hash =~ s@^/@@ || $before_hash =~ /\\/) {
590
591 # the first directory will be the domain name if file_is_url
592 # to generate archives, otherwise we'll assume all files are
593 # from the same site and base_dir is the root
594
595 if ($self->{'file_is_url'}) {
596 my @dirs = split /[\/\\]/, $file;
597 my $domname = shift (@dirs);
598 $before_hash = &util::filename_cat($domname, $before_hash);
599 $before_hash =~ s@\\@/@g; # for windows
600 }
601 else
602 {
603 # see if link shares directory with source document
604 # => turn into relative link if this is so!
605
606 if ($ENV{'GSDLOS'} =~ /^windows/i) {
607 # too difficult doing a pattern match with embedded '\'s...
608 my $win_before_hash=$before_hash;
609 $win_before_hash =~ s@(\\)+@/@g;
610 # $base_dir is already similarly "converted" on windows.
611 if ($win_before_hash =~ s@^$base_dir/@@o) {
612 # if this is true, we removed a prefix
613 $before_hash=$win_before_hash;
614 }
615 }
616 else {
617 $before_hash = &util::filename_cat("",$before_hash);
618 $before_hash =~ s@^$base_dir/@@;
619 }
620 }
621 } else {
622 # Turn relative file path into full path
623 my $dirname = &File::Basename::dirname($file);
624 $before_hash = &util::filename_cat($dirname, $before_hash);
625 $before_hash = $self->eval_dir_dots($before_hash);
626 }
627
628 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
629 # make sure there's a slash on the end if it's a directory
630 if ($before_hash !~ /\/$/) {
631 $before_hash .= "/" if (-d $linkfilename);
632 }
633 return ("http://" . $before_hash, $hash_part, 1);
634 } else {
635 # mailto, news, nntp, telnet, javascript or gopher link
636 return ($before_hash, "", 0);
637 }
638}
639
640sub extract_first_NNNN_characters {
641 my $self = shift (@_);
642 my ($textref, $doc_obj, $thissection) = @_;
643
644 foreach my $size (split /,/, $self->{'first'}) {
645 my $tmptext = $$textref;
646 # skip to the body
647 $tmptext =~ s/.*<body[^>]*>//i;
648 # remove javascript
649 $tmptext =~ s@<script.*?</script>@ @sig;
650 $tmptext =~ s/<[^>]*>/ /g;
651 $tmptext =~ s/&nbsp;/ /g;
652 $tmptext =~ s/^\s+//;
653 $tmptext =~ s/\s+$//;
654 $tmptext =~ s/\s+/ /gs;
655 # with perl 5.6 at least, substr might segment at a multi-byte char...
656 use utf8;
657 $tmptext = substr ($tmptext, 0, $size);
658 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
659 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
660 }
661}
662
663
664sub extract_metadata {
665 my $self = shift (@_);
666 my ($textref, $metadata, $doc_obj, $section) = @_;
667 my $outhandle = $self->{'outhandle'};
668 # if we don't want metadata, we may as well not be here ...
669 return if (!defined $self->{'metadata_fields'});
670
671 # metadata fields to extract/save. 'key' is the (lowercase) name of the
672 # html meta, 'value' is the metadata name for greenstone to use
673 my %find_fields = ();
674
675 my %creator_fields = (); # short-cut for lookups
676
677
678 foreach my $field (split /,/, $self->{'metadata_fields'}) {
679 # support tag<tagname>
680 if ($field =~ /^(.*?)<(.*?)>$/) {
681 # "$2" is the user's preferred gs metadata name
682 $find_fields{lc($1)}=$2; # lc = lowercase
683 } else { # no <tagname> for mapping
684 # "$field" is the user's preferred gs metadata name
685 $find_fields{lc($field)}=$field; # lc = lowercase
686 }
687 }
688
689 if (defined $self->{'hunt_creator_metadata'} &&
690 $self->{'hunt_creator_metadata'} == 1 ) {
691 my @extra_fields =
692 (
693 'author',
694 'author.email',
695 'creator',
696 'dc.creator',
697 'dc.creator.corporatename',
698 );
699
700 # add the creator_metadata fields to search for
701 foreach my $field (@extra_fields) {
702 $creator_fields{$field}=0; # add to lookup hash
703 }
704 }
705
706
707 # find the header in the html file, which has the meta tags
708 $$textref =~ m@<head>(.*?)</head>@si;
709
710 my $html_header=$1;
711 # go through every <meta... tag defined in the html and see if it is
712 # one of the tags we want to match.
713
714 # special case for title - we want to remember if its been found
715 my $found_title = 0;
716 # this assumes that ">" won't appear. (I don't think it's allowed to...)
717 $html_header =~ /^/; # match the start of the string, for \G assertion
718
719 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
720 my $metatag=$1;
721 my ($tag, $value);
722
723 # find the tag name
724 $metatag =~ /(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
725 $tag=$2;
726 # in case they're not using " or ', but they should...
727 if (! $tag) {
728 $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
729 $tag=$1;
730 }
731
732 if (!defined $tag) {
733 print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
734 next;
735 }
736
737 # don't need to assign this field if it was passed in from a previous
738 # (recursive) plugin
739 if (defined $metadata->{$tag}) {next}
740
741 # find the tag content
742 $metatag =~ /content\s*=\s*([\"\'])?(.*?)\1/is;
743 $value=$2;
744 if (! $value) {
745 $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
746 $value=$1;
747 }
748 if (!defined $value) {
749 print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
750 next;
751 }
752
753 # clean up and add
754 $value =~ s/\s+/ /gs;
755 chomp($value); # remove trailing \n, if any
756 if (exists $creator_fields{lc($tag)}) {
757 # map this value onto greenstone's "Creator" metadata
758 $tag='Creator';
759 } elsif (!exists $find_fields{lc($tag)}) {
760 next; # don't want this tag
761 } else {
762 # get the user's preferred capitalisation
763 $tag = $find_fields{lc($tag)};
764 }
765 if (lc($tag) eq "title") {
766 $found_title = 1;
767 }
768 print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
769 if ($self->{'verbosity'} > 2);
770 $doc_obj->add_utf8_metadata($section, $tag, $value);
771
772 }
773
774 # TITLE: extract the document title
775 if (exists $find_fields{'title'} && !$found_title) {
776 # we want a title, and didn't find one in the meta tags
777 # see if there's a <title> tag
778 my $title;
779 my $from = ""; # for debugging output only
780 if ($html_header =~ /<title[^>]*>([^<]+)<\/title[^>]*>/is) {
781 $title = $1;
782 $from = "<title> tags";
783 }
784
785 if (!defined $title) {
786 $from = "first 100 chars";
787 # if no title use first 100 or so characters
788 $title = $$textref;
789 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
790 $title =~ s/^.*?<body>//si;
791 # ignore javascript!
792 $title =~ s@<script.*?</script>@ @sig;
793 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
794 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
795 $title = substr ($title, 0, 100);
796 $title =~ s/\s\S*$/.../;
797 }
798 $title =~ s/<[^>]*>/ /g; # remove html tags
799 $title =~ s/&nbsp;/ /g;
800 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
801 $title =~ s/\s+/ /gs; # collapse multiple spaces
802 $title =~ s/^\s*//; # remove leading spaces
803 $title =~ s/\s*$//; # remove trailing spaces
804
805 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
806 $title =~ s/^\s+//s; # in case title_sub introduced any...
807 $doc_obj->add_utf8_metadata ($section, 'Title', $title);
808 print $outhandle " extracted Title metadata \"$title\" from $from\n"
809 if ($self->{'verbosity'} > 2);
810 }
811
812 # add FileFormat metadata
813 $doc_obj->add_metadata($section,"FileFormat", "HTML");
814
815 # Special, for metadata names such as tagH1 - extracts
816 # the text between the first <H1> and </H1> tags into "H1" metadata.
817
818 foreach my $field (keys %find_fields) {
819 if ($field !~ /^tag([a-z0-9]+)$/i) {next}
820 my $tag = $1;
821 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
822 my $content = $1;
823 $content =~ s/&nbsp;/ /g;
824 $content =~ s/<[^>]*>/ /g;
825 $content =~ s/^\s+//;
826 $content =~ s/\s+$//;
827 $content =~ s/\s+/ /gs;
828 if ($content) {
829 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
830 $tag =~ s/^tag//i;
831 $doc_obj->add_utf8_metadata ($section, $tag, $content);
832 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
833 if ($self->{'verbosity'} > 2);
834 }
835 }
836 }
837}
838
839
840# evaluate any "../" to next directory up
841# evaluate any "./" as here
842sub eval_dir_dots {
843 my $self = shift (@_);
844 my ($filename) = @_;
845 my $dirsep_os = &util::get_os_dirsep();
846 my @dirsep = split(/$dirsep_os/,$filename);
847
848 my @eval_dirs = ();
849 foreach my $d (@dirsep) {
850 if ($d eq "..") {
851 pop(@eval_dirs);
852
853 } elsif ($d eq ".") {
854 # do nothing!
855
856 } else {
857 push(@eval_dirs,$d);
858 }
859 }
860
861 # Need to fiddle with number of elements in @eval_dirs if the
862 # first one is the empty string. This is because of a
863 # modification to util::filename_cat that supresses the addition
864 # of a leading '/' character (or \ if windows) (intended to help
865 # filename cat with relative paths) if the first entry in the
866 # array is the empty string. Making the array start with *two*
867 # empty strings is a way to defeat this "smart" option.
868 #
869 if (scalar(@eval_dirs) > 0) {
870 if ($eval_dirs[0] eq ""){
871 unshift(@eval_dirs,"");
872 }
873 }
874 return &util::filename_cat(@eval_dirs);
875}
876
877sub replace_usemap_links {
878 my $self = shift (@_);
879 my ($front, $link, $back) = @_;
880
881 $link =~ s/^\.\///;
882 return $front . $link . $back;
883}
884
885sub inc_filecount {
886 my $self = shift (@_);
887
888 if ($self->{'file_num'} == 1000) {
889 $self->{'dir_num'} ++;
890 $self->{'file_num'} = 0;
891 } else {
892 $self->{'file_num'} ++;
893 }
894}
895
896
897# Extend the BasPlug read_file so that strings like &eacute; are
898# converted to UTF8 internally.
899#
900# We don't convert &lt; or &gt; or &amp; or &quot; in case
901# they interfere with the GML files
902
903sub read_file {
904 my ($self, $filename, $encoding, $language, $textref) = @_;
905
906 &BasPlug::read_file($self, $filename, $encoding, $language, $textref);
907
908 # Convert entities to their UTF8 equivalents
909 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
910 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
911 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
912}
913
9141;
Note: See TracBrowser for help on using the repository browser.