source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 9056

Last change on this file since 9056 was 9056, checked in by kjdon, 19 years ago

added an option to not strip html tags from metadata in description tags. contributed by jens wille

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 29.6 KB
RevLine 
[585]1###########################################################################
2#
3# HTMLPlug.pm -- basic html plugin
[808]4#
[585]5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
[808]27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
[585]35
36package HTMLPlug;
37
[1435]38use BasPlug;
[1010]39use ghtml;
[1891]40use unicode;
[585]41use util;
[808]42use parsargv;
[8509]43use XMLParser;
[585]44
45sub BEGIN {
[8716]46 @HTMLPlug::ISA = ('BasPlug');
[585]47}
48
[7202]49use strict; # every perl program should have this!
50no strict 'refs'; # make an exception so we can use variables as filehandles
51
[4744]52my $arguments =
53 [ { 'name' => "process_exp",
[4873]54 'desc' => "{BasPlug.process_exp}",
[6408]55 'type' => "regexp",
[4744]56 'deft' => &get_default_process_exp() },
57 { 'name' => "block_exp",
[4873]58 'desc' => "{BasPlug.block_exp}",
[6408]59 'type' => 'regexp',
[4744]60 'deft' => &get_default_block_exp() },
61 { 'name' => "nolinks",
[4873]62 'desc' => "{HTMLPlug.nolinks}",
[4744]63 'type' => "flag" },
64 { 'name' => "keep_head",
[4873]65 'desc' => "{HTMLPlug.keep_head}",
[4744]66 'type' => "flag" },
67 { 'name' => "no_metadata",
[4873]68 'desc' => "{HTMLPlug.no_metadata}",
[4744]69 'type' => "flag" },
70 { 'name' => "metadata_fields",
[4873]71 'desc' => "{HTMLPlug.metadata_fields}",
[5096]72 'type' => "string",
[4744]73 'deft' => "Title" },
74 { 'name' => "hunt_creator_metadata",
[4873]75 'desc' => "{HTMLPlug.hunt_creator_metadata}",
[4744]76 'type' => "flag" },
77 { 'name' => "file_is_url",
[4873]78 'desc' => "{HTMLPlug.file_is_url}",
[4744]79 'type' => "flag" },
80 { 'name' => "assoc_files",
[4873]81 'desc' => "{HTMLPlug.assoc_files}",
[6408]82 'type' => "regexp",
83 'deft' => &get_default_block_exp() },
[4744]84 { 'name' => "rename_assoc_files",
[4873]85 'desc' => "{HTMLPlug.rename_assoc_files}",
[4744]86 'type' => "flag" },
87 { 'name' => "title_sub",
[4873]88 'desc' => "{HTMLPlug.title_sub}",
[4744]89 'type' => "string",
90 'deft' => "" },
91 { 'name' => "description_tags",
[4873]92 'desc' => "{HTMLPlug.description_tags}",
[9056]93 'type' => "flag" } ,
94 { 'name' => "no_strip_metadata_html",
95 'desc' => "{HTMLPlug.no_strip_metadata_html}",
96 'type' => "string",
97 'deft' => "",
98 'reqd' => "no"}
99 ];
[3540]100
101my $options = { 'name' => "HTMLPlug",
[5680]102 'desc' => "{HTMLPlug.desc}",
[6408]103 'abstract' => "no",
[3540]104 'inherits' => "yes",
105 'args' => $arguments };
106
[585]107sub new {
[808]108 my $class = shift (@_);
[1435]109 my $self = new BasPlug ($class, @_);
[5924]110 $self->{'plugin_type'} = "HTMLPlug";
[3540]111 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
112 my $option_list = $self->{'option_list'};
113 push( @{$option_list}, $options );
114
[808]115 if (!parsargv::parse(\@_,
116 q^nolinks^, \$self->{'nolinks'},
117 q^keep_head^, \$self->{'keep_head'},
118 q^no_metadata^, \$self->{'no_metadata'},
119 q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
[1602]120 q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'},
[900]121 q^w3mir^, \$self->{'w3mir'},
[2342]122 q^file_is_url^, \$self->{'file_is_url'},
[3539]123 q^assoc_files/.*/(?i)\.(jpe?g|jpe|gif|png|css)$^, \$self->{'assoc_files'},
[1220]124 q^rename_assoc_files^, \$self->{'rename_assoc_files'},
[1410]125 q^title_sub/.*/^, \$self->{'title_sub'},
[2817]126 q^description_tags^, \$self->{'description_tags'},
[9056]127 q^no_strip_metadata_html/.*/^, \$self->{'no_strip_metadata_html'},
[1220]128 "allow_extra_options")) {
[1244]129
130 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
[4873]131 $self->print_txt_usage(""); # Use default resource bundle
[808]132 die "\n";
133 }
[2342]134
135 # retain this for backward compatibility (w3mir option was replaced by
136 # file_is_url)
137 if ($self->{'w3mir'}) {
138 $self->{'file_is_url'} = 1;
139 }
[1243]140
[808]141 $self->{'aux_files'} = {};
142 $self->{'dir_num'} = 0;
143 $self->{'file_num'} = 0;
[8509]144
[585]145 return bless $self, $class;
146}
147
[8366]148# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
149# if have eg <script language="javascript" src="img/lib.js@123">
[1243]150sub get_default_block_exp {
[585]151 my $self = shift (@_);
[8914]152
[8509]153 return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
[585]154}
155
[1243]156sub get_default_process_exp {
[808]157 my $self = shift (@_);
[8914]158
[1403]159 # the last option is an attempt to encode the concept of an html query ...
[3135]160 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
[1243]161}
[721]162
[8914]163
[8509]164sub metadata_read {
165 my $self = shift (@_);
166 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
[1243]167
[8509]168 my $outhandle = $self->{'outhandle'};
169
170 my $filename = $file;
171 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
172
173 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
174
175 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
176 return undef; # can't recognise
177 }
178
[8914]179 if ($self->{'smart_block'}) {
180 # Do encoding stuff
181 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
[8509]182
[8914]183 # read in file ($text will be in utf8)
184 my $text = "";
185 $self->read_file ($filename, $encoding, $language, \$text);
186 $self->store_block_files (\$text, $filename);
187 }
[8509]188 return 1;
189}
190
191sub store_block_files
192{
193 my $self =shift (@_);
194 my ($textref, $filename) = @_;
195
196 my $html_fname = $filename;
197 my @file_blocks;
198
199 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
200 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
201 $$textref =~ s/$opencom(.*?)$closecom//gs;
202
203 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
204 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
205 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
206 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
207
208
209 foreach my $link (@img_matches, @usemap_matches, @link_matches) {
210
211 # remove quotes from link at start and end if necessary
212 if ($link=~/^\"/) {
213 $link=~s/^\"//;
214 $link=~s/\"$//;
215 }
216
217 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
218
219 if ($link !~ s@^/@@ && $link !~ /^([A-Z]:?)\\/) {
220 # Turn relative file path into full path
221 my $dirname = &File::Basename::dirname($filename);
222 $link = &util::filename_cat($dirname, $link);
223 }
224 $link = $self->eval_dir_dots($link);
225 $self->{'file_blocks'}->{$link} = 1;
226 }
227}
228
[1243]229# do plugin specific processing of doc_obj
230sub process {
231 my $self = shift (@_);
[6332]232 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1431]233 my $outhandle = $self->{'outhandle'};
[6332]234
235 print STDERR "<Processing n='$file' p='HTMLPlug'>\n" if ($gli);
236
[1431]237 print $outhandle "HTMLPlug: processing $file\n"
[808]238 if $self->{'verbosity'} > 1;
[721]239
[3019]240 if ($ENV{'GSDLOS'} =~ /^windows/i) {
241 # this makes life so much easier... perl can cope with unix-style '/'s.
242 $base_dir =~ s@(\\)+@/@g;
243 $file =~ s@(\\)+@/@g;
244 }
[3148]245
246 # reset per-doc stuff...
247 $self->{'aux_files'} = {};
248 $self->{'dir_num'} = 0;
249 $self->{'file_num'} = 0;
250
[808]251 my $cursection = $doc_obj->get_top_section();
[1220]252
[8509]253 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
[2817]254 unless $self->{'no_metadata'} || $self->{'description_tags'};
[721]255
[808]256 # Store URL for page as metadata - this can be used for an
257 # altavista style search interface. The URL won't be valid
258 # unless the file structure contains the domain name (i.e.
259 # like when w3mir is used to download a website).
260 my $web_url = "http://$file";
[4845]261 $doc_obj->add_metadata($cursection, "URL", $web_url);
[721]262
[2817]263 if ($self->{'description_tags'}) {
[2995]264 # remove the html header - note that doing this here means any
265 # sections defined within the header will be lost (so all <Section>
266 # tags must appear within the body of the HTML)
[8509]267 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
268
[2995]269 $$textref =~ s/^.*?<body[^>]*>//is;
270 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
271
[2819]272 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
273 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
[8509]274
[2819]275 my $lt = '(?:<|&lt;)';
276 my $gt = '(?:>|&gt;)';
277 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
278
[2817]279 my $found_something = 0; my $top = 1;
[2819]280 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
[2817]281 my $text = $1;
282 my $comment = $2;
283 if (defined $text) {
[3369]284 # text before a comment - note that getting to here
285 # doesn't necessarily mean there are Section tags in
286 # the document
[2817]287 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
288 }
[2819]289 while ($comment =~ s/$lt(.*?)$gt//s) {
[2817]290 my $tag = $1;
291 if ($tag eq "Section") {
292 $found_something = 1;
293 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
294 $top = 0;
295 } elsif ($tag eq "/Section") {
296 $found_something = 1;
297 $cursection = $doc_obj->get_parent_section ($cursection);
[2819]298 } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
[2817]299 my $metaname = $1;
[9053]300 my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
[2819]301 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
[2817]302 my $metavalue = $1;
303 $metavalue =~ s/^\s+//;
304 $metavalue =~ s/\s+$//;
[2819]305 # assume that no metadata value intentionally includes
306 # carriage returns or HTML tags (if they're there they
307 # were probably introduced when converting to HTML from
308 # some other format).
[9056]309 # actually some people want to have html tags in their
310 # metadata.
[2819]311 $metavalue =~ s/[\cJ\cM]/ /sg;
[9056]312 my $dont = '';
313 if ($self->{'no_strip_metadata_html'}) {
314 ($dont = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
315 }
316 $metavalue =~ s/<[^>]+>//sg
317 unless $dont && ($dont eq 'all' || $metaname =~ /^($dont)$/);
318 if ($dont && ($dont eq 'all' || $metaname =~ /^($dont)$/)) {
319 }
[2819]320 $metavalue =~ s/\s+/ /sg;
[9053]321 if ($accumulate) {
322 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
323 } else {
324 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
325 }
[2819]326 } elsif ($tag eq "Description" || $tag eq "/Description") {
327 # do nothing with containing Description tags
328 } else {
329 # simple HTML tag (probably created by the conversion
330 # to HTML from some other format) - we'll ignore it and
331 # hope for the best ;-)
[2817]332 }
333 }
334 }
335 if ($cursection ne "") {
336 print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
337 }
338
339 $$textref =~ s/^.*?<body[^>]*>//is;
340 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
341 if ($$textref =~ /\S/) {
342 if (!$found_something) {
[8509]343 if ($self->{'verbosity'} > 2) {
344 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
345 print $outhandle " will be processed as a single section document\n";
346 }
347
[3369]348 # go ahead and process single-section document
[3349]349 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
[3369]350
351 # if document contains no Section tags we'll go ahead
352 # and extract metadata (this won't have been done
353 # above as the -description_tags option prevents it)
[8509]354 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
355 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
[3369]356 unless $self->{'no_metadata'};
357
[2817]358 } else {
359 print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
360 print $outhandle " of the final closing </Section> tag. This text will\n";
361 print $outhandle " be ignored.";
[8509]362
[2819]363 my ($text);
[2817]364 if (length($$textref) > 30) {
365 $text = substr($$textref, 0, 30) . "...";
[2819]366 } else {
367 $text = $$textref;
[2817]368 }
369 $text =~ s/\n/ /isg;
370 print $outhandle " ($text)\n";
371 }
[3369]372 } elsif (!$found_something) {
373
[8509]374 if ($self->{'verbosity'} > 2) {
375 # may get to here if document contained no valid Section
376 # tags but did contain some comments. The text will have
377 # been processed already but we should print the warning
378 # as above and extract metadata
379 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
380 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
381 }
[3369]382
[8509]383 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
384 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
[3369]385 unless $self->{'no_metadata'};
[2817]386 }
387
388 } else {
[2995]389
390 # remove header and footer
391 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
392 $$textref =~ s/^.*?<body[^>]*>//is;
393 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
394 }
395
[2817]396 # single section document
397 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
398 }
399 return 1;
400}
401
402# note that process_section may be called multiple times for a single
403# section (relying on the fact that add_utf8_text appends the text to any
404# that may exist already).
405sub process_section {
406 my $self = shift (@_);
407 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
[808]408 # trap links
409 if (!$self->{'nolinks'}) {
[721]410
[808]411 # usemap="./#index" not handled correctly => change to "#index"
[1243]412 $$textref =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
[808]413 $self->replace_usemap_links($1, $2, $3)/isge;
[721]414
[8366]415 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
[897]416 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
[721]417 }
418
[808]419 # trap images
[1929]420
[2695]421 # allow spaces if inside quotes - jrm21
422 $$textref =~ s/(<img[^>]*?src\s*=\s*)(\"[^\"]+\"|[^\s>]+)([^>]*>)/
[897]423 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
[1244]424
425 # add text to document object
[3019]426 # turn \ into \\ so that the rest of greenstone doesn't think there
427 # is an escape code following. (Macro parsing loses them...)
428 $$textref =~ s/\\/\\\\/go;
[1358]429 $doc_obj->add_utf8_text($cursection, $$textref);
[721]430}
431
[808]432sub replace_images {
433 my $self = shift (@_);
[897]434 my ($front, $link, $back, $base_dir,
435 $file, $doc_obj, $section) = @_;
[8509]436
[2695]437 # remove quotes from link at start and end if necessary
438 if ($link=~/^\"/) {
439 $link=~s/^\"//;$link=~s/\"$//;
440 $front.='"';
441 $back="\"$back";
442 }
443
[808]444 $link =~ s/\n/ /g;
[7949]445
446 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
447 # If the Word file path has spaces in it, wv messes up and you end up with
448 # absolute paths for the images, and without the "file://" prefix
449 # So check for this special case and massage the data to be correct
[7966]450 if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
451 $link =~ s/^.*\\([^\\]+)$/$1/;
[7949]452 }
453
[808]454 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[8509]455
[6812]456 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
457 my $anchor_name = $img_file;
458 $anchor_name =~ s/^.*\///;
459 $anchor_name = "<a name=\"$anchor_name\">";
460
[7595]461 return $front . $img_file . $back . $anchor_name;
[721]462}
463
[808]464sub replace_href_links {
[585]465 my $self = shift (@_);
[897]466 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
[585]467
[808]468 # attempt to sort out targets - frames are not handled
469 # well in this plugin and some cases will screw things
470 # up - e.g. the _parent target (so we'll just remove
471 # them all ;-)
472 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
473 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
474 $front =~ s/target=\"?_parent\"?//is;
475 $back =~ s/target=\"?_parent\"?//is;
[721]476
[808]477 return $front . $link . $back if $link =~ /^\#/s;
478 $link =~ s/\n/ /g;
[721]479
[808]480 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[1312]481 # href may use '\'s where '/'s should be on Windows
482 $href =~ s/\\/\//g;
[585]483
[850]484 my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/;
[8509]485
[8914]486
[897]487 ##### leave all these links alone (they won't be picked up by intermediate
488 ##### pages). I think that's safest when dealing with frames, targets etc.
489 ##### (at least until I think of a better way to do it). Problems occur with
490 ##### mailto links from within small frames, the intermediate page is displayed
491 ##### within that frame and can't be seen. There is still potential for this to
492 ##### happen even with html pages - the solution seems to be to somehow tell
493 ##### the browser from the server side to display the page being sent (i.e.
494 ##### the intermediate page) in the top level window - I'm not sure if that's
495 ##### possible - the following line should probably be deleted if that can be done
496 return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is;
497
[1605]498
[850]499 if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||
[808]500 ($href =~ /\/$/) || ($href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
[1010]501 &ghtml::urlsafe ($href);
[897]502 return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
[808]503 } else {
[1686]504 # link is to some other type of file (eg image) so we'll
[808]505 # need to associate that file
[965]506 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
[721]507 }
[808]508}
[721]509
[808]510sub add_file {
511 my $self = shift (@_);
[965]512 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
[808]513 my ($newname);
[585]514
[808]515 my $filename = $href;
516 $filename =~ s/^[^:]*:\/\///;
[1410]517 $filename = &util::filename_cat($base_dir, $filename);
[3708]518
519 # Replace %20's in URL with a space if required. Note that the filename
520 # may include the %20 in some situations
521 if ($filename =~ /\%20/) {
522 if (!-e $filename) {
523 $filename =~ s/\%20/ /g;
524 }
525 }
526
[808]527 my ($ext) = $filename =~ /(\.[^\.]*)$/;
[965]528
529 if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
530 return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
531 }
[900]532 if ($self->{'rename_assoc_files'}) {
533 if (defined $self->{'aux_files'}->{$href}) {
534 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
535 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
536 } else {
537 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
[965]538 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
[900]539 $self->inc_filecount ();
540 }
541 $doc_obj->associate_file($filename, $newname, undef, $section);
[3148]542 return "_httpdocimg_/$newname";
[585]543 } else {
[900]544 ($newname) = $filename =~ /([^\/\\]*)$/;
545 $doc_obj->associate_file($filename, $newname, undef, $section);
[1020]546 return "_httpdocimg_/$newname";
[585]547 }
[808]548}
[585]549
[721]550
[808]551sub format_link {
552 my $self = shift (@_);
553 my ($link, $base_dir, $file) = @_;
[585]554
[808]555 my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;
[8509]556
[808]557 $hash_part = "" if !defined $hash_part;
558 if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {
[1424]559 my $outhandle = $self->{'outhandle'};
560 print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
[808]561 if $self->{'verbosity'};
562 return ($link, "", 0);
[732]563 }
[8509]564
[3019]565 if ($before_hash =~ s@^((?:http|ftp|file)://)@@i) {
[808]566 my $type = $1;
[1929]567
[808]568 if ($link =~ /^(http|ftp):/i) {
569 # Turn url (using /) into file name (possibly using \ on windows)
570 my @http_dir_split = split('/', $before_hash);
571 $before_hash = &util::filename_cat(@http_dir_split);
[585]572 }
573
[808]574 $before_hash = $self->eval_dir_dots($before_hash);
575
576 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
[8509]577
[808]578 my $rl = 0;
579 $rl = 1 if (-e $linkfilename);
[585]580
[808]581 # make sure there's a slash on the end if it's a directory
582 if ($before_hash !~ /\/$/) {
583 $before_hash .= "/" if (-d $linkfilename);
584 }
[585]585
[808]586 return ($type . $before_hash, $hash_part, $rl);
[721]587
[808]588 } elsif ($link !~ /^(mailto|news|gopher|nntp|telnet|javascript):/i) {
[3019]589 if ($before_hash =~ s@^/@@ || $before_hash =~ /\\/) {
[721]590
[2342]591 # the first directory will be the domain name if file_is_url
[808]592 # to generate archives, otherwise we'll assume all files are
593 # from the same site and base_dir is the root
[3019]594
[2342]595 if ($self->{'file_is_url'}) {
[808]596 my @dirs = split /[\/\\]/, $file;
597 my $domname = shift (@dirs);
598 $before_hash = &util::filename_cat($domname, $before_hash);
[3019]599 $before_hash =~ s@\\@/@g; # for windows
[808]600 }
[1410]601 else
602 {
603 # see if link shares directory with source document
604 # => turn into relative link if this is so!
[1929]605
606 if ($ENV{'GSDLOS'} =~ /^windows/i) {
[3019]607 # too difficult doing a pattern match with embedded '\'s...
608 my $win_before_hash=$before_hash;
609 $win_before_hash =~ s@(\\)+@/@g;
610 # $base_dir is already similarly "converted" on windows.
611 if ($win_before_hash =~ s@^$base_dir/@@o) {
612 # if this is true, we removed a prefix
613 $before_hash=$win_before_hash;
614 }
[1929]615 }
616 else {
[3019]617 $before_hash = &util::filename_cat("",$before_hash);
618 $before_hash =~ s@^$base_dir/@@;
[1929]619 }
[1410]620 }
[808]621 } else {
622 # Turn relative file path into full path
623 my $dirname = &File::Basename::dirname($file);
624 $before_hash = &util::filename_cat($dirname, $before_hash);
625 $before_hash = $self->eval_dir_dots($before_hash);
626 }
[721]627
[1410]628 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
[808]629 # make sure there's a slash on the end if it's a directory
630 if ($before_hash !~ /\/$/) {
631 $before_hash .= "/" if (-d $linkfilename);
632 }
633 return ("http://" . $before_hash, $hash_part, 1);
634 } else {
635 # mailto, news, nntp, telnet, javascript or gopher link
636 return ($before_hash, "", 0);
637 }
638}
[1605]639
[1602]640sub extract_first_NNNN_characters {
641 my $self = shift (@_);
642 my ($textref, $doc_obj, $thissection) = @_;
643
644 foreach my $size (split /,/, $self->{'first'}) {
645 my $tmptext = $$textref;
[4821]646 # skip to the body
[1602]647 $tmptext =~ s/.*<body[^>]*>//i;
[4821]648 # remove javascript
649 $tmptext =~ s@<script.*?</script>@ @sig;
[1602]650 $tmptext =~ s/<[^>]*>/ /g;
651 $tmptext =~ s/&nbsp;/ /g;
652 $tmptext =~ s/^\s+//;
653 $tmptext =~ s/\s+$//;
654 $tmptext =~ s/\s+/ /gs;
[8767]655 # with perl 5.6 at least, substr might segment at a multi-byte char...
656 use utf8;
[1602]657 $tmptext = substr ($tmptext, 0, $size);
[4821]658 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
[1602]659 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
660 }
661}
[721]662
[7202]663
[808]664sub extract_metadata {
665 my $self = shift (@_);
666 my ($textref, $metadata, $doc_obj, $section) = @_;
[1605]667 my $outhandle = $self->{'outhandle'};
[1602]668 # if we don't want metadata, we may as well not be here ...
[1400]669 return if (!defined $self->{'metadata_fields'});
670
[8843]671 # metadata fields to extract/save. 'key' is the (lowercase) name of the
672 # html meta, 'value' is the metadata name for greenstone to use
673 my %find_fields = ();
[7202]674
675 my %creator_fields = (); # short-cut for lookups
676
677
678 foreach my $field (split /,/, $self->{'metadata_fields'}) {
[8225]679 # support tag<tagname>
680 if ($field =~ /^(.*?)<(.*?)>$/) {
681 # "$2" is the user's preferred gs metadata name
682 $find_fields{lc($1)}=$2; # lc = lowercase
[8843]683 } else { # no <tagname> for mapping
684 # "$field" is the user's preferred gs metadata name
685 $find_fields{lc($field)}=$field; # lc = lowercase
[8225]686 }
[7202]687 }
688
689 if (defined $self->{'hunt_creator_metadata'} &&
690 $self->{'hunt_creator_metadata'} == 1 ) {
691 my @extra_fields =
692 (
693 'author',
694 'author.email',
695 'creator',
696 'dc.creator',
697 'dc.creator.corporatename',
698 );
699
700 # add the creator_metadata fields to search for
701 foreach my $field (@extra_fields) {
702 $creator_fields{$field}=0; # add to lookup hash
[1602]703 }
704 }
[721]705
[8509]706
[7202]707 # find the header in the html file, which has the meta tags
708 $$textref =~ m@<head>(.*?)</head>@si;
709
710 my $html_header=$1;
711 # go through every <meta... tag defined in the html and see if it is
712 # one of the tags we want to match.
713
[7235]714 # special case for title - we want to remember if its been found
715 my $found_title = 0;
[7202]716 # this assumes that ">" won't appear. (I don't think it's allowed to...)
717 $html_header =~ /^/; # match the start of the string, for \G assertion
[8509]718
[7202]719 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
720 my $metatag=$1;
721 my ($tag, $value);
722
723 # find the tag name
724 $metatag =~ /(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
725 $tag=$2;
726 # in case they're not using " or ', but they should...
727 if (! $tag) {
[8843]728 $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
[7202]729 $tag=$1;
[808]730 }
[1605]731
[7202]732 if (!defined $tag) {
733 print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
[1230]734 next;
[1190]735 }
[1230]736
[7202]737 # don't need to assign this field if it was passed in from a previous
738 # (recursive) plugin
739 if (defined $metadata->{$tag}) {next}
740
741 # find the tag content
742 $metatag =~ /content\s*=\s*([\"\'])?(.*?)\1/is;
743 $value=$2;
744 if (! $value) {
[8843]745 $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
[7202]746 $value=$1;
747 }
748 if (!defined $value) {
749 print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
750 next;
751 }
752
753 # clean up and add
754 $value =~ s/\s+/ /gs;
[8794]755 chomp($value); # remove trailing \n, if any
[7202]756 if (exists $creator_fields{lc($tag)}) {
757 # map this value onto greenstone's "Creator" metadata
758 $tag='Creator';
759 } elsif (!exists $find_fields{lc($tag)}) {
760 next; # don't want this tag
761 } else {
762 # get the user's preferred capitalisation
763 $tag = $find_fields{lc($tag)};
764 }
[7235]765 if (lc($tag) eq "title") {
766 $found_title = 1;
767 }
[7202]768 print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
769 if ($self->{'verbosity'} > 2);
770 $doc_obj->add_utf8_metadata($section, $tag, $value);
771
[808]772 }
[7202]773
774 # TITLE: extract the document title
[7235]775 if (exists $find_fields{'title'} && !$found_title) {
[7202]776 # we want a title, and didn't find one in the meta tags
777 # see if there's a <title> tag
778 my $title;
[8843]779 my $from = ""; # for debugging output only
[7235]780 if ($html_header =~ /<title[^>]*>([^<]+)<\/title[^>]*>/is) {
[7202]781 $title = $1;
[7235]782 $from = "<title> tags";
[7202]783 }
[8509]784
[7202]785 if (!defined $title) {
[7235]786 $from = "first 100 chars";
[7202]787 # if no title use first 100 or so characters
788 $title = $$textref;
[8071]789 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
[7202]790 $title =~ s/^.*?<body>//si;
791 # ignore javascript!
792 $title =~ s@<script.*?</script>@ @sig;
793 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
794 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
795 $title = substr ($title, 0, 100);
796 $title =~ s/\s\S*$/.../;
797 }
798 $title =~ s/<[^>]*>/ /g; # remove html tags
799 $title =~ s/&nbsp;/ /g;
800 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
801 $title =~ s/\s+/ /gs; # collapse multiple spaces
802 $title =~ s/^\s*//; # remove leading spaces
803 $title =~ s/\s*$//; # remove trailing spaces
[8071]804
[7202]805 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
806 $title =~ s/^\s+//s; # in case title_sub introduced any...
807 $doc_obj->add_utf8_metadata ($section, 'Title', $title);
[7235]808 print $outhandle " extracted Title metadata \"$title\" from $from\n"
[7202]809 if ($self->{'verbosity'} > 2);
[7235]810 }
[8121]811
812 # add FileFormat metadata
813 $doc_obj->add_metadata($section,"FileFormat", "HTML");
[7202]814
815 # Special, for metadata names such as tagH1 - extracts
816 # the text between the first <H1> and </H1> tags into "H1" metadata.
817
818 foreach my $field (keys %find_fields) {
819 if ($field !~ /^tag([a-z0-9]+)$/i) {next}
820 my $tag = $1;
821 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
822 my $content = $1;
823 $content =~ s/&nbsp;/ /g;
824 $content =~ s/<[^>]*>/ /g;
825 $content =~ s/^\s+//;
826 $content =~ s/\s+$//;
827 $content =~ s/\s+/ /gs;
828 if ($content) {
829 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
830 $tag =~ s/^tag//i;
831 $doc_obj->add_utf8_metadata ($section, $tag, $content);
832 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
833 if ($self->{'verbosity'} > 2);
834 }
835 }
836 }
[585]837}
838
[1190]839
[808]840# evaluate any "../" to next directory up
841# evaluate any "./" as here
842sub eval_dir_dots {
843 my $self = shift (@_);
844 my ($filename) = @_;
[721]845 my $dirsep_os = &util::get_os_dirsep();
846 my @dirsep = split(/$dirsep_os/,$filename);
847
848 my @eval_dirs = ();
[850]849 foreach my $d (@dirsep) {
[808]850 if ($d eq "..") {
[721]851 pop(@eval_dirs);
[8509]852
[808]853 } elsif ($d eq ".") {
[721]854 # do nothing!
[808]855
856 } else {
[721]857 push(@eval_dirs,$d);
858 }
[585]859 }
860
[8509]861 # Need to fiddle with number of elements in @eval_dirs if the
862 # first one is the empty string. This is because of a
863 # modification to util::filename_cat that supresses the addition
864 # of a leading '/' character (or \ if windows) (intended to help
865 # filename cat with relative paths) if the first entry in the
866 # array is the empty string. Making the array start with *two*
867 # empty strings is a way to defeat this "smart" option.
868 #
869 if (scalar(@eval_dirs) > 0) {
870 if ($eval_dirs[0] eq ""){
871 unshift(@eval_dirs,"");
872 }
873 }
[721]874 return &util::filename_cat(@eval_dirs);
875}
876
[808]877sub replace_usemap_links {
878 my $self = shift (@_);
[721]879 my ($front, $link, $back) = @_;
880
881 $link =~ s/^\.\///;
[808]882 return $front . $link . $back;
[721]883}
884
[808]885sub inc_filecount {
886 my $self = shift (@_);
[721]887
[808]888 if ($self->{'file_num'} == 1000) {
889 $self->{'dir_num'} ++;
890 $self->{'file_num'} = 0;
891 } else {
892 $self->{'file_num'} ++;
893 }
894}
[721]895
[1891]896
897# Extend the BasPlug read_file so that strings like &eacute; are
898# converted to UTF8 internally.
899#
900# We don't convert &lt; or &gt; or &amp; or &quot; in case
901# they interfere with the GML files
902
903sub read_file {
[2735]904 my ($self, $filename, $encoding, $language, $textref) = @_;
[1891]905
[2735]906 &BasPlug::read_file($self, $filename, $encoding, $language, $textref);
[2364]907
[3181]908 # Convert entities to their UTF8 equivalents
[3196]909 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
[3181]910 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
[3196]911 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
[1891]912}
913
[585]9141;
Note: See TracBrowser for help on using the repository browser.