source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 11090

Last change on this file since 11090 was 10725, checked in by chi, 19 years ago

For some reasons, to change the date format to "yyymmdd" used "date" and can't recognise "Date". Therefore, adding some code to convert the metadata "Date" to "date" in the early stage.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 31.8 KB
Line 
1###########################################################################
2#
3# HTMLPlug.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlug;
37
38use BasPlug;
39use ghtml;
40use unicode;
41use util;
42use XMLParser;
43
44sub BEGIN {
45 @HTMLPlug::ISA = ('BasPlug');
46}
47
48use strict; # every perl program should have this!
49no strict 'refs'; # make an exception so we can use variables as filehandles
50
51my $arguments =
52 [ { 'name' => "process_exp",
53 'desc' => "{BasPlug.process_exp}",
54 'type' => "regexp",
55 'deft' => &get_default_process_exp() },
56 { 'name' => "block_exp",
57 'desc' => "{BasPlug.block_exp}",
58 'type' => 'regexp',
59 'deft' => &get_default_block_exp() },
60 { 'name' => "nolinks",
61 'desc' => "{HTMLPlug.nolinks}",
62 'type' => "flag" },
63 { 'name' => "keep_head",
64 'desc' => "{HTMLPlug.keep_head}",
65 'type' => "flag" },
66 { 'name' => "no_metadata",
67 'desc' => "{HTMLPlug.no_metadata}",
68 'type' => "flag" },
69 { 'name' => "metadata_fields",
70 'desc' => "{HTMLPlug.metadata_fields}",
71 'type' => "string",
72 'deft' => "Title" },
73 { 'name' => "hunt_creator_metadata",
74 'desc' => "{HTMLPlug.hunt_creator_metadata}",
75 'type' => "flag" },
76 { 'name' => "file_is_url",
77 'desc' => "{HTMLPlug.file_is_url}",
78 'type' => "flag" },
79 { 'name' => "assoc_files",
80 'desc' => "{HTMLPlug.assoc_files}",
81 'type' => "regexp",
82 'deft' => &get_default_block_exp() },
83 { 'name' => "rename_assoc_files",
84 'desc' => "{HTMLPlug.rename_assoc_files}",
85 'type' => "flag" },
86 { 'name' => "title_sub",
87 'desc' => "{HTMLPlug.title_sub}",
88 'type' => "string",
89 'deft' => "" },
90 { 'name' => "description_tags",
91 'desc' => "{HTMLPlug.description_tags}",
92 'type' => "flag" },
93 # retain this for backward compatibility (w3mir option was replaced by
94 # file_is_url)
95 { 'name' => "w3mir",
96# 'desc' => "{HTMLPlug.w3mir}",
97 'type' => "flag",
98 'hiddengli' => "yes"},
99 { 'name' => "no_strip_metadata_html",
100 'desc' => "{HTMLPlug.no_strip_metadata_html}",
101 'type' => "string",
102 'deft' => "",
103 'reqd' => "no"},
104 { 'name' => "sectionalise_using_h_tags",
105 'desc' => "{HTMLPlug.sectionalise_using_h_tags}",
106 'type' => "flag" }
107 ];
108
109my $options = { 'name' => "HTMLPlug",
110 'desc' => "{HTMLPlug.desc}",
111 'abstract' => "no",
112 'inherits' => "yes",
113 'args' => $arguments };
114
115sub new {
116 my ($class) = shift (@_);
117 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
118 push(@$pluginlist, $class);
119
120 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
121 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
122
123
124 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
125
126 if ($self->{'w3mir'}) {
127 $self->{'file_is_url'} = 1;
128 }
129 $self->{'aux_files'} = {};
130 $self->{'dir_num'} = 0;
131 $self->{'file_num'} = 0;
132
133 return bless $self, $class;
134}
135
136# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
137# if have eg <script language="javascript" src="img/lib.js@123">
138sub get_default_block_exp {
139 my $self = shift (@_);
140
141 return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
142}
143
144sub get_default_process_exp {
145 my $self = shift (@_);
146
147 # the last option is an attempt to encode the concept of an html query ...
148 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
149}
150
151sub store_block_files
152{
153 my $self =shift (@_);
154 my ($filename) = @_;
155 my $html_fname = $filename;
156 my @file_blocks;
157
158 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
159
160 # read in file ($text will be in utf8)
161 my $text = "";
162 $self->read_file ($filename, $encoding, $language, \$text);
163 my $textref = \$text;
164 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
165 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
166 $$textref =~ s/$opencom(.*?)$closecom//gs;
167
168 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
169 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
170 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
171 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
172 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
173 my @tabbg_matches = ($$textref =~ m/<(?:table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
174
175 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches) {
176
177 # remove quotes from link at start and end if necessary
178 if ($link=~/^\"/) {
179 $link=~s/^\"//;
180 $link=~s/\"$//;
181 }
182
183 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
184
185 if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
186 # Turn relative file path into full path
187 my $dirname = &File::Basename::dirname($filename);
188 $link = &util::filename_cat($dirname, $link);
189 }
190 $link = $self->eval_dir_dots($link);
191
192 $self->{'file_blocks'}->{$link} = 1;
193 }
194}
195
196
197# do plugin specific processing of doc_obj
198sub process {
199 my $self = shift (@_);
200 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
201 my $outhandle = $self->{'outhandle'};
202
203 print STDERR "<Processing n='$file' p='HTMLPlug'>\n" if ($gli);
204
205 print $outhandle "HTMLPlug: processing $file\n"
206 if $self->{'verbosity'} > 1;
207
208 if ($ENV{'GSDLOS'} =~ /^windows/i) {
209 # this makes life so much easier... perl can cope with unix-style '/'s.
210 $base_dir =~ s@(\\)+@/@g;
211 $file =~ s@(\\)+@/@g;
212 }
213
214 # reset per-doc stuff...
215 $self->{'aux_files'} = {};
216 $self->{'dir_num'} = 0;
217 $self->{'file_num'} = 0;
218
219 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
220 # you can also include metadata in the format (X can be any number)
221 # <hX>Title<!--gsdl-metadata
222 # <Metadata name="name1">value1</Metadata>
223 # ...
224 # <Metadata name="nameN">valueN</Metadata>
225 #--></hX>
226 if ($self->{'sectionalise_using_h_tags'}) {
227 # description_tags should allways be activated because we convert headings to description tags
228 $self->{'description_tags'} = 1;
229
230 my $arrSections = [];
231 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
232
233 if (scalar(@$arrSections)) {
234 my $strMetadata = $self->update_section_data($arrSections, -1);
235 if (length($strMetadata)) {
236 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
237 $$textref =~ s/<\/body>/$strMetadata/ig;
238 }
239 }
240 }
241
242 my $cursection = $doc_obj->get_top_section();
243
244 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
245 unless $self->{'no_metadata'} || $self->{'description_tags'};
246
247 # Store URL for page as metadata - this can be used for an
248 # altavista style search interface. The URL won't be valid
249 # unless the file structure contains the domain name (i.e.
250 # like when w3mir is used to download a website).
251
252 # URL metadata (even invalid ones) are used to support internal
253 # links, so even if 'file_is_url' is off, still need to store info
254
255 my $web_url = "http://$file";
256 $doc_obj->add_metadata($cursection, "URL", $web_url);
257
258 if ($self->{'file_is_url'}) {
259 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
260 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
261 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
262 }
263
264 if ($self->{'description_tags'}) {
265 # remove the html header - note that doing this here means any
266 # sections defined within the header will be lost (so all <Section>
267 # tags must appear within the body of the HTML)
268 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
269
270 $$textref =~ s/^.*?<body[^>]*>//is;
271 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
272
273 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
274 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
275
276 my $lt = '(?:<|&lt;)';
277 my $gt = '(?:>|&gt;)';
278 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
279
280 my $dont_strip = '';
281 if ($self->{'no_strip_metadata_html'}) {
282 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
283 }
284
285 my $found_something = 0; my $top = 1;
286 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
287 my $text = $1;
288 my $comment = $2;
289 if (defined $text) {
290 # text before a comment - note that getting to here
291 # doesn't necessarily mean there are Section tags in
292 # the document
293 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
294 }
295 while ($comment =~ s/$lt(.*?)$gt//s) {
296 my $tag = $1;
297 if ($tag eq "Section") {
298 $found_something = 1;
299 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
300 $top = 0;
301 } elsif ($tag eq "/Section") {
302 $found_something = 1;
303 $cursection = $doc_obj->get_parent_section ($cursection);
304 } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
305 my $metaname = $1;
306 my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
307 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
308 my $metavalue = $1;
309 $metavalue =~ s/^\s+//;
310 $metavalue =~ s/\s+$//;
311 # assume that no metadata value intentionally includes
312 # carriage returns or HTML tags (if they're there they
313 # were probably introduced when converting to HTML from
314 # some other format).
315 # actually some people want to have html tags in their
316 # metadata.
317 $metavalue =~ s/[\cJ\cM]/ /sg;
318 $metavalue =~ s/<[^>]+>//sg
319 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ /^($dont_strip)$/);
320 $metavalue =~ s/\s+/ /sg;
321 if ($accumulate) {
322 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
323 } else {
324 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
325 }
326 } elsif ($tag eq "Description" || $tag eq "/Description") {
327 # do nothing with containing Description tags
328 } else {
329 # simple HTML tag (probably created by the conversion
330 # to HTML from some other format) - we'll ignore it and
331 # hope for the best ;-)
332 }
333 }
334 }
335 if ($cursection ne "") {
336 print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
337 }
338
339 $$textref =~ s/^.*?<body[^>]*>//is;
340 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
341 if ($$textref =~ /\S/) {
342 if (!$found_something) {
343 if ($self->{'verbosity'} > 2) {
344 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
345 print $outhandle " will be processed as a single section document\n";
346 }
347
348 # go ahead and process single-section document
349 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
350
351 # if document contains no Section tags we'll go ahead
352 # and extract metadata (this won't have been done
353 # above as the -description_tags option prevents it)
354 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
355 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
356 unless $self->{'no_metadata'};
357
358 } else {
359 print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
360 print $outhandle " of the final closing </Section> tag. This text will\n";
361 print $outhandle " be ignored.";
362
363 my ($text);
364 if (length($$textref) > 30) {
365 $text = substr($$textref, 0, 30) . "...";
366 } else {
367 $text = $$textref;
368 }
369 $text =~ s/\n/ /isg;
370 print $outhandle " ($text)\n";
371 }
372 } elsif (!$found_something) {
373
374 if ($self->{'verbosity'} > 2) {
375 # may get to here if document contained no valid Section
376 # tags but did contain some comments. The text will have
377 # been processed already but we should print the warning
378 # as above and extract metadata
379 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
380 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
381 }
382
383 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
384 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
385 unless $self->{'no_metadata'};
386 }
387
388 } else {
389
390 # remove header and footer
391 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
392 $$textref =~ s/^.*?<body[^>]*>//is;
393 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
394 }
395
396 # single section document
397 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
398 }
399 return 1;
400}
401
402
403sub process_heading
404{
405 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
406 $strHeadingText = '' if (!defined($strHeadingText));
407
408 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
409
410 my $strSecMetadata = '';
411 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
412 {
413 $strSecMetadata .= $1;
414 }
415
416 $strHeadingText =~ s/^\s+//g;
417 $strHeadingText =~ s/\s+$//g;
418 $strSecMetadata =~ s/^\s+//g;
419 $strSecMetadata =~ s/\s+$//g;
420
421 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
422
423 if (length($strSecMetadata)) {
424 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
425 }
426
427 $strMetadata .= "\t</Description>\n";
428
429 return "<!--" . $strMetadata . "-->";
430}
431
432
433sub update_section_data
434{
435 my ($self, $arrSections, $nCurTocNo) = @_;
436 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
437
438 if ($nSections == 0) {
439 push @$arrSections, $nCurTocNo;
440 return $strBuffer;
441 }
442 $nLast = $arrSections->[$nSections - 1];
443 if ($nCurTocNo > $nLast) {
444 push @$arrSections, $nCurTocNo;
445 return $strBuffer;
446 }
447 for(my $i = $nSections - 1; $i >= 0; $i--) {
448 if ($nCurTocNo <= $arrSections->[$i]) {
449 $strBuffer .= "\n</Section>";
450 pop @$arrSections;
451 }
452 }
453 push @$arrSections, $nCurTocNo;
454 return $strBuffer;
455}
456
457
458# note that process_section may be called multiple times for a single
459# section (relying on the fact that add_utf8_text appends the text to any
460# that may exist already).
461sub process_section {
462 my $self = shift (@_);
463 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
464 # trap links
465 if (!$self->{'nolinks'}) {
466
467 # usemap="./#index" not handled correctly => change to "#index"
468 $$textref =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
469 $self->replace_usemap_links($1, $2, $3)/isge;
470
471 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
472 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
473 }
474
475 # trap images
476
477 # allow spaces if inside quotes - jrm21
478 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)(\"[^\"]+\"|[^\s>]+)([^>]*>)/
479 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
480
481 # add text to document object
482 # turn \ into \\ so that the rest of greenstone doesn't think there
483 # is an escape code following. (Macro parsing loses them...)
484 $$textref =~ s/\\/\\\\/go;
485
486 $doc_obj->add_utf8_text($cursection, $$textref);
487}
488
489sub replace_images {
490 my $self = shift (@_);
491 my ($front, $link, $back, $base_dir,
492 $file, $doc_obj, $section) = @_;
493
494 # remove quotes from link at start and end if necessary
495 if ($link=~/^\"/) {
496 $link=~s/^\"//;$link=~s/\"$//;
497 $front.='"';
498 $back="\"$back";
499 }
500
501 $link =~ s/\n/ /g;
502
503 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
504 # If the Word file path has spaces in it, wv messes up and you end up with
505 # absolute paths for the images, and without the "file://" prefix
506 # So check for this special case and massage the data to be correct
507 if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
508 $link =~ s/^.*\\([^\\]+)$/$1/;
509 }
510
511 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
512
513 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
514
515 my $anchor_name = $img_file;
516 $anchor_name =~ s/^.*\///;
517 $anchor_name = "<a name=\"$anchor_name\">";
518
519 return $front . $img_file . $back . $anchor_name;
520}
521
522sub replace_href_links {
523 my $self = shift (@_);
524 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
525
526 # attempt to sort out targets - frames are not handled
527 # well in this plugin and some cases will screw things
528 # up - e.g. the _parent target (so we'll just remove
529 # them all ;-)
530 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
531 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
532 $front =~ s/target=\"?_parent\"?//is;
533 $back =~ s/target=\"?_parent\"?//is;
534
535 return $front . $link . $back if $link =~ /^\#/s;
536 $link =~ s/\n/ /g;
537
538 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
539 # href may use '\'s where '/'s should be on Windows
540 $href =~ s/\\/\//g;
541
542 my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/;
543
544
545 ##### leave all these links alone (they won't be picked up by intermediate
546 ##### pages). I think that's safest when dealing with frames, targets etc.
547 ##### (at least until I think of a better way to do it). Problems occur with
548 ##### mailto links from within small frames, the intermediate page is displayed
549 ##### within that frame and can't be seen. There is still potential for this to
550 ##### happen even with html pages - the solution seems to be to somehow tell
551 ##### the browser from the server side to display the page being sent (i.e.
552 ##### the intermediate page) in the top level window - I'm not sure if that's
553 ##### possible - the following line should probably be deleted if that can be done
554 return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is;
555
556
557 if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||
558 ($href =~ /\/$/) || ($href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
559 &ghtml::urlsafe ($href);
560 return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
561 } else {
562 # link is to some other type of file (eg image) so we'll
563 # need to associate that file
564 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
565 }
566}
567
568sub add_file {
569 my $self = shift (@_);
570 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
571 my ($newname);
572
573 my $filename = $href;
574 if ($base_dir eq "") {
575 # remove http:/ thereby leaving one slash at the start
576 $filename =~ s/^[^:]*:\///;
577 }
578 else {
579 # remove http://
580 $filename =~ s/^[^:]*:\/\///;
581 }
582
583 $filename = &util::filename_cat($base_dir, $filename);
584
585 # Replace %20's in URL with a space if required. Note that the filename
586 # may include the %20 in some situations
587 if ($filename =~ /\%20/) {
588 if (!-e $filename) {
589 $filename =~ s/\%20/ /g;
590 }
591 }
592
593 my ($ext) = $filename =~ /(\.[^\.]*)$/;
594
595 if ($rl == 0) {
596 if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
597 return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
598 }
599 else {
600 return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
601 }
602 }
603
604 if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
605 return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
606 }
607 if ($self->{'rename_assoc_files'}) {
608 if (defined $self->{'aux_files'}->{$href}) {
609 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
610 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
611 } else {
612 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
613 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
614 $self->inc_filecount ();
615 }
616 $doc_obj->associate_file($filename, $newname, undef, $section);
617 return "_httpdocimg_/$newname";
618 } else {
619 ($newname) = $filename =~ /([^\/\\]*)$/;
620 $doc_obj->associate_file($filename, $newname, undef, $section);
621 return "_httpdocimg_/$newname";
622 }
623}
624
625
626sub format_link {
627 my $self = shift (@_);
628 my ($link, $base_dir, $file) = @_;
629
630 my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;
631
632 $hash_part = "" if !defined $hash_part;
633 if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {
634 my $outhandle = $self->{'outhandle'};
635 print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
636 if $self->{'verbosity'};
637 return ($link, "", 0);
638 }
639
640 if ($before_hash =~ s@^((?:http|ftp|file)://)@@i) {
641 my $type = $1;
642
643 if ($link =~ /^(http|ftp):/i) {
644 # Turn url (using /) into file name (possibly using \ on windows)
645 my @http_dir_split = split('/', $before_hash);
646 $before_hash = &util::filename_cat(@http_dir_split);
647 }
648
649 $before_hash = $self->eval_dir_dots($before_hash);
650
651 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
652
653 my $rl = 0;
654 $rl = 1 if (-e $linkfilename);
655
656 # make sure there's a slash on the end if it's a directory
657 if ($before_hash !~ /\/$/) {
658 $before_hash .= "/" if (-d $linkfilename);
659 }
660
661 return ($type . $before_hash, $hash_part, $rl);
662
663 } elsif ($link !~ /^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ /^\//) {
664 if ($before_hash =~ s@^/@@ || $before_hash =~ /\\/) {
665
666 # the first directory will be the domain name if file_is_url
667 # to generate archives, otherwise we'll assume all files are
668 # from the same site and base_dir is the root
669
670 if ($self->{'file_is_url'}) {
671 my @dirs = split /[\/\\]/, $file;
672 my $domname = shift (@dirs);
673 $before_hash = &util::filename_cat($domname, $before_hash);
674 $before_hash =~ s@\\@/@g; # for windows
675 }
676 else
677 {
678 # see if link shares directory with source document
679 # => turn into relative link if this is so!
680
681 if ($ENV{'GSDLOS'} =~ /^windows/i) {
682 # too difficult doing a pattern match with embedded '\'s...
683 my $win_before_hash=$before_hash;
684 $win_before_hash =~ s@(\\)+@/@g;
685 # $base_dir is already similarly "converted" on windows.
686 if ($win_before_hash =~ s@^$base_dir/@@o) {
687 # if this is true, we removed a prefix
688 $before_hash=$win_before_hash;
689 }
690 }
691 else {
692 # before_hash has lost leading slash by this point,
693 # -> add back in prior to substitution with $base_dir
694 $before_hash = "/$before_hash";
695
696 $before_hash = &util::filename_cat("",$before_hash);
697 $before_hash =~ s@^$base_dir/@@;
698 }
699 }
700 } else {
701 # Turn relative file path into full path
702 my $dirname = &File::Basename::dirname($file);
703 $before_hash = &util::filename_cat($dirname, $before_hash);
704 $before_hash = $self->eval_dir_dots($before_hash);
705 }
706
707 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
708 # make sure there's a slash on the end if it's a directory
709 if ($before_hash !~ /\/$/) {
710 $before_hash .= "/" if (-d $linkfilename);
711 }
712 return ("http://" . $before_hash, $hash_part, 1);
713 } else {
714 # mailto, news, nntp, telnet, javascript or gopher link
715 return ($before_hash, "", 0);
716 }
717}
718
719sub extract_first_NNNN_characters {
720 my $self = shift (@_);
721 my ($textref, $doc_obj, $thissection) = @_;
722
723 foreach my $size (split /,/, $self->{'first'}) {
724 my $tmptext = $$textref;
725 # skip to the body
726 $tmptext =~ s/.*<body[^>]*>//i;
727 # remove javascript
728 $tmptext =~ s@<script.*?</script>@ @sig;
729 $tmptext =~ s/<[^>]*>/ /g;
730 $tmptext =~ s/&nbsp;/ /g;
731 $tmptext =~ s/^\s+//;
732 $tmptext =~ s/\s+$//;
733 $tmptext =~ s/\s+/ /gs;
734 $tmptext = &unicode::substr ($tmptext, 0, $size);
735 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
736 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
737 }
738}
739
740
741sub extract_metadata {
742 my $self = shift (@_);
743 my ($textref, $metadata, $doc_obj, $section) = @_;
744 my $outhandle = $self->{'outhandle'};
745 # if we don't want metadata, we may as well not be here ...
746 return if (!defined $self->{'metadata_fields'});
747
748 # metadata fields to extract/save. 'key' is the (lowercase) name of the
749 # html meta, 'value' is the metadata name for greenstone to use
750 my %find_fields = ();
751
752 my %creator_fields = (); # short-cut for lookups
753
754
755 foreach my $field (split /,/, $self->{'metadata_fields'}) {
756 # support tag<tagname>
757 if ($field =~ /^(.*?)<(.*?)>$/) {
758 # "$2" is the user's preferred gs metadata name
759 $find_fields{lc($1)}=$2; # lc = lowercase
760 } else { # no <tagname> for mapping
761 # "$field" is the user's preferred gs metadata name
762 $find_fields{lc($field)}=$field; # lc = lowercase
763 }
764 }
765
766 if (defined $self->{'hunt_creator_metadata'} &&
767 $self->{'hunt_creator_metadata'} == 1 ) {
768 my @extra_fields =
769 (
770 'author',
771 'author.email',
772 'creator',
773 'dc.creator',
774 'dc.creator.corporatename',
775 );
776
777 # add the creator_metadata fields to search for
778 foreach my $field (@extra_fields) {
779 $creator_fields{$field}=0; # add to lookup hash
780 }
781 }
782
783
784 # find the header in the html file, which has the meta tags
785 $$textref =~ m@<head>(.*?)</head>@si;
786
787 my $html_header=$1;
788
789 # go through every <meta... tag defined in the html and see if it is
790 # one of the tags we want to match.
791
792 # special case for title - we want to remember if its been found
793 my $found_title = 0;
794 # this assumes that ">" won't appear. (I don't think it's allowed to...)
795 $html_header =~ /^/; # match the start of the string, for \G assertion
796
797 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
798 my $metatag=$1;
799 my ($tag, $value);
800
801 # find the tag name
802 $metatag =~ /(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
803 $tag=$2;
804 # in case they're not using " or ', but they should...
805 if (! $tag) {
806 $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
807 $tag=$1;
808 }
809
810 if (!defined $tag) {
811 print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
812 next;
813 }
814
815 # don't need to assign this field if it was passed in from a previous
816 # (recursive) plugin
817 if (defined $metadata->{$tag}) {next}
818
819 # find the tag content
820 $metatag =~ /content\s*=\s*([\"\'])?(.*?)\1/is;
821 $value=$2;
822
823 if (! $value) {
824 $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
825 $value=$1;
826 }
827 if (!defined $value) {
828 print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
829 next;
830 }
831
832 # clean up and add
833 $value =~ s/\s+/ /gs;
834 chomp($value); # remove trailing \n, if any
835 if (exists $creator_fields{lc($tag)}) {
836 # map this value onto greenstone's "Creator" metadata
837 $tag='Creator';
838 } elsif (!exists $find_fields{lc($tag)}) {
839 next; # don't want this tag
840 } else {
841 # get the user's preferred capitalisation
842 $tag = $find_fields{lc($tag)};
843 }
844 if (lc($tag) eq "title") {
845 $found_title = 1;
846 }
847 print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
848 if ($self->{'verbosity'} > 2);
849 if ($tag =~ /date.*/i){
850 $tag = lc($tag);
851 }
852 $doc_obj->add_utf8_metadata($section, $tag, $value);
853
854 }
855
856 # TITLE: extract the document title
857 if (exists $find_fields{'title'} && !$found_title) {
858 # we want a title, and didn't find one in the meta tags
859 # see if there's a <title> tag
860 my $title;
861 my $from = ""; # for debugging output only
862 if ($html_header =~ /<title[^>]*>([^<]+)<\/title[^>]*>/is) {
863 $title = $1;
864 $from = "<title> tags";
865 }
866
867 if (!defined $title) {
868 $from = "first 100 chars";
869 # if no title use first 100 or so characters
870 $title = $$textref;
871 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
872 $title =~ s/^.*?<body>//si;
873 # ignore javascript!
874 $title =~ s@<script.*?</script>@ @sig;
875 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
876 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
877 $title = substr ($title, 0, 100);
878 $title =~ s/\s\S*$/.../;
879 }
880 $title =~ s/<[^>]*>/ /g; # remove html tags
881 $title =~ s/&nbsp;/ /g;
882 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
883 $title =~ s/\s+/ /gs; # collapse multiple spaces
884 $title =~ s/^\s*//; # remove leading spaces
885 $title =~ s/\s*$//; # remove trailing spaces
886
887 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
888 $title =~ s/^\s+//s; # in case title_sub introduced any...
889 $doc_obj->add_utf8_metadata ($section, 'Title', $title);
890 print $outhandle " extracted Title metadata \"$title\" from $from\n"
891 if ($self->{'verbosity'} > 2);
892 }
893
894 # add FileFormat metadata
895 $doc_obj->add_metadata($section,"FileFormat", "HTML");
896
897 # Special, for metadata names such as tagH1 - extracts
898 # the text between the first <H1> and </H1> tags into "H1" metadata.
899
900 foreach my $field (keys %find_fields) {
901 if ($field !~ /^tag([a-z0-9]+)$/i) {next}
902 my $tag = $1;
903 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
904 my $content = $1;
905 $content =~ s/&nbsp;/ /g;
906 $content =~ s/<[^>]*>/ /g;
907 $content =~ s/^\s+//;
908 $content =~ s/\s+$//;
909 $content =~ s/\s+/ /gs;
910 if ($content) {
911 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
912 $tag =~ s/^tag//i;
913 $doc_obj->add_utf8_metadata ($section, $tag, $content);
914 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
915 if ($self->{'verbosity'} > 2);
916 }
917 }
918 }
919}
920
921
922# evaluate any "../" to next directory up
923# evaluate any "./" as here
924sub eval_dir_dots {
925 my $self = shift (@_);
926 my ($filename) = @_;
927 my $dirsep_os = &util::get_os_dirsep();
928 my @dirsep = split(/$dirsep_os/,$filename);
929
930 my @eval_dirs = ();
931 foreach my $d (@dirsep) {
932 if ($d eq "..") {
933 pop(@eval_dirs);
934
935 } elsif ($d eq ".") {
936 # do nothing!
937
938 } else {
939 push(@eval_dirs,$d);
940 }
941 }
942
943 # Need to fiddle with number of elements in @eval_dirs if the
944 # first one is the empty string. This is because of a
945 # modification to util::filename_cat that supresses the addition
946 # of a leading '/' character (or \ if windows) (intended to help
947 # filename cat with relative paths) if the first entry in the
948 # array is the empty string. Making the array start with *two*
949 # empty strings is a way to defeat this "smart" option.
950 #
951 if (scalar(@eval_dirs) > 0) {
952 if ($eval_dirs[0] eq ""){
953 unshift(@eval_dirs,"");
954 }
955 }
956 return &util::filename_cat(@eval_dirs);
957}
958
959sub replace_usemap_links {
960 my $self = shift (@_);
961 my ($front, $link, $back) = @_;
962
963 $link =~ s/^\.\///;
964 return $front . $link . $back;
965}
966
967sub inc_filecount {
968 my $self = shift (@_);
969
970 if ($self->{'file_num'} == 1000) {
971 $self->{'dir_num'} ++;
972 $self->{'file_num'} = 0;
973 } else {
974 $self->{'file_num'} ++;
975 }
976}
977
978
979# Extend the BasPlug read_file so that strings like &eacute; are
980# converted to UTF8 internally.
981#
982# We don't convert &lt; or &gt; or &amp; or &quot; in case
983# they interfere with the GML files
984
985sub read_file {
986 my ($self, $filename, $encoding, $language, $textref) = @_;
987
988 &BasPlug::read_file($self, $filename, $encoding, $language, $textref);
989
990 # Convert entities to their UTF8 equivalents
991 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
992 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
993 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
994}
995
9961;
Note: See TracBrowser for help on using the repository browser.