source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 6649

Last change on this file since 6649 was 6649, checked in by kjdon, 20 years ago

changed the regex for getting info out of meta tags so it now works if the content attribute comes before the name attribute

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 26.7 KB
Line 
1###########################################################################
2#
3# HTMLPlug.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34# To use frames properly you'll need to use the WebPlug plugin.
35#
36
37
38package HTMLPlug;
39
40use BasPlug;
41use ghtml;
42use unicode;
43use util;
44use parsargv;
45
46sub BEGIN {
47 @ISA = ('BasPlug');
48}
49
50my $arguments =
51 [ { 'name' => "process_exp",
52 'desc' => "{BasPlug.process_exp}",
53 'type' => "regexp",
54 'deft' => &get_default_process_exp() },
55 { 'name' => "block_exp",
56 'desc' => "{BasPlug.block_exp}",
57 'type' => 'regexp',
58 'deft' => &get_default_block_exp() },
59 { 'name' => "nolinks",
60 'desc' => "{HTMLPlug.nolinks}",
61 'type' => "flag" },
62 { 'name' => "keep_head",
63 'desc' => "{HTMLPlug.keep_head}",
64 'type' => "flag" },
65 { 'name' => "no_metadata",
66 'desc' => "{HTMLPlug.no_metadata}",
67 'type' => "flag" },
68 { 'name' => "metadata_fields",
69 'desc' => "{HTMLPlug.metadata_fields}",
70 'type' => "string",
71 'deft' => "Title" },
72 { 'name' => "hunt_creator_metadata",
73 'desc' => "{HTMLPlug.hunt_creator_metadata}",
74 'type' => "flag" },
75 { 'name' => "file_is_url",
76 'desc' => "{HTMLPlug.file_is_url}",
77 'type' => "flag" },
78 { 'name' => "assoc_files",
79 'desc' => "{HTMLPlug.assoc_files}",
80 'type' => "regexp",
81 'deft' => &get_default_block_exp() },
82 { 'name' => "rename_assoc_files",
83 'desc' => "{HTMLPlug.rename_assoc_files}",
84 'type' => "flag" },
85 { 'name' => "title_sub",
86 'desc' => "{HTMLPlug.title_sub}",
87 'type' => "string",
88 'deft' => "" },
89 { 'name' => "description_tags",
90 'desc' => "{HTMLPlug.description_tags}",
91 'type' => "flag" } ];
92
93my $options = { 'name' => "HTMLPlug",
94 'desc' => "{HTMLPlug.desc}",
95 'abstract' => "no",
96 'inherits' => "yes",
97 'args' => $arguments };
98
99
100# sub print_usage {
101# print STDERR "\n usage: plugin HTMLPlug [options]\n\n";
102# print STDERR " options:\n";
103# print STDERR " -nolinks Don't make any attempt to trap links (setting this\n";
104# print STDERR " flag may improve speed of building/importing but\n";
105# print STDERR " any relative links within documents will be broken).\n";
106# print STDERR " -keep_head Don't remove headers from html files.\n";
107# print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n";
108# print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to
109# extract. Defaults to 'Title'.
110# Use 'tag<tagname>' to have the contents of the first
111# <tagname> pair put in a metadata element called
112# 'tagname'. Capitalise this as you want the metadata
113# capitalised in Greenstone, since the tag extraction
114# is case insensitive.\n";
115# print STDERR " -hunt_creator_metadata Find as much metadata as possible on authorship and
116# place it in the 'Creator' field. Requires the
117# -metadata_fields flag.\n";
118# print STDERR " -file_is_url Set if input filenames make up url of original source
119# documents e.g. if a web mirroring tool was used to
120# create the import directory structure\n";
121# print STDERR " -assoc_files Perl regular expression of file extensions to
122# associate with html documents.
123# Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n";
124# print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images).
125# Also creates much shallower directory structure
126# (useful when creating collections to go on cd-rom).\n";
127# print STDERR " -title_sub Substitution expression to modify string stored as
128# Title. Used by, for example, PDFPlug to remove
129# \"Page 1\", etc from text used as the title.\n";
130# print STDERR " -description_tags Split document into sub-sections where <Section> tags
131# occur. Note that by setting this option you
132# implicitly set -no_metadata, as all metadata should
133# be included within the <Section> tags (this is only
134# true for documents that actually contain <Section> tags
135# however). Also, '-keep_head' will have no effect when
136# this option is set, regardless of whether a document
137# contains Section tags.\n";
138# }
139
140sub new {
141 my $class = shift (@_);
142 my $self = new BasPlug ($class, @_);
143 $self->{'plugin_type'} = "HTMLPlug";
144 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
145 my $option_list = $self->{'option_list'};
146 push( @{$option_list}, $options );
147
148 if (!parsargv::parse(\@_,
149 q^nolinks^, \$self->{'nolinks'},
150 q^keep_head^, \$self->{'keep_head'},
151 q^no_metadata^, \$self->{'no_metadata'},
152 q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
153 q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'},
154 q^w3mir^, \$self->{'w3mir'},
155 q^file_is_url^, \$self->{'file_is_url'},
156 q^assoc_files/.*/(?i)\.(jpe?g|jpe|gif|png|css)$^, \$self->{'assoc_files'},
157 q^rename_assoc_files^, \$self->{'rename_assoc_files'},
158 q^title_sub/.*/^, \$self->{'title_sub'},
159 q^description_tags^, \$self->{'description_tags'},
160 "allow_extra_options")) {
161
162 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
163 $self->print_txt_usage(""); # Use default resource bundle
164 die "\n";
165 }
166
167 # retain this for backward compatibility (w3mir option was replaced by
168 # file_is_url)
169 if ($self->{'w3mir'}) {
170 $self->{'file_is_url'} = 1;
171 }
172
173 $self->{'aux_files'} = {};
174 $self->{'dir_num'} = 0;
175 $self->{'file_num'} = 0;
176 return bless $self, $class;
177}
178
179
180sub get_default_block_exp {
181 my $self = shift (@_);
182
183 return q^(?i)\.(gif|jpe?g|jpe|png|css)$^;
184}
185
186sub get_default_process_exp {
187 my $self = shift (@_);
188
189 # the last option is an attempt to encode the concept of an html query ...
190 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
191}
192
193
194# do plugin specific processing of doc_obj
195sub process {
196 my $self = shift (@_);
197 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
198 my $outhandle = $self->{'outhandle'};
199
200 print STDERR "<Processing n='$file' p='HTMLPlug'>\n" if ($gli);
201
202 print $outhandle "HTMLPlug: processing $file\n"
203 if $self->{'verbosity'} > 1;
204
205 if ($ENV{'GSDLOS'} =~ /^windows/i) {
206 # this makes life so much easier... perl can cope with unix-style '/'s.
207 $base_dir =~ s@(\\)+@/@g;
208 $file =~ s@(\\)+@/@g;
209 }
210
211 # reset per-doc stuff...
212 $self->{'aux_files'} = {};
213 $self->{'dir_num'} = 0;
214 $self->{'file_num'} = 0;
215
216 my $cursection = $doc_obj->get_top_section();
217
218 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
219 unless $self->{'no_metadata'} || $self->{'description_tags'};
220
221 # Store URL for page as metadata - this can be used for an
222 # altavista style search interface. The URL won't be valid
223 # unless the file structure contains the domain name (i.e.
224 # like when w3mir is used to download a website).
225 my $web_url = "http://$file";
226 $doc_obj->add_metadata($cursection, "URL", $web_url);
227
228 if ($self->{'description_tags'}) {
229
230 # remove the html header - note that doing this here means any
231 # sections defined within the header will be lost (so all <Section>
232 # tags must appear within the body of the HTML)
233 $$textref =~ s/^.*?<body[^>]*>//is;
234 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
235
236 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
237 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
238 my $lt = '(?:<|&lt;)';
239 my $gt = '(?:>|&gt;)';
240 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
241
242 my $found_something = 0; my $top = 1;
243 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
244 my $text = $1;
245 my $comment = $2;
246 if (defined $text) {
247 # text before a comment - note that getting to here
248 # doesn't necessarily mean there are Section tags in
249 # the document
250 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
251 }
252 while ($comment =~ s/$lt(.*?)$gt//s) {
253
254 my $tag = $1;
255 if ($tag eq "Section") {
256 $found_something = 1;
257 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
258 $top = 0;
259 } elsif ($tag eq "/Section") {
260 $found_something = 1;
261 $cursection = $doc_obj->get_parent_section ($cursection);
262 } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
263 my $metaname = $1;
264 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
265 my $metavalue = $1;
266 $metavalue =~ s/^\s+//;
267 $metavalue =~ s/\s+$//;
268 # assume that no metadata value intentionally includes
269 # carriage returns or HTML tags (if they're there they
270 # were probably introduced when converting to HTML from
271 # some other format).
272 $metavalue =~ s/[\cJ\cM]/ /sg;
273 $metavalue =~ s/<[^>]+>//sg;
274 $metavalue =~ s/\s+/ /sg;
275 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
276 } elsif ($tag eq "Description" || $tag eq "/Description") {
277 # do nothing with containing Description tags
278 } else {
279 # simple HTML tag (probably created by the conversion
280 # to HTML from some other format) - we'll ignore it and
281 # hope for the best ;-)
282 }
283 }
284 }
285 if ($cursection ne "") {
286 print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
287 }
288
289 $$textref =~ s/^.*?<body[^>]*>//is;
290 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
291 if ($$textref =~ /\S/) {
292 if (!$found_something) {
293 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
294 print $outhandle " will be processed as a single section document\n";
295
296 # go ahead and process single-section document
297 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
298
299 # if document contains no Section tags we'll go ahead
300 # and extract metadata (this won't have been done
301 # above as the -description_tags option prevents it)
302 $self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
303 unless $self->{'no_metadata'};
304
305 } else {
306 print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
307 print $outhandle " of the final closing </Section> tag. This text will\n";
308 print $outhandle " be ignored.";
309 my ($text);
310 if (length($$textref) > 30) {
311 $text = substr($$textref, 0, 30) . "...";
312 } else {
313 $text = $$textref;
314 }
315 $text =~ s/\n/ /isg;
316 print $outhandle " ($text)\n";
317 }
318 } elsif (!$found_something) {
319
320 # may get to here if document contained no valid Section
321 # tags but did contain some comments. The text will have
322 # been processed already but we should print the warning
323 # as above and extract metadata
324 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
325 print $outhandle " will be processed as a single section document\n";
326
327 $self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
328 unless $self->{'no_metadata'};
329 }
330
331 } else {
332
333 # remove header and footer
334 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
335 $$textref =~ s/^.*?<body[^>]*>//is;
336 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
337 }
338
339 # single section document
340 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
341 }
342 return 1;
343}
344
345# note that process_section may be called multiple times for a single
346# section (relying on the fact that add_utf8_text appends the text to any
347# that may exist already).
348sub process_section {
349 my $self = shift (@_);
350 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
351 # trap links
352 if (!$self->{'nolinks'}) {
353
354 # usemap="./#index" not handled correctly => change to "#index"
355 $$textref =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
356 $self->replace_usemap_links($1, $2, $3)/isge;
357
358 $$textref =~ s/(<(?:a|area|frame|link)\s+[^>]*?\s*(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
359 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
360 }
361
362 # trap images
363
364 # allow spaces if inside quotes - jrm21
365 $$textref =~ s/(<img[^>]*?src\s*=\s*)(\"[^\"]+\"|[^\s>]+)([^>]*>)/
366 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
367
368 # add text to document object
369 # turn \ into \\ so that the rest of greenstone doesn't think there
370 # is an escape code following. (Macro parsing loses them...)
371 $$textref =~ s/\\/\\\\/go;
372 $doc_obj->add_utf8_text($cursection, $$textref);
373}
374
375sub replace_images {
376 my $self = shift (@_);
377 my ($front, $link, $back, $base_dir,
378 $file, $doc_obj, $section) = @_;
379 # remove quotes from link at start and end if necessary
380 if ($link=~/^\"/) {
381 $link=~s/^\"//;$link=~s/\"$//;
382 $front.='"';
383 $back="\"$back";
384 }
385
386 $link =~ s/\n/ /g;
387 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
388 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
389}
390
391sub replace_href_links {
392 my $self = shift (@_);
393 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
394
395 # attempt to sort out targets - frames are not handled
396 # well in this plugin and some cases will screw things
397 # up - e.g. the _parent target (so we'll just remove
398 # them all ;-)
399 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
400 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
401 $front =~ s/target=\"?_parent\"?//is;
402 $back =~ s/target=\"?_parent\"?//is;
403
404 return $front . $link . $back if $link =~ /^\#/s;
405 $link =~ s/\n/ /g;
406
407 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
408 # href may use '\'s where '/'s should be on Windows
409 $href =~ s/\\/\//g;
410
411 my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/;
412
413 ##### leave all these links alone (they won't be picked up by intermediate
414 ##### pages). I think that's safest when dealing with frames, targets etc.
415 ##### (at least until I think of a better way to do it). Problems occur with
416 ##### mailto links from within small frames, the intermediate page is displayed
417 ##### within that frame and can't be seen. There is still potential for this to
418 ##### happen even with html pages - the solution seems to be to somehow tell
419 ##### the browser from the server side to display the page being sent (i.e.
420 ##### the intermediate page) in the top level window - I'm not sure if that's
421 ##### possible - the following line should probably be deleted if that can be done
422 return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is;
423
424
425 if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||
426 ($href =~ /\/$/) || ($href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
427 &ghtml::urlsafe ($href);
428 return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
429
430 } else {
431 # link is to some other type of file (eg image) so we'll
432 # need to associate that file
433 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
434 }
435}
436
437sub add_file {
438 my $self = shift (@_);
439 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
440 my ($newname);
441
442 my $filename = $href;
443 $filename =~ s/^[^:]*:\/\///;
444 $filename = &util::filename_cat($base_dir, $filename);
445
446 # Replace %20's in URL with a space if required. Note that the filename
447 # may include the %20 in some situations
448 if ($filename =~ /\%20/) {
449 if (!-e $filename) {
450 $filename =~ s/\%20/ /g;
451 }
452 }
453
454 my ($ext) = $filename =~ /(\.[^\.]*)$/;
455
456 if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
457 return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
458 }
459
460 if ($self->{'rename_assoc_files'}) {
461 if (defined $self->{'aux_files'}->{$href}) {
462 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
463 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
464 } else {
465 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
466 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
467 $self->inc_filecount ();
468 }
469 $doc_obj->associate_file($filename, $newname, undef, $section);
470 return "_httpdocimg_/$newname";
471
472 } else {
473 ($newname) = $filename =~ /([^\/\\]*)$/;
474 $doc_obj->associate_file($filename, $newname, undef, $section);
475 return "_httpdocimg_/$newname";
476 }
477}
478
479
480sub format_link {
481 my $self = shift (@_);
482 my ($link, $base_dir, $file) = @_;
483
484 my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;
485
486 $hash_part = "" if !defined $hash_part;
487 if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {
488 my $outhandle = $self->{'outhandle'};
489 print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
490 if $self->{'verbosity'};
491 return ($link, "", 0);
492 }
493
494 if ($before_hash =~ s@^((?:http|ftp|file)://)@@i) {
495 my $type = $1;
496
497 if ($link =~ /^(http|ftp):/i) {
498 # Turn url (using /) into file name (possibly using \ on windows)
499 my @http_dir_split = split('/', $before_hash);
500 $before_hash = &util::filename_cat(@http_dir_split);
501 }
502
503 $before_hash = $self->eval_dir_dots($before_hash);
504
505 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
506
507 my $rl = 0;
508 $rl = 1 if (-e $linkfilename);
509
510 # make sure there's a slash on the end if it's a directory
511 if ($before_hash !~ /\/$/) {
512 $before_hash .= "/" if (-d $linkfilename);
513 }
514
515 return ($type . $before_hash, $hash_part, $rl);
516
517 } elsif ($link !~ /^(mailto|news|gopher|nntp|telnet|javascript):/i) {
518 if ($before_hash =~ s@^/@@ || $before_hash =~ /\\/) {
519
520 # the first directory will be the domain name if file_is_url
521 # to generate archives, otherwise we'll assume all files are
522 # from the same site and base_dir is the root
523
524 if ($self->{'file_is_url'}) {
525 my @dirs = split /[\/\\]/, $file;
526 my $domname = shift (@dirs);
527 $before_hash = &util::filename_cat($domname, $before_hash);
528 $before_hash =~ s@\\@/@g; # for windows
529 }
530 else
531 {
532 # see if link shares directory with source document
533 # => turn into relative link if this is so!
534
535 if ($ENV{'GSDLOS'} =~ /^windows/i) {
536 # too difficult doing a pattern match with embedded '\'s...
537 my $win_before_hash=$before_hash;
538 $win_before_hash =~ s@(\\)+@/@g;
539 # $base_dir is already similarly "converted" on windows.
540 if ($win_before_hash =~ s@^$base_dir/@@o) {
541 # if this is true, we removed a prefix
542 $before_hash=$win_before_hash;
543 }
544
545 }
546 else {
547 $before_hash = &util::filename_cat("",$before_hash);
548 $before_hash =~ s@^$base_dir/@@;
549 }
550
551 }
552
553 } else {
554 # Turn relative file path into full path
555 my $dirname = &File::Basename::dirname($file);
556 $before_hash = &util::filename_cat($dirname, $before_hash);
557 $before_hash = $self->eval_dir_dots($before_hash);
558 }
559
560 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
561
562 # make sure there's a slash on the end if it's a directory
563 if ($before_hash !~ /\/$/) {
564 $before_hash .= "/" if (-d $linkfilename);
565 }
566
567 return ("http://" . $before_hash, $hash_part, 1);
568
569 } else {
570 # mailto, news, nntp, telnet, javascript or gopher link
571 return ($before_hash, "", 0);
572 }
573}
574
575sub extract_first_NNNN_characters {
576 my $self = shift (@_);
577 my ($textref, $doc_obj, $thissection) = @_;
578
579 foreach my $size (split /,/, $self->{'first'}) {
580 my $tmptext = $$textref;
581 # skip to the body
582 $tmptext =~ s/.*<body[^>]*>//i;
583 # remove javascript
584 $tmptext =~ s@<script.*?</script>@ @sig;
585 $tmptext =~ s/<[^>]*>/ /g;
586 $tmptext =~ s/&nbsp;/ /g;
587 $tmptext =~ s/^\s+//;
588 $tmptext =~ s/\s+$//;
589 $tmptext =~ s/\s+/ /gs;
590 $tmptext = substr ($tmptext, 0, $size);
591 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
592 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
593 }
594}
595
596sub extract_metadata {
597 my $self = shift (@_);
598 my ($textref, $metadata, $doc_obj, $section) = @_;
599 my $outhandle = $self->{'outhandle'};
600 # if we don't want metadata, we may as well not be here ...
601 return if (!defined $self->{'metadata_fields'});
602
603 # hunt for an author look in the metadata elements:
604 if (defined $self->{'hunt_creator_metadata'}) {
605 for my $name (split /,/, "AUTHOR,AUTHOR.EMAIL,CREATOR,DC.CREATOR,DC.CREATOR.CORPORATENAME") {
606 #if ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$name\"?([^>]*)/is) {
607 if ($$textref =~ /<meta(.*?)(?:name|http-equiv)\s*=\s*\"?$name\"?([^>]*)/is) {
608 my $content = $1 . $2;
609 if ($content =~ /content\s*=\s*\"?(.*)\"?/is) {
610 if (defined $1) {
611 my $value = $1;
612 $value =~ s/\"$//;
613 $value =~ s/\s+/ /gs;
614 $doc_obj->add_utf8_metadata($section, "Creator", $value);
615 print $outhandle " extracted Creator metadata \"$value\"\n"
616 if ($self->{'verbosity'} > 2);
617 next;
618 }
619 }
620 }
621 }
622 }
623
624 foreach my $field (split /,/, $self->{'metadata_fields'}) {
625 my $found = 0;
626 # don't need to extract field if it was passed in from a previous
627 # (recursive) plugin
628 next if defined $metadata->{$field};
629
630 # see if there's a <meta> tag for this field
631 #while ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$field\"?([^>]*)/isg) {
632 while ($$textref =~ /<meta(.*?)(?:name|http-equiv)\s*=\s*\"?$field\"?([^>]*)/isg) {
633 my $content = $1 . $2;
634 if ($content =~ /content\s*=\s*\"?(.*)\"?/is) {
635 if (defined $1) {
636 my $value = $1;
637 $value =~ s/\"$//;
638 $value =~ s/\s+/ /gs;
639 $value =~ s/\".*//gs;
640 $doc_obj->add_utf8_metadata($section, $field, $value);
641 print $outhandle " extracted \"$field\" metadata \"$value\"\n"
642 if ($self->{'verbosity'} > 2);
643 $found = 1;
644 }
645 }
646 }
647 next if $found;
648 # TITLE: extract the document title
649
650 if ($field =~ /^title$/i) {
651
652 # see if there's a <title> tag
653 if ($$textref =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) {
654 if (defined $1) {
655 my $title = $1;
656 # Arg. This allows only ascii value characters in titles
657 if ($title =~ /\w/) {
658 $title =~ s/<[^>]*>/ /g;
659 $title =~ s/&nbsp;/ /g;
660 $title =~ s/\s+/ /gs;
661 $title =~ s/^\s+//;
662 $title =~ s/\s+$//;
663 $doc_obj->add_utf8_metadata ($section, $field, $title);
664 print $outhandle " extracted \"$field\" metadata \"$title\"\n"
665 if ($self->{'verbosity'} > 2);
666 next;
667 }
668 }
669 }
670
671 # if no title use first 100 characters
672 my $tmptext = $$textref;
673 $tmptext =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
674 $tmptext =~ s/<[^>]*>/ /g;
675 $tmptext =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
676 $tmptext =~ s/^\s+//s;
677 $tmptext =~ s/\s+$//;
678 $tmptext =~ s/\s+/ /gs;
679 $tmptext =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
680 $tmptext =~ s/^\s+//s; # in case title_sub introduced any...
681 $tmptext = substr ($tmptext, 0, 100);
682 $tmptext =~ s/\s\S*$/.../;
683 $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
684 print $outhandle " extracted \"$field\" metadata \"$tmptext\"\n"
685 if ($self->{'verbosity'} > 2);
686 next;
687 }
688
689 # tag: extract the text between the first <H1> and </H1> tags
690 if ($field =~ /^tag[a-z0-9]+$/i) {
691
692 my $tag = $field;
693 $tag =~ s/^tag//i;
694 my $tmptext = $$textref;
695 $tmptext =~ s/\s+/ /gs;
696 if ($tmptext =~ /<$tag[^>]*>/i) {
697 foreach my $word ($tmptext =~ m/<$tag[^>]*>(.*?)<\/$tag[^>]*>/g) {
698 $word =~ s/&nbsp;/ /g;
699 $word =~ s/<[^>]*>/ /g;
700 $word =~ s/^\s+//;
701 $word =~ s/\s+$//;
702 $word =~ s/\s+/ /gs;
703 if ($word ne "") {
704 $doc_obj->add_utf8_metadata ($section, $tag, $word);
705 print $outhandle " extracted \"$tag\" metadata \"$word\"\n"
706 if ($self->{'verbosity'} > 2);
707 }
708 }
709 }
710 next;
711 }
712 }
713}
714
715
716# evaluate any "../" to next directory up
717# evaluate any "./" as here
718sub eval_dir_dots {
719 my $self = shift (@_);
720 my ($filename) = @_;
721
722 my $dirsep_os = &util::get_os_dirsep();
723 my @dirsep = split(/$dirsep_os/,$filename);
724
725 my @eval_dirs = ();
726 foreach my $d (@dirsep) {
727 if ($d eq "..") {
728 pop(@eval_dirs);
729
730 } elsif ($d eq ".") {
731 # do nothing!
732
733 } else {
734 push(@eval_dirs,$d);
735 }
736 }
737
738 return &util::filename_cat(@eval_dirs);
739}
740
741sub replace_usemap_links {
742 my $self = shift (@_);
743 my ($front, $link, $back) = @_;
744
745 $link =~ s/^\.\///;
746 return $front . $link . $back;
747}
748
749sub inc_filecount {
750 my $self = shift (@_);
751
752 if ($self->{'file_num'} == 1000) {
753 $self->{'dir_num'} ++;
754 $self->{'file_num'} = 0;
755 } else {
756 $self->{'file_num'} ++;
757 }
758}
759
760
761# Extend the BasPlug read_file so that strings like &eacute; are
762# converted to UTF8 internally.
763#
764# We don't convert &lt; or &gt; or &amp; or &quot; in case
765# they interfere with the GML files
766
767sub read_file {
768 my ($self, $filename, $encoding, $language, $textref) = @_;
769
770 &BasPlug::read_file($self, $filename, $encoding, $language, $textref);
771
772 # Convert entities to their UTF8 equivalents
773 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
774 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
775 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
776}
777
7781;
Note: See TracBrowser for help on using the repository browser.