source: tags/gsdl-2_40-distribution/gsdl/perllib/plugins/HTMLPlug.pm@ 4846

Last change on this file since 4846 was 4846, checked in by (none), 21 years ago

This commit was manufactured by cvs2svn to create tag
'gsdl-2_40-distribution'.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 27.9 KB
Line 
1###########################################################################
2#
3# HTMLPlug.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34# To use frames properly you'll need to use the WebPlug plugin.
35#
36
37
38package HTMLPlug;
39
40use BasPlug;
41use ghtml;
42use unicode;
43use util;
44use parsargv;
45
46sub BEGIN {
47 @ISA = ('BasPlug');
48}
49
50my $arguments =
51 [ { 'name' => "process_exp",
52 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
53 'type' => "string",
54 'deft' => &get_default_process_exp() },
55 { 'name' => "block_exp",
56 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
57 'type' => 'string',
58 'deft' => &get_default_block_exp() },
59 { 'name' => "nolinks",
60 'desc' => "Don't make any attempt to trap links (setting this flag may improve speed of building/importing but any relative links within documents will be broken).",
61 'type' => "flag" },
62 { 'name' => "keep_head",
63 'desc' => "Don't remove headers from html files.",
64 'type' => "flag" },
65 { 'name' => "no_metadata",
66 'desc' => "Don't attempt to extract any metadata from files.",
67 'type' => "flag" },
68 { 'name' => "metadata_fields",
69 'desc' => "Comma separated list of metadata fields to attempt to extract. Use 'tag<tagname>' to have the contents of the first <tagname> pair put in a metadata element called 'tagname'. Capitalise this as you want the metadata capitalised in Greenstone, since the tag extraction is case insensitive.",
70 'type' => "metadata",
71 'deft' => "Title" },
72 { 'name' => "hunt_creator_metadata",
73 'desc' => "Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.",
74 'type' => "flag" },
75 { 'name' => "file_is_url",
76 'desc' => "Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure.",
77 'type' => "flag" },
78 { 'name' => "assoc_files",
79 'desc' => "Perl regular expression of file extensions to associate with html documents.",
80 'type' => "string",
81 'deft' => q^(?i)\.(jpe?g|gif|png|css)$^ },
82 { 'name' => "rename_assoc_files",
83 'desc' => "Renames files associated with documents (e.g. images). Also creates much shallower directory structure (useful when creating collections to go on cd-rom).",
84 'type' => "flag" },
85 { 'name' => "title_sub",
86 'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove \"Page 1\", etc from text used as the title.",
87 'type' => "string",
88 'deft' => "" },
89 { 'name' => "description_tags",
90 'desc' => "Split document into sub-sections where <Section> tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the <Section> tags. Also, '-keep_head' will have no effect when this option is set.",
91 'type' => "flag" } ];
92
93my $options = { 'name' => "HTMLPlug",
94 'desc' => "This plugin processes HTML files",
95 'inherits' => "yes",
96 'args' => $arguments };
97
98
99# sub print_usage {
100# print STDERR "\n usage: plugin HTMLPlug [options]\n\n";
101# print STDERR " options:\n";
102# print STDERR " -nolinks Don't make any attempt to trap links (setting this\n";
103# print STDERR " flag may improve speed of building/importing but\n";
104# print STDERR " any relative links within documents will be broken).\n";
105# print STDERR " -keep_head Don't remove headers from html files.\n";
106# print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n";
107# print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to
108# extract. Defaults to 'Title'.
109# Use 'tag<tagname>' to have the contents of the first
110# <tagname> pair put in a metadata element called
111# 'tagname'. Capitalise this as you want the metadata
112# capitalised in Greenstone, since the tag extraction
113# is case insensitive.\n";
114# print STDERR " -hunt_creator_metadata Find as much metadata as possible on authorship and
115# place it in the 'Creator' field. Requires the
116# -metadata_fields flag.\n";
117# print STDERR " -file_is_url Set if input filenames make up url of original source
118# documents e.g. if a web mirroring tool was used to
119# create the import directory structure\n";
120# print STDERR " -assoc_files Perl regular expression of file extensions to
121# associate with html documents.
122# Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n";
123# print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images).
124# Also creates much shallower directory structure
125# (useful when creating collections to go on cd-rom).\n";
126# print STDERR " -title_sub Substitution expression to modify string stored as
127# Title. Used by, for example, PDFPlug to remove
128# \"Page 1\", etc from text used as the title.\n";
129# print STDERR " -description_tags Split document into sub-sections where <Section> tags
130# occur. Note that by setting this option you
131# implicitly set -no_metadata, as all metadata should
132# be included within the <Section> tags (this is only
133# true for documents that actually contain <Section> tags
134# however). Also, '-keep_head' will have no effect when
135# this option is set, regardless of whether a document
136# contains Section tags.\n";
137# }
138
139sub new {
140 my $class = shift (@_);
141 my $self = new BasPlug ($class, @_);
142
143 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
144 my $option_list = $self->{'option_list'};
145 push( @{$option_list}, $options );
146
147 if (!parsargv::parse(\@_,
148 q^nolinks^, \$self->{'nolinks'},
149 q^keep_head^, \$self->{'keep_head'},
150 q^no_metadata^, \$self->{'no_metadata'},
151 q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
152 q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'},
153 q^w3mir^, \$self->{'w3mir'},
154 q^file_is_url^, \$self->{'file_is_url'},
155 q^assoc_files/.*/(?i)\.(jpe?g|jpe|gif|png|css)$^, \$self->{'assoc_files'},
156 q^rename_assoc_files^, \$self->{'rename_assoc_files'},
157 q^title_sub/.*/^, \$self->{'title_sub'},
158 q^description_tags^, \$self->{'description_tags'},
159 "allow_extra_options")) {
160
161 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
162 $self->print_txt_usage();
163 die "\n";
164 }
165
166 # retain this for backward compatibility (w3mir option was replaced by
167 # file_is_url)
168 if ($self->{'w3mir'}) {
169 $self->{'file_is_url'} = 1;
170 }
171
172 $self->{'aux_files'} = {};
173 $self->{'dir_num'} = 0;
174 $self->{'file_num'} = 0;
175 return bless $self, $class;
176}
177
178
179sub get_default_block_exp {
180 my $self = shift (@_);
181
182 return q^(?i)\.(gif|jpe?g|jpe|png|css)$^;
183}
184
185sub get_default_process_exp {
186 my $self = shift (@_);
187
188 # the last option is an attempt to encode the concept of an html query ...
189 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
190}
191
192
193# do plugin specific processing of doc_obj
194sub process {
195 my $self = shift (@_);
196 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
197 my $outhandle = $self->{'outhandle'};
198 print $outhandle "HTMLPlug: processing $file\n"
199 if $self->{'verbosity'} > 1;
200
201 if ($ENV{'GSDLOS'} =~ /^windows/i) {
202 # this makes life so much easier... perl can cope with unix-style '/'s.
203 $base_dir =~ s@(\\)+@/@g;
204 $file =~ s@(\\)+@/@g;
205 }
206
207 # reset per-doc stuff...
208 $self->{'aux_files'} = {};
209 $self->{'dir_num'} = 0;
210 $self->{'file_num'} = 0;
211
212 my $cursection = $doc_obj->get_top_section();
213
214 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
215 unless $self->{'no_metadata'} || $self->{'description_tags'};
216
217 # Store URL for page as metadata - this can be used for an
218 # altavista style search interface. The URL won't be valid
219 # unless the file structure contains the domain name (i.e.
220 # like when w3mir is used to download a website).
221 my $web_url = "http://$file";
222 $doc_obj->add_metadata($cursection, "URL", $web_url);
223
224 if ($self->{'description_tags'}) {
225
226 # remove the html header - note that doing this here means any
227 # sections defined within the header will be lost (so all <Section>
228 # tags must appear within the body of the HTML)
229 $$textref =~ s/^.*?<body[^>]*>//is;
230 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
231
232 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
233 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
234 my $lt = '(?:<|&lt;)';
235 my $gt = '(?:>|&gt;)';
236 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
237
238 my $found_something = 0; my $top = 1;
239 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
240 my $text = $1;
241 my $comment = $2;
242 if (defined $text) {
243 # text before a comment - note that getting to here
244 # doesn't necessarily mean there are Section tags in
245 # the document
246 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
247 }
248 while ($comment =~ s/$lt(.*?)$gt//s) {
249
250 my $tag = $1;
251 if ($tag eq "Section") {
252 $found_something = 1;
253 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
254 $top = 0;
255 } elsif ($tag eq "/Section") {
256 $found_something = 1;
257 $cursection = $doc_obj->get_parent_section ($cursection);
258 } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
259 my $metaname = $1;
260 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
261 my $metavalue = $1;
262 $metavalue =~ s/^\s+//;
263 $metavalue =~ s/\s+$//;
264 # assume that no metadata value intentionally includes
265 # carriage returns or HTML tags (if they're there they
266 # were probably introduced when converting to HTML from
267 # some other format).
268 $metavalue =~ s/[\cJ\cM]/ /sg;
269 $metavalue =~ s/<[^>]+>//sg;
270 $metavalue =~ s/\s+/ /sg;
271 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
272 } elsif ($tag eq "Description" || $tag eq "/Description") {
273 # do nothing with containing Description tags
274 } else {
275 # simple HTML tag (probably created by the conversion
276 # to HTML from some other format) - we'll ignore it and
277 # hope for the best ;-)
278 }
279 }
280 }
281 if ($cursection ne "") {
282 print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
283 }
284
285 $$textref =~ s/^.*?<body[^>]*>//is;
286 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
287 if ($$textref =~ /\S/) {
288 if (!$found_something) {
289 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
290 print $outhandle " will be processed as a single section document\n";
291
292 # go ahead and process single-section document
293 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
294
295 # if document contains no Section tags we'll go ahead
296 # and extract metadata (this won't have been done
297 # above as the -description_tags option prevents it)
298 $self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
299 unless $self->{'no_metadata'};
300
301 } else {
302 print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
303 print $outhandle " of the final closing </Section> tag. This text will\n";
304 print $outhandle " be ignored.";
305 my ($text);
306 if (length($$textref) > 30) {
307 $text = substr($$textref, 0, 30) . "...";
308 } else {
309 $text = $$textref;
310 }
311 $text =~ s/\n/ /isg;
312 print $outhandle " ($text)\n";
313 }
314 } elsif (!$found_something) {
315
316 # may get to here if document contained no valid Section
317 # tags but did contain some comments. The text will have
318 # been processed already but we should print the warning
319 # as above and extract metadata
320 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
321 print $outhandle " will be processed as a single section document\n";
322
323 $self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
324 unless $self->{'no_metadata'};
325 }
326
327 } else {
328
329 # remove header and footer
330 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
331 $$textref =~ s/^.*?<body[^>]*>//is;
332 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
333 }
334
335 # single section document
336 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
337 }
338 return 1;
339}
340
341# note that process_section may be called multiple times for a single
342# section (relying on the fact that add_utf8_text appends the text to any
343# that may exist already).
344sub process_section {
345 my $self = shift (@_);
346 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
347 # trap links
348 if (!$self->{'nolinks'}) {
349
350 # usemap="./#index" not handled correctly => change to "#index"
351 $$textref =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
352 $self->replace_usemap_links($1, $2, $3)/isge;
353
354 $$textref =~ s/(<(?:a|area|frame|link)\s+[^>]*?\s*(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
355 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
356 }
357
358 # trap images
359
360 # allow spaces if inside quotes - jrm21
361 $$textref =~ s/(<img[^>]*?src\s*=\s*)(\"[^\"]+\"|[^\s>]+)([^>]*>)/
362 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
363
364 # add text to document object
365 # turn \ into \\ so that the rest of greenstone doesn't think there
366 # is an escape code following. (Macro parsing loses them...)
367 $$textref =~ s/\\/\\\\/go;
368 $doc_obj->add_utf8_text($cursection, $$textref);
369}
370
371sub replace_images {
372 my $self = shift (@_);
373 my ($front, $link, $back, $base_dir,
374 $file, $doc_obj, $section) = @_;
375 # remove quotes from link at start and end if necessary
376 if ($link=~/^\"/) {
377 $link=~s/^\"//;$link=~s/\"$//;
378 $front.='"';
379 $back="\"$back";
380 }
381
382 $link =~ s/\n/ /g;
383 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
384 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
385}
386
387sub replace_href_links {
388 my $self = shift (@_);
389 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
390
391 # attempt to sort out targets - frames are not handled
392 # well in this plugin and some cases will screw things
393 # up - e.g. the _parent target (so we'll just remove
394 # them all ;-)
395 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
396 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
397 $front =~ s/target=\"?_parent\"?//is;
398 $back =~ s/target=\"?_parent\"?//is;
399
400 return $front . $link . $back if $link =~ /^\#/s;
401 $link =~ s/\n/ /g;
402
403 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
404 # href may use '\'s where '/'s should be on Windows
405 $href =~ s/\\/\//g;
406
407 my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/;
408
409 ##### leave all these links alone (they won't be picked up by intermediate
410 ##### pages). I think that's safest when dealing with frames, targets etc.
411 ##### (at least until I think of a better way to do it). Problems occur with
412 ##### mailto links from within small frames, the intermediate page is displayed
413 ##### within that frame and can't be seen. There is still potential for this to
414 ##### happen even with html pages - the solution seems to be to somehow tell
415 ##### the browser from the server side to display the page being sent (i.e.
416 ##### the intermediate page) in the top level window - I'm not sure if that's
417 ##### possible - the following line should probably be deleted if that can be done
418 return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is;
419
420
421 if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||
422 ($href =~ /\/$/) || ($href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
423 &ghtml::urlsafe ($href);
424 return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
425
426 } else {
427 # link is to some other type of file (eg image) so we'll
428 # need to associate that file
429 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
430 }
431}
432
433sub add_file {
434 my $self = shift (@_);
435 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
436 my ($newname);
437
438 my $filename = $href;
439 $filename =~ s/^[^:]*:\/\///;
440 $filename = &util::filename_cat($base_dir, $filename);
441
442 # Replace %20's in URL with a space if required. Note that the filename
443 # may include the %20 in some situations
444 if ($filename =~ /\%20/) {
445 if (!-e $filename) {
446 $filename =~ s/\%20/ /g;
447 }
448 }
449
450 my ($ext) = $filename =~ /(\.[^\.]*)$/;
451
452 if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
453 return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
454 }
455
456 if ($self->{'rename_assoc_files'}) {
457 if (defined $self->{'aux_files'}->{$href}) {
458 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
459 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
460 } else {
461 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
462 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
463 $self->inc_filecount ();
464 }
465 $doc_obj->associate_file($filename, $newname, undef, $section);
466 return "_httpdocimg_/$newname";
467
468 } else {
469 ($newname) = $filename =~ /([^\/\\]*)$/;
470 $doc_obj->associate_file($filename, $newname, undef, $section);
471 return "_httpdocimg_/$newname";
472 }
473}
474
475
476sub format_link {
477 my $self = shift (@_);
478 my ($link, $base_dir, $file) = @_;
479
480 my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;
481
482 $hash_part = "" if !defined $hash_part;
483 if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {
484 my $outhandle = $self->{'outhandle'};
485 print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
486 if $self->{'verbosity'};
487 return ($link, "", 0);
488 }
489
490 if ($before_hash =~ s@^((?:http|ftp|file)://)@@i) {
491 my $type = $1;
492
493 if ($link =~ /^(http|ftp):/i) {
494 # Turn url (using /) into file name (possibly using \ on windows)
495 my @http_dir_split = split('/', $before_hash);
496 $before_hash = &util::filename_cat(@http_dir_split);
497 }
498
499 $before_hash = $self->eval_dir_dots($before_hash);
500
501 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
502
503 my $rl = 0;
504 $rl = 1 if (-e $linkfilename);
505
506 # make sure there's a slash on the end if it's a directory
507 if ($before_hash !~ /\/$/) {
508 $before_hash .= "/" if (-d $linkfilename);
509 }
510
511 return ($type . $before_hash, $hash_part, $rl);
512
513 } elsif ($link !~ /^(mailto|news|gopher|nntp|telnet|javascript):/i) {
514 if ($before_hash =~ s@^/@@ || $before_hash =~ /\\/) {
515
516 # the first directory will be the domain name if file_is_url
517 # to generate archives, otherwise we'll assume all files are
518 # from the same site and base_dir is the root
519
520 if ($self->{'file_is_url'}) {
521 my @dirs = split /[\/\\]/, $file;
522 my $domname = shift (@dirs);
523 $before_hash = &util::filename_cat($domname, $before_hash);
524 $before_hash =~ s@\\@/@g; # for windows
525 }
526 else
527 {
528 # see if link shares directory with source document
529 # => turn into relative link if this is so!
530
531 if ($ENV{'GSDLOS'} =~ /^windows/i) {
532 # too difficult doing a pattern match with embedded '\'s...
533 my $win_before_hash=$before_hash;
534 $win_before_hash =~ s@(\\)+@/@g;
535 # $base_dir is already similarly "converted" on windows.
536 if ($win_before_hash =~ s@^$base_dir/@@o) {
537 # if this is true, we removed a prefix
538 $before_hash=$win_before_hash;
539 }
540
541 }
542 else {
543 $before_hash = &util::filename_cat("",$before_hash);
544 $before_hash =~ s@^$base_dir/@@;
545 }
546
547 }
548
549 } else {
550 # Turn relative file path into full path
551 my $dirname = &File::Basename::dirname($file);
552 $before_hash = &util::filename_cat($dirname, $before_hash);
553 $before_hash = $self->eval_dir_dots($before_hash);
554 }
555
556 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
557
558 # make sure there's a slash on the end if it's a directory
559 if ($before_hash !~ /\/$/) {
560 $before_hash .= "/" if (-d $linkfilename);
561 }
562
563 return ("http://" . $before_hash, $hash_part, 1);
564
565 } else {
566 # mailto, news, nntp, telnet, javascript or gopher link
567 return ($before_hash, "", 0);
568 }
569}
570
571sub extract_first_NNNN_characters {
572 my $self = shift (@_);
573 my ($textref, $doc_obj, $thissection) = @_;
574
575 foreach my $size (split /,/, $self->{'first'}) {
576 my $tmptext = $$textref;
577 # skip to the body
578 $tmptext =~ s/.*<body[^>]*>//i;
579 # remove javascript
580 $tmptext =~ s@<script.*?</script>@ @sig;
581 $tmptext =~ s/<[^>]*>/ /g;
582 $tmptext =~ s/&nbsp;/ /g;
583 $tmptext =~ s/^\s+//;
584 $tmptext =~ s/\s+$//;
585 $tmptext =~ s/\s+/ /gs;
586 $tmptext = substr ($tmptext, 0, $size);
587 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
588 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
589 }
590}
591
592sub extract_metadata {
593 my $self = shift (@_);
594 my ($textref, $metadata, $doc_obj, $section) = @_;
595 my $outhandle = $self->{'outhandle'};
596
597 # if we don't want metadata, we may as well not be here ...
598 return if (!defined $self->{'metadata_fields'});
599
600 # hunt for an author look in the metadata elements:
601 if (defined $self->{'hunt_creator_metadata'}) {
602 for my $name (split /,/, "AUTHOR,AUTHOR.EMAIL,CREATOR,DC.CREATOR,DC.CREATOR.CORPORATENAME") {
603 if ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$name\"?([^>]*)/is) {
604 my $content = $1 . $2;
605 if ($content =~ /content\s*=\s*\"?(.*)\"?/is) {
606 if (defined $1) {
607 my $value = $1;
608 $value =~ s/\"$//;
609 $value =~ s/\s+/ /gs;
610 $doc_obj->add_utf8_metadata($section, "Creator", $value);
611 print $outhandle " extracted Creator metadata \"$value\"\n"
612 if ($self->{'verbosity'} > 2);
613 next;
614 }
615 }
616 }
617 }
618 }
619
620 foreach my $field (split /,/, $self->{'metadata_fields'}) {
621
622 # don't need to extract field if it was passed in from a previous
623 # (recursive) plugin
624 next if defined $metadata->{$field};
625
626 # see if there's a <meta> tag for this field
627 if ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$field\"?([^>]*)/is) {
628 my $content = $1 . $2;
629 if ($content =~ /content\s*=\s*\"?(.*)\"?/is) {
630 if (defined $1) {
631 my $value = $1;
632 $value =~ s/\"$//;
633 $value =~ s/\s+/ /gs;
634 $value =~ s/\".*//gs;
635 $doc_obj->add_utf8_metadata($section, $field, $value);
636 print $outhandle " extracted \"$field\" metadata \"$value\"\n"
637 if ($self->{'verbosity'} > 2);
638 next;
639 }
640 }
641 }
642
643 # TITLE: extract the document title
644
645 if ($field =~ /^title$/i) {
646
647 # see if there's a <title> tag
648 if ($$textref =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) {
649 if (defined $1) {
650 my $title = $1;
651 if ($title =~ /\w/) {
652 $title =~ s/<[^>]*>/ /g;
653 $title =~ s/&nbsp;/ /g;
654 $title =~ s/\s+/ /gs;
655 $title =~ s/^\s+//;
656 $title =~ s/\s+$//;
657 $doc_obj->add_utf8_metadata ($section, $field, $title);
658 print $outhandle " extracted \"$field\" metadata \"$title\"\n"
659 if ($self->{'verbosity'} > 2);
660 next;
661 }
662 }
663 }
664
665 # if no title use first 100 characters
666 my $tmptext = $$textref;
667 $tmptext =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
668 $tmptext =~ s/<[^>]*>/ /g;
669 $tmptext =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
670 $tmptext =~ s/^\s+//s;
671 $tmptext =~ s/\s+$//;
672 $tmptext =~ s/\s+/ /gs;
673 $tmptext =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
674 $tmptext =~ s/^\s+//s; # in case title_sub introduced any...
675 $tmptext = substr ($tmptext, 0, 100);
676 $tmptext =~ s/\s\S*$/.../;
677 $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
678 print $outhandle " extracted \"$field\" metadata \"$tmptext\"\n"
679 if ($self->{'verbosity'} > 2);
680 next;
681 }
682
683
684 # tag: extract the text between the first <H1> and </H1> tags
685 if ($field =~ /^tag[a-z0-9]+$/i) {
686
687 my $tag = $field;
688 $tag =~ s/^tag//i;
689 my $tmptext = $$textref;
690 $tmptext =~ s/\s+/ /gs;
691 if ($tmptext =~ /<$tag[^>]*>/i) {
692 foreach my $word ($tmptext =~ m/<$tag[^>]*>(.*?)<\/$tag[^>]*>/g) {
693 $word =~ s/&nbsp;/ /g;
694 $word =~ s/<[^>]*>/ /g;
695 $word =~ s/^\s+//;
696 $word =~ s/\s+$//;
697 $word =~ s/\s+/ /gs;
698 if ($word ne "") {
699 $doc_obj->add_utf8_metadata ($section, $tag, $word);
700 print $outhandle " extracted \"$tag\" metadata \"$word\"\n"
701 if ($self->{'verbosity'} > 2);
702 }
703 }
704 }
705 next;
706 }
707 }
708}
709
710
711# evaluate any "../" to next directory up
712# evaluate any "./" as here
713sub eval_dir_dots {
714 my $self = shift (@_);
715 my ($filename) = @_;
716
717 my $dirsep_os = &util::get_os_dirsep();
718 my @dirsep = split(/$dirsep_os/,$filename);
719
720 my @eval_dirs = ();
721 foreach my $d (@dirsep) {
722 if ($d eq "..") {
723 pop(@eval_dirs);
724
725 } elsif ($d eq ".") {
726 # do nothing!
727
728 } else {
729 push(@eval_dirs,$d);
730 }
731 }
732
733 return &util::filename_cat(@eval_dirs);
734}
735
736sub replace_usemap_links {
737 my $self = shift (@_);
738 my ($front, $link, $back) = @_;
739
740 $link =~ s/^\.\///;
741 return $front . $link . $back;
742}
743
744sub inc_filecount {
745 my $self = shift (@_);
746
747 if ($self->{'file_num'} == 1000) {
748 $self->{'dir_num'} ++;
749 $self->{'file_num'} = 0;
750 } else {
751 $self->{'file_num'} ++;
752 }
753}
754
755
756# Extend the BasPlug read_file so that strings like &eacute; are
757# converted to UTF8 internally.
758#
759# We don't convert &lt; or &gt; or &amp; or &quot; in case
760# they interfere with the GML files
761
762sub read_file {
763 my ($self, $filename, $encoding, $language, $textref) = @_;
764
765 &BasPlug::read_file($self, $filename, $encoding, $language, $textref);
766
767 # Convert entities to their UTF8 equivalents
768 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
769 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
770 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
771}
772
7731;
Note: See TracBrowser for help on using the repository browser.