source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 3540

Last change on this file since 3540 was 3540, checked in by kjdon, 21 years ago

added John T's changes into CVS - added info to enable retrieval of usage info in xml

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 27.8 KB
Line 
1###########################################################################
2#
3# HTMLPlug.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34# To use frames properly you'll need to use the WebPlug plugin.
35#
36
37
38package HTMLPlug;
39
40use BasPlug;
41use ghtml;
42use unicode;
43use util;
44use parsargv;
45
46sub BEGIN {
47 @ISA = ('BasPlug');
48}
49
50my $arguments = [ { 'name' => "process_exp",
51 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
52 'type' => "string",
53 'deft' => q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php|\.cgi|.+\?.+=.*)$^ },
54 { 'name' => "block_exp",
55 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
56 'type' => 'string',
57 'deft' => q^(?i)\.(gif|jpe?g|png|css)$^ },
58 { 'name' => "nolinks",
59 'desc' => "Don't make any attempt to trap links (setting this flag may improve speed of building/importing but any relative links within documents will be broken).",
60 'type' => "flag" },
61 { 'name' => "keep_head",
62 'desc' => "Don't remove headers from html files.",
63 'type' => "flag" },
64 { 'name' => "no_metadata",
65 'desc' => "Don't attempt to extract any metadata from files.",
66 'type' => "flag" },
67 { 'name' => "metadata_fields",
68 'desc' => "Comma separated list of metadata fields to attempt to extract. Defaults to 'Title'. Use 'tag<tagname>' to have the contents of the first <tagname > pair put in a metadata element called 'tagname'. Capitalise this as you want the metadata capitalised in Greenstone, since the tag extraction is case insensitive.",
69 'type' => "metadatum",
70 'deft' => "" },
71 { 'name' => "hunt_creator_metadata",
72 'desc' => "Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.",
73 'type' => "flag" },
74 { 'name' => "file_is_url",
75 'desc' => "Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure.",
76 'type' => "flag" },
77 { 'name' => "assoc_files",
78 'desc' => "Perl regular expression of file extensions to associate with html documents. Defaults to '(?i)\.(jpe?g|gif|png|css)\$'",
79 'type' => "string",
80 'deft' => q^(?i)\.(jpe?g|gif|png|css)\$^ },
81 { 'name' => "rename_assoc_files",
82 'desc' => "Renames files associated with documents (e.g. images). Also creates much shallower directory structure (useful when creating collections to go on cd-rom).",
83 'type' => "flag" } ,
84 { 'name' => "title_sub",
85 'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove \"Page 1\", etc from text used as the title.",
86 'type' => "string" } ,
87 { 'name' => "description_tags",
88 'desc' => "Split document into sub-sections where <Section> tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the <Section> tags. Also, '-keep_head' will have no effect when this option is set.",
89 'type' => "flag" } ];
90
91my $options = { 'name' => "HTMLPlug",
92 'desc' => "This plugin processes HTML files",
93 'inherits' => "yes",
94 'args' => $arguments };
95
96sub print_usage {
97 print STDERR "\n usage: plugin HTMLPlug [options]\n\n";
98 print STDERR " options:\n";
99 print STDERR " -nolinks Don't make any attempt to trap links (setting this\n";
100 print STDERR " flag may improve speed of building/importing but\n";
101 print STDERR " any relative links within documents will be broken).\n";
102 print STDERR " -keep_head Don't remove headers from html files.\n";
103 print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n";
104 print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to
105 extract. Defaults to 'Title'.
106 Use 'tag<tagname>' to have the contents of the first
107 <tagname> pair put in a metadata element called
108 'tagname'. Capitalise this as you want the metadata
109 capitalised in Greenstone, since the tag extraction
110 is case insensitive.\n";
111 print STDERR " -hunt_creator_metadata Find as much metadata as possible on authorship and
112 place it in the 'Creator' field. Requires the
113 -metadata_fields flag.\n";
114 print STDERR " -file_is_url Set if input filenames make up url of original source
115 documents e.g. if a web mirroring tool was used to
116 create the import directory structure\n";
117 print STDERR " -assoc_files Perl regular expression of file extensions to
118 associate with html documents.
119 Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n";
120 print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images).
121 Also creates much shallower directory structure
122 (useful when creating collections to go on cd-rom).\n";
123 print STDERR " -title_sub Substitution expression to modify string stored as
124 Title. Used by, for example, PDFPlug to remove
125 \"Page 1\", etc from text used as the title.\n";
126 print STDERR " -description_tags Split document into sub-sections where <Section> tags
127 occur. Note that by setting this option you
128 implicitly set -no_metadata, as all metadata should
129 be included within the <Section> tags (this is only
130 true for documents that actually contain <Section> tags
131 however). Also, '-keep_head' will have no effect when
132 this option is set, regardless of whether a document
133 contains Section tags.\n";
134}
135
136sub new {
137 my $class = shift (@_);
138 my $self = new BasPlug ($class, @_);
139
140 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
141 my $option_list = $self->{'option_list'};
142 push( @{$option_list}, $options );
143
144 if (!parsargv::parse(\@_,
145 q^nolinks^, \$self->{'nolinks'},
146 q^keep_head^, \$self->{'keep_head'},
147 q^no_metadata^, \$self->{'no_metadata'},
148 q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
149 q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'},
150 q^w3mir^, \$self->{'w3mir'},
151 q^file_is_url^, \$self->{'file_is_url'},
152 q^assoc_files/.*/(?i)\.(jpe?g|jpe|gif|png|css)$^, \$self->{'assoc_files'},
153 q^rename_assoc_files^, \$self->{'rename_assoc_files'},
154 q^title_sub/.*/^, \$self->{'title_sub'},
155 q^description_tags^, \$self->{'description_tags'},
156 "allow_extra_options")) {
157
158 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
159 &print_usage();
160 die "\n";
161 }
162
163 # retain this for backward compatibility (w3mir option was replaced by
164 # file_is_url)
165 if ($self->{'w3mir'}) {
166 $self->{'file_is_url'} = 1;
167 }
168
169 $self->{'aux_files'} = {};
170 $self->{'dir_num'} = 0;
171 $self->{'file_num'} = 0;
172 return bless $self, $class;
173}
174
175
176sub get_default_block_exp {
177 my $self = shift (@_);
178
179 return q^(?i)\.(gif|jpe?g|jpe|png|css)$^;
180}
181
182sub get_default_process_exp {
183 my $self = shift (@_);
184
185 # the last option is an attempt to encode the concept of an html query ...
186 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
187}
188
189
190# do plugin specific processing of doc_obj
191sub process {
192 my $self = shift (@_);
193 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
194 my $outhandle = $self->{'outhandle'};
195 print $outhandle "HTMLPlug: processing $file\n"
196 if $self->{'verbosity'} > 1;
197
198 if ($ENV{'GSDLOS'} =~ /^windows/i) {
199 # this makes life so much easier... perl can cope with unix-style '/'s.
200 $base_dir =~ s@(\\)+@/@g;
201 $file =~ s@(\\)+@/@g;
202 }
203
204 # reset per-doc stuff...
205 $self->{'aux_files'} = {};
206 $self->{'dir_num'} = 0;
207 $self->{'file_num'} = 0;
208
209 my $cursection = $doc_obj->get_top_section();
210
211 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
212 unless $self->{'no_metadata'} || $self->{'description_tags'};
213
214 # Store URL for page as metadata - this can be used for an
215 # altavista style search interface. The URL won't be valid
216 # unless the file structure contains the domain name (i.e.
217 # like when w3mir is used to download a website).
218 my $web_url = "http://$file";
219 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
220
221 if ($self->{'description_tags'}) {
222
223 # remove the html header - note that doing this here means any
224 # sections defined within the header will be lost (so all <Section>
225 # tags must appear within the body of the HTML)
226 $$textref =~ s/^.*?<body[^>]*>//is;
227 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
228
229 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
230 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
231 my $lt = '(?:<|&lt;)';
232 my $gt = '(?:>|&gt;)';
233 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
234
235 my $found_something = 0; my $top = 1;
236 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
237 my $text = $1;
238 my $comment = $2;
239 if (defined $text) {
240 # text before a comment - note that getting to here
241 # doesn't necessarily mean there are Section tags in
242 # the document
243 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
244 }
245 while ($comment =~ s/$lt(.*?)$gt//s) {
246
247 my $tag = $1;
248 if ($tag eq "Section") {
249 $found_something = 1;
250 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
251 $top = 0;
252 } elsif ($tag eq "/Section") {
253 $found_something = 1;
254 $cursection = $doc_obj->get_parent_section ($cursection);
255 } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
256 my $metaname = $1;
257 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
258 my $metavalue = $1;
259 $metavalue =~ s/^\s+//;
260 $metavalue =~ s/\s+$//;
261 # assume that no metadata value intentionally includes
262 # carriage returns or HTML tags (if they're there they
263 # were probably introduced when converting to HTML from
264 # some other format).
265 $metavalue =~ s/[\cJ\cM]/ /sg;
266 $metavalue =~ s/<[^>]+>//sg;
267 $metavalue =~ s/\s+/ /sg;
268 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
269 } elsif ($tag eq "Description" || $tag eq "/Description") {
270 # do nothing with containing Description tags
271 } else {
272 # simple HTML tag (probably created by the conversion
273 # to HTML from some other format) - we'll ignore it and
274 # hope for the best ;-)
275 }
276 }
277 }
278 if ($cursection ne "") {
279 print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
280 }
281
282 $$textref =~ s/^.*?<body[^>]*>//is;
283 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
284 if ($$textref =~ /\S/) {
285 if (!$found_something) {
286 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
287 print $outhandle " will be processed as a single section document\n";
288
289 # go ahead and process single-section document
290 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
291
292 # if document contains no Section tags we'll go ahead
293 # and extract metadata (this won't have been done
294 # above as the -description_tags option prevents it)
295 $self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
296 unless $self->{'no_metadata'};
297
298 } else {
299 print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
300 print $outhandle " of the final closing </Section> tag. This text will\n";
301 print $outhandle " be ignored.";
302 my ($text);
303 if (length($$textref) > 30) {
304 $text = substr($$textref, 0, 30) . "...";
305 } else {
306 $text = $$textref;
307 }
308 $text =~ s/\n/ /isg;
309 print $outhandle " ($text)\n";
310 }
311 } elsif (!$found_something) {
312
313 # may get to here if document contained no valid Section
314 # tags but did contain some comments. The text will have
315 # been processed already but we should print the warning
316 # as above and extract metadata
317 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
318 print $outhandle " will be processed as a single section document\n";
319
320 $self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
321 unless $self->{'no_metadata'};
322 }
323
324 } else {
325
326 # remove header and footer
327 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
328 $$textref =~ s/^.*?<body[^>]*>//is;
329 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
330 }
331
332 # single section document
333 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
334 }
335 return 1;
336}
337
338# note that process_section may be called multiple times for a single
339# section (relying on the fact that add_utf8_text appends the text to any
340# that may exist already).
341sub process_section {
342 my $self = shift (@_);
343 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
344 # trap links
345 if (!$self->{'nolinks'}) {
346
347 # usemap="./#index" not handled correctly => change to "#index"
348 $$textref =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
349 $self->replace_usemap_links($1, $2, $3)/isge;
350
351 $$textref =~ s/(<(?:a|area|frame|link)\s+[^>]*?\s*(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
352 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
353 }
354
355 # trap images
356
357 # allow spaces if inside quotes - jrm21
358 $$textref =~ s/(<img[^>]*?src\s*=\s*)(\"[^\"]+\"|[^\s>]+)([^>]*>)/
359 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
360
361 # add text to document object
362 # turn \ into \\ so that the rest of greenstone doesn't think there
363 # is an escape code following. (Macro parsing loses them...)
364 $$textref =~ s/\\/\\\\/go;
365 $doc_obj->add_utf8_text($cursection, $$textref);
366}
367
368sub replace_images {
369 my $self = shift (@_);
370 my ($front, $link, $back, $base_dir,
371 $file, $doc_obj, $section) = @_;
372 # remove quotes from link at start and end if necessary
373 if ($link=~/^\"/) {
374 $link=~s/^\"//;$link=~s/\"$//;
375 $front.='"';
376 $back="\"$back";
377 }
378
379 $link =~ s/\n/ /g;
380 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
381 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
382}
383
384sub replace_href_links {
385 my $self = shift (@_);
386 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
387
388 # attempt to sort out targets - frames are not handled
389 # well in this plugin and some cases will screw things
390 # up - e.g. the _parent target (so we'll just remove
391 # them all ;-)
392 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
393 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
394 $front =~ s/target=\"?_parent\"?//is;
395 $back =~ s/target=\"?_parent\"?//is;
396
397 return $front . $link . $back if $link =~ /^\#/s;
398 $link =~ s/\n/ /g;
399
400 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
401 # href may use '\'s where '/'s should be on Windows
402 $href =~ s/\\/\//g;
403
404 my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/;
405
406 ##### leave all these links alone (they won't be picked up by intermediate
407 ##### pages). I think that's safest when dealing with frames, targets etc.
408 ##### (at least until I think of a better way to do it). Problems occur with
409 ##### mailto links from within small frames, the intermediate page is displayed
410 ##### within that frame and can't be seen. There is still potential for this to
411 ##### happen even with html pages - the solution seems to be to somehow tell
412 ##### the browser from the server side to display the page being sent (i.e.
413 ##### the intermediate page) in the top level window - I'm not sure if that's
414 ##### possible - the following line should probably be deleted if that can be done
415 return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is;
416
417
418 if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||
419 ($href =~ /\/$/) || ($href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
420 &ghtml::urlsafe ($href);
421 return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
422
423 } else {
424 # link is to some other type of file (eg image) so we'll
425 # need to associate that file
426 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
427 }
428}
429
430sub add_file {
431 my $self = shift (@_);
432 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
433 my ($newname);
434
435 my $filename = $href;
436 $filename =~ s/^[^:]*:\/\///;
437 $filename = &util::filename_cat($base_dir, $filename);
438
439 my ($ext) = $filename =~ /(\.[^\.]*)$/;
440
441 if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
442 return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
443 }
444
445 if ($self->{'rename_assoc_files'}) {
446 if (defined $self->{'aux_files'}->{$href}) {
447 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
448 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
449 } else {
450 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
451 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
452 $self->inc_filecount ();
453 }
454 $doc_obj->associate_file($filename, $newname, undef, $section);
455 return "_httpdocimg_/$newname";
456
457 } else {
458 ($newname) = $filename =~ /([^\/\\]*)$/;
459 $doc_obj->associate_file($filename, $newname, undef, $section);
460 return "_httpdocimg_/$newname";
461 }
462}
463
464
465sub format_link {
466 my $self = shift (@_);
467 my ($link, $base_dir, $file) = @_;
468
469 my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;
470
471 $hash_part = "" if !defined $hash_part;
472 if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {
473 my $outhandle = $self->{'outhandle'};
474 print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
475 if $self->{'verbosity'};
476 return ($link, "", 0);
477 }
478
479 if ($before_hash =~ s@^((?:http|ftp|file)://)@@i) {
480 my $type = $1;
481
482 if ($link =~ /^(http|ftp):/i) {
483 # Turn url (using /) into file name (possibly using \ on windows)
484 my @http_dir_split = split('/', $before_hash);
485 $before_hash = &util::filename_cat(@http_dir_split);
486 }
487
488 $before_hash = $self->eval_dir_dots($before_hash);
489
490 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
491
492 my $rl = 0;
493 $rl = 1 if (-e $linkfilename);
494
495 # make sure there's a slash on the end if it's a directory
496 if ($before_hash !~ /\/$/) {
497 $before_hash .= "/" if (-d $linkfilename);
498 }
499
500 return ($type . $before_hash, $hash_part, $rl);
501
502 } elsif ($link !~ /^(mailto|news|gopher|nntp|telnet|javascript):/i) {
503 if ($before_hash =~ s@^/@@ || $before_hash =~ /\\/) {
504
505 # the first directory will be the domain name if file_is_url
506 # to generate archives, otherwise we'll assume all files are
507 # from the same site and base_dir is the root
508
509 if ($self->{'file_is_url'}) {
510 my @dirs = split /[\/\\]/, $file;
511 my $domname = shift (@dirs);
512 $before_hash = &util::filename_cat($domname, $before_hash);
513 $before_hash =~ s@\\@/@g; # for windows
514 }
515 else
516 {
517 # see if link shares directory with source document
518 # => turn into relative link if this is so!
519
520 if ($ENV{'GSDLOS'} =~ /^windows/i) {
521 # too difficult doing a pattern match with embedded '\'s...
522 my $win_before_hash=$before_hash;
523 $win_before_hash =~ s@(\\)+@/@g;
524 # $base_dir is already similarly "converted" on windows.
525 if ($win_before_hash =~ s@^$base_dir/@@o) {
526 # if this is true, we removed a prefix
527 $before_hash=$win_before_hash;
528 }
529
530 }
531 else {
532 $before_hash = &util::filename_cat("",$before_hash);
533 $before_hash =~ s@^$base_dir/@@;
534 }
535
536 }
537
538 } else {
539 # Turn relative file path into full path
540 my $dirname = &File::Basename::dirname($file);
541 $before_hash = &util::filename_cat($dirname, $before_hash);
542 $before_hash = $self->eval_dir_dots($before_hash);
543 }
544
545 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
546
547 # make sure there's a slash on the end if it's a directory
548 if ($before_hash !~ /\/$/) {
549 $before_hash .= "/" if (-d $linkfilename);
550 }
551
552 return ("http://" . $before_hash, $hash_part, 1);
553
554 } else {
555 # mailto, news, nntp, telnet, javascript or gopher link
556 return ($before_hash, "", 0);
557 }
558}
559
560sub extract_first_NNNN_characters {
561 my $self = shift (@_);
562 my ($textref, $doc_obj, $thissection) = @_;
563
564 foreach my $size (split /,/, $self->{'first'}) {
565 my $tmptext = $$textref;
566 $tmptext =~ s/.*<body[^>]*>//i;
567 $tmptext =~ s/$self->{'title_sub'}// if ($self->{'title_sub'});
568 $tmptext =~ s/<[^>]*>/ /g;
569 $tmptext =~ s/&nbsp;/ /g;
570 $tmptext =~ s/^\s+//;
571 $tmptext =~ s/\s+$//;
572 $tmptext =~ s/\s+/ /gs;
573 $tmptext = substr ($tmptext, 0, $size);
574 $tmptext =~ s/\s\S*$/&#8230;/;
575 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
576 }
577}
578
579sub extract_metadata {
580 my $self = shift (@_);
581 my ($textref, $metadata, $doc_obj, $section) = @_;
582 my $outhandle = $self->{'outhandle'};
583
584 # if we don't want metadata, we may as well not be here ...
585 return if (!defined $self->{'metadata_fields'});
586
587 # hunt for an author look in the metadata elements:
588 if (defined $self->{'hunt_creator_metadata'}) {
589 for my $name (split /,/, "AUTHOR,AUTHOR.EMAIL,CREATOR,DC.CREATOR,DC.CREATOR.CORPORATENAME") {
590 if ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$name\"?([^>]*)/is) {
591 my $content = $1 . $2;
592 if ($content =~ /content\s*=\s*\"?(.*)\"?/is) {
593 if (defined $1) {
594 my $value = $1;
595 $value =~ s/\"$//;
596 $value =~ s/\s+/ /gs;
597 $doc_obj->add_utf8_metadata($section, "Creator", $value);
598 print $outhandle " extracted Creator metadata \"$value\"\n"
599 if ($self->{'verbosity'} > 2);
600 next;
601 }
602 }
603 }
604 }
605 }
606
607 foreach my $field (split /,/, $self->{'metadata_fields'}) {
608
609 # don't need to extract field if it was passed in from a previous
610 # (recursive) plugin
611 next if defined $metadata->{$field};
612
613 # see if there's a <meta> tag for this field
614 if ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$field\"?([^>]*)/is) {
615 my $content = $1 . $2;
616 if ($content =~ /content\s*=\s*\"?(.*)\"?/is) {
617 if (defined $1) {
618 my $value = $1;
619 $value =~ s/\"$//;
620 $value =~ s/\s+/ /gs;
621 $value =~ s/\".*//gs;
622 $doc_obj->add_utf8_metadata($section, $field, $value);
623 print $outhandle " extracted \"$field\" metadata \"$value\"\n"
624 if ($self->{'verbosity'} > 2);
625 next;
626 }
627 }
628 }
629
630 # TITLE: extract the document title
631
632 if ($field =~ /^title$/i) {
633
634 # see if there's a <title> tag
635 if ($$textref =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) {
636 if (defined $1) {
637 my $title = $1;
638 if ($title =~ /\w/) {
639 $title =~ s/<[^>]*>/ /g;
640 $title =~ s/&nbsp;/ /g;
641 $title =~ s/\s+/ /gs;
642 $title =~ s/^\s+//;
643 $title =~ s/\s+$//;
644 $doc_obj->add_utf8_metadata ($section, $field, $title);
645 print $outhandle " extracted \"$field\" metadata \"$title\"\n"
646 if ($self->{'verbosity'} > 2);
647 next;
648 }
649 }
650 }
651
652 # if no title use first 100 characters
653 my $tmptext = $$textref;
654 $tmptext =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
655 $tmptext =~ s/<[^>]*>/ /g;
656 $tmptext =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
657 $tmptext =~ s/^\s+//s;
658 $tmptext =~ s/\s+$//;
659 $tmptext =~ s/\s+/ /gs;
660 $tmptext =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
661 $tmptext =~ s/^\s+//s; # in case title_sub introduced any...
662 $tmptext = substr ($tmptext, 0, 100);
663 $tmptext =~ s/\s\S*$/.../;
664 $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
665 print $outhandle " extracted \"$field\" metadata \"$tmptext\"\n"
666 if ($self->{'verbosity'} > 2);
667 next;
668 }
669
670
671 # tag: extract the text between the first <H1> and </H1> tags
672 if ($field =~ /^tag[a-z0-9]+$/i) {
673
674 my $tag = $field;
675 $tag =~ s/^tag//i;
676 my $tmptext = $$textref;
677 $tmptext =~ s/\s+/ /gs;
678 if ($tmptext =~ /<$tag[^>]*>/i) {
679 foreach my $word ($tmptext =~ m/<$tag[^>]*>(.*?)<\/$tag[^>]*>/g) {
680 $word =~ s/&nbsp;/ /g;
681 $word =~ s/<[^>]*>/ /g;
682 $word =~ s/^\s+//;
683 $word =~ s/\s+$//;
684 $word =~ s/\s+/ /gs;
685 if ($word ne "") {
686 $doc_obj->add_utf8_metadata ($section, $tag, $word);
687 print $outhandle " extracted \"$tag\" metadata \"$word\"\n"
688 if ($self->{'verbosity'} > 2);
689 }
690 }
691 }
692 next;
693 }
694 }
695}
696
697
698# evaluate any "../" to next directory up
699# evaluate any "./" as here
700sub eval_dir_dots {
701 my $self = shift (@_);
702 my ($filename) = @_;
703
704 my $dirsep_os = &util::get_os_dirsep();
705 my @dirsep = split(/$dirsep_os/,$filename);
706
707 my @eval_dirs = ();
708 foreach my $d (@dirsep) {
709 if ($d eq "..") {
710 pop(@eval_dirs);
711
712 } elsif ($d eq ".") {
713 # do nothing!
714
715 } else {
716 push(@eval_dirs,$d);
717 }
718 }
719
720 return &util::filename_cat(@eval_dirs);
721}
722
723sub replace_usemap_links {
724 my $self = shift (@_);
725 my ($front, $link, $back) = @_;
726
727 $link =~ s/^\.\///;
728 return $front . $link . $back;
729}
730
731sub inc_filecount {
732 my $self = shift (@_);
733
734 if ($self->{'file_num'} == 1000) {
735 $self->{'dir_num'} ++;
736 $self->{'file_num'} = 0;
737 } else {
738 $self->{'file_num'} ++;
739 }
740}
741
742
743# Extend the BasPlug read_file so that strings like &eacute; are
744# converted to UTF8 internally.
745#
746# We don't convert &lt; or &gt; or &amp; or &quot; in case
747# they interfere with the GML files
748
749sub read_file {
750 my ($self, $filename, $encoding, $language, $textref) = @_;
751
752 &BasPlug::read_file($self, $filename, $encoding, $language, $textref);
753
754 # Convert entities to their UTF8 equivalents
755 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
756 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
757 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
758}
759
7601;
Note: See TracBrowser for help on using the repository browser.