source: gsdl/trunk/perllib/plugins/W3ImagePlugin.pm@ 15918

Last change on this file since 15918 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 37.0 KB
Line 
1###########################################################################
2#
3# W3ImagePlugin.pm -- Context-based image indexing plugin for HTML documents
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# DESCRIPTION:
28#
29# Extracts images and associated text and metadata from
30# web pages as individual documents for indexing. Thumbnails
31# are created from each image for browsing purposes.
32#
33# Options are available for configuring the aggressiveness of the
34# associated text extraction mechanisms. A higher level of
35# aggressiveness will extract more text and consequently
36# may mean lower accuracy (precision); however, it may also
37# retrieve more of the relevant images from the collection (recall).
38# Lower levels of aggressiveness maybe result in slightly faster
39# collection builds at the import stage.
40#
41# W3ImagePlugin is a subclass of HTMLPlug (i.e. it will index pages also
42# if required). It can be used in place of HTMLPlugin to index both
43# pages and their images.
44#
45# REQUIREMENTS:
46#
47# The ImageMagick image manipulation is used to create
48# thumbnails and extract some image metadata. (Available
49# from http://www.imagemagick.org/)
50#
51# Unix:
52# Many Linux distributions contain ImageMagick.
53#
54# Windows:
55# ImageMagick can be downloaded from the website above.
56# Make sure the system path includes the ImageMagick binaries
57# before using W3ImagePlugin.
58#
59# NOTE: NT/2000/XP contain a filesystem utility 'convert.exe'
60# with the same name as the image conversion utility. The
61# ImageMagick FAQ recommends renaming the filesystem
62# utility (e.g. to 'fsconvert.exe') to avoid this clash.
63#
64# USAGE:
65#
66# An image document consists of metadata elements:
67#
68# OriginalFilename, FilePath, Filename, FileExt, FileSize,
69# Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL
70# ImageText, PageTitle
71#
72# Most of these are only useful in format strings (e.g. ThumbURL,
73# Filename, URL, PageURL, CachePageURL).
74#
75# ImageText, as the name suggests contains the indexable text.
76# (unless using the -document_text plugin option)
77#
78# Since image documents are made up of metadata elements
79# alone, format strings are needed to display them properly.
80# NOTE: The receptionist will only display results (e.g. thumbnails)
81# in 4 columns if the format string begins with "<td><table>".
82#
83# The example below takes the user to the image within the
84# source HTML document rather than using a format string
85# on DocumentText to display the image document itself.
86#
87# Example collect.cfg:
88#
89# ...
90#
91# indexes document:ImageText document:text
92# defaultindex document:ImageText
93#
94# collectionmeta .document:ImageText "images"
95# collectionmeta .document:text "documents"
96#
97# ...
98#
99# plugin W3ImagePlugin -index_pages -aggressiveness 6
100#
101# ...
102#
103# format SearchVList '<td>{If}{[Title],[link][icon]&nbsp;[Title][[/link],
104# <table><tr><td align="center"><a href="[CachePageURL]">
105# <img src="[ThumbURL]"></a></td></tr><tr><td align="center">
106# <a href="[CachePageURL]"><font size="-1">[OriginalFilename]</font></a>
107# <br>[Width]x[Height]</td></tr></table>}</td>'
108#
109# ...
110#
111
112package W3ImagePlugin;
113
114use HTMLPlugin;
115use ghtml;
116use unicode;
117use util;
118use strict; # 'subs';
119no strict 'refs'; # allow filehandles to be variables and viceversa
120
121sub BEGIN {
122 @W3ImagePlugin::ISA = qw( HTMLPlugin );
123}
124
125my $aggressiveness_list =
126 [ { 'name' => "1",
127 'desc' => "{W3ImagePlugin.aggressiveness.1}" },
128 { 'name' => "2",
129 'desc' => "{W3ImagePlugin.aggressiveness.2}" },
130 { 'name' => "3",
131 'desc' => "{W3ImagePlugin.aggressiveness.3}" },
132 { 'name' => "4",
133 'desc' => "{W3ImagePlugin.aggressiveness.4}" },
134 { 'name' => "5",
135 'desc' => "{W3ImagePlugin.aggressiveness.5}" },
136 { 'name' => "6",
137 'desc' => "{W3ImagePlugin.aggressiveness.6}" },
138 { 'name' => "7",
139 'desc' => "{W3ImagePlugin.aggressiveness.7}" },
140 { 'name' => "8",
141 'desc' => "{W3ImagePlugin.aggressiveness.8}" },
142 { 'name' => "9",
143 'desc' => "{W3ImagePlugin.aggressiveness.9}" } ];
144
145my $arguments =
146 [ { 'name' => "aggressiveness",
147 'desc' => "{W3ImagePlugin.aggressiveness}",
148 'type' => "int",
149 'list' => $aggressiveness_list,
150 'deft' => "3",
151 'reqd' => "no" },
152 { 'name' => "index_pages",
153 'desc' => "{W3ImagePlugin.index_pages}",
154 'type' => "flag",
155 'reqd' => "no" },
156 { 'name' => "no_cache_images",
157 'desc' => "{W3ImagePlugin.no_cache_images}",
158 'type' => "flag",
159 'reqd' => "no" },
160 { 'name' => "min_size",
161 'desc' => "{W3ImagePlugin.min_size}",
162 'type' => "int",
163 'deft' => "2000",
164 'reqd' => "no" },
165 { 'name' => "min_width",
166 'desc' => "{W3ImagePlugin.min_width}",
167 'type' => "int",
168 'deft' => "50",
169 'reqd' => "no" },
170 { 'name' => "min_height",
171 'desc' => "{W3ImagePlugin.min_height}",
172 'type' => "int",
173 'deft' => "50",
174 'reqd' => "no" },
175 { 'name' => "thumb_size",
176 'desc' => "{W3ImagePlugin.thumb_size}",
177 'type' => "int",
178 'deft' => "100",
179 'reqd' => "no" },
180 { 'name' => "convert_params",
181 'desc' => "{W3ImagePlugin.convert_params}",
182 'type' => "string",
183 'deft' => "",
184 'reqd' => "no" },
185 { 'name' => "min_near_text",
186 'desc' => "{W3ImagePlugin.min_near_text}",
187 'type' => "int",
188 'deft' => "10",
189 'reqd' => "no" },
190 { 'name' => "max_near_text",
191 'desc' => "{W3ImagePlugin.max_near_text}",
192 'type' => "int",
193 'deft' => "400",
194 'reqd' => "no" },
195 { 'name' => "smallpage_threshold",
196 'desc' => "{W3ImagePlugin.smallpage_threshold}",
197 'type' => "int",
198 'deft' => "2048",
199 'reqd' => "no" },
200 { 'name' => "textrefs_threshold",
201 'desc' => "{W3ImagePlugin.textrefs_threshold}",
202 'type' => "int",
203 'deft' => "2",
204 'reqd' => "no" },
205 { 'name' => "caption_length",
206 'desc' => "{W3ImagePlugin.caption_length}",
207 'type' => "int",
208 'deft' => "80",
209 'reqd' => "no" },
210 { 'name' => "neartext_length",
211 'desc' => "{W3ImagePlugin.neartext_length}",
212 'type' => "int",
213 'deft' => "300",
214 'reqd' => "no" },
215 { 'name' => "document_text",
216 'desc' => "{W3ImagePlugin.document_text}",
217 'type' => "flag",
218 'reqd' => "no" } ];
219
220my $options = { 'name' => "W3ImagePlugin",
221 'desc' => "{W3ImagePlugin.desc}",
222 'abstract' => "no",
223 'inherits' => "yes",
224 'args' => $arguments };
225
226sub new {
227 my ($class) = shift (@_);
228 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
229 push(@$pluginlist, $class);
230
231 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
232 push(@{$hashArgOptLists->{"OptList"}},$options);
233
234 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
235
236 # init class variables
237 $self->{'textref'} = undef; # init by read_file fn
238 $self->{'htdoc_obj'} = undef; # init by process fn
239 $self->{'htpath'} = undef; # init by process fn
240 $self->{'hturl'} = undef; # init by process fn
241 $self->{'plaintext'} = undef; # HTML stripped version - only init if needed by raw_neartext sub
242 $self->{'smallpage'} = 0; # set by process fn
243 $self->{'images_indexed'} = undef; # num of images indexed - if 1 or 2 then we know page is small
244 $self->{'initialised'} = undef; # flag (see set_extraction_options())
245
246 return bless $self, $class;
247}
248
249# if indexing pages, let HTMLPlugin do it's stuff
250# image extraction done through read()
251sub process {
252 my $self = shift(@_);
253 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
254 $self->{'imglist'} = ();
255 if ( $self->{'index_pages'} ) {
256 my $ok = $self->SUPER::process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
257 if ( ! $ok ) { return $ok }
258 $self->{'htdoc_obj'} = $doc_obj;
259 }
260 # else use URL for referencing
261 #if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
262
263 $self->{'htpath'} = $base_dir if (-d $base_dir);
264 if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} .= "/$1"; }
265 $self->{'htpath'} =~ s/\\/\//g; # replace \ with /
266
267 $self->{'hturl'} = "http://$file";
268 $self->{'hturl'} =~ s/\\/\//g; # for windows
269 ($self->{'filename'}) = $file =~ /.*[\/\\](.*)/;
270 ($self->{'base_path'}) = $file =~ /(.*)[\/\\]/i;
271 if ( ( -s "$base_dir/$file") <= $self->{'smallpage_threshold'} ) {
272 $self->{'smallpage'} = 1;
273 } else { $self->{'smallpage'} = 0; }
274
275 if ( defined($self->{'initialised'}) ) { return 1; }
276 else {
277 $self->{'initialised'} = $self->set_extraction_options($base_dir =~ /^(.*?)\/import/i);
278 return $self->{'initialised'};
279 }
280}
281
282# get complex configuration options from configuration files
283# -- $GSDLCOLLECTION/etc/W3ImagePlugin.cfg (tag sets for aggr 2+)
284# -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+)
285
286# If there's no W3ImagePlugin.cfg file we'll use the following default values
287my $defaultcfg = '
288<delimitertagset>
289 <setname>Caption</setname>
290 <taggroup>font</taggroup>
291 <taggroup>tt</taggroup>
292 <taggroup>small</taggroup>
293 <taggroup>b</taggroup>
294 <taggroup>i</taggroup>
295 <taggroup>u</taggroup>
296 <taggroup>em</taggroup>
297 <taggroup>td</taggroup>
298 <taggroup>li</taggroup>
299 <taggroup>a</taggroup>
300 <taggroup>p</taggroup>
301 <taggroup>tr</taggroup>
302 <taggroup>center</taggroup>
303 <taggroup>div</taggroup>
304 <taggroup>caption</taggroup>
305 <taggroup>br</taggroup>
306 <taggroup>ul</taggroup>
307 <taggroup>ol</taggroup>
308 <taggroup>table</taggroup>
309 <taggroup>hr</taggroup>
310</delimitertagset>
311
312<delimitertagset>
313 <setname>Neartext</setname>
314 <taggroup>tr|hr|table|h\d|img|body</taggroup>
315 <taggroup>td|tr|hr|table|h\d|img|body</taggroup>
316 <taggroup>p|br|td|tr|hr|table|h\d|img|body</taggroup>
317 <taggroup>font|p|i|b|em|img</taggroup>
318</delimitertagset>
319';
320
321sub set_extraction_options() {
322 my ($self, $collpath) = @_;
323 my ($filepath);
324
325 print {$self->{'outhandle'}} "W3ImagePlugin: Initialising\n"
326 if $self->{'verbosity'} > 1;
327 # etc/W3ImagePlugin.cfg (XML)
328 # tag sets for captions and neartext
329 if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) {
330 $self->{'delims'} = [];
331 $self->{'cdelims'} = [];
332 my ($cfg, @tagsets, $tagset, $type, @delims);
333
334 $filepath = "$collpath/etc/W3ImagePlugin.cfg";
335 if ( open CFG, "<$filepath" ) {
336 while (<CFG>) { $cfg .= $_ }
337 close CFG;
338 } else {
339 $cfg = $defaultcfg;
340 }
341
342 (@tagsets) =
343 $cfg =~ /<delimitertagset>(.*?)<\/delimitertagset>/igs;
344 foreach $tagset ( @tagsets ) {
345 ($type) = $tagset =~ /<setname>(.*?)<\/setname>/i;
346 if ( lc($type) eq "caption" ) {
347 (@{$self->{'cdelims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
348 }
349 elsif ( lc($type) eq "neartext" ) {
350 (@{$self->{'delims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
351 }
352 }
353
354 # output a warning if there seem to be no delimiters
355 if ( scalar(@{$self->{'cdelims'}} == 0)) {
356 print {$self->{'outhandle'}} "W3ImagePlugin: Warning: no caption delimiters found in $filepath\n";
357 }
358 if ( scalar(@{$self->{'delims'}} == 0)) {
359 print {$self->{'outhandle'}} "W3ImagePlugin: Warning: no neartext delimiters found in $filepath\n";
360 }
361 }
362
363 # get stop words for textual reference extraction
364 # TODO: warnings scroll off. Would be best to output them again at end of import
365 if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) {
366 $self->{'stopwords'} = ();
367 $filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");
368 if ( open STOPWORDS, "<$filepath" ) {
369 while ( <STOPWORDS> ) {
370 chomp;
371 $self->{'stopwords'}{$_} = 1;
372 }
373 close STOPWORDS;
374 } else {
375 print {$self->{'outhandle'}} "W3ImagePlugin: Warning: couldn't open stopwords file at $filepath ($!)\n";
376 }
377
378 }
379
380 if ( $self->{'neartext_length'} > $self->{'max_near_text'} ) {
381 $self->{'max_near_text'} = $self->{'neartext_length'} * 1.33;
382 print {$self->{'outhandle'}} "W3ImagePlugin: Warning: adjusted max_text to $self->{'max_near_text'}\n";
383 }
384 if ( $self->{'caption_length'} > $self->{'max_near_text'} ) {
385 $self->{'max_near_text'} = $self->{'caption_length'} * 1.33;
386 print {$self->{'outhandle'}} "W3ImagePlugin: Warning: adjusted max_text to $self->{'max_near_text'}\n";
387 }
388
389 return 1;
390}
391
392# return number of files processed, undef if can't recognise, -1 if
393# cant process
394# Note that $base_dir might be "" and that $file might
395# include directories
396sub read {
397 my ($self, $pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = (@_);
398 my ($doc_obj, $section, $filepath, $imgtag, $pos, $context, $numdocs, $tndir, $imgs);
399 # forward normal read (runs HTMLPlugin if index_pages T)
400 my $ok = $self->SUPER::read($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
401 if ( ! $ok ) { return $ok } # what is this returning??
402
403 my $outhandle = $self->{'outhandle'};
404 my $textref = $self->{'textref'};
405 my $htdoc_obj = $self->{'htdoc_obj'};
406 $numdocs = 0;
407 $base_dir =~ /(.*)\/.*/;
408 $tndir = "$1/archives/thumbnails"; # TODO: this path shouldn't be hardcoded?
409 &util::mk_all_dir($tndir) unless -e "$tndir";
410
411 $imgs = \%{$self->{'imglist'}};
412 my $nimgs = $self->get_img_list($textref);
413 $self->{'images_indexed'} = $nimgs;
414 if ( $nimgs > 0 ) {
415 my @fplist = (sort { $imgs->{$a}{'pos'} <=> $imgs->{$b}{'pos'} } keys %{$imgs});
416 my $i = 0;
417 foreach $filepath ( @fplist ) {
418 $pos = $imgs->{$filepath}{'pos'};
419 $context = substr ($$textref, $pos - 50, $pos + 50); # grab context (quicker)
420 ($imgtag) = ($context =~ /(<(?:img|a|body)\s[^>]*$filepath[^>]*>)/is );
421 if (! defined($imgtag)) { $imgtag = $filepath }
422 print $outhandle "W3ImagePlugin: extracting $filepath\n"
423 if ( $self->{'verbosity'} > 1 );
424 $doc_obj = new doc ("", "indexed_doc");
425 $section = $doc_obj->get_top_section();
426 my $prevpos = ( $i == 0 ? 0 : $imgs->{$fplist[$i - 1]}{'pos'});
427 my $nextpos = ( $i >= ($nimgs -1) ? -1 : $imgs->{$fplist[$i + 1]}{'pos'} );
428
429 $self->extract_image_info($imgtag, $filepath, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos);
430 $processor->process($doc_obj);
431 $numdocs++;
432 $i++;
433 }
434 return $numdocs;
435 } else {
436 print $outhandle "W3ImagePlugin: No images from $file indexed\n"
437 if ( $self->{'verbosity'} > 2 );
438 return 1;
439 }
440
441}
442
443# for every valid image tag
444# 1. extract related text and image metadata
445# 2. add this as document meta-data
446# 3. add assoc image(s) as files
447#
448sub extract_image_info {
449 my $self = shift (@_);
450 my ($tag, $id, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos) = (@_);
451 my ($filename, $orig_fp, $fn, $ext, $reltext, $relreltext, $crcid, $imgs,
452 $thumbfp, $pagetitle, $alttext, $filepath, $aggr);
453 $aggr = $self->{'aggressiveness'};
454 $imgs = \%{$self->{'imglist'}};
455 $filepath = $imgs->{$id}{'relpath'};
456 ($filename) = $filepath =~ /([^\/\\]+)$/s;
457 ($orig_fp) = "$self->{'base_path'}/$filepath";
458 $orig_fp =~ tr/+/ /;
459 $orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc
460 $orig_fp =~ s/\\/\//g;
461 $filepath = "$self->{'htpath'}/$filepath";
462 my ($onlyfn) = $filename =~ /([^\\\/]*)$/;
463 ($fn, $ext) = $onlyfn =~ /(.*)\.(.*)/;
464 $fn = lc $fn; $ext = lc $ext;
465 ($reltext) = "<tr><td>GifComment</td><td>" . `identify $filepath -ping -format "%c"` . "</td></tr>\n"
466 if ($ext eq "gif");
467 $reltext .= "<tr><td>FilePath</td><td>$orig_fp</td></tr>\n";
468
469 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
470 $crcid = "$fn.$ext." . $self->{'next_crcid'}++;
471 } else { ($crcid) = `cksum $filepath` =~ /^(\d+)/; }
472 $thumbfp = "$tndir/tn_$crcid.jpg";
473 `convert -flatten -filter Hanning $self->{'convert_params'} -geometry "$self->{'thumb_size'}x$self->{'thumb_size'}>" $filepath $thumbfp` unless -e $thumbfp;
474 if ( ! (-e $thumbfp) ) {
475 print STDERR "W3ImagePlugin: 'convert' failed. Check ImageMagicK binaries are installed and working correctly\n"; return 0;
476 }
477
478 # shove in full text (tag stripped or unstripped) if settings require it
479 if ( $aggr == 10) {
480 $reltext = "<tr><td>AllPage</td><td>" . $$textref . "</td><tr>\n"; # level 10 (all text, verbatim)
481 } else {
482 $pagetitle = $self->get_meta_value("title", $textref);
483 ($alttext) = $tag =~ /\salt\s*=\s*(?:\"|\')(.+?)(?:\"|\')/is;
484 if ( defined($alttext) && length($alttext) > 1) {
485 $reltext .= "<tr><td>ALTtext</td><td>$alttext</td></tr>\n"; }
486 $reltext .= "<tr><td>SplitCapitalisation</td><td>" .
487 $self->split_filepath($orig_fp) . "</td></tr>\n";
488
489 # get caption/tag based near text (if appropriate)
490 if ( $aggr > 1 ) {
491 if ( $aggr >= 2 ) {
492 $reltext .=
493 $self->extract_caption_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
494 $relreltext = $reltext;
495 }
496 # repeat the filepath, alt-text, caption, etc
497 if ( $aggr == 8 ) {
498 $reltext .= $relreltext;
499 }
500 if ( $aggr >= 3 ) {
501 $reltext .=
502 $self->extract_near_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
503 }
504
505 # get page metadata (if appropriate)
506 if ( $aggr >= 6 || ( $aggr >= 2 &&
507 ( $self->{'images_indexed'} < 2 ||
508 ($self->{'smallpage'} == 1 && $self->{'images_indexed'} < 6 )))) {
509 $reltext .= $self->get_page_metadata($textref);
510 }
511 # textual references
512 if ( $aggr == 5 || $aggr >= 7) {
513 if ( length($relreltext) > ($self->{'caption_length'} * 2) ) {
514 $reltext .= $self->get_textrefs($relreltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); }
515 else {
516 $reltext .= $self->get_textrefs($reltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
517 }
518 }
519 } # aggr > 1
520 } # aggr != 10
521
522 $doc_obj->set_OID($crcid);
523 $doc_obj->associate_file($thumbfp, "$fn.thumb.jpg", undef, $section);
524 $doc_obj->add_metadata($section, "OriginalFilename", $filename);
525 $doc_obj->add_metadata($section, "FilePath", $orig_fp);
526 $doc_obj->add_metadata($section, "Filename", $fn);
527 $doc_obj->add_metadata($section, "FileExt", $ext);
528 $doc_obj->add_metadata($section, "FileSize", $imgs->{$id}{'filesize'});
529 $doc_obj->add_metadata($section, "Width", $imgs->{$id}{'width'});
530 $doc_obj->add_metadata($section, "Height", $imgs->{$id}{'height'});
531 $doc_obj->add_metadata($section, "URL", "http://$orig_fp");
532 $doc_obj->add_metadata($section, "PageURL", $self->{'hturl'});
533 $doc_obj->add_metadata($section, "PageTitle", $pagetitle);
534 $doc_obj->add_metadata($section, "ThumbURL",
535 "_httpprefix_/collect/[collection]/index/assoc/[archivedir]/$fn.thumb.jpg");
536 $doc_obj->add_metadata($section, "FileFormat", "W3Img");
537
538 if ( $self->{'document_text'} ) {
539 $doc_obj->add_utf8_text($section, "<table border=1>\n$reltext</table>");
540 } else {
541 $doc_obj->add_metadata($section, "ImageText", "<table border=1>\n$reltext</table>\n");
542 }
543
544 if ( $self->{'index_pages'} ) {
545 my ($cache_url) = "_httpdoc_&d=" . $self->{'htdoc_obj'}->get_OID();
546 if ( $imgs->{$id}{'anchored'} ) {
547 my $a_name = $id;
548 $a_name =~ s/[\/\\\:\&]/_/g;
549 $cache_url .= "#gsdl_$a_name" ;
550 }
551 $doc_obj->add_utf8_metadata($section, "CachePageURL", $cache_url);
552 }
553 if ( ! $self->{'no_cache_images'} ) {
554 $onlyfn = lc $onlyfn;
555 $doc_obj->associate_file($filepath, $onlyfn, undef, $section);
556 $doc_obj->add_utf8_metadata($section, "CacheURL",
557 "_httpprefix_/collect/[collection]/index/assoc/[archivedir]/$onlyfn");
558 }
559 return 1;
560}
561
562sub get_page_metadata {
563 my ($self, $textref) = (@_);
564 my (@rval);
565 $rval[0] = $self->get_meta_value("title", $textref);
566 $rval[1] = $self->get_meta_value("keywords", $textref);
567 $rval[2] = $self->get_meta_value("description", $textref);
568 $rval[3] = $self->{'filename'};
569
570 return wantarray ? @rval : "<tr><td>PageMeta</td><td>@rval</td></tr>\n" ;
571}
572
573# turns LargeCatFish into Large,Cat,Fish so MG sees the separate words
574sub split_filepath {
575 my ($self, $filepath) = (@_);
576 my (@words) = $filepath =~ /([A-Z][a-z]+)/g;
577 return join(',', @words);
578}
579
580# finds and extracts sentences
581# that seem to be on the same topic
582# as other related text (correlations)
583# and textual references (e.g. in figure 3 ...)
584sub get_textrefs {
585 my ($self, $reltext, $textref, $prevpos, $pos, $nextpos) = (@_);
586 my ($maxtext, $mintext, $startpos, $context_size, $context);
587
588 my (@relwords, @refwords, %sentences, @pagemeta);
589
590 # extract larger context
591 $maxtext = $self->{'max_near_text'};
592 $startpos = $pos - ($maxtext * 4);
593 $context_size = $maxtext*10;
594 if ($startpos < $prevpos ) { $startpos = $prevpos }
595 if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) }
596 $context = substr ( $$textref, $startpos, $context_size );
597 $context =~ s/<.*?>//gs;
598 $context =~ s/^.*>(.*)/$1/gs;
599 $context =~ s/(.*)<.*$/$1/gs;
600
601 # get page meta-data (if not already included)
602 if ( $self->{'aggressiveness'} == 5 && ! $self->{'smallpage'} ) {
603 @pagemeta = $self->get_page_metadata($textref);
604 foreach my $value ( @pagemeta ) {
605 $context .= "$value."; # make each into psuedo-sentence
606 }
607 }
608
609 # TODO: this list is not exhaustive
610 @refwords = ( '(?:is|are)? ?(?:show(?:s|n)|demonstrate(?:d|s)|explains|features) (?:in|by|below|above|here)',
611 '(?:see)? (?:figure|table)? (?:below|above)');
612
613 # extract general references
614 foreach my $rw ( @refwords ) {
615 while ( $context =~ /[\.\?\!\,](.*?$rw\W.*?[\.\?\!\,])/ig ) {
616 my $sentence = $1;
617 $sentence =~ s/\s+/ /g;
618 $sentences{$sentence}+=2;
619 }
620 }
621 # extract specific (figure, table) references by number
622 my ($fignum) = $context =~ /[\.\?\!].*?(?:figure|table)s?[\-\_\ \.](\d+\w*)\W.*?[\.\?\!]/ig;
623 if ( $fignum ) {
624 foreach my $rw ( @refwords ) {
625 while ( $context =~ /[\.\?\!](.*?(figure|table)[\-\_\ \.]$fignum\W.*?[\.\?\!])/ig ) {
626 my $sentence = $1;
627 $sentence =~ s/\s+/ /g;
628 $sentences{$sentence}+=4;
629 }
630 }
631 }
632
633 # sentences with occurances of important words
634 @relwords = $reltext =~ /([a-zA-Z]{4,})/g; # take out small words
635 foreach my $word ( @relwords ) {
636 if ( $self->{'stopwords'}{$word} ) { next } # skip stop words
637 while ( $context =~ /([^\.\?\!]*?$word\W.*?[\.\?\!])/ig ) {
638 my $sentence = $1;
639 $sentence =~ s/\s+/ /g;
640 $sentences{$sentence}++;
641 }
642 }
643 foreach my $sentence ( keys %sentences ) {
644 if ($sentences{$sentence} < $self->{'textrefs_threshold'}) {
645 delete $sentences{$sentence};
646 }
647 }
648 my ($rval) = join "<br>\n", (keys %sentences);
649 if ( $rval && length($rval) > 5 ) {
650 return ( "<tr><td>TextualReferences</td><td>" . $rval . "</td></tr>\n") }
651 else { return "" }
652}
653
654# handles caption extraction
655# calling the extractor with different
656# tags and choosing the best candidate caption
657sub extract_caption_text {
658 my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
659 my (@neartext, $len, $hdelim, $mintext, $goodlen,
660 $startpos, $context, $context_size);
661
662 $mintext = $self->{'min_near_text'};
663 $goodlen = $self->{'caption_length'};
664
665 # extract a context to extract near text from (faster)
666 $context_size = $self->{'max_near_text'}*3;
667 $startpos = $pos - ($context_size / 2);
668 if ($startpos < $prevpos ) { $startpos = $prevpos }
669 if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
670 { $context_size = ($nextpos - $startpos) }
671
672 $context = substr ( $$textref, $startpos, $context_size );
673 $context =~ s/<!--.*?-->//gs;
674 $context =~ s/^.*-->(.*)/$1/gs;
675 $context =~ s/(.*)<!--.*$/$1/gs;
676
677 # try stepping through markup delimiter sets
678 # and selecting the best one
679 foreach $hdelim ( @{ $self->{'cdelims'} } ) {
680 @neartext = $self->extract_caption($tag, $hdelim, \$context);
681 $len = length(join("", @neartext));
682 last if ($len >= $mintext && $len <= $goodlen);
683 }
684 # reject if well over reasonable length
685 if ( $len > $goodlen ) {
686 @neartext = [];
687 }
688 $neartext[0] = " " if (! defined $neartext[0]);
689 $neartext[1] = " " if (! defined $neartext[1]);
690 return "<tr><td>Caption</td><td>" . (join ",", @neartext) . "</td></tr>\n"; # TODO: the | is for testing purposes
691} # end extract_caption_text
692
693# the previous section header often gives a bit
694# of context to the section that the image is
695# in (invariably the header is before/above the image)
696# so extract the text of the closest header above the image
697#
698# this fn just gets all the headers above the image, within the context window
699sub get_prev_header {
700 my ($self, $pos, $textref) = (@_);
701 my ($rhtext);
702 while ( $$textref =~ /<h\d>(.*?)<\/h\d>/sig ) {
703 # only headers before image
704 if ((pos $$textref) < $pos) {
705 $rhtext .= "$1, ";
706 }
707 }
708 if ( $rhtext ) { return "Header($rhtext)" }
709 else { return "" }
710}
711
712# not the most robust tag stripping
713# regexps (see perl.com FAQ) but good enough
714#
715# used by caption & tag-based near text algorithms
716sub strip_tags {
717 my ( $self, $value ) = @_;
718 if ( ! defined($value) ) { $value = "" } # handle nulls
719 else {
720 $value =~ s/<.*?>//gs; # strip all html tags
721 $value =~ s/\s+/\ /g; # remove extra whitespace
722 $value =~ s/\&\w+\;//g; # remove &nbsp; etc
723 }
724 return $value;
725}
726
727# uses the given tag(s) to identify
728# the caption near to the image
729# (below, above or both below and above)
730sub extract_caption {
731 my ($self, $tag, $bound_tag, $contextref) = (@_);
732 my (@nt, $n, $etag, $gotcap);
733 return ("", "") if ( ! ($$contextref =~ /\Q$tag/) );
734
735 $nt[0] = $`;
736 $nt[1] = $';
737 $gotcap = 0;
738
739 # look before the image for a boundary tag
740 ($etag, $nt[0]) = $nt[0] =~ /<($bound_tag)[\s]?.*?>(.*?)$/is;
741 # if bound_tag too far from the image, then prob not caption
742 # (note: have to allow for tags, so multiply by 3
743 if ( $etag && length($nt[0]) < ($self->{'caption_length'} * 3) ) {
744 if ( $nt[0] =~ /<\/$etag>/si ) {
745 # the whole caption is above the image: <tag>text</tag><img>
746 ($nt[0]) =~ /<(?:$etag)[\s]?.*?>(.*?)<\/$etag>/is;
747 $nt[0] = $self->strip_tags($nt[0]);
748 if ( length($nt[0]) > $self->{'min_near_text'} ) {
749 $gotcap = 1;
750 $nt[1] = "";
751 }
752
753 } elsif ( $nt[1] =~ /<\/$etag>/si) {
754 # the caption tag covers image: <tag>text?<img>text?</tag>
755 ($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
756 $nt[0] = $self->strip_tags($nt[0] . $nt[1]);
757 if ( length($nt[0]) > $self->{'min_near_text'} ) {
758 $gotcap = 2;
759 $nt[1] = "";
760 }
761 }
762 }
763 # else try below the image
764 if ( ! $gotcap ) {
765 # the caption is after the image: <img><tag>text</tag>
766 ($etag, $nt[1]) = $nt[1] =~ /^.*?<($bound_tag)[\s]?.*?>(.*)/is;
767 if ( $etag && $nt[1] =~ /<\/$etag>/s) {
768 ($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
769 $gotcap = 3;
770 $nt[0] = "";
771 $nt[1] = $self->strip_tags($nt[1]);
772 }
773 }
774 if ( ! $gotcap ) { $nt[0] = $nt[1] = "" }
775 else {
776 # strip part-tags
777 $nt[0] =~ s/^.*>//s;
778 $nt[1] =~ s/<.*$//s;
779 }
780 my ($type);
781 if ( $gotcap == 0 ) { return ("nocaption", "") }
782 elsif ( $gotcap == 1 ) { $type = "captionabove:" }
783 elsif ( $gotcap == 2 ) { $type = "captioncovering:" }
784 elsif ( $gotcap == 3 ) { $type = "captionbelow:" }
785 return ($type, $nt[0], $nt[1]);
786}
787
788# tag-based near text
789#
790# tries different tag sets
791# and chooses the best one
792sub extract_near_text {
793 my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
794 my (@neartext, $len, $hdelim, $maxtext, $mintext, $goodlen,
795 @bestlen, @best, $startpos, $context, $context_size,
796 $dist, $bdist, $best1, $i, $nt);
797 $bestlen[0] = $bestlen[1] = 0; $bestlen[2] = $bdist = 999999;
798 $best[0] = $best[1] = $best[2] = "";
799 $maxtext = $self->{'max_near_text'};
800 $mintext = $self->{'min_near_text'};
801 $goodlen = $self->{'neartext_length'};
802
803 # extract a context to extract near text from (faster)
804 $context_size = $maxtext*4;
805 $startpos = $pos - ($context_size / 2);
806 if ($startpos < $prevpos ) { $startpos = $prevpos }
807 if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
808 { $context_size = ($nextpos - $startpos) }
809 $context = substr ( $$textref, $startpos, $context_size );
810 $context =~ s/<!--.*?-->//gs;
811 $context =~ s/^.*-->(.*)/$1/gs;
812 $context =~ s/(.*)<!--.*$/$1/gs;
813
814 # try stepping through markup delimiter sets
815 # and selecting the best one
816 foreach $hdelim ( @{ $self->{'delims'} } ) {
817 @neartext = $self->extract_tagged_neartext($tag, $hdelim, \$context);
818 $nt = join("", @neartext);
819 $len = length($nt);
820 # Priorities:
821 # 1. Greater than mintext
822 # 2. Less than maxtext
823 # 3. Closest to goodlen
824 if ( $len <= $goodlen && $len > $bestlen[0] ) {
825 $bestlen[0] = $len;
826 $best[0] = $hdelim;
827 } elsif ( $len >= $maxtext && $len < $bestlen[2] ) {
828 $bestlen[2] = $len;
829 $best[2] = $hdelim;
830 } elsif ( $len >= $bestlen[0] && $len <= $bestlen[2] ) {
831 $dist = abs($goodlen - $len);
832 if ( $dist < $bdist ) {
833 $bestlen[1] = $len;
834 $best[1] = $hdelim;
835 $bdist = $dist;
836 }
837 }
838 }
839 $best1 = 2;
840 foreach $i ( 0..2 ) {
841 if ( $bestlen[$i] == 999999 ) { $bestlen[$i] = 0 }
842 $dist = abs($goodlen - $bestlen[$i]);
843 if ( $bestlen[$i] > $mintext && $dist <= $bdist ) {
844 $best1 = $i;
845 $bdist = $dist;
846 }
847 }
848 @neartext = $self->extract_tagged_neartext($tag, $best[$best1], \$context);
849 if ( $bestlen[$best1] > $maxtext ) {
850 # truncate on word boundary if too much text
851 my $hmax = $maxtext / 2;
852 ($neartext[0]) = $neartext[0] =~ /([^\s]*.{1,$hmax})$/s;
853 ($neartext[1]) = $neartext[1] =~ /^(.{1,$hmax}[^\s]*)/s;
854 } elsif ( $bestlen[$best1] < $mintext ) {
855 # use plain text extraction if tags failed (e.g. usable tag outside context)
856 print {$self->{'outhandle'}} "W3ImagePlugin: Fallback to plain-text extraction for $tag\n"
857 if $self->{'verbosity'} > 2;
858 $neartext[0] = "<tr><td>RawNeartext</td><td>" . $self->extract_raw_neartext($tag, $textref) . "</td></tr>";
859 $neartext[1] = "";
860 }
861 # get previous header if available
862 $neartext[0] .= "<br>\n" .
863 $self->get_prev_header($pos, \$context) if ( $self->{'aggressiveness'} >= 4 );
864 $neartext[0] = " " if (! defined $neartext[0]);
865 $neartext[1] = " " if (! defined $neartext[1]);
866
867 return "<tr><td>NearText</td><td>" . (join "|", @neartext) . "</td></tr>\n"; # TODO: the | is for testing purposes
868} # end extract_near_text
869
870# actually captures tag-based
871# near-text given a tag set
872sub extract_tagged_neartext {
873 my ($self, $tag, $bound_tag, $textref) = (@_);
874 return "" if ( ! ($$textref =~ /\Q$tag/) );
875 my (@nt, $delim, $pre_tag, $n);
876 $nt[0] = $`;
877 $nt[1] = $';
878
879 # get text after previous image tag
880 $nt[0] =~ s/.*<($bound_tag)[^>]*>(.*)/$2/is; # get rid of preceding text
881 if (defined($1)) { $delim = $1 }
882 $pre_tag = $bound_tag;
883
884 if (defined($delim)) {
885 # we want to try and use the end tag of the previous delimiter
886 # (put it on the front of the list)
887 $pre_tag =~ s/(^|\|)($delim)($|\|)//i; # take it out
888 $pre_tag =~ s/\|\|/\|/i; # replace || with |
889 $pre_tag = $delim . "|" . $pre_tag; # put it on the front
890 }
891
892 # get text before next image tag
893 $nt[1] =~ s/<\/?(?:$pre_tag)[^>]*>.*//is; # get rid of stuff after first delimiter
894
895 # process related text
896 for $n (0..1) {
897 if ( defined($nt[$n]) ) {
898 $nt[$n] =~ s/<.*?>//gs; # strip all html tags
899 $nt[$n] =~ s/\s+/\ /gs; # remove extra whitespace
900 $nt[$n] =~ s/\&\w+\;//sg; # remove &nbsp; etc
901 # strip part-tags
902 if ( $n == 0 ) { $nt[0] =~ s/^.*>//s }
903 if ( $n == 1 ) { $nt[1] =~ s/<.*$//s }
904 } else { $nt[$n] = ""; } # handle nulls
905 }
906 return @nt;
907}
908
909# this function is fall-back
910# if tags aren't suitable.
911#
912# extracts a fixed length of characters
913# either side of image tag (on word boundary)
914sub extract_raw_neartext {
915 my ($self, $tag, $textref) = (@_);
916 my ($rawtext, $startpos, $fp);
917 my $imgs = \%{$self->{'imglist'}};
918 ($fp) = $tag =~ /([\w\\\/]+\.(?:gif|jpe?g|png))/is;
919 if (! $fp) { return " " };
920 # if the cached, plain-text version isn't there, then create it
921 $self->init_plaintext($textref) unless defined($self->{'plaintext'});
922
923 # take the closest maxtext/2 characters
924 # either side of the tag (by word boundary)
925 return "" if ( ! exists $imgs->{$fp}{'rawpos'} );
926 $startpos = $imgs->{$fp}{'rawpos'} - (($self->{'max_near_text'} / 2) + 20);
927 if ( $startpos < 0 ) { $startpos = 0 }
928 $rawtext = substr $self->{'plaintext'}, $startpos, $self->{'max_near_text'} + 20;
929 $rawtext =~ s/\s\s/ /g;
930
931 return $rawtext;
932}
933
934# init plaintext variable for HTML-stripped version
935# (for full text index/raw assoc text extraction)
936sub init_plaintext {
937 my ($self, $textref) = (@_);
938 my ($page, $fp);
939 my $imgs = \%{$self->{'imglist'}};
940 $page = $$textref; # make a copy of original
941
942 # strip tags around image filenames so they don't get zapped
943 $page =~ s/<\w+\s+.*?([\w\/\\]+\.(?:gif|jpe?g|png))[^>]*>/\"$1\"/gsi;
944 $page =~ s/<.*?>//gs;
945 $page =~ s/&nbsp;/ /gs;
946 $page =~ s/&amp;/&/gs; #TODO: more &zzz; replacements (except &lt;, $gt;)
947
948 # get positions and strip images
949 while ( $page =~ /([^\s\'\"]+\.(jpe?g|gif|png))/ig ) {
950 $fp = $1;
951 if ( $imgs->{$fp}{'exists'} ) {
952 $imgs->{$fp}{'rawpos'} = pos $page;
953 }
954 $page =~ s/\"$fp\"//gs;
955 }
956 $self->{'plaintext'} = $page;
957}
958
959# finds and filters images based on size
960# (dimensions, height, filesize) and existence
961#
962# looks for image filenames (.jpg, .gif, etc)
963# and checks for existence on disk
964# (hence supports most JavaScript images)
965sub get_img_list {
966 my $self = shift (@_);
967 my ($textref) = (@_);
968 my ($filepath, $relpath, $abspath, $pos, $num, $width, $height, $filesize);
969 my $imgs = \%{$self->{'imglist'}};
970 while ( $$textref =~ /([^\s\'\"]+\.(jpe?g|gif|png))/ig ) {
971 $filepath = $1;
972 $pos = pos $$textref;
973 next if ( $imgs->{$filepath}{'relpath'} );
974 $relpath = $filepath;
975 $relpath =~ s/^http\:\/\///; # remove http:// in case we have mirrored it
976 $relpath =~ s/\\/\//g; # replace \ with /
977 $relpath =~ s/^\.\///s; # make "./filepath" into "filepath"
978 $imgs->{$filepath}{'relpath'} = $relpath;
979 $abspath = "$self->{'htpath'}/$relpath";
980
981 if (! -e $abspath) { next }
982
983 # can't modify real filepath var because it
984 # then can't be located in the page for tag recognition later
985 ($width, $height) =
986 `identify $abspath -ping -format "%wx%h"` =~ /^(\d*)x(\d*)$/m;
987 if (! ($width && $height)) {
988 print STDERR "W3ImagePlugin: ($abspath) 'identify' failed. Check ImageMagicK binaries are installed and working correctly\n"; next;
989 }
990 $filesize = (-s $abspath);
991 if ( $filesize >= $self->{'min_size'}
992 && ( $width >= $self->{'min_width'} )
993 && ( $height >= $self->{'min_height'} ) ) {
994
995 $imgs->{$filepath}{'exists'} = 1;
996 $imgs->{$filepath}{'pos'} = $pos;
997 $imgs->{$filepath}{'width'} = $width;
998 $imgs->{$filepath}{'height'} = $height;
999 $imgs->{$filepath}{'filesize'} = $filesize;
1000 } else {
1001 print {$self->{'outhandle'}} "W3ImagePlugin: skipping $self->{'base_path'}/$relpath: $filesize, $width x $height\n"
1002 if $self->{'verbosity'} > 2;
1003 }
1004 }
1005 $num = 0;
1006 foreach my $i ( keys %{$imgs} ) {
1007 if ( $imgs->{$i}{'pos'} ) {
1008 $num++;
1009 } else { delete $imgs->{$i} }
1010 }
1011 return $num;
1012}
1013
1014# make the text available to the read function
1015# by making it an object variable
1016sub read_file {
1017 my ($self, $filename, $encoding, $language, $textref) = @_;
1018 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1019
1020 # if HTMLplug has run through, then it will
1021 # have replaced references so we have to
1022 # make a copy of the text before processing
1023 if ( $self->{'index_pages'} ) {
1024 $self->{'text'} = $$textref;
1025 $self->{'textref'} = \($self->{'text'});
1026 } else {
1027 $self->{'textref'} = $textref;
1028 }
1029 $self->{'plaintext'} = undef;
1030}
1031
1032# HTMLPlugin only extracts meta-data if it is specified in plugin options
1033# hence a special function to do it here
1034sub get_meta_value {
1035 my ($self, $name, $textref) = @_;
1036 my ($value);
1037 $name = lc $name;
1038 if ($name eq "title") {
1039 ($value) = $$textref =~ /<title>(.*?)<\/title>/is
1040 } else {
1041 my $qm = "(?:\"|\')";
1042 ($value) = $$textref =~ /<meta name\s*=\s*$qm?$name$qm?\s+content\s*=\s*$qm?(.*?)$qm?\s*>/is
1043 }
1044 $value = "" unless $value;
1045 return $value;
1046}
1047
1048# make filename an anchor reference
1049# so we can go straight to the image
1050# within the cached version of the source page
1051# (augment's HTMLPlugin sub)
1052sub replace_images {
1053 my $self = shift (@_);
1054 my ($front, $link, $back, $base_dir,
1055 $file, $doc_obj, $section) = @_;
1056 $link =~ s/\"//g;
1057 my ($a_name) = $link;
1058 $a_name =~ s/[\/\\\:\&]/_/g;
1059 # keep a list so we don't repeat the same anchor
1060 if ( ! $self->{'imglist'}{$link}{'anchored'} ) {
1061 $front = "<a name=\"gsdl_$a_name\">$front";
1062 $back = "$back</a>";
1063 $self->{'imglist'}{$link}{'anchored'} = 1;
1064 }
1065 return $self->SUPER::replace_images($front, $link, $back, $base_dir,
1066 $file, $doc_obj, $section);
1067}
1068
10691;
Note: See TracBrowser for help on using the repository browser.