source: trunk/gsdl/perllib/plugins/W3ImgPlug.pm@ 10347

Last change on this file since 10347 was 10347, checked in by kjdon, 19 years ago

removed the unneeded 'use parsargv'

  • Property svn:keywords set to Author Date Id Revision
File size: 36.8 KB
Line 
1###########################################################################
2#
3# W3ImgPlug.pm -- Context-based image indexing plugin for HTML documents
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# DESCRIPTION:
28#
29# Extracts images and associated text and metadata from
30# web pages as individual documents for indexing. Thumbnails
31# are created from each image for browsing purposes.
32#
33# Options are available for configuring the aggressiveness of the
34# associated text extraction mechanisms. A higher level of
35# aggressiveness will extract more text and consequently
36# may mean lower accuracy (precision); however, it may also
37# retrieve more of the relevant images from the collection (recall).
38# Lower levels of aggressiveness maybe result in slightly faster
39# collection builds at the import stage.
40#
41# W3ImgPlug is a subclass of HTMLPlug (i.e. it will index pages also
42# if required). It can be used in place of HTMLPlug to index both
43# pages and their images.
44#
45# REQUIREMENTS:
46#
47# The ImageMagick image manipulation is used to create
48# thumbnails and extract some image metadata. (Available
49# from http://www.imagemagick.org/)
50#
51# Unix:
52# Many Linux distributions contain ImageMagick.
53#
54# Windows:
55# ImageMagick can be downloaded from the website above.
56# Make sure the system path includes the ImageMagick binaries
57# before using W3ImgPlug.
58#
59# NOTE: NT/2000/XP contain a filesystem utility 'convert.exe'
60# with the same name as the image conversion utility. The
61# ImageMagick FAQ recommends renaming the filesystem
62# utility (e.g. to 'fsconvert.exe') to avoid this clash.
63#
64# USAGE:
65#
66# An image document consists of metadata elements:
67#
68# OriginalFilename, FilePath, Filename, FileExt, FileSize,
69# Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL
70# ImageText, PageTitle
71#
72# Most of these are only useful in format strings (e.g. ThumbURL,
73# Filename, URL, PageURL, CachePageURL).
74#
75# ImageText, as the name suggests contains the indexable text.
76# (unless using the -document_text plugin option)
77#
78# Since image documents are made up of metadata elements
79# alone, format strings are needed to display them properly.
80# NOTE: The receptionist will only display results (e.g. thumbnails)
81# in 4 columns if the format string begins with "<td><table>".
82#
83# The example below takes the user to the image within the
84# source HTML document rather than using a format string
85# on DocumentText to display the image document itself.
86#
87# Example collect.cfg:
88#
89# ...
90#
91# indexes document:ImageText document:text
92# defaultindex document:ImageText
93#
94# collectionmeta .document:ImageText "images"
95# collectionmeta .document:text "documents"
96#
97# ...
98#
99# plugin W3ImgPlug -index_pages -aggressiveness 6
100#
101# ...
102#
103# format SearchVList '<td>{If}{[Title],[link][icon]&nbsp;[Title][[/link],
104# <table><tr><td align="center"><a href="[CachePageURL]">
105# <img src="[ThumbURL]"></a></td></tr><tr><td align="center">
106# <a href="[CachePageURL]"><font size="-1">[OriginalFilename]</font></a>
107# <br>[Width]x[Height]</td></tr></table>}</td>'
108#
109# ...
110#
111
112package W3ImgPlug;
113
114use HTMLPlug;
115use ghtml;
116use unicode;
117use util;
118use strict; # 'subs';
119no strict 'refs'; # allow filehandles to be variables and viceversa
120
121sub BEGIN {
122 @W3ImgPlug::ISA = qw( HTMLPlug );
123}
124
125my $aggressiveness_list =
126 [ { 'name' => "1",
127 'desc' => "{W3ImgPlug.aggressiveness.1}" },
128 { 'name' => "2",
129 'desc' => "{W3ImgPlug.aggressiveness.2}" },
130 { 'name' => "3",
131 'desc' => "{W3ImgPlug.aggressiveness.3}" },
132 { 'name' => "4",
133 'desc' => "{W3ImgPlug.aggressiveness.4}" },
134 { 'name' => "5",
135 'desc' => "{W3ImgPlug.aggressiveness.5}" },
136 { 'name' => "6",
137 'desc' => "{W3ImgPlug.aggressiveness.6}" },
138 { 'name' => "7",
139 'desc' => "{W3ImgPlug.aggressiveness.7}" },
140 { 'name' => "8",
141 'desc' => "{W3ImgPlug.aggressiveness.8}" },
142 { 'name' => "9",
143 'desc' => "{W3ImgPlug.aggressiveness.9}" } ];
144
145my $arguments =
146 [ { 'name' => "aggressiveness",
147 'desc' => "{W3ImgPlug.aggressiveness}",
148 'type' => "int",
149 'list' => $aggressiveness_list,
150 'deft' => "3",
151 'reqd' => "no" },
152 { 'name' => "index_pages",
153 'desc' => "{W3ImgPlug.index_pages}",
154 'type' => "flag",
155 'reqd' => "no" },
156 { 'name' => "no_cache_images",
157 'desc' => "{W3ImgPlug.no_cache_images}",
158 'type' => "flag",
159 'reqd' => "no" },
160 { 'name' => "min_size",
161 'desc' => "{W3ImgPlug.min_size}",
162 'type' => "int",
163 'deft' => "2000",
164 'reqd' => "no" },
165 { 'name' => "min_width",
166 'desc' => "{W3ImgPlug.min_width}",
167 'type' => "int",
168 'deft' => "50",
169 'reqd' => "no" },
170 { 'name' => "min_height",
171 'desc' => "{W3ImgPlug.min_height}",
172 'type' => "int",
173 'deft' => "50",
174 'reqd' => "no" },
175 { 'name' => "thumb_size",
176 'desc' => "{W3ImgPlug.thumb_size}",
177 'type' => "int",
178 'deft' => "100",
179 'reqd' => "no" },
180 { 'name' => "convert_params",
181 'desc' => "{W3ImgPlug.convert_params}",
182 'type' => "string",
183 'deft' => "",
184 'reqd' => "no" },
185 { 'name' => "min_near_text",
186 'desc' => "{W3ImgPlug.min_near_text}",
187 'type' => "int",
188 'deft' => "10",
189 'reqd' => "no" },
190 { 'name' => "max_near_text",
191 'desc' => "{W3ImgPlug.max_near_text}",
192 'type' => "int",
193 'deft' => "400",
194 'reqd' => "no" },
195 { 'name' => "smallpage_threshold",
196 'desc' => "{W3ImgPlug.smallpage_threshold}",
197 'type' => "int",
198 'deft' => "2048",
199 'reqd' => "no" },
200 { 'name' => "textrefs_threshold",
201 'desc' => "{W3ImgPlug.textrefs_threshold}",
202 'type' => "int",
203 'deft' => "2",
204 'reqd' => "no" },
205 { 'name' => "caption_length",
206 'desc' => "{W3ImgPlug.caption_length}",
207 'type' => "int",
208 'deft' => "80",
209 'reqd' => "no" },
210 { 'name' => "neartext_length",
211 'desc' => "{W3ImgPlug.neartext_length}",
212 'type' => "int",
213 'deft' => "300",
214 'reqd' => "no" },
215 { 'name' => "document_text",
216 'desc' => "{W3ImgPlug.document_text}",
217 'type' => "flag",
218 'reqd' => "no" } ];
219
220my $options = { 'name' => "W3ImgPlug",
221 'desc' => "{W3ImgPlug.desc}",
222 'abstract' => "no",
223 'inherits' => "yes",
224 'args' => $arguments };
225
226sub new {
227 my ($class) = shift (@_);
228 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
229 push(@$pluginlist, $class);
230
231 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
232 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
233
234 my $self = (defined $hashArgOptLists)? new HTMLPlug($pluginlist,$inputargs,$hashArgOptLists): new HTMLPlug($pluginlist,$inputargs);
235
236 # init class variables
237 $self->{'textref'} = undef; # init by read_file fn
238 $self->{'htdoc_obj'} = undef; # init by process fn
239 $self->{'htpath'} = undef; # init by process fn
240 $self->{'hturl'} = undef; # init by process fn
241 $self->{'plaintext'} = undef; # HTML stripped version - only init if needed by raw_neartext sub
242 $self->{'smallpage'} = 0; # set by process fn
243 $self->{'images_indexed'} = undef; # num of images indexed - if 1 or 2 then we know page is small
244 $self->{'initialised'} = undef; # flag (see set_extraction_options())
245
246 return bless $self, $class;
247}
248
249# if indexing pages, let HTMLPlug do it's stuff
250# image extraction done through read()
251sub process {
252 my ($self, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
253 $self->{'imglist'} = ();
254 if ( $self->{'index_pages'} ) {
255 my $ok = $self->SUPER::process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
256 if ( ! $ok ) { return $ok }
257 $self->{'htdoc_obj'} = $doc_obj;
258 }
259 # else use URL for referencing
260 #if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
261
262 $self->{'htpath'} = $base_dir if (-d $base_dir);
263 if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} .= "/$1"; }
264 $self->{'htpath'} =~ s/\\/\//g; # replace \ with /
265
266 $self->{'hturl'} = "http://$file";
267 $self->{'hturl'} =~ s/\\/\//g; # for windows
268 ($self->{'filename'}) = $file =~ /.*[\/\\](.*)/;
269 ($self->{'base_path'}) = $file =~ /(.*)[\/\\]/i;
270 if ( ( -s "$base_dir/$file") <= $self->{'smallpage_threshold'} ) {
271 $self->{'smallpage'} = 1;
272 } else { $self->{'smallpage'} = 0; }
273
274 if ( defined($self->{'initialised'}) ) { return 1; }
275 else {
276 $self->{'initialised'} = $self->set_extraction_options($base_dir =~ /^(.*?)\/import/i);
277 return $self->{'initialised'};
278 }
279}
280
281# get complex configuration options from configuration files
282# -- $GSDLCOLLECTION/etc/W3ImgPlug.cfg (tag sets for aggr 2+)
283# -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+)
284
285# If there's no W3ImgPlug.cfg file we'll use the following default values
286my $defaultcfg = '
287<delimitertagset>
288 <setname>Caption</setname>
289 <taggroup>font</taggroup>
290 <taggroup>tt</taggroup>
291 <taggroup>small</taggroup>
292 <taggroup>b</taggroup>
293 <taggroup>i</taggroup>
294 <taggroup>u</taggroup>
295 <taggroup>em</taggroup>
296 <taggroup>td</taggroup>
297 <taggroup>li</taggroup>
298 <taggroup>a</taggroup>
299 <taggroup>p</taggroup>
300 <taggroup>tr</taggroup>
301 <taggroup>center</taggroup>
302 <taggroup>div</taggroup>
303 <taggroup>caption</taggroup>
304 <taggroup>br</taggroup>
305 <taggroup>ul</taggroup>
306 <taggroup>ol</taggroup>
307 <taggroup>table</taggroup>
308 <taggroup>hr</taggroup>
309</delimitertagset>
310
311<delimitertagset>
312 <setname>Neartext</setname>
313 <taggroup>tr|hr|table|h\d|img|body</taggroup>
314 <taggroup>td|tr|hr|table|h\d|img|body</taggroup>
315 <taggroup>p|br|td|tr|hr|table|h\d|img|body</taggroup>
316 <taggroup>font|p|i|b|em|img</taggroup>
317</delimitertagset>
318';
319
320sub set_extraction_options() {
321 my ($self, $collpath) = @_;
322 my ($filepath);
323
324 print {$self->{'outhandle'}} "W3ImgPlug: Initialising\n"
325 if $self->{'verbosity'} > 1;
326 # etc/W3ImgPlug.cfg (XML)
327 # tag sets for captions and neartext
328 if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) {
329 $self->{'delims'} = [];
330 $self->{'cdelims'} = [];
331 my ($cfg, @tagsets, $tagset, $type, @delims);
332
333 $filepath = "$collpath/etc/W3ImgPlug.cfg";
334 if ( open CFG, "<$filepath" ) {
335 while (<CFG>) { $cfg .= $_ }
336 close CFG;
337 } else {
338 $cfg = $defaultcfg;
339 }
340
341 (@tagsets) =
342 $cfg =~ /<delimitertagset>(.*?)<\/delimitertagset>/igs;
343 foreach $tagset ( @tagsets ) {
344 ($type) = $tagset =~ /<setname>(.*?)<\/setname>/i;
345 if ( lc($type) eq "caption" ) {
346 (@{$self->{'cdelims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
347 }
348 elsif ( lc($type) eq "neartext" ) {
349 (@{$self->{'delims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
350 }
351 }
352
353 # output a warning if there seem to be no delimiters
354 if ( scalar(@{$self->{'cdelims'}} == 0)) {
355 print {$self->{'outhandle'}} "W3ImgPlug: Warning: no caption delimiters found in $filepath\n";
356 }
357 if ( scalar(@{$self->{'delims'}} == 0)) {
358 print {$self->{'outhandle'}} "W3ImgPlug: Warning: no neartext delimiters found in $filepath\n";
359 }
360 }
361
362 # get stop words for textual reference extraction
363 # TODO: warnings scroll off. Would be best to output them again at end of import
364 if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) {
365 $self->{'stopwords'} = ();
366 $filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");
367 if ( open STOPWORDS, "<$filepath" ) {
368 while ( <STOPWORDS> ) {
369 chomp;
370 $self->{'stopwords'}{$_} = 1;
371 }
372 close STOPWORDS;
373 } else {
374 print {$self->{'outhandle'}} "W3ImgPlug: Warning: couldn't open stopwords file at $filepath ($!)\n";
375 }
376
377 }
378
379 if ( $self->{'neartext_length'} > $self->{'max_near_text'} ) {
380 $self->{'max_near_text'} = $self->{'neartext_length'} * 1.33;
381 print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'max_near_text'}\n";
382 }
383 if ( $self->{'caption_length'} > $self->{'max_near_text'} ) {
384 $self->{'max_near_text'} = $self->{'caption_length'} * 1.33;
385 print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'max_near_text'}\n";
386 }
387
388 return 1;
389}
390
391# return number of files processed, undef if can't recognise, -1 if
392# cant process
393# Note that $base_dir might be "" and that $file might
394# include directories
395sub read {
396 my ($self, $pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = (@_);
397 my ($doc_obj, $section, $filepath, $imgtag, $pos, $context, $numdocs, $tndir, $imgs);
398 # forward normal read (runs HTMLPlug if index_pages T)
399 my $ok = $self->SUPER::read($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
400 if ( ! $ok ) { return $ok } # what is this returning??
401
402 my $outhandle = $self->{'outhandle'};
403 my $textref = $self->{'textref'};
404 my $htdoc_obj = $self->{'htdoc_obj'};
405 $numdocs = 0;
406 $base_dir =~ /(.*)\/.*/;
407 $tndir = "$1/archives/thumbnails"; # TODO: this path shouldn't be hardcoded?
408 &util::mk_all_dir($tndir) unless -e "$tndir";
409
410 $imgs = \%{$self->{'imglist'}};
411 my $nimgs = $self->get_img_list($textref);
412 $self->{'images_indexed'} = $nimgs;
413 if ( $nimgs > 0 ) {
414 my @fplist = (sort { $imgs->{$a}{'pos'} <=> $imgs->{$b}{'pos'} } keys %{$imgs});
415 my $i = 0;
416 foreach $filepath ( @fplist ) {
417 $pos = $imgs->{$filepath}{'pos'};
418 $context = substr ($$textref, $pos - 50, $pos + 50); # grab context (quicker)
419 ($imgtag) = ($context =~ /(<(?:img|a|body)\s[^>]*$filepath[^>]*>)/is );
420 if (! defined($imgtag)) { $imgtag = $filepath }
421 print $outhandle "W3ImgPlug: extracting $filepath\n"
422 if ( $self->{'verbosity'} > 1 );
423 $doc_obj = new doc ("", "indexed_doc");
424 $section = $doc_obj->get_top_section();
425 my $prevpos = ( $i == 0 ? 0 : $imgs->{$fplist[$i - 1]}{'pos'});
426 my $nextpos = ( $i >= ($nimgs -1) ? -1 : $imgs->{$fplist[$i + 1]}{'pos'} );
427
428 $self->extract_image_info($imgtag, $filepath, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos);
429 $processor->process($doc_obj);
430 $numdocs++;
431 $i++;
432 }
433 return $numdocs;
434 } else {
435 print $outhandle "W3ImgPlug: No images from $file indexed\n"
436 if ( $self->{'verbosity'} > 2 );
437 return 1;
438 }
439
440}
441
442# for every valid image tag
443# 1. extract related text and image metadata
444# 2. add this as document meta-data
445# 3. add assoc image(s) as files
446#
447sub extract_image_info {
448 my $self = shift (@_);
449 my ($tag, $id, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos) = (@_);
450 my ($filename, $orig_fp, $fn, $ext, $reltext, $relreltext, $crcid, $imgs,
451 $thumbfp, $pagetitle, $alttext, $filepath, $aggr);
452 $aggr = $self->{'aggressiveness'};
453 $imgs = \%{$self->{'imglist'}};
454 $filepath = $imgs->{$id}{'relpath'};
455 ($filename) = $filepath =~ /([^\/\\]+)$/s;
456 ($orig_fp) = "$self->{'base_path'}/$filepath";
457 $orig_fp =~ tr/+/ /;
458 $orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc
459 $orig_fp =~ s/\\/\//g;
460 $filepath = "$self->{'htpath'}/$filepath";
461 my ($onlyfn) = $filename =~ /([^\\\/]*)$/;
462 ($fn, $ext) = $onlyfn =~ /(.*)\.(.*)/;
463 $fn = lc $fn; $ext = lc $ext;
464 ($reltext) = "<tr><td>GifComment</td><td>" . `identify $filepath -ping -format "%c"` . "</td></tr>\n"
465 if ($ext eq "gif");
466 $reltext .= "<tr><td>FilePath</td><td>$orig_fp</td></tr>\n";
467
468 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
469 $crcid = "$fn.$ext." . $self->{'next_crcid'}++;
470 } else { ($crcid) = `cksum $filepath` =~ /^(\d+)/; }
471 $thumbfp = "$tndir/tn_$crcid.jpg";
472 `convert -flatten -filter Hanning $self->{'convert_params'} -geometry "$self->{'thumb_size'}x$self->{'thumb_size'}>" $filepath $thumbfp` unless -e $thumbfp;
473 if ( ! (-e $thumbfp) ) {
474 print STDERR "W3ImgPlug: 'convert' failed. Check ImageMagicK binaries are installed and working correctly\n"; return 0;
475 }
476
477 # shove in full text (tag stripped or unstripped) if settings require it
478 if ( $aggr == 10) {
479 $reltext = "<tr><td>AllPage</td><td>" . $$textref . "</td><tr>\n"; # level 10 (all text, verbatim)
480 } else {
481 $pagetitle = $self->get_meta_value("title", $textref);
482 ($alttext) = $tag =~ /\salt\s*=\s*(?:\"|\')(.+?)(?:\"|\')/is;
483 if ( defined($alttext) && length($alttext) > 1) {
484 $reltext .= "<tr><td>ALTtext</td><td>$alttext</td></tr>\n"; }
485 $reltext .= "<tr><td>SplitCapitalisation</td><td>" .
486 $self->split_filepath($orig_fp) . "</td></tr>\n";
487
488 # get caption/tag based near text (if appropriate)
489 if ( $aggr > 1 ) {
490 if ( $aggr >= 2 ) {
491 $reltext .=
492 $self->extract_caption_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
493 $relreltext = $reltext;
494 }
495 # repeat the filepath, alt-text, caption, etc
496 if ( $aggr == 8 ) {
497 $reltext .= $relreltext;
498 }
499 if ( $aggr >= 3 ) {
500 $reltext .=
501 $self->extract_near_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
502 }
503
504 # get page metadata (if appropriate)
505 if ( $aggr >= 6 || ( $aggr >= 2 &&
506 ( $self->{'images_indexed'} < 2 ||
507 ($self->{'smallpage'} == 1 && $self->{'images_indexed'} < 6 )))) {
508 $reltext .= $self->get_page_metadata($textref);
509 }
510 # textual references
511 if ( $aggr == 5 || $aggr >= 7) {
512 if ( length($relreltext) > ($self->{'caption_length'} * 2) ) {
513 $reltext .= $self->get_textrefs($relreltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); }
514 else {
515 $reltext .= $self->get_textrefs($reltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
516 }
517 }
518 } # aggr > 1
519 } # aggr != 10
520
521 $doc_obj->set_OID($crcid);
522 $doc_obj->associate_file($thumbfp, "$fn.thumb.jpg", undef, $section);
523 $doc_obj->add_metadata($section, "OriginalFilename", $filename);
524 $doc_obj->add_metadata($section, "FilePath", $orig_fp);
525 $doc_obj->add_metadata($section, "Filename", $fn);
526 $doc_obj->add_metadata($section, "FileExt", $ext);
527 $doc_obj->add_metadata($section, "FileSize", $imgs->{$id}{'filesize'});
528 $doc_obj->add_metadata($section, "Width", $imgs->{$id}{'width'});
529 $doc_obj->add_metadata($section, "Height", $imgs->{$id}{'height'});
530 $doc_obj->add_metadata($section, "URL", "http://$orig_fp");
531 $doc_obj->add_metadata($section, "PageURL", $self->{'hturl'});
532 $doc_obj->add_metadata($section, "PageTitle", $pagetitle);
533 $doc_obj->add_metadata($section, "ThumbURL",
534 "_httpcollection_/index/assoc/[archivedir]/$fn.thumb.jpg");
535 $doc_obj->add_metadata($section, "FileFormat", "W3Img");
536
537 if ( $self->{'document_text'} ) {
538 $doc_obj->add_utf8_text($section, "<table border=1>\n$reltext</table>");
539 } else {
540 $doc_obj->add_metadata($section, "ImageText", "<table border=1>\n$reltext</table>\n");
541 }
542
543 if ( $self->{'index_pages'} ) {
544 my ($cache_url) = "_httpdoc_&d=" . $self->{'htdoc_obj'}->get_OID();
545 if ( $imgs->{$id}{'anchored'} ) {
546 my $a_name = $id;
547 $a_name =~ s/[\/\\\:\&]/_/g;
548 $cache_url .= "#gsdl_$a_name" ;
549 }
550 $doc_obj->add_utf8_metadata($section, "CachePageURL", $cache_url);
551 }
552 if ( ! $self->{'no_cache_images'} ) {
553 $onlyfn = lc $onlyfn;
554 $doc_obj->associate_file($filepath, $onlyfn, undef, $section);
555 $doc_obj->add_utf8_metadata($section, "CacheURL",
556 "_httpcollection_/index/assoc/[archivedir]/$onlyfn");
557 }
558 return 1;
559}
560
561sub get_page_metadata {
562 my ($self, $textref) = (@_);
563 my (@rval);
564 $rval[0] = $self->get_meta_value("title", $textref);
565 $rval[1] = $self->get_meta_value("keywords", $textref);
566 $rval[2] = $self->get_meta_value("description", $textref);
567 $rval[3] = $self->{'filename'};
568
569 return wantarray ? @rval : "<tr><td>PageMeta</td><td>@rval</td></tr>\n" ;
570}
571
572# turns LargeCatFish into Large,Cat,Fish so MG sees the separate words
573sub split_filepath {
574 my ($self, $filepath) = (@_);
575 my (@words) = $filepath =~ /([A-Z][a-z]+)/g;
576 return join(',', @words);
577}
578
579# finds and extracts sentences
580# that seem to be on the same topic
581# as other related text (correlations)
582# and textual references (e.g. in figure 3 ...)
583sub get_textrefs {
584 my ($self, $reltext, $textref, $prevpos, $pos, $nextpos) = (@_);
585 my ($maxtext, $mintext, $startpos, $context_size, $context);
586
587 my (@relwords, @refwords, %sentences, @pagemeta);
588
589 # extract larger context
590 $maxtext = $self->{'max_near_text'};
591 $startpos = $pos - ($maxtext * 4);
592 $context_size = $maxtext*10;
593 if ($startpos < $prevpos ) { $startpos = $prevpos }
594 if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) }
595 $context = substr ( $$textref, $startpos, $context_size );
596 $context =~ s/<.*?>//gs;
597 $context =~ s/^.*>(.*)/$1/gs;
598 $context =~ s/(.*)<.*$/$1/gs;
599
600 # get page meta-data (if not already included)
601 if ( $self->{'aggressiveness'} == 5 && ! $self->{'smallpage'} ) {
602 @pagemeta = $self->get_page_metadata($textref);
603 foreach my $value ( @pagemeta ) {
604 $context .= "$value."; # make each into psuedo-sentence
605 }
606 }
607
608 # TODO: this list is not exhaustive
609 @refwords = ( '(?:is|are)? ?(?:show(?:s|n)|demonstrate(?:d|s)|explains|features) (?:in|by|below|above|here)',
610 '(?:see)? (?:figure|table)? (?:below|above)');
611
612 # extract general references
613 foreach my $rw ( @refwords ) {
614 while ( $context =~ /[\.\?\!\,](.*?$rw\W.*?[\.\?\!\,])/ig ) {
615 my $sentence = $1;
616 $sentence =~ s/\s+/ /g;
617 $sentences{$sentence}+=2;
618 }
619 }
620 # extract specific (figure, table) references by number
621 my ($fignum) = $context =~ /[\.\?\!].*?(?:figure|table)s?[\-\_\ \.](\d+\w*)\W.*?[\.\?\!]/ig;
622 if ( $fignum ) {
623 foreach my $rw ( @refwords ) {
624 while ( $context =~ /[\.\?\!](.*?(figure|table)[\-\_\ \.]$fignum\W.*?[\.\?\!])/ig ) {
625 my $sentence = $1;
626 $sentence =~ s/\s+/ /g;
627 $sentences{$sentence}+=4;
628 }
629 }
630 }
631
632 # sentences with occurances of important words
633 @relwords = $reltext =~ /([a-zA-Z]{4,})/g; # take out small words
634 foreach my $word ( @relwords ) {
635 if ( $self->{'stopwords'}{$word} ) { next } # skip stop words
636 while ( $context =~ /([^\.\?\!]*?$word\W.*?[\.\?\!])/ig ) {
637 my $sentence = $1;
638 $sentence =~ s/\s+/ /g;
639 $sentences{$sentence}++;
640 }
641 }
642 foreach my $sentence ( keys %sentences ) {
643 if ($sentences{$sentence} < $self->{'textrefs_threshold'}) {
644 delete $sentences{$sentence};
645 }
646 }
647 my ($rval) = join "<br>\n", (keys %sentences);
648 if ( $rval && length($rval) > 5 ) {
649 return ( "<tr><td>TextualReferences</td><td>" . $rval . "</td></tr>\n") }
650 else { return "" }
651}
652
653# handles caption extraction
654# calling the extractor with different
655# tags and choosing the best candidate caption
656sub extract_caption_text {
657 my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
658 my (@neartext, $len, $hdelim, $mintext, $goodlen,
659 $startpos, $context, $context_size);
660
661 $mintext = $self->{'min_near_text'};
662 $goodlen = $self->{'caption_length'};
663
664 # extract a context to extract near text from (faster)
665 $context_size = $self->{'max_near_text'}*3;
666 $startpos = $pos - ($context_size / 2);
667 if ($startpos < $prevpos ) { $startpos = $prevpos }
668 if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
669 { $context_size = ($nextpos - $startpos) }
670
671 $context = substr ( $$textref, $startpos, $context_size );
672 $context =~ s/<!--.*?-->//gs;
673 $context =~ s/^.*-->(.*)/$1/gs;
674 $context =~ s/(.*)<!--.*$/$1/gs;
675
676 # try stepping through markup delimiter sets
677 # and selecting the best one
678 foreach $hdelim ( @{ $self->{'cdelims'} } ) {
679 @neartext = $self->extract_caption($tag, $hdelim, \$context);
680 $len = length(join("", @neartext));
681 last if ($len >= $mintext && $len <= $goodlen);
682 }
683 # reject if well over reasonable length
684 if ( $len > $goodlen ) {
685 @neartext = [];
686 }
687 $neartext[0] = " " if (! defined $neartext[0]);
688 $neartext[1] = " " if (! defined $neartext[1]);
689 return "<tr><td>Caption</td><td>" . (join ",", @neartext) . "</td></tr>\n"; # TODO: the | is for testing purposes
690} # end extract_caption_text
691
692# the previous section header often gives a bit
693# of context to the section that the image is
694# in (invariably the header is before/above the image)
695# so extract the text of the closest header above the image
696#
697# this fn just gets all the headers above the image, within the context window
698sub get_prev_header {
699 my ($self, $pos, $textref) = (@_);
700 my ($rhtext);
701 while ( $$textref =~ /<h\d>(.*?)<\/h\d>/sig ) {
702 # only headers before image
703 if ((pos $$textref) < $pos) {
704 $rhtext .= "$1, ";
705 }
706 }
707 if ( $rhtext ) { return "Header($rhtext)" }
708 else { return "" }
709}
710
711# not the most robust tag stripping
712# regexps (see perl.com FAQ) but good enough
713#
714# used by caption & tag-based near text algorithms
715sub strip_tags {
716 my ( $self, $value ) = @_;
717 if ( ! defined($value) ) { $value = "" } # handle nulls
718 else {
719 $value =~ s/<.*?>//gs; # strip all html tags
720 $value =~ s/\s+/\ /g; # remove extra whitespace
721 $value =~ s/\&\w+\;//g; # remove &nbsp; etc
722 }
723 return $value;
724}
725
726# uses the given tag(s) to identify
727# the caption near to the image
728# (below, above or both below and above)
729sub extract_caption {
730 my ($self, $tag, $bound_tag, $contextref) = (@_);
731 my (@nt, $n, $etag, $gotcap);
732 return ("", "") if ( ! ($$contextref =~ /\Q$tag/) );
733
734 $nt[0] = $`;
735 $nt[1] = $';
736 $gotcap = 0;
737
738 # look before the image for a boundary tag
739 ($etag, $nt[0]) = $nt[0] =~ /<($bound_tag)[\s]?.*?>(.*?)$/is;
740 # if bound_tag too far from the image, then prob not caption
741 # (note: have to allow for tags, so multiply by 3
742 if ( $etag && length($nt[0]) < ($self->{'caption_length'} * 3) ) {
743 if ( $nt[0] =~ /<\/$etag>/si ) {
744 # the whole caption is above the image: <tag>text</tag><img>
745 ($nt[0]) =~ /<(?:$etag)[\s]?.*?>(.*?)<\/$etag>/is;
746 $nt[0] = $self->strip_tags($nt[0]);
747 if ( length($nt[0]) > $self->{'min_near_text'} ) {
748 $gotcap = 1;
749 $nt[1] = "";
750 }
751
752 } elsif ( $nt[1] =~ /<\/$etag>/si) {
753 # the caption tag covers image: <tag>text?<img>text?</tag>
754 ($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
755 $nt[0] = $self->strip_tags($nt[0] . $nt[1]);
756 if ( length($nt[0]) > $self->{'min_near_text'} ) {
757 $gotcap = 2;
758 $nt[1] = "";
759 }
760 }
761 }
762 # else try below the image
763 if ( ! $gotcap ) {
764 # the caption is after the image: <img><tag>text</tag>
765 ($etag, $nt[1]) = $nt[1] =~ /^.*?<($bound_tag)[\s]?.*?>(.*)/is;
766 if ( $etag && $nt[1] =~ /<\/$etag>/s) {
767 ($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
768 $gotcap = 3;
769 $nt[0] = "";
770 $nt[1] = $self->strip_tags($nt[1]);
771 }
772 }
773 if ( ! $gotcap ) { $nt[0] = $nt[1] = "" }
774 else {
775 # strip part-tags
776 $nt[0] =~ s/^.*>//s;
777 $nt[1] =~ s/<.*$//s;
778 }
779 my ($type);
780 if ( $gotcap == 0 ) { return ("nocaption", "") }
781 elsif ( $gotcap == 1 ) { $type = "captionabove:" }
782 elsif ( $gotcap == 2 ) { $type = "captioncovering:" }
783 elsif ( $gotcap == 3 ) { $type = "captionbelow:" }
784 return ($type, $nt[0], $nt[1]);
785}
786
787# tag-based near text
788#
789# tries different tag sets
790# and chooses the best one
791sub extract_near_text {
792 my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
793 my (@neartext, $len, $hdelim, $maxtext, $mintext, $goodlen,
794 @bestlen, @best, $startpos, $context, $context_size,
795 $dist, $bdist, $best1, $i, $nt);
796 $bestlen[0] = $bestlen[1] = 0; $bestlen[2] = $bdist = 999999;
797 $best[0] = $best[1] = $best[2] = "";
798 $maxtext = $self->{'max_near_text'};
799 $mintext = $self->{'min_near_text'};
800 $goodlen = $self->{'neartext_length'};
801
802 # extract a context to extract near text from (faster)
803 $context_size = $maxtext*4;
804 $startpos = $pos - ($context_size / 2);
805 if ($startpos < $prevpos ) { $startpos = $prevpos }
806 if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
807 { $context_size = ($nextpos - $startpos) }
808 $context = substr ( $$textref, $startpos, $context_size );
809 $context =~ s/<!--.*?-->//gs;
810 $context =~ s/^.*-->(.*)/$1/gs;
811 $context =~ s/(.*)<!--.*$/$1/gs;
812
813 # try stepping through markup delimiter sets
814 # and selecting the best one
815 foreach $hdelim ( @{ $self->{'delims'} } ) {
816 @neartext = $self->extract_tagged_neartext($tag, $hdelim, \$context);
817 $nt = join("", @neartext);
818 $len = length($nt);
819 # Priorities:
820 # 1. Greater than mintext
821 # 2. Less than maxtext
822 # 3. Closest to goodlen
823 if ( $len <= $goodlen && $len > $bestlen[0] ) {
824 $bestlen[0] = $len;
825 $best[0] = $hdelim;
826 } elsif ( $len >= $maxtext && $len < $bestlen[2] ) {
827 $bestlen[2] = $len;
828 $best[2] = $hdelim;
829 } elsif ( $len >= $bestlen[0] && $len <= $bestlen[2] ) {
830 $dist = abs($goodlen - $len);
831 if ( $dist < $bdist ) {
832 $bestlen[1] = $len;
833 $best[1] = $hdelim;
834 $bdist = $dist;
835 }
836 }
837 }
838 $best1 = 2;
839 foreach $i ( 0..2 ) {
840 if ( $bestlen[$i] == 999999 ) { $bestlen[$i] = 0 }
841 $dist = abs($goodlen - $bestlen[$i]);
842 if ( $bestlen[$i] > $mintext && $dist <= $bdist ) {
843 $best1 = $i;
844 $bdist = $dist;
845 }
846 }
847 @neartext = $self->extract_tagged_neartext($tag, $best[$best1], \$context);
848 if ( $bestlen[$best1] > $maxtext ) {
849 # truncate on word boundary if too much text
850 my $hmax = $maxtext / 2;
851 ($neartext[0]) = $neartext[0] =~ /([^\s]*.{1,$hmax})$/s;
852 ($neartext[1]) = $neartext[1] =~ /^(.{1,$hmax}[^\s]*)/s;
853 } elsif ( $bestlen[$best1] < $mintext ) {
854 # use plain text extraction if tags failed (e.g. usable tag outside context)
855 print {$self->{'outhandle'}} "W3ImgPlug: Fallback to plain-text extraction for $tag\n"
856 if $self->{'verbosity'} > 2;
857 $neartext[0] = "<tr><td>RawNeartext</td><td>" . $self->extract_raw_neartext($tag, $textref) . "</td></tr>";
858 $neartext[1] = "";
859 }
860 # get previous header if available
861 $neartext[0] .= "<br>\n" .
862 $self->get_prev_header($pos, \$context) if ( $self->{'aggressiveness'} >= 4 );
863 $neartext[0] = " " if (! defined $neartext[0]);
864 $neartext[1] = " " if (! defined $neartext[1]);
865
866 return "<tr><td>NearText</td><td>" . (join "|", @neartext) . "</td></tr>\n"; # TODO: the | is for testing purposes
867} # end extract_near_text
868
869# actually captures tag-based
870# near-text given a tag set
871sub extract_tagged_neartext {
872 my ($self, $tag, $bound_tag, $textref) = (@_);
873 return "" if ( ! ($$textref =~ /\Q$tag/) );
874 my (@nt, $delim, $pre_tag, $n);
875 $nt[0] = $`;
876 $nt[1] = $';
877
878 # get text after previous image tag
879 $nt[0] =~ s/.*<($bound_tag)[^>]*>(.*)/$2/is; # get rid of preceding text
880 if (defined($1)) { $delim = $1 }
881 $pre_tag = $bound_tag;
882
883 if (defined($delim)) {
884 # we want to try and use the end tag of the previous delimiter
885 # (put it on the front of the list)
886 $pre_tag =~ s/(^|\|)($delim)($|\|)//i; # take it out
887 $pre_tag =~ s/\|\|/\|/i; # replace || with |
888 $pre_tag = $delim . "|" . $pre_tag; # put it on the front
889 }
890
891 # get text before next image tag
892 $nt[1] =~ s/<\/?(?:$pre_tag)[^>]*>.*//is; # get rid of stuff after first delimiter
893
894 # process related text
895 for $n (0..1) {
896 if ( defined($nt[$n]) ) {
897 $nt[$n] =~ s/<.*?>//gs; # strip all html tags
898 $nt[$n] =~ s/\s+/\ /gs; # remove extra whitespace
899 $nt[$n] =~ s/\&\w+\;//sg; # remove &nbsp; etc
900 # strip part-tags
901 if ( $n == 0 ) { $nt[0] =~ s/^.*>//s }
902 if ( $n == 1 ) { $nt[1] =~ s/<.*$//s }
903 } else { $nt[$n] = ""; } # handle nulls
904 }
905 return @nt;
906}
907
908# this function is fall-back
909# if tags aren't suitable.
910#
911# extracts a fixed length of characters
912# either side of image tag (on word boundary)
913sub extract_raw_neartext {
914 my ($self, $tag, $textref) = (@_);
915 my ($rawtext, $startpos, $fp);
916 my $imgs = \%{$self->{'imglist'}};
917 ($fp) = $tag =~ /([\w\\\/]+\.(?:gif|jpe?g|png))/is;
918 if (! $fp) { return " " };
919 # if the cached, plain-text version isn't there, then create it
920 $self->init_plaintext($textref) unless defined($self->{'plaintext'});
921
922 # take the closest maxtext/2 characters
923 # either side of the tag (by word boundary)
924 return "" if ( ! exists $imgs->{$fp}{'rawpos'} );
925 $startpos = $imgs->{$fp}{'rawpos'} - (($self->{'max_near_text'} / 2) + 20);
926 if ( $startpos < 0 ) { $startpos = 0 }
927 $rawtext = substr $self->{'plaintext'}, $startpos, $self->{'max_near_text'} + 20;
928 $rawtext =~ s/\s\s/ /g;
929
930 return $rawtext;
931}
932
933# init plaintext variable for HTML-stripped version
934# (for full text index/raw assoc text extraction)
935sub init_plaintext {
936 my ($self, $textref) = (@_);
937 my ($page, $fp);
938 my $imgs = \%{$self->{'imglist'}};
939 $page = $$textref; # make a copy of original
940
941 # strip tags around image filenames so they don't get zapped
942 $page =~ s/<\w+\s+.*?([\w\/\\]+\.(?:gif|jpe?g|png))[^>]*>/\"$1\"/gsi;
943 $page =~ s/<.*?>//gs;
944 $page =~ s/&nbsp;/ /gs;
945 $page =~ s/&amp;/&/gs; #TODO: more &zzz; replacements (except &lt;, $gt;)
946
947 # get positions and strip images
948 while ( $page =~ /([^\s\'\"]+\.(jpe?g|gif|png))/ig ) {
949 $fp = $1;
950 if ( $imgs->{$fp}{'exists'} ) {
951 $imgs->{$fp}{'rawpos'} = pos $page;
952 }
953 $page =~ s/\"$fp\"//gs;
954 }
955 $self->{'plaintext'} = $page;
956}
957
958# finds and filters images based on size
959# (dimensions, height, filesize) and existence
960#
961# looks for image filenames (.jpg, .gif, etc)
962# and checks for existence on disk
963# (hence supports most JavaScript images)
964sub get_img_list {
965 my $self = shift (@_);
966 my ($textref) = (@_);
967 my ($filepath, $relpath, $abspath, $pos, $num, $width, $height, $filesize);
968 my $imgs = \%{$self->{'imglist'}};
969 while ( $$textref =~ /([^\s\'\"]+\.(jpe?g|gif|png))/ig ) {
970 $filepath = $1;
971 $pos = pos $$textref;
972 next if ( $imgs->{$filepath}{'relpath'} );
973 $relpath = $filepath;
974 $relpath =~ s/^http\:\/\///; # remove http:// in case we have mirrored it
975 $relpath =~ s/\\/\//g; # replace \ with /
976 $relpath =~ s/^\.\///s; # make "./filepath" into "filepath"
977 $imgs->{$filepath}{'relpath'} = $relpath;
978 $abspath = "$self->{'htpath'}/$relpath";
979
980 if (! -e $abspath) { next }
981
982 # can't modify real filepath var because it
983 # then can't be located in the page for tag recognition later
984 ($width, $height) =
985 `identify $abspath -ping -format "%wx%h"` =~ /^(\d*)x(\d*)$/m;
986 if (! ($width && $height)) {
987 print STDERR "W3ImgPlug: ($abspath) 'identify' failed. Check ImageMagicK binaries are installed and working correctly\n"; next;
988 }
989 $filesize = (-s $abspath);
990 if ( $filesize >= $self->{'min_size'}
991 && ( $width >= $self->{'min_width'} )
992 && ( $height >= $self->{'min_height'} ) ) {
993
994 $imgs->{$filepath}{'exists'} = 1;
995 $imgs->{$filepath}{'pos'} = $pos;
996 $imgs->{$filepath}{'width'} = $width;
997 $imgs->{$filepath}{'height'} = $height;
998 $imgs->{$filepath}{'filesize'} = $filesize;
999 } else {
1000 print {$self->{'outhandle'}} "W3ImgPlug: skipping $self->{'base_path'}/$relpath: $filesize, $width x $height\n"
1001 if $self->{'verbosity'} > 2;
1002 }
1003 }
1004 $num = 0;
1005 foreach my $i ( keys %{$imgs} ) {
1006 if ( $imgs->{$i}{'pos'} ) {
1007 $num++;
1008 } else { delete $imgs->{$i} }
1009 }
1010 return $num;
1011}
1012
1013# make the text available to the read function
1014# by making it an object variable
1015sub read_file {
1016 my ($self, $filename, $encoding, $language, $textref) = @_;
1017 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1018
1019 # if HTMLplug has run through, then it will
1020 # have replaced references so we have to
1021 # make a copy of the text before processing
1022 if ( $self->{'index_pages'} ) {
1023 $self->{'text'} = $$textref;
1024 $self->{'textref'} = \($self->{'text'});
1025 } else {
1026 $self->{'textref'} = $textref;
1027 }
1028 $self->{'plaintext'} = undef;
1029}
1030
1031# HTMLPlug only extracts meta-data if it is specified in plugin options
1032# hence a special function to do it here
1033sub get_meta_value {
1034 my ($self, $name, $textref) = @_;
1035 my ($value);
1036 $name = lc $name;
1037 if ($name eq "title") {
1038 ($value) = $$textref =~ /<title>(.*?)<\/title>/is
1039 } else {
1040 my $qm = "(?:\"|\')";
1041 ($value) = $$textref =~ /<meta name\s*=\s*$qm?$name$qm?\s+content\s*=\s*$qm?(.*?)$qm?\s*>/is
1042 }
1043 $value = "" unless $value;
1044 return $value;
1045}
1046
1047# make filename an anchor reference
1048# so we can go straight to the image
1049# within the cached version of the source page
1050# (augment's HTMLPlug sub)
1051sub replace_images {
1052 my $self = shift (@_);
1053 my ($front, $link, $back, $base_dir,
1054 $file, $doc_obj, $section) = @_;
1055 $link =~ s/\"//g;
1056 my ($a_name) = $link;
1057 $a_name =~ s/[\/\\\:\&]/_/g;
1058 # keep a list so we don't repeat the same anchor
1059 if ( ! $self->{'imglist'}{$link}{'anchored'} ) {
1060 $front = "<a name=\"gsdl_$a_name\">$front";
1061 $back = "$back</a>";
1062 $self->{'imglist'}{$link}{'anchored'} = 1;
1063 }
1064 return $self->SUPER::replace_images($front, $link, $back, $base_dir,
1065 $file, $doc_obj, $section);
1066}
1067
10681;
Note: See TracBrowser for help on using the repository browser.