source: trunk/gsdl/perllib/plugins/W3ImgPlug.pm@ 2996

Last change on this file since 2996 was 2996, checked in by sjboddie, 22 years ago

* empty log message *

  • Property svn:keywords set to Author Date Id Revision
File size: 37.9 KB
Line 
1###########################################################################
2#
3# W3ImgPlug.pm -- Context-based image indexing plugin for HTML documents
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# DESCRIPTION:
28#
29# Extracts images and associated text and metadata from
30# web pages as individual documents for indexing. Thumbnails
31# are created from each image for browsing purposes.
32#
33# Options are available for configuring the aggressiveness of the
34# associated text extraction mechanisms. A higher level of
35# aggressiveness will extract more text and consequently
36# may mean lower accuracy (precision); however, it may also
37# retrieve more of the relevant images from the collection (recall).
38# Lower levels of aggressiveness maybe result in slightly faster
39# collection builds at the import stage.
40#
41# W3ImgPlug is a subclass of HTMLPlug (i.e. it will index pages also
42# if required). It can be used in place of HTMLPlug to index both
43# pages and their images.
44#
45# REQUIREMENTS:
46#
47# The ImageMagick image manipulation is used to create
48# thumbnails and extract some image metadata. (Available
49# from http://www.imagemagick.org/)
50#
51# Unix:
52# Many Linux distributions contain ImageMagick.
53#
54# Windows:
55# ImageMagick can be downloaded from the website above.
56# Make sure the system path includes the ImageMagick binaries
57# before using W3ImgPlug.
58#
59# NOTE: NT/2000/XP contain a filesystem utility 'convert.exe'
60# with the same name as the image conversion utility. The
61# ImageMagick FAQ recommends renaming the filesystem
62# utility (e.g. to 'fsconvert.exe') to avoid this clash.
63#
64# USAGE:
65#
66# An image document consists of metadata elements:
67#
68# OriginalFilename, FilePath, Filename, FileExt, FileSize,
69# Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL
70# ImageText, PageTitle
71#
72# Most of these are only useful in format strings (e.g. ThumbURL,
73# Filename, URL, PageURL, CachePageURL).
74#
75# ImageText, as the name suggests contains the indexable text.
76# (unless using the -document_text plugin option)
77#
78# Since image documents are made up of metadata elements
79# alone, format strings are needed to display them properly.
80# NOTE: The receptionist will only display results (e.g. thumbnails)
81# in 4 columns if the format string begins with "<td><table>".
82#
83# The example below takes the user to the image within the
84# source HTML document rather than using a format string
85# on DocumentText to display the image document itself.
86#
87# Example collect.cfg:
88#
89# ...
90#
91# indexes document:ImageText document:text
92# defaultindex document:ImageText
93#
94# collectionmeta .document:ImageText "images"
95# collectionmeta .document:text "documents"
96#
97# ...
98#
99# plugin W3ImgPlug -index_pages -aggressiveness 6
100#
101# ...
102#
103# format SearchVList '<td>{If}{[Title],[link][icon]&nbsp;[Title][[/link],
104# <table><tr><td align="center"><a href="[CachePageURL]">
105# <img src="[ThumbURL]"></a></td></tr><tr><td align="center">
106# <a href="[CachePageURL]"><font size="-1">[OriginalFilename]</font></a>
107# <br>[Width]x[Height]</td></tr></table>}</td>'
108#
109# ...
110#
111
112package W3ImgPlug;
113
114use HTMLPlug;
115use ghtml;
116use unicode;
117use util;
118use parsargv;
119use strict 'subs';
120
121sub BEGIN {
122 @ISA = qw( HTMLPlug );
123}
124
125sub print_usage {
126 print STDERR "\nUsage: plugin W3ImgPlug [options]\n\n";
127 print STDERR " options:\n";
128 print STDERR " -aggressiveness Range of related text extraction techniques to use [4]\n";
129 print STDERR " 1: Filename, path, ALT text only\n";
130 print STDERR " 2: All of 1, plus caption where available\n";
131 print STDERR " 3: All of 2, plus near paragraphs where available\n";
132 print STDERR " 4: All of 3, plus previous headers (<h1>, <h2>...)\n";
133 print STDERR " where available\n";
134 print STDERR " 5: All of 4, plus textual references where available\n";
135 print STDERR " 6: All of 4, plus page metatags (title, keywords, etc)\n";
136 print STDERR " 7: All of 6, 5 and 4 combined\n";
137 print STDERR " 8: All of 7, plus repeat caption, filename, etc (raise \n";
138 print STDERR " ranking of more relevant results)\n";
139 print STDERR " 10: All of 1, plus full text of source page\n";
140 print STDERR "\n";
141 print STDERR " -no_cache_images Don't cache images (point to URL of original)\n";
142 print STDERR " -index_pages Index the pages along with the images.\n";
143 print STDERR " Otherwise reference the pages at the source URL\n";
144 print STDERR " -min_size Bytes. Skip images smaller than this [2000]\n";
145 print STDERR " -min_width Pixels. Skip images narrower than this [50 pixels]\n";
146 print STDERR " -min_height Pixels. Skip images shorter than this [50 pixels]\n";
147 print STDERR " -thumb_size Max thumbnail size. Both width and height [100 pixels]\n";
148 print STDERR " -convert_params Additional parameters for ImageMagicK convert on\n";
149 print STDERR " thumbnail creation. For example, '-raise' will give\n";
150 print STDERR " a three dimensional effect to thumbnail images.\n";
151 print STDERR " -document_text Add image text as document:text (otherwise IndexedText\n";
152 print STDERR " metadata field)\n";
153 print STDERR "\n";
154 print STDERR " Advanced Options (applicability depends on aggressiveness level)\n";
155 print STDERR " -smallpage_threshold Images on pages smaller than this (bytes) will have\n";
156 print STDERR " the page (title, keywords, etc) meta-data added [2048]\n";
157 print STDERR " -textrefs_threshold Threshold for textual references. Lower values mean\n";
158 print STDERR " the algorithm is less strict [2]\n";
159 print STDERR " -caption_length Maximum length of captions (in characters) [100]\n";
160 print STDERR " -neartext_length Target length of near text (in characters) [300]\n";
161 print STDERR " -max_near_text Maximum characters near images to extract [400]\n";
162 print STDERR " -min_near_text Minimum characters of near text or caption to extract [10]\n";
163 print STDERR "\n";
164 print STDERR " Tag set configuration file (XML format):\n";
165 print STDERR " <collectionpath>/etc/W3ImgPlug.cfg \n";
166 print STDERR "\n";
167 print STDERR "\n";
168 print STDERR "W3ImgPlug inherits all of HTMLPlug's functionality and options:\n";
169 HTMLPlug::print_usage();
170 print STDERR "\n";
171}
172
173sub new {
174 my $class = shift (@_);
175 my $self = new HTMLPlug ($class, @_);
176
177 if (!parsargv::parse(\@_,
178 q^aggressiveness/\d/3^, \$self->{'aggressiveness'},
179 q^index_pages^, \$self->{'index_pages'},
180 q^no_cache_images^, \$self->{'no_cache_images'},
181 q^min_size/\d*/2000^, \$self->{'min_img_filesize'},
182 q^min_width/\d*/50^, \$self->{'min_img_width'},
183 q^min_height/\d*/50^, \$self->{'min_img_height'},
184 q^thumb_size/\d*/100^, \$self->{'thumbnail_size'},
185 q^convert_params/.*/ ^, \$self->{'img_convert_param'},
186 q^max_near_text/\d*/400^, \$self->{'maxtext'},
187 q^min_near_text/\d*/10^, \$self->{'mintext'},
188 q^smallpage_threshold/\d*/2048^, \$self->{'smallpage_threshold'},
189 q^textrefs_threshold/\d*/2^, \$self->{'textref_threshold'},
190 q^caption_length/\d*/80^, \$self->{'caption_len'},
191 q^neartext_length/\d*/300^, \$self->{'neartext_len'},
192 q^document_text^, \$self->{'document_text'},
193 "allow_extra_options"
194 )) {
195
196 print STDERR "\nIncorrect options passed to W3ImgPlug, check your collect.cfg configuration file\n";
197 &print_usage();
198 die "\n";
199 }
200
201 # init class variables
202 $self->{'textref'} = undef; # init by read_file fn
203 $self->{'htdoc_obj'} = undef; # init by process fn
204 $self->{'htpath'} = undef; # init by process fn
205 $self->{'hturl'} = undef; # init by process fn
206 $self->{'plaintext'} = undef; # HTML stripped version - only init if needed by raw_neartext sub
207 $self->{'smallpage'} = 0; # set by process fn
208 $self->{'images_indexed'} = undef; # num of images indexed - if 1 or 2 then we know page is small
209 $self->{'initialised'} = undef; # flag (see set_extraction_options())
210
211 return bless $self, $class;
212}
213
214# if indexing pages, let HTMLPlug do it's stuff
215# image extraction done through read()
216sub process {
217 my ($self, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
218 $self->{'imglist'} = ();
219 if ( $self->{'index_pages'} ) {
220 my $ok = $self->SUPER::process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
221 if ( ! $ok ) { return $ok }
222 $self->{'htdoc_obj'} = $doc_obj;
223 }
224 # else use URL for referencing
225 #if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
226
227 $self->{'htpath'} = $base_dir if (-d $base_dir);
228 if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} .= "/$1"; }
229 $self->{'htpath'} =~ s/\\/\//g; # replace \ with /
230
231 $self->{'hturl'} = "http://$file";
232 $self->{'hturl'} =~ s/\\/\//g; # for windows
233 ($self->{'filename'}) = $file =~ /.*[\/\\](.*)/;
234 ($self->{'base_path'}) = $file =~ /(.*)[\/\\]/i;
235 if ( ( -s "$base_dir/$file") <= $self->{'smallpage_threshold'} ) {
236 $self->{'smallpage'} = 1;
237 } else { $self->{'smallpage'} = 0; }
238
239 if ( defined($self->{'initialised'}) ) { return 1; }
240 else {
241 $self->{'initialised'} = $self->set_extraction_options($base_dir =~ /^(.*?)\/import/i);
242 return $self->{'initialised'};
243 }
244}
245
246# get complex configuration options from configuration files
247# -- $GSDLCOLLECTION/etc/W3ImgPlug.cfg (tag sets for aggr 2+)
248# -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+)
249
250# If there's no W3ImgPlug.cfg file we'll use the following default values
251my $defaultcfg = '
252<delimitertagset>
253 <setname>Caption</setname>
254 <taggroup>font</taggroup>
255 <taggroup>tt</taggroup>
256 <taggroup>small</taggroup>
257 <taggroup>b</taggroup>
258 <taggroup>i</taggroup>
259 <taggroup>u</taggroup>
260 <taggroup>em</taggroup>
261 <taggroup>td</taggroup>
262 <taggroup>li</taggroup>
263 <taggroup>a</taggroup>
264 <taggroup>p</taggroup>
265 <taggroup>tr</taggroup>
266 <taggroup>center</taggroup>
267 <taggroup>div</taggroup>
268 <taggroup>caption</taggroup>
269 <taggroup>br</taggroup>
270 <taggroup>ul</taggroup>
271 <taggroup>ol</taggroup>
272 <taggroup>table</taggroup>
273 <taggroup>hr</taggroup>
274</delimitertagset>
275
276<delimitertagset>
277 <setname>Neartext</setname>
278 <taggroup>tr|hr|table|h\d|img|body</taggroup>
279 <taggroup>td|tr|hr|table|h\d|img|body</taggroup>
280 <taggroup>p|br|td|tr|hr|table|h\d|img|body</taggroup>
281 <taggroup>font|p|i|b|em|img</taggroup>
282</delimitertagset>
283';
284
285sub set_extraction_options() {
286 my ($self, $collpath) = @_;
287 my ($filepath);
288
289 print {$self->{'outhandle'}} "W3ImgPlug: Initialising\n"
290 if $self->{'verbosity'} > 1;
291 # etc/W3ImgPlug.cfg (XML)
292 # tag sets for captions and neartext
293 if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 10 ) {
294 $self->{'delims'} = [];
295 $self->{'cdelims'} = [];
296 my ($cfg, @tagsets, $tagset, $type, @delims);
297
298 $filepath = "$collpath/etc/W3ImgPlug.cfg";
299 if ( open CFG, "<$filepath" ) {
300 while (<CFG>) { $cfg .= $_ }
301 close CFG;
302 } else {
303 $cfg = $defaultcfg;
304 }
305
306 (@tagsets) =
307 $cfg =~ /<delimitertagset>(.*?)<\/delimitertagset>/igs;
308 foreach $tagset ( @tagsets ) {
309 ($type) = $tagset =~ /<setname>(.*?)<\/setname>/i;
310 if ( lc($type) eq "caption" ) {
311 (@{$self->{'cdelims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
312 }
313 elsif ( lc($type) eq "neartext" ) {
314 (@{$self->{'delims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
315 }
316 }
317
318 # output a warning if there seem to be no delimiters
319 if ( scalar(@{$self->{'cdelims'}} == 0)) {
320 print {$self->{'outhandle'}} "W3ImgPlug: Warning: no caption delimiters found in $filepath\n";
321 }
322 if ( scalar(@{$self->{'delims'}} == 0)) {
323 print {$self->{'outhandle'}} "W3ImgPlug: Warning: no neartext delimiters found in $filepath\n";
324 }
325 }
326
327 # get stop words for textual reference extraction
328 # TODO: warnings scroll off. Would be best to output them again at end of import
329 if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 10 ) {
330 $self->{'stopwords'} = ();
331 $filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");
332 if ( open STOPWORDS, "<$filepath" ) {
333 while ( <STOPWORDS> ) {
334 chomp;
335 $self->{'stopwords'}{$_} = 1;
336 }
337 close STOPWORDS;
338 } else {
339 print {$self->{'outhandle'}} "W3ImgPlug: Warning: couldn't open stopwords file at $filepath ($!)\n";
340 }
341
342 }
343
344 if ( $self->{'neartext_len'} > $self->{'maxtext'} ) {
345 $self->{'maxtext'} = $self->{'neartext_len'} * 1.33;
346 print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'maxtext'}\n";
347 }
348 if ( $self->{'caption_len'} > $self->{'maxtext'} ) {
349 $self->{'maxtext'} = $self->{'caption_len'} * 1.33;
350 print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'maxtext'}\n";
351 }
352
353 return 1;
354}
355
356# return number of files processed, undef if can't process
357# Note that $base_dir might be "" and that $file might
358# include directories
359sub read {
360 my ($self, $pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = (@_);
361 my ($doc_obj, $section, $filepath, $imgtag, $pos, $context, $numdocs, $tndir, $imgs);
362 # forward normal read (runs HTMLPlug if index_pages T)
363 my $ok = $self->SUPER::read($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs);
364 if ( ! $ok ) { return $ok }
365
366 my $outhandle = $self->{'outhandle'};
367 my $textref = $self->{'textref'};
368 my $htdoc_obj = $self->{'htdoc_obj'};
369 $numdocs = 0;
370 $base_dir =~ /(.*)\/.*/;
371 $tndir = "$1/archives/thumbnails"; # TODO: this path shouldn't be hardcoded?
372 &util::mk_all_dir($tndir) unless -e "$tndir";
373
374 $imgs = \%{$self->{'imglist'}};
375 my $nimgs = $self->get_img_list($textref);
376 $self->{'images_indexed'} = $nimgs;
377 if ( $nimgs > 0 ) {
378 my @fplist = (sort { $imgs->{$a}{'pos'} <=> $imgs->{$b}{'pos'} } keys %{$imgs});
379 my $i = 0;
380 foreach $filepath ( @fplist ) {
381 $pos = $imgs->{$filepath}{'pos'};
382 $context = substr ($$textref, $pos - 50, $pos + 50); # grab context (quicker)
383 ($imgtag) = ($context =~ /(<(?:img|a|body)\s[^>]*$filepath[^>]*>)/is );
384 if (! defined($imgtag)) { $imgtag = $filepath }
385 print $outhandle "W3ImgPlug: extracting $filepath\n"
386 if ( $self->{'verbosity'} > 1 );
387 $doc_obj = new doc ("", "indexed_doc");
388 $section = $doc_obj->get_top_section();
389 $prevpos = ( $i == 0 ? 0 : $imgs->{$fplist[$i - 1]}{'pos'});
390 $nextpos = ( $i >= ($nimgs -1) ? -1 : $imgs->{$fplist[$i + 1]}{'pos'} );
391
392 $self->extract_image_info($imgtag, $filepath, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos);
393 $processor->process($doc_obj);
394 $numdocs++;
395 $i++;
396 }
397 return $numdocs;
398 } else {
399 print $outhandle "W3ImgPlug: No images from $file indexed\n"
400 if ( $self->{'verbosity'} > 2 );
401 return 1;
402 }
403
404}
405
406# for every valid image tag
407# 1. extract related text and image metadata
408# 2. add this as document meta-data
409# 3. add assoc image(s) as files
410#
411sub extract_image_info {
412 my $self = shift (@_);
413 my ($tag, $id, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos) = (@_);
414 my ($filename, $orig_fp, $fn, $ext, $reltext, $relreltext, $crcid, $imgs,
415 $thumbfp, $pagetitle, $alttext, $filepath, $aggr);
416 $aggr = $self->{'aggressiveness'};
417 $imgs = \%{$self->{'imglist'}};
418 $filepath = $imgs->{$id}{'relpath'};
419 ($filename) = $filepath =~ /([^\/\\]+)$/s;
420 ($orig_fp) = "$self->{'base_path'}/$filepath";
421 $orig_fp =~ tr/+/ /;
422 $orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc
423 $orig_fp =~ s/\\/\//g;
424 $filepath = "$self->{'htpath'}/$filepath";
425 ($onlyfn) = $filename =~ /([^\\\/]*)$/;
426 ($fn, $ext) = $onlyfn =~ /(.*)\.(.*)/;
427 $fn = lc $fn; $ext = lc $ext;
428 ($reltext) = "<tr><td>GifComment</td><td>" . `identify $filepath -ping -format "%c"` . "</td></tr>\n"
429 if ($ext eq "gif");
430 $reltext .= "<tr><td>FilePath</td><td>$orig_fp</td></tr>\n";
431
432 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
433 $crcid = "$fn.$ext." . $self->{'next_crcid'}++;
434 } else { ($crcid) = `cksum $filepath` =~ /^(\d+)/; }
435 $thumbfp = "$tndir/tn_$crcid.jpg";
436 `convert -flatten -filter Hanning $self->{'img_convert_param'} -geometry "$self->{'thumbnail_size'}x$self->{'thumbnail_size'}>" $filepath $thumbfp` unless -e $thumbfp;
437 if ( ! (-e $thumbfp) ) {
438 print STDERR "W3ImgPlug: 'convert' failed. Check ImageMagicK binaries are installed and working correctly\n"; return 0;
439 }
440
441 # shove in full text (tag stripped or unstripped) if settings require it
442 if ( $aggr == 10) {
443 $reltext = "<tr><td>AllPage</td><td>" . $$textref . "</td><tr>\n"; # level 10 (all text, verbatim)
444 } else {
445 $pagetitle = $self->get_meta_value("title", $textref);
446 ($alttext) = $tag =~ /\salt\s*=\s*(?:\"|\')(.+?)(?:\"|\')/is;
447 if ( defined($alttext) && length($alttext) > 1) {
448 $reltext .= "<tr><td>ALTtext</td><td>$alttext</td></tr>\n"; }
449 $reltext .= "<tr><td>SplitCapitalisation</td><td>" .
450 $self->split_filepath($orig_fp) . "</td></tr>\n";
451
452 # get caption/tag based near text (if appropriate)
453 if ( $aggr > 1 ) {
454 if ( $aggr >= 2 ) {
455 $reltext .=
456 $self->extract_caption_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
457 $relreltext = $reltext;
458 }
459 # repeat the filepath, alt-text, caption, etc
460 if ( $aggr == 8 ) {
461 $reltext .= $relreltext;
462 }
463 if ( $aggr >= 3 ) {
464 $reltext .=
465 $self->extract_near_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
466 }
467
468 # get page metadata (if appropriate)
469 if ( $aggr >= 6 || ( $aggr >= 2 &&
470 ( $self->{'images_indexed'} < 2 ||
471 ($self->{'smallpage'} == 1 && $self->{'images_indexed'} < 6 )))) {
472 $reltext .= $self->get_page_metadata($textref);
473 }
474 # textual references
475 if ( $aggr == 5 || $aggr >= 7) {
476 if ( length($relreltext) > ($self->{'caption_len'} * 2) ) {
477 $reltext .= $self->get_textrefs($relreltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); }
478 else {
479 $reltext .= $self->get_textrefs($reltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
480 }
481 }
482 } # aggr > 1
483 } # aggr != 10
484
485 $doc_obj->set_OID($crcid);
486 $doc_obj->associate_file($thumbfp, "$fn.thumb.jpg", undef, $section);
487 $doc_obj->add_metadata($section, "OriginalFilename", $filename);
488 $doc_obj->add_metadata($section, "FilePath", $orig_fp);
489 $doc_obj->add_metadata($section, "Filename", $fn);
490 $doc_obj->add_metadata($section, "FileExt", $ext);
491 $doc_obj->add_metadata($section, "FileSize", $imgs->{$id}{'filesize'});
492 $doc_obj->add_metadata($section, "Width", $imgs->{$id}{'width'});
493 $doc_obj->add_metadata($section, "Height", $imgs->{$id}{'height'});
494 $doc_obj->add_metadata($section, "URL", "http://$orig_fp");
495 $doc_obj->add_metadata($section, "PageURL", $self->{'hturl'});
496 $doc_obj->add_metadata($section, "PageTitle", $pagetitle);
497 $doc_obj->add_metadata($section, "ThumbURL",
498 "_httpcollection_/index/assoc/[archivedir]/$fn.thumb.jpg");
499
500 if ( $self->{'document_text'} ) {
501 $doc_obj->add_utf8_text($section, "<table border=1>\n$reltext</table>");
502 } else {
503 $doc_obj->add_metadata($section, "ImageText", "<table border=1>\n$reltext</table>\n");
504 }
505
506 if ( $self->{'index_pages'} ) {
507 my ($cache_url) = "_httpdoc_&d=" . $self->{'htdoc_obj'}->get_OID();
508 if ( $imgs->{$id}{'anchored'} ) {
509 my $a_name = $id;
510 $a_name =~ s/[\/\\\:\&]/_/g;
511 $cache_url .= "#gsdl_$a_name" ;
512 }
513 $doc_obj->add_utf8_metadata($section, "CachePageURL", $cache_url);
514 }
515 if ( ! $self->{'no_cache_images'} ) {
516 $onlyfn = lc $onlyfn;
517 $doc_obj->associate_file($filepath, $onlyfn, undef, $section);
518 $doc_obj->add_utf8_metadata($section, "CacheURL",
519 "_httpcollection_/index/assoc/[archivedir]/$onlyfn");
520 }
521 return 1;
522}
523
524sub get_page_metadata {
525 my ($self, $textref) = (@_);
526 my (@rval);
527 $rval[0] = $self->get_meta_value("title", $textref);
528 $rval[1] = $self->get_meta_value("keywords", $textref);
529 $rval[2] = $self->get_meta_value("description", $textref);
530 $rval[3] = $self->{'filename'};
531
532 return wantarray ? @rval : "<tr><td>PageMeta</td><td>@rval</td></tr>\n" ;
533}
534
535# turns LargeCatFish into Large,Cat,Fish so MG sees the separate words
536sub split_filepath {
537 my ($self, $filepath) = (@_);
538 my (@words) = $filepath =~ /([A-Z][a-z]+)/g;
539 return join(',', @words);
540}
541
542# finds and extracts sentences
543# that seem to be on the same topic
544# as other related text (correlations)
545# and textual references (e.g. in figure 3 ...)
546sub get_textrefs {
547 my ($self, $reltext, $textref, $prevpos, $pos, $nextpos) = (@_);
548 my ($maxtext, $mintext, $startpos, $context_size, $context);
549
550 my (@relwords, @refwords, %sentences, @pagemeta);
551
552 # extract larger context
553 $maxtext = $self->{'maxtext'};
554 $startpos = $pos - ($maxtext * 4);
555 $context_size = $maxtext*10;
556 if ($startpos < $prevpos ) { $startpos = $prevpos }
557 if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) }
558 $context = substr ( $$textref, $startpos, $context_size );
559 $context =~ s/<.*?>//gs;
560 $context =~ s/^.*>(.*)/$1/gs;
561 $context =~ s/(.*)<.*$/$1/gs;
562
563 # get page meta-data (if not already included)
564 if ( $self->{'aggressiveness'} == 5 && ! $self->{'smallpage'} ) {
565 @pagemeta = $self->get_page_metadata($textref);
566 foreach $value ( @pagemeta ) {
567 $context .= "$value."; # make each into psuedo-sentence
568 }
569 }
570
571 # TODO: this list is not exhaustive
572 @refwords = ( '(?:is|are)? ?(?:show(?:s|n)|demonstrate(?:d|s)|explains|features) (?:in|by|below|above|here)',
573 '(?:see)? (?:figure|table)? (?:below|above)');
574
575 # extract general references
576 foreach $rw ( @refwords ) {
577 while ( $context =~ /[\.\?\!\,](.*?$rw\W.*?[\.\?\!\,])/ig ) {
578 $sentence = $1;
579 $sentence =~ s/\s+/ /g;
580 $sentences{$sentence}+=2;
581 }
582 }
583 # extract specific (figure, table) references by number
584 my ($fignum) = $context =~ /[\.\?\!].*?(?:figure|table)s?[\-\_\ \.](\d+\w*)\W.*?[\.\?\!]/ig;
585 if ( $fignum ) {
586 foreach $rw ( @refwords ) {
587 while ( $context =~ /[\.\?\!](.*?(figure|table)[\-\_\ \.]$fignum\W.*?[\.\?\!])/ig ) {
588 $sentence = $1;
589 $sentence =~ s/\s+/ /g;
590 $sentences{$sentence}+=4;
591 }
592 }
593 }
594
595 # sentences with occurances of important words
596 @relwords = $reltext =~ /([a-zA-Z]{4,})/g; # take out small words
597 foreach $word ( @relwords ) {
598 if ( $self->{'stopwords'}{$word} ) { next } # skip stop words
599 while ( $context =~ /([^\.\?\!]*?$word\W.*?[\.\?\!])/ig ) {
600 $sentence = $1;
601 $sentence =~ s/\s+/ /g;
602 $sentences{$sentence}++;
603 }
604 }
605 foreach $sentence ( keys %sentences ) {
606 if ($sentences{$sentence} < $self->{'textref_threshold'}) {
607 delete $sentences{$sentence};
608 }
609 }
610 my ($rval) = join "<br>\n", (keys %sentences);
611 if ( $rval && length($rval) > 5 ) {
612 return ( "<tr><td>TextualReferences</td><td>" . $rval . "</td></tr>\n") }
613 else { return "" }
614}
615
616# handles caption extraction
617# calling the extractor with different
618# tags and choosing the best candidate caption
619sub extract_caption_text {
620 my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
621 my (@neartext, $len, $hdelim, $goodlen,
622 $startpos, $context, $context_size);
623
624 $mintext = $self->{'mintext'};
625 $goodlen = $self->{'caption_len'};
626
627 # extract a context to extract near text from (faster)
628 $context_size = $self->{'maxtext'}*3;
629 $startpos = $pos - ($context_size / 2);
630 if ($startpos < $prevpos ) { $startpos = $prevpos }
631 if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
632 { $context_size = ($nextpos - $startpos) }
633
634 $context = substr ( $$textref, $startpos, $context_size );
635 $context =~ s/<!--.*?-->//gs;
636 $context =~ s/^.*-->(.*)/$1/gs;
637 $context =~ s/(.*)<!--.*$/$1/gs;
638
639 # try stepping through markup delimiter sets
640 # and selecting the best one
641 foreach $hdelim ( @{ $self->{'cdelims'} } ) {
642 @neartext = $self->extract_caption($tag, $hdelim, \$context);
643 $len = length(join("", @neartext));
644 last if ($len >= $mintext && $len <= $goodlen);
645 }
646 # reject if well over reasonable length
647 if ( $len > $goodlen ) {
648 @neartext = [];
649 }
650 $neartext[0] = " " if (! defined $neartext[0]);
651 $neartext[1] = " " if (! defined $neartext[1]);
652 return "<tr><td>Caption</td><td>" . (join ",", @neartext) . "</td></tr>\n"; # TODO: the | is for testing purposes
653} # end extract_caption_text
654
655# the previous section header often gives a bit
656# of context to the section that the image is
657# in (invariably the header is before/above the image)
658# so extract the text of the closest header above the image
659#
660# this fn just gets all the headers above the image, within the context window
661sub get_prev_header {
662 my ($self, $pos, $textref) = (@_);
663 my ($rhtext);
664 while ( $$textref =~ /<h\d>(.*?)<\/h\d>/sig ) {
665 # only headers before image
666 if ((pos $$textref) < $pos) {
667 $rhtext .= "$1, ";
668 }
669 }
670 if ( $rhtext ) { return "Header($rhtext)" }
671 else { return "" }
672}
673
674# not the most robust tag stripping
675# regexps (see perl.com FAQ) but good enough
676#
677# used by caption & tag-based near text algorithms
678sub strip_tags {
679 my ( $self, $value ) = @_;
680 if ( ! defined($value) ) { $value = "" } # handle nulls
681 else {
682 $value =~ s/<.*?>//gs; # strip all html tags
683 $value =~ s/\s+/\ /g; # remove extra whitespace
684 $value =~ s/\&\w+\;//g; # remove &nbsp; etc
685 }
686 return $value;
687}
688
689# uses the given tag(s) to identify
690# the caption near to the image
691# (below, above or both below and above)
692sub extract_caption {
693 my ($self, $tag, $bound_tag, $contextref) = (@_);
694 my (@nt, $n, $etag, $gotcap);
695 return ("", "") if ( ! ($$contextref =~ /\Q$tag/) );
696
697 $nt[0] = $`;
698 $nt[1] = $';
699 $gotcap = 0;
700
701 # look before the image for a boundary tag
702 ($etag, $nt[0]) = $nt[0] =~ /<($bound_tag)[\s]?.*?>(.*?)$/is;
703 # if bound_tag too far from the image, then prob not caption
704 # (note: have to allow for tags, so multiply by 3
705 if ( $etag && length($nt[0]) < ($self->{'caption_len'} * 3) ) {
706 if ( $nt[0] =~ /<\/$etag>/si ) {
707 # the whole caption is above the image: <tag>text</tag><img>
708 ($nt[0]) =~ /<(?:$etag)[\s]?.*?>(.*?)<\/$etag>/is;
709 $nt[0] = $self->strip_tags($nt[0]);
710 if ( length($nt[0]) > $self->{'mintext'} ) {
711 $gotcap = 1;
712 $nt[1] = "";
713 }
714
715 } elsif ( $nt[1] =~ /<\/$etag>/si) {
716 # the caption tag covers image: <tag>text?<img>text?</tag>
717 ($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
718 $nt[0] = $self->strip_tags($nt[0] . $nt[1]);
719 if ( length($nt[0]) > $self->{'mintext'} ) {
720 $gotcap = 2;
721 $nt[1] = "";
722 }
723 }
724 }
725 # else try below the image
726 if ( ! $gotcap ) {
727 # the caption is after the image: <img><tag>text</tag>
728 ($etag, $nt[1]) = $nt[1] =~ /^.*?<($bound_tag)[\s]?.*?>(.*)/is;
729 if ( $etag && $nt[1] =~ /<\/$etag>/s) {
730 ($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
731 $gotcap = 3;
732 $nt[0] = "";
733 $nt[1] = $self->strip_tags($nt[1]);
734 }
735 }
736 if ( ! $gotcap ) { $nt[0] = $nt[1] = "" }
737 else {
738 # strip part-tags
739 $nt[0] =~ s/^.*>//s;
740 $nt[1] =~ s/<.*$//s;
741 }
742 my ($type);
743 if ( $gotcap == 0 ) { return ("nocaption", "") }
744 elsif ( $gotcap == 1 ) { $type = "captionabove:" }
745 elsif ( $gotcap == 2 ) { $type = "captioncovering:" }
746 elsif ( $gotcap == 3 ) { $type = "captionbelow:" }
747 return ($type, $nt[0], $nt[1]);
748}
749
750# tag-based near text
751#
752# tries different tag sets
753# and chooses the best one
754sub extract_near_text {
755 my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
756 my (@neartext, $len, $hdelim, $maxtext, $mintext, $goodlen,
757 @bestlen, @best, $startpos, $context, $context_size,
758 $dist, $bdist, $best1, $i, $nt);
759 $bestlen[0] = $bestlen[1] = 0; $bestlen[2] = $bdist = 999999;
760 $best[0] = $best[1] = $best[2] = "";
761 $maxtext = $self->{'maxtext'};
762 $mintext = $self->{'mintext'};
763 $goodlen = $self->{'neartext_len'};
764
765 # extract a context to extract near text from (faster)
766 $context_size = $maxtext*4;
767 $startpos = $pos - ($context_size / 2);
768 if ($startpos < $prevpos ) { $startpos = $prevpos }
769 if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
770 { $context_size = ($nextpos - $startpos) }
771 $context = substr ( $$textref, $startpos, $context_size );
772 $context =~ s/<!--.*?-->//gs;
773 $context =~ s/^.*-->(.*)/$1/gs;
774 $context =~ s/(.*)<!--.*$/$1/gs;
775
776 # try stepping through markup delimiter sets
777 # and selecting the best one
778 foreach $hdelim ( @{ $self->{'delims'} } ) {
779 @neartext = $self->extract_tagged_neartext($tag, $hdelim, \$context);
780 $nt = join("", @neartext);
781 $len = length($nt);
782 # Priorities:
783 # 1. Greater than mintext
784 # 2. Less than maxtext
785 # 3. Closest to goodlen
786 if ( $len <= $goodlen && $len > $bestlen[0] ) {
787 $bestlen[0] = $len;
788 $best[0] = $hdelim;
789 } elsif ( $len >= $maxtext && $len < $bestlen[2] ) {
790 $bestlen[2] = $len;
791 $best[2] = $hdelim;
792 } elsif ( $len >= $bestlen[0] && $len <= $bestlen[2] ) {
793 $dist = abs($goodlen - $len);
794 if ( $dist < $bdist ) {
795 $bestlen[1] = $len;
796 $best[1] = $hdelim;
797 $bdist = $dist;
798 }
799 }
800 }
801 $best1 = 2;
802 foreach $i ( 0..2 ) {
803 if ( $bestlen[$i] == 999999 ) { $bestlen[$i] = 0 }
804 $dist = abs($goodlen - $bestlen[$i]);
805 if ( $bestlen[$i] > $mintext && $dist <= $bdist ) {
806 $best1 = $i;
807 $bdist = $dist;
808 }
809 }
810 @neartext = $self->extract_tagged_neartext($tag, $best[$best1], \$context);
811 if ( $bestlen[$best1] > $maxtext ) {
812 # truncate on word boundary if too much text
813 my $hmax = $maxtext / 2;
814 ($neartext[0]) = $neartext[0] =~ /([^\s]*.{1,$hmax})$/s;
815 ($neartext[1]) = $neartext[1] =~ /^(.{1,$hmax}[^\s]*)/s;
816 } elsif ( $bestlen[$best1] < $mintext ) {
817 # use plain text extraction if tags failed (e.g. usable tag outside context)
818 print {$self->{'outhandle'}} "W3ImgPlug: Fallback to plain-text extraction for $tag\n"
819 if $self->{'verbosity'} > 2;
820 $neartext[0] = "<tr><td>RawNeartext</td><td>" . $self->extract_raw_neartext($tag, $textref) . "</td></tr>";
821 $neartext[1] = "";
822 }
823 # get previous header if available
824 $neartext[0] .= "<br>\n" .
825 $self->get_prev_header($pos, \$context) if ( $self->{'aggressiveness'} >= 4 );
826 $neartext[0] = " " if (! defined $neartext[0]);
827 $neartext[1] = " " if (! defined $neartext[1]);
828
829 return "<tr><td>NearText</td><td>" . (join "|", @neartext) . "</td></tr>\n"; # TODO: the | is for testing purposes
830} # end extract_near_text
831
832# actually captures tag-based
833# near-text given a tag set
834sub extract_tagged_neartext {
835 my ($self, $tag, $bound_tag, $textref) = (@_);
836 return "" if ( ! ($$textref =~ /\Q$tag/) );
837 my (@nt, $delim, $pre_tag, $n);
838 $nt[0] = $`;
839 $nt[1] = $';
840
841 # get text after previous image tag
842 $nt[0] =~ s/.*<($bound_tag)[^>]*>(.*)/$2/is; # get rid of preceding text
843 if (defined($1)) { $delim = $1 }
844 $pre_tag = $bound_tag;
845
846 if (defined($delim)) {
847 # we want to try and use the end tag of the previous delimiter
848 # (put it on the front of the list)
849 $pre_tag =~ s/(^|\|)($delim)($|\|)//i; # take it out
850 $pre_tag =~ s/\|\|/\|/i; # replace || with |
851 $pre_tag = $delim . "|" . $pre_tag; # put it on the front
852 }
853
854 # get text before next image tag
855 $nt[1] =~ s/<\/?(?:$pre_tag)[^>]*>.*//is; # get rid of stuff after first delimiter
856
857 # process related text
858 for $n (0..1) {
859 if ( defined($nt[$n]) ) {
860 $nt[$n] =~ s/<.*?>//gs; # strip all html tags
861 $nt[$n] =~ s/\s+/\ /gs; # remove extra whitespace
862 $nt[$n] =~ s/\&\w+\;//sg; # remove &nbsp; etc
863 # strip part-tags
864 if ( $n == 0 ) { $nt[0] =~ s/^.*>//s }
865 if ( $n == 1 ) { $nt[1] =~ s/<.*$//s }
866 } else { $nt[$n] = ""; } # handle nulls
867 }
868 return @nt;
869}
870
871# this function is fall-back
872# if tags aren't suitable.
873#
874# extracts a fixed length of characters
875# either side of image tag (on word boundary)
876sub extract_raw_neartext {
877 my ($self, $tag, $textref) = (@_);
878 my ($rawtext, $startpos, $fp);
879 my $imgs = \%{$self->{'imglist'}};
880 ($fp) = $tag =~ /([\w\\\/]+\.(?:gif|jpe?g|png))/is;
881 if (! $fp) { return " " };
882 # if the cached, plain-text version isn't there, then create it
883 $self->init_plaintext($textref) unless defined($self->{'plaintext'});
884
885 # take the closest maxtext/2 characters
886 # either side of the tag (by word boundary)
887 return "" if ( ! exists $imgs->{$fp}{'rawpos'} );
888 $startpos = $imgs->{$fp}{'rawpos'} - (($self->{'maxtext'} / 2) + 20);
889 if ( $startpos < 0 ) { $startpos = 0 }
890 $rawtext = substr $self->{'plaintext'}, $startpos, $self->{'maxtext'} + 20;
891 $rawtext =~ s/\s\s/ /g;
892
893 return $rawtext;
894}
895
896# init plaintext variable for HTML-stripped version
897# (for full text index/raw assoc text extraction)
898sub init_plaintext {
899 my ($self, $textref) = (@_);
900 my ($page, $fp);
901 my $imgs = \%{$self->{'imglist'}};
902 $page = $$textref; # make a copy of original
903
904 # strip tags around image filenames so they don't get zapped
905 $page =~ s/<\w+\s+.*?([\w\/\\]+\.(?:gif|jpe?g|png))[^>]*>/\"$1\"/gsi;
906 $page =~ s/<.*?>//gs;
907 $page =~ s/&nbsp;/ /gs;
908 $page =~ s/&amp;/&/gs; #TODO: more &zzz; replacements (except &lt;, $gt;)
909
910 # get positions and strip images
911 while ( $page =~ /([^\s\'\"]+\.(jpe?g|gif|png))/ig ) {
912 $fp = $1;
913 if ( $imgs->{$fp}{'exists'} ) {
914 $imgs->{$fp}{'rawpos'} = pos $page;
915 }
916 $page =~ s/\"$fp\"//gs;
917 }
918 $self->{'plaintext'} = $page;
919}
920
921# finds and filters images based on size
922# (dimensions, height, filesize) and existence
923#
924# looks for image filenames (.jpg, .gif, etc)
925# and checks for existence on disk
926# (hence supports most JavaScript images)
927sub get_img_list {
928 my $self = shift (@_);
929 my ($textref) = (@_);
930 my ($filepath, $relpath, $abspath, $pos, $num, $width, $height, $filesize);
931 my $imgs = \%{$self->{'imglist'}};
932 while ( $$textref =~ /([^\s\'\"]+\.(jpe?g|gif|png))/ig ) {
933 $filepath = $1;
934 $pos = pos $$textref;
935 next if ( $imgs->{$filepath}{'relpath'} );
936 $relpath = $filepath;
937 $relpath =~ s/^http\:\/\///; # remove http:// in case we have mirrored it
938 $relpath =~ s/\\/\//g; # replace \ with /
939 $relpath =~ s/^\.\///s; # make "./filepath" into "filepath"
940 $imgs->{$filepath}{'relpath'} = $relpath;
941 $abspath = "$self->{'htpath'}/$relpath";
942
943 if (! -e $abspath) { next }
944
945 # can't modify real filepath var because it
946 # then can't be located in the page for tag recognition later
947 ($width, $height) =
948 `identify $abspath -ping -format "%wx%h"` =~ /^(\d*)x(\d*)$/m;
949 if (! ($width && $height)) {
950 print STDERR "W3ImgPlug: ($abspath) 'identify' failed. Check ImageMagicK binaries are installed and working correctly\n"; next;
951 }
952 $filesize = (-s $abspath);
953 if ( $filesize >= $self->{'min_img_filesize'}
954 && ( $width >= $self->{'min_img_width'} )
955 && ( $height >= $self->{'min_img_height'} ) ) {
956
957 $imgs->{$filepath}{'exists'} = 1;
958 $imgs->{$filepath}{'pos'} = $pos;
959 $imgs->{$filepath}{'width'} = $width;
960 $imgs->{$filepath}{'height'} = $height;
961 $imgs->{$filepath}{'filesize'} = $filesize;
962 } else {
963 print {$self->{'outhandle'}} "W3ImgPlug: skipping $self->{'base_path'}/$relpath: $filesize, $width x $height\n"
964 if $self->{'verbosity'} > 2;
965 }
966 }
967 $num = 0;
968 foreach $i ( keys %{$imgs} ) {
969 if ( $imgs->{$i}{'pos'} ) {
970 $num++;
971 } else { delete $imgs->{$i} }
972 }
973 return $num;
974}
975
976# make the text available to the read function
977# by making it an object variable
978sub read_file {
979 my ($self, $filename, $encoding, $language, $textref) = @_;
980 $self->SUPER::read_file($filename, $encoding, $language, $textref);
981
982 # if HTMLplug has run through, then it will
983 # have replaced references so we have to
984 # make a copy of the text before processing
985 if ( $self->{'index_pages'} ) {
986 $self->{'text'} = $$textref;
987 $self->{'textref'} = \($self->{'text'});
988 } else {
989 $self->{'textref'} = $textref;
990 }
991 $self->{'plaintext'} = undef;
992}
993
994# HTMLPlug only extracts meta-data if it is specified in plugin options
995# hence a special function to do it here
996sub get_meta_value {
997 my ($self, $name, $textref) = @_;
998 my ($value);
999 $name = lc $name;
1000 if ($name eq "title") {
1001 ($value) = $$textref =~ /<title>(.*?)<\/title>/is
1002 } else {
1003 my $qm = "(?:\"|\')";
1004 ($value) = $$textref =~ /<meta name\s*=\s*$qm?$name$qm?\s+content\s*=\s*$qm?(.*?)$qm?\s*>/is
1005 }
1006 $value = "" unless $value;
1007 return $value;
1008}
1009
1010# make filename an anchor reference
1011# so we can go straight to the image
1012# within the cached version of the source page
1013# (augment's HTMLPlug sub)
1014sub replace_images {
1015 my $self = shift (@_);
1016 my ($front, $link, $back, $base_dir,
1017 $file, $doc_obj, $section) = @_;
1018 $link =~ s/\"//g;
1019 my ($a_name) = $link;
1020 $a_name =~ s/[\/\\\:\&]/_/g;
1021 # keep a list so we don't repeat the same anchor
1022 if ( ! $self->{'imglist'}{$link}{'anchored'} ) {
1023 $front = "<a name=\"gsdl_$a_name\">$front";
1024 $back = "$back</a>";
1025 $self->{'imglist'}{$link}{'anchored'} = 1;
1026 }
1027 return $self->SUPER::replace_images($front, $link, $back, $base_dir,
1028 $file, $doc_obj, $section);
1029}
1030
10311;
Note: See TracBrowser for help on using the repository browser.