source: trunk/gsdl/perllib/plugins/W3ImgPlug.pm@ 7243

Last change on this file since 7243 was 6408, checked in by jmt12, 20 years ago

Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)

  • Property svn:keywords set to Author Date Id Revision
File size: 41.0 KB
Line 
1###########################################################################
2#
3# W3ImgPlug.pm -- Context-based image indexing plugin for HTML documents
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# DESCRIPTION:
28#
29# Extracts images and associated text and metadata from
30# web pages as individual documents for indexing. Thumbnails
31# are created from each image for browsing purposes.
32#
33# Options are available for configuring the aggressiveness of the
34# associated text extraction mechanisms. A higher level of
35# aggressiveness will extract more text and consequently
36# may mean lower accuracy (precision); however, it may also
37# retrieve more of the relevant images from the collection (recall).
38# Lower levels of aggressiveness maybe result in slightly faster
39# collection builds at the import stage.
40#
41# W3ImgPlug is a subclass of HTMLPlug (i.e. it will index pages also
42# if required). It can be used in place of HTMLPlug to index both
43# pages and their images.
44#
45# REQUIREMENTS:
46#
47# The ImageMagick image manipulation is used to create
48# thumbnails and extract some image metadata. (Available
49# from http://www.imagemagick.org/)
50#
51# Unix:
52# Many Linux distributions contain ImageMagick.
53#
54# Windows:
55# ImageMagick can be downloaded from the website above.
56# Make sure the system path includes the ImageMagick binaries
57# before using W3ImgPlug.
58#
59# NOTE: NT/2000/XP contain a filesystem utility 'convert.exe'
60# with the same name as the image conversion utility. The
61# ImageMagick FAQ recommends renaming the filesystem
62# utility (e.g. to 'fsconvert.exe') to avoid this clash.
63#
64# USAGE:
65#
66# An image document consists of metadata elements:
67#
68# OriginalFilename, FilePath, Filename, FileExt, FileSize,
69# Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL
70# ImageText, PageTitle
71#
72# Most of these are only useful in format strings (e.g. ThumbURL,
73# Filename, URL, PageURL, CachePageURL).
74#
75# ImageText, as the name suggests contains the indexable text.
76# (unless using the -document_text plugin option)
77#
78# Since image documents are made up of metadata elements
79# alone, format strings are needed to display them properly.
80# NOTE: The receptionist will only display results (e.g. thumbnails)
81# in 4 columns if the format string begins with "<td><table>".
82#
83# The example below takes the user to the image within the
84# source HTML document rather than using a format string
85# on DocumentText to display the image document itself.
86#
87# Example collect.cfg:
88#
89# ...
90#
91# indexes document:ImageText document:text
92# defaultindex document:ImageText
93#
94# collectionmeta .document:ImageText "images"
95# collectionmeta .document:text "documents"
96#
97# ...
98#
99# plugin W3ImgPlug -index_pages -aggressiveness 6
100#
101# ...
102#
103# format SearchVList '<td>{If}{[Title],[link][icon]&nbsp;[Title][[/link],
104# <table><tr><td align="center"><a href="[CachePageURL]">
105# <img src="[ThumbURL]"></a></td></tr><tr><td align="center">
106# <a href="[CachePageURL]"><font size="-1">[OriginalFilename]</font></a>
107# <br>[Width]x[Height]</td></tr></table>}</td>'
108#
109# ...
110#
111
112package W3ImgPlug;
113
114use HTMLPlug;
115use ghtml;
116use unicode;
117use util;
118use parsargv;
119use strict 'subs';
120
121sub BEGIN {
122 @ISA = qw( HTMLPlug );
123}
124
125my $aggressiveness_list =
126 [ { 'name' => "1",
127 'desc' => "{W3ImgPlug.aggressiveness.1}" },
128 { 'name' => "2",
129 'desc' => "{W3ImgPlug.aggressiveness.2}" },
130 { 'name' => "3",
131 'desc' => "{W3ImgPlug.aggressiveness.3}" },
132 { 'name' => "4",
133 'desc' => "{W3ImgPlug.aggressiveness.4}" },
134 { 'name' => "5",
135 'desc' => "{W3ImgPlug.aggressiveness.5}" },
136 { 'name' => "6",
137 'desc' => "{W3ImgPlug.aggressiveness.6}" },
138 { 'name' => "7",
139 'desc' => "{W3ImgPlug.aggressiveness.7}" },
140 { 'name' => "8",
141 'desc' => "{W3ImgPlug.aggressiveness.8}" },
142 { 'name' => "9",
143 'desc' => "{W3ImgPlug.aggressiveness.9}" } ];
144
145my $arguments =
146 [ { 'name' => "aggressiveness",
147 'desc' => "{W3ImgPlug.aggressiveness}",
148 'type' => "int",
149 'list' => $aggressiveness_list,
150 'deft' => "3",
151 'reqd' => "no" },
152 { 'name' => "index_pages",
153 'desc' => "{W3ImgPlug.index_pages}",
154 'type' => "flag",
155 'reqd' => "no" },
156 { 'name' => "no_cache_images",
157 'desc' => "{W3ImgPlug.no_cache_images}",
158 'type' => "flag",
159 'reqd' => "no" },
160 { 'name' => "min_size",
161 'desc' => "{W3ImgPlug.min_size}",
162 'type' => "int",
163 'deft' => "2000",
164 'reqd' => "no" },
165 { 'name' => "min_width",
166 'desc' => "{W3ImgPlug.min_width}",
167 'type' => "int",
168 'deft' => "50",
169 'reqd' => "no" },
170 { 'name' => "min_height",
171 'desc' => "{W3ImgPlug.min_height}",
172 'type' => "int",
173 'deft' => "50",
174 'reqd' => "no" },
175 { 'name' => "thumb_size",
176 'desc' => "{W3ImgPlug.thumb_size}",
177 'type' => "int",
178 'deft' => "100",
179 'reqd' => "no" },
180 { 'name' => "convert_params",
181 'desc' => "{W3ImgPlug.convert_params}",
182 'type' => "string",
183 'deft' => "",
184 'reqd' => "no" },
185 { 'name' => "min_near_text",
186 'desc' => "{W3ImgPlug.min_near_text}",
187 'type' => "int",
188 'deft' => "10",
189 'reqd' => "no" },
190 { 'name' => "max_near_text",
191 'desc' => "{W3ImgPlug.max_near_text}",
192 'type' => "int",
193 'deft' => "400",
194 'reqd' => "no" },
195 { 'name' => "smallpage_threshold",
196 'desc' => "{W3ImgPlug.smallpage_threshold}",
197 'type' => "int",
198 'deft' => "2048",
199 'reqd' => "no" },
200 { 'name' => "textrefs_threshold",
201 'desc' => "{W3ImgPlug.textrefs_threshold}",
202 'type' => "int",
203 'deft' => "2",
204 'reqd' => "no" },
205 { 'name' => "caption_length",
206 'desc' => "{W3ImgPlug.caption_length}",
207 'type' => "int",
208 'deft' => "80",
209 'reqd' => "no" },
210 { 'name' => "neartext_length",
211 'desc' => "{W3ImgPlug.neartext_length}",
212 'type' => "int",
213 'deft' => "300",
214 'reqd' => "no" },
215 { 'name' => "document_text",
216 'desc' => "{W3ImgPlug.document_text}",
217 'type' => "flag",
218 'reqd' => "no" } ];
219
220my $options = { 'name' => "W3ImgPlug",
221 'desc' => "{W3ImgPlug.desc}",
222 'abstract' => "no",
223 'inherits' => "yes",
224 'args' => $arguments };
225
226
227# sub print_usage {
228# print STDERR "\nUsage: plugin W3ImgPlug [options]\n\n";
229# print STDERR " options:\n";
230# print STDERR " -aggressiveness Range of related text extraction techniques to use [4]\n";
231# print STDERR " 1: Filename, path, ALT text only\n";
232# print STDERR " 2: All of 1, plus caption where available\n";
233# print STDERR " 3: All of 2, plus near paragraphs where available\n";
234# print STDERR " 4: All of 3, plus previous headers (<h1>, <h2>...)\n";
235# print STDERR " where available\n";
236# print STDERR " 5: All of 4, plus textual references where available\n";
237# print STDERR " 6: All of 4, plus page metatags (title, keywords, etc)\n";
238# print STDERR " 7: All of 6, 5 and 4 combined\n";
239# print STDERR " 8: All of 7, plus repeat caption, filename, etc (raise \n";
240# print STDERR " ranking of more relevant results)\n";
241# print STDERR " 10: All of 1, plus full text of source page\n";
242# print STDERR "\n";
243# print STDERR " -no_cache_images Don't cache images (point to URL of original)\n";
244# print STDERR " -index_pages Index the pages along with the images.\n";
245# print STDERR " Otherwise reference the pages at the source URL\n";
246# print STDERR " -min_size Bytes. Skip images smaller than this [2000]\n";
247# print STDERR " -min_width Pixels. Skip images narrower than this [50 pixels]\n";
248# print STDERR " -min_height Pixels. Skip images shorter than this [50 pixels]\n";
249# print STDERR " -thumb_size Max thumbnail size. Both width and height [100 pixels]\n";
250# print STDERR " -convert_params Additional parameters for ImageMagicK convert on\n";
251# print STDERR " thumbnail creation. For example, '-raise' will give\n";
252# print STDERR " a three dimensional effect to thumbnail images.\n";
253# print STDERR " -document_text Add image text as document:text (otherwise IndexedText\n";
254# print STDERR " metadata field)\n";
255# print STDERR "\n";
256# print STDERR " Advanced Options (applicability depends on aggressiveness level)\n";
257# print STDERR " -smallpage_threshold Images on pages smaller than this (bytes) will have\n";
258# print STDERR " the page (title, keywords, etc) meta-data added [2048]\n";
259# print STDERR " -textrefs_threshold Threshold for textual references. Lower values mean\n";
260# print STDERR " the algorithm is less strict [2]\n";
261# print STDERR " -caption_length Maximum length of captions (in characters) [100]\n";
262# print STDERR " -neartext_length Target length of near text (in characters) [300]\n";
263# print STDERR " -max_near_text Maximum characters near images to extract [400]\n";
264# print STDERR " -min_near_text Minimum characters of near text or caption to extract [10]\n";
265# print STDERR "\n";
266# print STDERR " Tag set configuration file (XML format):\n";
267# print STDERR " <collectionpath>/etc/W3ImgPlug.cfg \n";
268# print STDERR "\n";
269# print STDERR "\n";
270# print STDERR "W3ImgPlug inherits all of HTMLPlug's functionality and options:\n";
271# HTMLPlug::print_usage();
272# print STDERR "\n";
273# }
274
275sub new {
276 my $class = shift (@_);
277 my $self = new HTMLPlug ($class, @_);
278 $self->{'plugin_type'} = "W3ImgPlug";
279 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
280 my $option_list = $self->{'option_list'};
281 push( @{$option_list}, $options );
282
283 if (!parsargv::parse(\@_,
284 q^aggressiveness/\d/3^, \$self->{'aggressiveness'},
285 q^index_pages^, \$self->{'index_pages'},
286 q^no_cache_images^, \$self->{'no_cache_images'},
287 q^min_size/\d*/2000^, \$self->{'min_img_filesize'},
288 q^min_width/\d*/50^, \$self->{'min_img_width'},
289 q^min_height/\d*/50^, \$self->{'min_img_height'},
290 q^thumb_size/\d*/100^, \$self->{'thumbnail_size'},
291 q^convert_params/.*/ ^, \$self->{'img_convert_param'},
292 q^max_near_text/\d*/400^, \$self->{'maxtext'},
293 q^min_near_text/\d*/10^, \$self->{'mintext'},
294 q^smallpage_threshold/\d*/2048^, \$self->{'smallpage_threshold'},
295 q^textrefs_threshold/\d*/2^, \$self->{'textref_threshold'},
296 q^caption_length/\d*/80^, \$self->{'caption_len'},
297 q^neartext_length/\d*/300^, \$self->{'neartext_len'},
298 q^document_text^, \$self->{'document_text'},
299 "allow_extra_options"
300 )) {
301
302 print STDERR "\nIncorrect options passed to W3ImgPlug, check your collect.cfg configuration file\n";
303 $self->print_txt_usage(""); # Use default resource bundle
304 die "\n";
305 }
306
307 # init class variables
308 $self->{'textref'} = undef; # init by read_file fn
309 $self->{'htdoc_obj'} = undef; # init by process fn
310 $self->{'htpath'} = undef; # init by process fn
311 $self->{'hturl'} = undef; # init by process fn
312 $self->{'plaintext'} = undef; # HTML stripped version - only init if needed by raw_neartext sub
313 $self->{'smallpage'} = 0; # set by process fn
314 $self->{'images_indexed'} = undef; # num of images indexed - if 1 or 2 then we know page is small
315 $self->{'initialised'} = undef; # flag (see set_extraction_options())
316
317 return bless $self, $class;
318}
319
320# if indexing pages, let HTMLPlug do it's stuff
321# image extraction done through read()
322sub process {
323 my ($self, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
324 $self->{'imglist'} = ();
325 if ( $self->{'index_pages'} ) {
326 my $ok = $self->SUPER::process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
327 if ( ! $ok ) { return $ok }
328 $self->{'htdoc_obj'} = $doc_obj;
329 }
330 # else use URL for referencing
331 #if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
332
333 $self->{'htpath'} = $base_dir if (-d $base_dir);
334 if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} .= "/$1"; }
335 $self->{'htpath'} =~ s/\\/\//g; # replace \ with /
336
337 $self->{'hturl'} = "http://$file";
338 $self->{'hturl'} =~ s/\\/\//g; # for windows
339 ($self->{'filename'}) = $file =~ /.*[\/\\](.*)/;
340 ($self->{'base_path'}) = $file =~ /(.*)[\/\\]/i;
341 if ( ( -s "$base_dir/$file") <= $self->{'smallpage_threshold'} ) {
342 $self->{'smallpage'} = 1;
343 } else { $self->{'smallpage'} = 0; }
344
345 if ( defined($self->{'initialised'}) ) { return 1; }
346 else {
347 $self->{'initialised'} = $self->set_extraction_options($base_dir =~ /^(.*?)\/import/i);
348 return $self->{'initialised'};
349 }
350}
351
352# get complex configuration options from configuration files
353# -- $GSDLCOLLECTION/etc/W3ImgPlug.cfg (tag sets for aggr 2+)
354# -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+)
355
356# If there's no W3ImgPlug.cfg file we'll use the following default values
357my $defaultcfg = '
358<delimitertagset>
359 <setname>Caption</setname>
360 <taggroup>font</taggroup>
361 <taggroup>tt</taggroup>
362 <taggroup>small</taggroup>
363 <taggroup>b</taggroup>
364 <taggroup>i</taggroup>
365 <taggroup>u</taggroup>
366 <taggroup>em</taggroup>
367 <taggroup>td</taggroup>
368 <taggroup>li</taggroup>
369 <taggroup>a</taggroup>
370 <taggroup>p</taggroup>
371 <taggroup>tr</taggroup>
372 <taggroup>center</taggroup>
373 <taggroup>div</taggroup>
374 <taggroup>caption</taggroup>
375 <taggroup>br</taggroup>
376 <taggroup>ul</taggroup>
377 <taggroup>ol</taggroup>
378 <taggroup>table</taggroup>
379 <taggroup>hr</taggroup>
380</delimitertagset>
381
382<delimitertagset>
383 <setname>Neartext</setname>
384 <taggroup>tr|hr|table|h\d|img|body</taggroup>
385 <taggroup>td|tr|hr|table|h\d|img|body</taggroup>
386 <taggroup>p|br|td|tr|hr|table|h\d|img|body</taggroup>
387 <taggroup>font|p|i|b|em|img</taggroup>
388</delimitertagset>
389';
390
391sub set_extraction_options() {
392 my ($self, $collpath) = @_;
393 my ($filepath);
394
395 print {$self->{'outhandle'}} "W3ImgPlug: Initialising\n"
396 if $self->{'verbosity'} > 1;
397 # etc/W3ImgPlug.cfg (XML)
398 # tag sets for captions and neartext
399 if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) {
400 $self->{'delims'} = [];
401 $self->{'cdelims'} = [];
402 my ($cfg, @tagsets, $tagset, $type, @delims);
403
404 $filepath = "$collpath/etc/W3ImgPlug.cfg";
405 if ( open CFG, "<$filepath" ) {
406 while (<CFG>) { $cfg .= $_ }
407 close CFG;
408 } else {
409 $cfg = $defaultcfg;
410 }
411
412 (@tagsets) =
413 $cfg =~ /<delimitertagset>(.*?)<\/delimitertagset>/igs;
414 foreach $tagset ( @tagsets ) {
415 ($type) = $tagset =~ /<setname>(.*?)<\/setname>/i;
416 if ( lc($type) eq "caption" ) {
417 (@{$self->{'cdelims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
418 }
419 elsif ( lc($type) eq "neartext" ) {
420 (@{$self->{'delims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
421 }
422 }
423
424 # output a warning if there seem to be no delimiters
425 if ( scalar(@{$self->{'cdelims'}} == 0)) {
426 print {$self->{'outhandle'}} "W3ImgPlug: Warning: no caption delimiters found in $filepath\n";
427 }
428 if ( scalar(@{$self->{'delims'}} == 0)) {
429 print {$self->{'outhandle'}} "W3ImgPlug: Warning: no neartext delimiters found in $filepath\n";
430 }
431 }
432
433 # get stop words for textual reference extraction
434 # TODO: warnings scroll off. Would be best to output them again at end of import
435 if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) {
436 $self->{'stopwords'} = ();
437 $filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");
438 if ( open STOPWORDS, "<$filepath" ) {
439 while ( <STOPWORDS> ) {
440 chomp;
441 $self->{'stopwords'}{$_} = 1;
442 }
443 close STOPWORDS;
444 } else {
445 print {$self->{'outhandle'}} "W3ImgPlug: Warning: couldn't open stopwords file at $filepath ($!)\n";
446 }
447
448 }
449
450 if ( $self->{'neartext_len'} > $self->{'maxtext'} ) {
451 $self->{'maxtext'} = $self->{'neartext_len'} * 1.33;
452 print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'maxtext'}\n";
453 }
454 if ( $self->{'caption_len'} > $self->{'maxtext'} ) {
455 $self->{'maxtext'} = $self->{'caption_len'} * 1.33;
456 print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'maxtext'}\n";
457 }
458
459 return 1;
460}
461
462# return number of files processed, undef if can't process
463# Note that $base_dir might be "" and that $file might
464# include directories
465sub read {
466 my ($self, $pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = (@_);
467 my ($doc_obj, $section, $filepath, $imgtag, $pos, $context, $numdocs, $tndir, $imgs);
468 # forward normal read (runs HTMLPlug if index_pages T)
469 my $ok = $self->SUPER::read($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs);
470 if ( ! $ok ) { return $ok }
471
472 my $outhandle = $self->{'outhandle'};
473 my $textref = $self->{'textref'};
474 my $htdoc_obj = $self->{'htdoc_obj'};
475 $numdocs = 0;
476 $base_dir =~ /(.*)\/.*/;
477 $tndir = "$1/archives/thumbnails"; # TODO: this path shouldn't be hardcoded?
478 &util::mk_all_dir($tndir) unless -e "$tndir";
479
480 $imgs = \%{$self->{'imglist'}};
481 my $nimgs = $self->get_img_list($textref);
482 $self->{'images_indexed'} = $nimgs;
483 if ( $nimgs > 0 ) {
484 my @fplist = (sort { $imgs->{$a}{'pos'} <=> $imgs->{$b}{'pos'} } keys %{$imgs});
485 my $i = 0;
486 foreach $filepath ( @fplist ) {
487 $pos = $imgs->{$filepath}{'pos'};
488 $context = substr ($$textref, $pos - 50, $pos + 50); # grab context (quicker)
489 ($imgtag) = ($context =~ /(<(?:img|a|body)\s[^>]*$filepath[^>]*>)/is );
490 if (! defined($imgtag)) { $imgtag = $filepath }
491 print $outhandle "W3ImgPlug: extracting $filepath\n"
492 if ( $self->{'verbosity'} > 1 );
493 $doc_obj = new doc ("", "indexed_doc");
494 $section = $doc_obj->get_top_section();
495 $prevpos = ( $i == 0 ? 0 : $imgs->{$fplist[$i - 1]}{'pos'});
496 $nextpos = ( $i >= ($nimgs -1) ? -1 : $imgs->{$fplist[$i + 1]}{'pos'} );
497
498 $self->extract_image_info($imgtag, $filepath, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos);
499 $processor->process($doc_obj);
500 $numdocs++;
501 $i++;
502 }
503 return $numdocs;
504 } else {
505 print $outhandle "W3ImgPlug: No images from $file indexed\n"
506 if ( $self->{'verbosity'} > 2 );
507 return 1;
508 }
509
510}
511
512# for every valid image tag
513# 1. extract related text and image metadata
514# 2. add this as document meta-data
515# 3. add assoc image(s) as files
516#
517sub extract_image_info {
518 my $self = shift (@_);
519 my ($tag, $id, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos) = (@_);
520 my ($filename, $orig_fp, $fn, $ext, $reltext, $relreltext, $crcid, $imgs,
521 $thumbfp, $pagetitle, $alttext, $filepath, $aggr);
522 $aggr = $self->{'aggressiveness'};
523 $imgs = \%{$self->{'imglist'}};
524 $filepath = $imgs->{$id}{'relpath'};
525 ($filename) = $filepath =~ /([^\/\\]+)$/s;
526 ($orig_fp) = "$self->{'base_path'}/$filepath";
527 $orig_fp =~ tr/+/ /;
528 $orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc
529 $orig_fp =~ s/\\/\//g;
530 $filepath = "$self->{'htpath'}/$filepath";
531 ($onlyfn) = $filename =~ /([^\\\/]*)$/;
532 ($fn, $ext) = $onlyfn =~ /(.*)\.(.*)/;
533 $fn = lc $fn; $ext = lc $ext;
534 ($reltext) = "<tr><td>GifComment</td><td>" . `identify $filepath -ping -format "%c"` . "</td></tr>\n"
535 if ($ext eq "gif");
536 $reltext .= "<tr><td>FilePath</td><td>$orig_fp</td></tr>\n";
537
538 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
539 $crcid = "$fn.$ext." . $self->{'next_crcid'}++;
540 } else { ($crcid) = `cksum $filepath` =~ /^(\d+)/; }
541 $thumbfp = "$tndir/tn_$crcid.jpg";
542 `convert -flatten -filter Hanning $self->{'img_convert_param'} -geometry "$self->{'thumbnail_size'}x$self->{'thumbnail_size'}>" $filepath $thumbfp` unless -e $thumbfp;
543 if ( ! (-e $thumbfp) ) {
544 print STDERR "W3ImgPlug: 'convert' failed. Check ImageMagicK binaries are installed and working correctly\n"; return 0;
545 }
546
547 # shove in full text (tag stripped or unstripped) if settings require it
548 if ( $aggr == 10) {
549 $reltext = "<tr><td>AllPage</td><td>" . $$textref . "</td><tr>\n"; # level 10 (all text, verbatim)
550 } else {
551 $pagetitle = $self->get_meta_value("title", $textref);
552 ($alttext) = $tag =~ /\salt\s*=\s*(?:\"|\')(.+?)(?:\"|\')/is;
553 if ( defined($alttext) && length($alttext) > 1) {
554 $reltext .= "<tr><td>ALTtext</td><td>$alttext</td></tr>\n"; }
555 $reltext .= "<tr><td>SplitCapitalisation</td><td>" .
556 $self->split_filepath($orig_fp) . "</td></tr>\n";
557
558 # get caption/tag based near text (if appropriate)
559 if ( $aggr > 1 ) {
560 if ( $aggr >= 2 ) {
561 $reltext .=
562 $self->extract_caption_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
563 $relreltext = $reltext;
564 }
565 # repeat the filepath, alt-text, caption, etc
566 if ( $aggr == 8 ) {
567 $reltext .= $relreltext;
568 }
569 if ( $aggr >= 3 ) {
570 $reltext .=
571 $self->extract_near_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
572 }
573
574 # get page metadata (if appropriate)
575 if ( $aggr >= 6 || ( $aggr >= 2 &&
576 ( $self->{'images_indexed'} < 2 ||
577 ($self->{'smallpage'} == 1 && $self->{'images_indexed'} < 6 )))) {
578 $reltext .= $self->get_page_metadata($textref);
579 }
580 # textual references
581 if ( $aggr == 5 || $aggr >= 7) {
582 if ( length($relreltext) > ($self->{'caption_len'} * 2) ) {
583 $reltext .= $self->get_textrefs($relreltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); }
584 else {
585 $reltext .= $self->get_textrefs($reltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
586 }
587 }
588 } # aggr > 1
589 } # aggr != 10
590
591 $doc_obj->set_OID($crcid);
592 $doc_obj->associate_file($thumbfp, "$fn.thumb.jpg", undef, $section);
593 $doc_obj->add_metadata($section, "OriginalFilename", $filename);
594 $doc_obj->add_metadata($section, "FilePath", $orig_fp);
595 $doc_obj->add_metadata($section, "Filename", $fn);
596 $doc_obj->add_metadata($section, "FileExt", $ext);
597 $doc_obj->add_metadata($section, "FileSize", $imgs->{$id}{'filesize'});
598 $doc_obj->add_metadata($section, "Width", $imgs->{$id}{'width'});
599 $doc_obj->add_metadata($section, "Height", $imgs->{$id}{'height'});
600 $doc_obj->add_metadata($section, "URL", "http://$orig_fp");
601 $doc_obj->add_metadata($section, "PageURL", $self->{'hturl'});
602 $doc_obj->add_metadata($section, "PageTitle", $pagetitle);
603 $doc_obj->add_metadata($section, "ThumbURL",
604 "_httpcollection_/index/assoc/[archivedir]/$fn.thumb.jpg");
605
606 if ( $self->{'document_text'} ) {
607 $doc_obj->add_utf8_text($section, "<table border=1>\n$reltext</table>");
608 } else {
609 $doc_obj->add_metadata($section, "ImageText", "<table border=1>\n$reltext</table>\n");
610 }
611
612 if ( $self->{'index_pages'} ) {
613 my ($cache_url) = "_httpdoc_&d=" . $self->{'htdoc_obj'}->get_OID();
614 if ( $imgs->{$id}{'anchored'} ) {
615 my $a_name = $id;
616 $a_name =~ s/[\/\\\:\&]/_/g;
617 $cache_url .= "#gsdl_$a_name" ;
618 }
619 $doc_obj->add_utf8_metadata($section, "CachePageURL", $cache_url);
620 }
621 if ( ! $self->{'no_cache_images'} ) {
622 $onlyfn = lc $onlyfn;
623 $doc_obj->associate_file($filepath, $onlyfn, undef, $section);
624 $doc_obj->add_utf8_metadata($section, "CacheURL",
625 "_httpcollection_/index/assoc/[archivedir]/$onlyfn");
626 }
627 return 1;
628}
629
630sub get_page_metadata {
631 my ($self, $textref) = (@_);
632 my (@rval);
633 $rval[0] = $self->get_meta_value("title", $textref);
634 $rval[1] = $self->get_meta_value("keywords", $textref);
635 $rval[2] = $self->get_meta_value("description", $textref);
636 $rval[3] = $self->{'filename'};
637
638 return wantarray ? @rval : "<tr><td>PageMeta</td><td>@rval</td></tr>\n" ;
639}
640
641# turns LargeCatFish into Large,Cat,Fish so MG sees the separate words
642sub split_filepath {
643 my ($self, $filepath) = (@_);
644 my (@words) = $filepath =~ /([A-Z][a-z]+)/g;
645 return join(',', @words);
646}
647
648# finds and extracts sentences
649# that seem to be on the same topic
650# as other related text (correlations)
651# and textual references (e.g. in figure 3 ...)
652sub get_textrefs {
653 my ($self, $reltext, $textref, $prevpos, $pos, $nextpos) = (@_);
654 my ($maxtext, $mintext, $startpos, $context_size, $context);
655
656 my (@relwords, @refwords, %sentences, @pagemeta);
657
658 # extract larger context
659 $maxtext = $self->{'maxtext'};
660 $startpos = $pos - ($maxtext * 4);
661 $context_size = $maxtext*10;
662 if ($startpos < $prevpos ) { $startpos = $prevpos }
663 if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) }
664 $context = substr ( $$textref, $startpos, $context_size );
665 $context =~ s/<.*?>//gs;
666 $context =~ s/^.*>(.*)/$1/gs;
667 $context =~ s/(.*)<.*$/$1/gs;
668
669 # get page meta-data (if not already included)
670 if ( $self->{'aggressiveness'} == 5 && ! $self->{'smallpage'} ) {
671 @pagemeta = $self->get_page_metadata($textref);
672 foreach $value ( @pagemeta ) {
673 $context .= "$value."; # make each into psuedo-sentence
674 }
675 }
676
677 # TODO: this list is not exhaustive
678 @refwords = ( '(?:is|are)? ?(?:show(?:s|n)|demonstrate(?:d|s)|explains|features) (?:in|by|below|above|here)',
679 '(?:see)? (?:figure|table)? (?:below|above)');
680
681 # extract general references
682 foreach $rw ( @refwords ) {
683 while ( $context =~ /[\.\?\!\,](.*?$rw\W.*?[\.\?\!\,])/ig ) {
684 $sentence = $1;
685 $sentence =~ s/\s+/ /g;
686 $sentences{$sentence}+=2;
687 }
688 }
689 # extract specific (figure, table) references by number
690 my ($fignum) = $context =~ /[\.\?\!].*?(?:figure|table)s?[\-\_\ \.](\d+\w*)\W.*?[\.\?\!]/ig;
691 if ( $fignum ) {
692 foreach $rw ( @refwords ) {
693 while ( $context =~ /[\.\?\!](.*?(figure|table)[\-\_\ \.]$fignum\W.*?[\.\?\!])/ig ) {
694 $sentence = $1;
695 $sentence =~ s/\s+/ /g;
696 $sentences{$sentence}+=4;
697 }
698 }
699 }
700
701 # sentences with occurances of important words
702 @relwords = $reltext =~ /([a-zA-Z]{4,})/g; # take out small words
703 foreach $word ( @relwords ) {
704 if ( $self->{'stopwords'}{$word} ) { next } # skip stop words
705 while ( $context =~ /([^\.\?\!]*?$word\W.*?[\.\?\!])/ig ) {
706 $sentence = $1;
707 $sentence =~ s/\s+/ /g;
708 $sentences{$sentence}++;
709 }
710 }
711 foreach $sentence ( keys %sentences ) {
712 if ($sentences{$sentence} < $self->{'textref_threshold'}) {
713 delete $sentences{$sentence};
714 }
715 }
716 my ($rval) = join "<br>\n", (keys %sentences);
717 if ( $rval && length($rval) > 5 ) {
718 return ( "<tr><td>TextualReferences</td><td>" . $rval . "</td></tr>\n") }
719 else { return "" }
720}
721
722# handles caption extraction
723# calling the extractor with different
724# tags and choosing the best candidate caption
725sub extract_caption_text {
726 my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
727 my (@neartext, $len, $hdelim, $goodlen,
728 $startpos, $context, $context_size);
729
730 $mintext = $self->{'mintext'};
731 $goodlen = $self->{'caption_len'};
732
733 # extract a context to extract near text from (faster)
734 $context_size = $self->{'maxtext'}*3;
735 $startpos = $pos - ($context_size / 2);
736 if ($startpos < $prevpos ) { $startpos = $prevpos }
737 if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
738 { $context_size = ($nextpos - $startpos) }
739
740 $context = substr ( $$textref, $startpos, $context_size );
741 $context =~ s/<!--.*?-->//gs;
742 $context =~ s/^.*-->(.*)/$1/gs;
743 $context =~ s/(.*)<!--.*$/$1/gs;
744
745 # try stepping through markup delimiter sets
746 # and selecting the best one
747 foreach $hdelim ( @{ $self->{'cdelims'} } ) {
748 @neartext = $self->extract_caption($tag, $hdelim, \$context);
749 $len = length(join("", @neartext));
750 last if ($len >= $mintext && $len <= $goodlen);
751 }
752 # reject if well over reasonable length
753 if ( $len > $goodlen ) {
754 @neartext = [];
755 }
756 $neartext[0] = " " if (! defined $neartext[0]);
757 $neartext[1] = " " if (! defined $neartext[1]);
758 return "<tr><td>Caption</td><td>" . (join ",", @neartext) . "</td></tr>\n"; # TODO: the | is for testing purposes
759} # end extract_caption_text
760
761# the previous section header often gives a bit
762# of context to the section that the image is
763# in (invariably the header is before/above the image)
764# so extract the text of the closest header above the image
765#
766# this fn just gets all the headers above the image, within the context window
767sub get_prev_header {
768 my ($self, $pos, $textref) = (@_);
769 my ($rhtext);
770 while ( $$textref =~ /<h\d>(.*?)<\/h\d>/sig ) {
771 # only headers before image
772 if ((pos $$textref) < $pos) {
773 $rhtext .= "$1, ";
774 }
775 }
776 if ( $rhtext ) { return "Header($rhtext)" }
777 else { return "" }
778}
779
780# not the most robust tag stripping
781# regexps (see perl.com FAQ) but good enough
782#
783# used by caption & tag-based near text algorithms
784sub strip_tags {
785 my ( $self, $value ) = @_;
786 if ( ! defined($value) ) { $value = "" } # handle nulls
787 else {
788 $value =~ s/<.*?>//gs; # strip all html tags
789 $value =~ s/\s+/\ /g; # remove extra whitespace
790 $value =~ s/\&\w+\;//g; # remove &nbsp; etc
791 }
792 return $value;
793}
794
795# uses the given tag(s) to identify
796# the caption near to the image
797# (below, above or both below and above)
798sub extract_caption {
799 my ($self, $tag, $bound_tag, $contextref) = (@_);
800 my (@nt, $n, $etag, $gotcap);
801 return ("", "") if ( ! ($$contextref =~ /\Q$tag/) );
802
803 $nt[0] = $`;
804 $nt[1] = $';
805 $gotcap = 0;
806
807 # look before the image for a boundary tag
808 ($etag, $nt[0]) = $nt[0] =~ /<($bound_tag)[\s]?.*?>(.*?)$/is;
809 # if bound_tag too far from the image, then prob not caption
810 # (note: have to allow for tags, so multiply by 3
811 if ( $etag && length($nt[0]) < ($self->{'caption_len'} * 3) ) {
812 if ( $nt[0] =~ /<\/$etag>/si ) {
813 # the whole caption is above the image: <tag>text</tag><img>
814 ($nt[0]) =~ /<(?:$etag)[\s]?.*?>(.*?)<\/$etag>/is;
815 $nt[0] = $self->strip_tags($nt[0]);
816 if ( length($nt[0]) > $self->{'mintext'} ) {
817 $gotcap = 1;
818 $nt[1] = "";
819 }
820
821 } elsif ( $nt[1] =~ /<\/$etag>/si) {
822 # the caption tag covers image: <tag>text?<img>text?</tag>
823 ($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
824 $nt[0] = $self->strip_tags($nt[0] . $nt[1]);
825 if ( length($nt[0]) > $self->{'mintext'} ) {
826 $gotcap = 2;
827 $nt[1] = "";
828 }
829 }
830 }
831 # else try below the image
832 if ( ! $gotcap ) {
833 # the caption is after the image: <img><tag>text</tag>
834 ($etag, $nt[1]) = $nt[1] =~ /^.*?<($bound_tag)[\s]?.*?>(.*)/is;
835 if ( $etag && $nt[1] =~ /<\/$etag>/s) {
836 ($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
837 $gotcap = 3;
838 $nt[0] = "";
839 $nt[1] = $self->strip_tags($nt[1]);
840 }
841 }
842 if ( ! $gotcap ) { $nt[0] = $nt[1] = "" }
843 else {
844 # strip part-tags
845 $nt[0] =~ s/^.*>//s;
846 $nt[1] =~ s/<.*$//s;
847 }
848 my ($type);
849 if ( $gotcap == 0 ) { return ("nocaption", "") }
850 elsif ( $gotcap == 1 ) { $type = "captionabove:" }
851 elsif ( $gotcap == 2 ) { $type = "captioncovering:" }
852 elsif ( $gotcap == 3 ) { $type = "captionbelow:" }
853 return ($type, $nt[0], $nt[1]);
854}
855
856# tag-based near text
857#
858# tries different tag sets
859# and chooses the best one
860sub extract_near_text {
861 my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
862 my (@neartext, $len, $hdelim, $maxtext, $mintext, $goodlen,
863 @bestlen, @best, $startpos, $context, $context_size,
864 $dist, $bdist, $best1, $i, $nt);
865 $bestlen[0] = $bestlen[1] = 0; $bestlen[2] = $bdist = 999999;
866 $best[0] = $best[1] = $best[2] = "";
867 $maxtext = $self->{'maxtext'};
868 $mintext = $self->{'mintext'};
869 $goodlen = $self->{'neartext_len'};
870
871 # extract a context to extract near text from (faster)
872 $context_size = $maxtext*4;
873 $startpos = $pos - ($context_size / 2);
874 if ($startpos < $prevpos ) { $startpos = $prevpos }
875 if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
876 { $context_size = ($nextpos - $startpos) }
877 $context = substr ( $$textref, $startpos, $context_size );
878 $context =~ s/<!--.*?-->//gs;
879 $context =~ s/^.*-->(.*)/$1/gs;
880 $context =~ s/(.*)<!--.*$/$1/gs;
881
882 # try stepping through markup delimiter sets
883 # and selecting the best one
884 foreach $hdelim ( @{ $self->{'delims'} } ) {
885 @neartext = $self->extract_tagged_neartext($tag, $hdelim, \$context);
886 $nt = join("", @neartext);
887 $len = length($nt);
888 # Priorities:
889 # 1. Greater than mintext
890 # 2. Less than maxtext
891 # 3. Closest to goodlen
892 if ( $len <= $goodlen && $len > $bestlen[0] ) {
893 $bestlen[0] = $len;
894 $best[0] = $hdelim;
895 } elsif ( $len >= $maxtext && $len < $bestlen[2] ) {
896 $bestlen[2] = $len;
897 $best[2] = $hdelim;
898 } elsif ( $len >= $bestlen[0] && $len <= $bestlen[2] ) {
899 $dist = abs($goodlen - $len);
900 if ( $dist < $bdist ) {
901 $bestlen[1] = $len;
902 $best[1] = $hdelim;
903 $bdist = $dist;
904 }
905 }
906 }
907 $best1 = 2;
908 foreach $i ( 0..2 ) {
909 if ( $bestlen[$i] == 999999 ) { $bestlen[$i] = 0 }
910 $dist = abs($goodlen - $bestlen[$i]);
911 if ( $bestlen[$i] > $mintext && $dist <= $bdist ) {
912 $best1 = $i;
913 $bdist = $dist;
914 }
915 }
916 @neartext = $self->extract_tagged_neartext($tag, $best[$best1], \$context);
917 if ( $bestlen[$best1] > $maxtext ) {
918 # truncate on word boundary if too much text
919 my $hmax = $maxtext / 2;
920 ($neartext[0]) = $neartext[0] =~ /([^\s]*.{1,$hmax})$/s;
921 ($neartext[1]) = $neartext[1] =~ /^(.{1,$hmax}[^\s]*)/s;
922 } elsif ( $bestlen[$best1] < $mintext ) {
923 # use plain text extraction if tags failed (e.g. usable tag outside context)
924 print {$self->{'outhandle'}} "W3ImgPlug: Fallback to plain-text extraction for $tag\n"
925 if $self->{'verbosity'} > 2;
926 $neartext[0] = "<tr><td>RawNeartext</td><td>" . $self->extract_raw_neartext($tag, $textref) . "</td></tr>";
927 $neartext[1] = "";
928 }
929 # get previous header if available
930 $neartext[0] .= "<br>\n" .
931 $self->get_prev_header($pos, \$context) if ( $self->{'aggressiveness'} >= 4 );
932 $neartext[0] = " " if (! defined $neartext[0]);
933 $neartext[1] = " " if (! defined $neartext[1]);
934
935 return "<tr><td>NearText</td><td>" . (join "|", @neartext) . "</td></tr>\n"; # TODO: the | is for testing purposes
936} # end extract_near_text
937
938# actually captures tag-based
939# near-text given a tag set
940sub extract_tagged_neartext {
941 my ($self, $tag, $bound_tag, $textref) = (@_);
942 return "" if ( ! ($$textref =~ /\Q$tag/) );
943 my (@nt, $delim, $pre_tag, $n);
944 $nt[0] = $`;
945 $nt[1] = $';
946
947 # get text after previous image tag
948 $nt[0] =~ s/.*<($bound_tag)[^>]*>(.*)/$2/is; # get rid of preceding text
949 if (defined($1)) { $delim = $1 }
950 $pre_tag = $bound_tag;
951
952 if (defined($delim)) {
953 # we want to try and use the end tag of the previous delimiter
954 # (put it on the front of the list)
955 $pre_tag =~ s/(^|\|)($delim)($|\|)//i; # take it out
956 $pre_tag =~ s/\|\|/\|/i; # replace || with |
957 $pre_tag = $delim . "|" . $pre_tag; # put it on the front
958 }
959
960 # get text before next image tag
961 $nt[1] =~ s/<\/?(?:$pre_tag)[^>]*>.*//is; # get rid of stuff after first delimiter
962
963 # process related text
964 for $n (0..1) {
965 if ( defined($nt[$n]) ) {
966 $nt[$n] =~ s/<.*?>//gs; # strip all html tags
967 $nt[$n] =~ s/\s+/\ /gs; # remove extra whitespace
968 $nt[$n] =~ s/\&\w+\;//sg; # remove &nbsp; etc
969 # strip part-tags
970 if ( $n == 0 ) { $nt[0] =~ s/^.*>//s }
971 if ( $n == 1 ) { $nt[1] =~ s/<.*$//s }
972 } else { $nt[$n] = ""; } # handle nulls
973 }
974 return @nt;
975}
976
977# this function is fall-back
978# if tags aren't suitable.
979#
980# extracts a fixed length of characters
981# either side of image tag (on word boundary)
982sub extract_raw_neartext {
983 my ($self, $tag, $textref) = (@_);
984 my ($rawtext, $startpos, $fp);
985 my $imgs = \%{$self->{'imglist'}};
986 ($fp) = $tag =~ /([\w\\\/]+\.(?:gif|jpe?g|png))/is;
987 if (! $fp) { return " " };
988 # if the cached, plain-text version isn't there, then create it
989 $self->init_plaintext($textref) unless defined($self->{'plaintext'});
990
991 # take the closest maxtext/2 characters
992 # either side of the tag (by word boundary)
993 return "" if ( ! exists $imgs->{$fp}{'rawpos'} );
994 $startpos = $imgs->{$fp}{'rawpos'} - (($self->{'maxtext'} / 2) + 20);
995 if ( $startpos < 0 ) { $startpos = 0 }
996 $rawtext = substr $self->{'plaintext'}, $startpos, $self->{'maxtext'} + 20;
997 $rawtext =~ s/\s\s/ /g;
998
999 return $rawtext;
1000}
1001
1002# init plaintext variable for HTML-stripped version
1003# (for full text index/raw assoc text extraction)
1004sub init_plaintext {
1005 my ($self, $textref) = (@_);
1006 my ($page, $fp);
1007 my $imgs = \%{$self->{'imglist'}};
1008 $page = $$textref; # make a copy of original
1009
1010 # strip tags around image filenames so they don't get zapped
1011 $page =~ s/<\w+\s+.*?([\w\/\\]+\.(?:gif|jpe?g|png))[^>]*>/\"$1\"/gsi;
1012 $page =~ s/<.*?>//gs;
1013 $page =~ s/&nbsp;/ /gs;
1014 $page =~ s/&amp;/&/gs; #TODO: more &zzz; replacements (except &lt;, $gt;)
1015
1016 # get positions and strip images
1017 while ( $page =~ /([^\s\'\"]+\.(jpe?g|gif|png))/ig ) {
1018 $fp = $1;
1019 if ( $imgs->{$fp}{'exists'} ) {
1020 $imgs->{$fp}{'rawpos'} = pos $page;
1021 }
1022 $page =~ s/\"$fp\"//gs;
1023 }
1024 $self->{'plaintext'} = $page;
1025}
1026
1027# finds and filters images based on size
1028# (dimensions, height, filesize) and existence
1029#
1030# looks for image filenames (.jpg, .gif, etc)
1031# and checks for existence on disk
1032# (hence supports most JavaScript images)
1033sub get_img_list {
1034 my $self = shift (@_);
1035 my ($textref) = (@_);
1036 my ($filepath, $relpath, $abspath, $pos, $num, $width, $height, $filesize);
1037 my $imgs = \%{$self->{'imglist'}};
1038 while ( $$textref =~ /([^\s\'\"]+\.(jpe?g|gif|png))/ig ) {
1039 $filepath = $1;
1040 $pos = pos $$textref;
1041 next if ( $imgs->{$filepath}{'relpath'} );
1042 $relpath = $filepath;
1043 $relpath =~ s/^http\:\/\///; # remove http:// in case we have mirrored it
1044 $relpath =~ s/\\/\//g; # replace \ with /
1045 $relpath =~ s/^\.\///s; # make "./filepath" into "filepath"
1046 $imgs->{$filepath}{'relpath'} = $relpath;
1047 $abspath = "$self->{'htpath'}/$relpath";
1048
1049 if (! -e $abspath) { next }
1050
1051 # can't modify real filepath var because it
1052 # then can't be located in the page for tag recognition later
1053 ($width, $height) =
1054 `identify $abspath -ping -format "%wx%h"` =~ /^(\d*)x(\d*)$/m;
1055 if (! ($width && $height)) {
1056 print STDERR "W3ImgPlug: ($abspath) 'identify' failed. Check ImageMagicK binaries are installed and working correctly\n"; next;
1057 }
1058 $filesize = (-s $abspath);
1059 if ( $filesize >= $self->{'min_img_filesize'}
1060 && ( $width >= $self->{'min_img_width'} )
1061 && ( $height >= $self->{'min_img_height'} ) ) {
1062
1063 $imgs->{$filepath}{'exists'} = 1;
1064 $imgs->{$filepath}{'pos'} = $pos;
1065 $imgs->{$filepath}{'width'} = $width;
1066 $imgs->{$filepath}{'height'} = $height;
1067 $imgs->{$filepath}{'filesize'} = $filesize;
1068 } else {
1069 print {$self->{'outhandle'}} "W3ImgPlug: skipping $self->{'base_path'}/$relpath: $filesize, $width x $height\n"
1070 if $self->{'verbosity'} > 2;
1071 }
1072 }
1073 $num = 0;
1074 foreach $i ( keys %{$imgs} ) {
1075 if ( $imgs->{$i}{'pos'} ) {
1076 $num++;
1077 } else { delete $imgs->{$i} }
1078 }
1079 return $num;
1080}
1081
1082# make the text available to the read function
1083# by making it an object variable
1084sub read_file {
1085 my ($self, $filename, $encoding, $language, $textref) = @_;
1086 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1087
1088 # if HTMLplug has run through, then it will
1089 # have replaced references so we have to
1090 # make a copy of the text before processing
1091 if ( $self->{'index_pages'} ) {
1092 $self->{'text'} = $$textref;
1093 $self->{'textref'} = \($self->{'text'});
1094 } else {
1095 $self->{'textref'} = $textref;
1096 }
1097 $self->{'plaintext'} = undef;
1098}
1099
1100# HTMLPlug only extracts meta-data if it is specified in plugin options
1101# hence a special function to do it here
1102sub get_meta_value {
1103 my ($self, $name, $textref) = @_;
1104 my ($value);
1105 $name = lc $name;
1106 if ($name eq "title") {
1107 ($value) = $$textref =~ /<title>(.*?)<\/title>/is
1108 } else {
1109 my $qm = "(?:\"|\')";
1110 ($value) = $$textref =~ /<meta name\s*=\s*$qm?$name$qm?\s+content\s*=\s*$qm?(.*?)$qm?\s*>/is
1111 }
1112 $value = "" unless $value;
1113 return $value;
1114}
1115
1116# make filename an anchor reference
1117# so we can go straight to the image
1118# within the cached version of the source page
1119# (augment's HTMLPlug sub)
1120sub replace_images {
1121 my $self = shift (@_);
1122 my ($front, $link, $back, $base_dir,
1123 $file, $doc_obj, $section) = @_;
1124 $link =~ s/\"//g;
1125 my ($a_name) = $link;
1126 $a_name =~ s/[\/\\\:\&]/_/g;
1127 # keep a list so we don't repeat the same anchor
1128 if ( ! $self->{'imglist'}{$link}{'anchored'} ) {
1129 $front = "<a name=\"gsdl_$a_name\">$front";
1130 $back = "$back</a>";
1131 $self->{'imglist'}{$link}{'anchored'} = 1;
1132 }
1133 return $self->SUPER::replace_images($front, $link, $back, $base_dir,
1134 $file, $doc_obj, $section);
1135}
1136
11371;
Note: See TracBrowser for help on using the repository browser.