Context Navigation

source: trunk/gsdl/perllib/plugins/W3ImgPlug.pm@ 7243

Last change on this file since 7243 was 6408, checked in by jmt12, 20 years ago
Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)
Property svn:keywords set to `Author Date Id Revision`
File size: 41.0 KB

Line
1	###########################################################################
2	#
3	# W3ImgPlug.pm -- Context-based image indexing plugin for HTML documents
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 2001 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# DESCRIPTION:
28	#
29	# Extracts images and associated text and metadata from
30	# web pages as individual documents for indexing. Thumbnails
31	# are created from each image for browsing purposes.
32	#
33	# Options are available for configuring the aggressiveness of the
34	# associated text extraction mechanisms. A higher level of
35	# aggressiveness will extract more text and consequently
36	# may mean lower accuracy (precision); however, it may also
37	# retrieve more of the relevant images from the collection (recall).
38	# Lower levels of aggressiveness maybe result in slightly faster
39	# collection builds at the import stage.
40	#
41	# W3ImgPlug is a subclass of HTMLPlug (i.e. it will index pages also
42	# if required). It can be used in place of HTMLPlug to index both
43	# pages and their images.
44	#
45	# REQUIREMENTS:
46	#
47	# The ImageMagick image manipulation is used to create
48	# thumbnails and extract some image metadata. (Available
49	# from http://www.imagemagick.org/)
50	#
51	# Unix:
52	# Many Linux distributions contain ImageMagick.
53	#
54	# Windows:
55	# ImageMagick can be downloaded from the website above.
56	# Make sure the system path includes the ImageMagick binaries
57	# before using W3ImgPlug.
58	#
59	# NOTE: NT/2000/XP contain a filesystem utility 'convert.exe'
60	# with the same name as the image conversion utility. The
61	# ImageMagick FAQ recommends renaming the filesystem
62	# utility (e.g. to 'fsconvert.exe') to avoid this clash.
63	#
64	# USAGE:
65	#
66	# An image document consists of metadata elements:
67	#
68	# OriginalFilename, FilePath, Filename, FileExt, FileSize,
69	# Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL
70	# ImageText, PageTitle
71	#
72	# Most of these are only useful in format strings (e.g. ThumbURL,
73	# Filename, URL, PageURL, CachePageURL).
74	#
75	# ImageText, as the name suggests contains the indexable text.
76	# (unless using the -document_text plugin option)
77	#
78	# Since image documents are made up of metadata elements
79	# alone, format strings are needed to display them properly.
80	# NOTE: The receptionist will only display results (e.g. thumbnails)
81	# in 4 columns if the format string begins with "<td><table>".
82	#
83	# The example below takes the user to the image within the
84	# source HTML document rather than using a format string
85	# on DocumentText to display the image document itself.
86	#
87	# Example collect.cfg:
88	#
89	# ...
90	#
91	# indexes document:ImageText document:text
92	# defaultindex document:ImageText
93	#
94	# collectionmeta .document:ImageText "images"
95	# collectionmeta .document:text "documents"
96	#
97	# ...
98	#
99	# plugin W3ImgPlug -index_pages -aggressiveness 6
100	#
101	# ...
102	#
103	# format SearchVList '<td>{If}{[Title],[link][icon] [Title][[/link],
104	# <table><tr><td align="center"><a href="[CachePageURL]">
105	# <img src="[ThumbURL]"></a></td></tr><tr><td align="center">
106	# <a href="[CachePageURL]"><font size="-1">[OriginalFilename]</font></a>
107	# <br>[Width]x[Height]</td></tr></table>}</td>'
108	#
109	# ...
110	#
111
112	package W3ImgPlug;
113
114	use HTMLPlug;
115	use ghtml;
116	use unicode;
117	use util;
118	use parsargv;
119	use strict 'subs';
120
121	sub BEGIN {
122	@ISA = qw( HTMLPlug );
123	}
124
125	my $aggressiveness_list =
126	[ { 'name' => "1",
127	'desc' => "{W3ImgPlug.aggressiveness.1}" },
128	{ 'name' => "2",
129	'desc' => "{W3ImgPlug.aggressiveness.2}" },
130	{ 'name' => "3",
131	'desc' => "{W3ImgPlug.aggressiveness.3}" },
132	{ 'name' => "4",
133	'desc' => "{W3ImgPlug.aggressiveness.4}" },
134	{ 'name' => "5",
135	'desc' => "{W3ImgPlug.aggressiveness.5}" },
136	{ 'name' => "6",
137	'desc' => "{W3ImgPlug.aggressiveness.6}" },
138	{ 'name' => "7",
139	'desc' => "{W3ImgPlug.aggressiveness.7}" },
140	{ 'name' => "8",
141	'desc' => "{W3ImgPlug.aggressiveness.8}" },
142	{ 'name' => "9",
143	'desc' => "{W3ImgPlug.aggressiveness.9}" } ];
144
145	my $arguments =
146	[ { 'name' => "aggressiveness",
147	'desc' => "{W3ImgPlug.aggressiveness}",
148	'type' => "int",
149	'list' => $aggressiveness_list,
150	'deft' => "3",
151	'reqd' => "no" },
152	{ 'name' => "index_pages",
153	'desc' => "{W3ImgPlug.index_pages}",
154	'type' => "flag",
155	'reqd' => "no" },
156	{ 'name' => "no_cache_images",
157	'desc' => "{W3ImgPlug.no_cache_images}",
158	'type' => "flag",
159	'reqd' => "no" },
160	{ 'name' => "min_size",
161	'desc' => "{W3ImgPlug.min_size}",
162	'type' => "int",
163	'deft' => "2000",
164	'reqd' => "no" },
165	{ 'name' => "min_width",
166	'desc' => "{W3ImgPlug.min_width}",
167	'type' => "int",
168	'deft' => "50",
169	'reqd' => "no" },
170	{ 'name' => "min_height",
171	'desc' => "{W3ImgPlug.min_height}",
172	'type' => "int",
173	'deft' => "50",
174	'reqd' => "no" },
175	{ 'name' => "thumb_size",
176	'desc' => "{W3ImgPlug.thumb_size}",
177	'type' => "int",
178	'deft' => "100",
179	'reqd' => "no" },
180	{ 'name' => "convert_params",
181	'desc' => "{W3ImgPlug.convert_params}",
182	'type' => "string",
183	'deft' => "",
184	'reqd' => "no" },
185	{ 'name' => "min_near_text",
186	'desc' => "{W3ImgPlug.min_near_text}",
187	'type' => "int",
188	'deft' => "10",
189	'reqd' => "no" },
190	{ 'name' => "max_near_text",
191	'desc' => "{W3ImgPlug.max_near_text}",
192	'type' => "int",
193	'deft' => "400",
194	'reqd' => "no" },
195	{ 'name' => "smallpage_threshold",
196	'desc' => "{W3ImgPlug.smallpage_threshold}",
197	'type' => "int",
198	'deft' => "2048",
199	'reqd' => "no" },
200	{ 'name' => "textrefs_threshold",
201	'desc' => "{W3ImgPlug.textrefs_threshold}",
202	'type' => "int",
203	'deft' => "2",
204	'reqd' => "no" },
205	{ 'name' => "caption_length",
206	'desc' => "{W3ImgPlug.caption_length}",
207	'type' => "int",
208	'deft' => "80",
209	'reqd' => "no" },
210	{ 'name' => "neartext_length",
211	'desc' => "{W3ImgPlug.neartext_length}",
212	'type' => "int",
213	'deft' => "300",
214	'reqd' => "no" },
215	{ 'name' => "document_text",
216	'desc' => "{W3ImgPlug.document_text}",
217	'type' => "flag",
218	'reqd' => "no" } ];
219
220	my $options = { 'name' => "W3ImgPlug",
221	'desc' => "{W3ImgPlug.desc}",
222	'abstract' => "no",
223	'inherits' => "yes",
224	'args' => $arguments };
225
226
227	# sub print_usage {
228	# print STDERR "\nUsage: plugin W3ImgPlug [options]\n\n";
229	# print STDERR " options:\n";
230	# print STDERR " -aggressiveness Range of related text extraction techniques to use [4]\n";
231	# print STDERR " 1: Filename, path, ALT text only\n";
232	# print STDERR " 2: All of 1, plus caption where available\n";
233	# print STDERR " 3: All of 2, plus near paragraphs where available\n";
234	# print STDERR " 4: All of 3, plus previous headers (<h1>, <h2>...)\n";
235	# print STDERR " where available\n";
236	# print STDERR " 5: All of 4, plus textual references where available\n";
237	# print STDERR " 6: All of 4, plus page metatags (title, keywords, etc)\n";
238	# print STDERR " 7: All of 6, 5 and 4 combined\n";
239	# print STDERR " 8: All of 7, plus repeat caption, filename, etc (raise \n";
240	# print STDERR " ranking of more relevant results)\n";
241	# print STDERR " 10: All of 1, plus full text of source page\n";
242	# print STDERR "\n";
243	# print STDERR " -no_cache_images Don't cache images (point to URL of original)\n";
244	# print STDERR " -index_pages Index the pages along with the images.\n";
245	# print STDERR " Otherwise reference the pages at the source URL\n";
246	# print STDERR " -min_size Bytes. Skip images smaller than this [2000]\n";
247	# print STDERR " -min_width Pixels. Skip images narrower than this [50 pixels]\n";
248	# print STDERR " -min_height Pixels. Skip images shorter than this [50 pixels]\n";
249	# print STDERR " -thumb_size Max thumbnail size. Both width and height [100 pixels]\n";
250	# print STDERR " -convert_params Additional parameters for ImageMagicK convert on\n";
251	# print STDERR " thumbnail creation. For example, '-raise' will give\n";
252	# print STDERR " a three dimensional effect to thumbnail images.\n";
253	# print STDERR " -document_text Add image text as document:text (otherwise IndexedText\n";
254	# print STDERR " metadata field)\n";
255	# print STDERR "\n";
256	# print STDERR " Advanced Options (applicability depends on aggressiveness level)\n";
257	# print STDERR " -smallpage_threshold Images on pages smaller than this (bytes) will have\n";
258	# print STDERR " the page (title, keywords, etc) meta-data added [2048]\n";
259	# print STDERR " -textrefs_threshold Threshold for textual references. Lower values mean\n";
260	# print STDERR " the algorithm is less strict [2]\n";
261	# print STDERR " -caption_length Maximum length of captions (in characters) [100]\n";
262	# print STDERR " -neartext_length Target length of near text (in characters) [300]\n";
263	# print STDERR " -max_near_text Maximum characters near images to extract [400]\n";
264	# print STDERR " -min_near_text Minimum characters of near text or caption to extract [10]\n";
265	# print STDERR "\n";
266	# print STDERR " Tag set configuration file (XML format):\n";
267	# print STDERR " <collectionpath>/etc/W3ImgPlug.cfg \n";
268	# print STDERR "\n";
269	# print STDERR "\n";
270	# print STDERR "W3ImgPlug inherits all of HTMLPlug's functionality and options:\n";
271	# HTMLPlug::print_usage();
272	# print STDERR "\n";
273	# }
274
275	sub new {
276	my $class = shift (@_);
277	my $self = new HTMLPlug ($class, @_);
278	$self->{'plugin_type'} = "W3ImgPlug";
279	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
280	my $option_list = $self->{'option_list'};
281	push( @{$option_list}, $options );
282
283	if (!parsargv::parse(\@_,
284	q^aggressiveness/\d/3^, \$self->{'aggressiveness'},
285	q^index_pages^, \$self->{'index_pages'},
286	q^no_cache_images^, \$self->{'no_cache_images'},
287	q^min_size/\d*/2000^, \$self->{'min_img_filesize'},
288	q^min_width/\d*/50^, \$self->{'min_img_width'},
289	q^min_height/\d*/50^, \$self->{'min_img_height'},
290	q^thumb_size/\d*/100^, \$self->{'thumbnail_size'},
291	q^convert_params/.*/ ^, \$self->{'img_convert_param'},
292	q^max_near_text/\d*/400^, \$self->{'maxtext'},
293	q^min_near_text/\d*/10^, \$self->{'mintext'},
294	q^smallpage_threshold/\d*/2048^, \$self->{'smallpage_threshold'},
295	q^textrefs_threshold/\d*/2^, \$self->{'textref_threshold'},
296	q^caption_length/\d*/80^, \$self->{'caption_len'},
297	q^neartext_length/\d*/300^, \$self->{'neartext_len'},
298	q^document_text^, \$self->{'document_text'},
299	"allow_extra_options"
300	)) {
301
302	print STDERR "\nIncorrect options passed to W3ImgPlug, check your collect.cfg configuration file\n";
303	$self->print_txt_usage(""); # Use default resource bundle
304	die "\n";
305	}
306
307	# init class variables
308	$self->{'textref'} = undef; # init by read_file fn
309	$self->{'htdoc_obj'} = undef; # init by process fn
310	$self->{'htpath'} = undef; # init by process fn
311	$self->{'hturl'} = undef; # init by process fn
312	$self->{'plaintext'} = undef; # HTML stripped version - only init if needed by raw_neartext sub
313	$self->{'smallpage'} = 0; # set by process fn
314	$self->{'images_indexed'} = undef; # num of images indexed - if 1 or 2 then we know page is small
315	$self->{'initialised'} = undef; # flag (see set_extraction_options())
316
317	return bless $self, $class;
318	}
319
320	# if indexing pages, let HTMLPlug do it's stuff
321	# image extraction done through read()
322	sub process {
323	my ($self, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
324	$self->{'imglist'} = ();
325	if ( $self->{'index_pages'} ) {
326	my $ok = $self->SUPER::process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
327	if ( ! $ok ) { return $ok }
328	$self->{'htdoc_obj'} = $doc_obj;
329	}
330	# else use URL for referencing
331	#if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
332
333	$self->{'htpath'} = $base_dir if (-d $base_dir);
334	if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} .= "/$1"; }
335	$self->{'htpath'} =~ s/\\/\//g; # replace \ with /
336
337	$self->{'hturl'} = "http://$file";
338	$self->{'hturl'} =~ s/\\/\//g; # for windows
339	($self->{'filename'}) = $file =~ /.[\/\\](.)/;
340	($self->{'base_path'}) = $file =~ /(.*)[\/\\]/i;
341	if ( ( -s "$base_dir/$file") <= $self->{'smallpage_threshold'} ) {
342	$self->{'smallpage'} = 1;
343	} else { $self->{'smallpage'} = 0; }
344
345	if ( defined($self->{'initialised'}) ) { return 1; }
346	else {
347	$self->{'initialised'} = $self->set_extraction_options($base_dir =~ /^(.*?)\/import/i);
348	return $self->{'initialised'};
349	}
350	}
351
352	# get complex configuration options from configuration files
353	# -- $GSDLCOLLECTION/etc/W3ImgPlug.cfg (tag sets for aggr 2+)
354	# -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+)
355
356	# If there's no W3ImgPlug.cfg file we'll use the following default values
357	my $defaultcfg = '
358	<delimitertagset>
359	<setname>Caption</setname>
360	<taggroup>font</taggroup>
361	<taggroup>tt</taggroup>
362	<taggroup>small</taggroup>
363	<taggroup>b</taggroup>
364	<taggroup>i</taggroup>
365	<taggroup>u</taggroup>
366	<taggroup>em</taggroup>
367	<taggroup>td</taggroup>
368	<taggroup>li</taggroup>
369	<taggroup>a</taggroup>
370	<taggroup>p</taggroup>
371	<taggroup>tr</taggroup>
372	<taggroup>center</taggroup>
373	<taggroup>div</taggroup>
374	<taggroup>caption</taggroup>
375	<taggroup>br</taggroup>
376	<taggroup>ul</taggroup>
377	<taggroup>ol</taggroup>
378	<taggroup>table</taggroup>
379	<taggroup>hr</taggroup>
380	</delimitertagset>
381
382	<delimitertagset>
383	<setname>Neartext</setname>
384	<taggroup>tr\|hr\|table\|h\d\|img\|body</taggroup>
385	<taggroup>td\|tr\|hr\|table\|h\d\|img\|body</taggroup>
386	<taggroup>p\|br\|td\|tr\|hr\|table\|h\d\|img\|body</taggroup>
387	<taggroup>font\|p\|i\|b\|em\|img</taggroup>
388	</delimitertagset>
389	';
390
391	sub set_extraction_options() {
392	my ($self, $collpath) = @_;
393	my ($filepath);
394
395	print {$self->{'outhandle'}} "W3ImgPlug: Initialising\n"
396	if $self->{'verbosity'} > 1;
397	# etc/W3ImgPlug.cfg (XML)
398	# tag sets for captions and neartext
399	if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) {
400	$self->{'delims'} = [];
401	$self->{'cdelims'} = [];
402	my ($cfg, @tagsets, $tagset, $type, @delims);
403
404	$filepath = "$collpath/etc/W3ImgPlug.cfg";
405	if ( open CFG, "<$filepath" ) {
406	while (<CFG>) { $cfg .= $_ }
407	close CFG;
408	} else {
409	$cfg = $defaultcfg;
410	}
411
412	(@tagsets) =
413	$cfg =~ /<delimitertagset>(.*?)<\/delimitertagset>/igs;
414	foreach $tagset ( @tagsets ) {
415	($type) = $tagset =~ /<setname>(.*?)<\/setname>/i;
416	if ( lc($type) eq "caption" ) {
417	(@{$self->{'cdelims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
418	}
419	elsif ( lc($type) eq "neartext" ) {
420	(@{$self->{'delims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
421	}
422	}
423
424	# output a warning if there seem to be no delimiters
425	if ( scalar(@{$self->{'cdelims'}} == 0)) {
426	print {$self->{'outhandle'}} "W3ImgPlug: Warning: no caption delimiters found in $filepath\n";
427	}
428	if ( scalar(@{$self->{'delims'}} == 0)) {
429	print {$self->{'outhandle'}} "W3ImgPlug: Warning: no neartext delimiters found in $filepath\n";
430	}
431	}
432
433	# get stop words for textual reference extraction
434	# TODO: warnings scroll off. Would be best to output them again at end of import
435	if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) {
436	$self->{'stopwords'} = ();
437	$filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");
438	if ( open STOPWORDS, "<$filepath" ) {
439	while ( <STOPWORDS> ) {
440	chomp;
441	$self->{'stopwords'}{$_} = 1;
442	}
443	close STOPWORDS;
444	} else {
445	print {$self->{'outhandle'}} "W3ImgPlug: Warning: couldn't open stopwords file at $filepath ($!)\n";
446	}
447
448	}
449
450	if ( $self->{'neartext_len'} > $self->{'maxtext'} ) {
451	$self->{'maxtext'} = $self->{'neartext_len'} * 1.33;
452	print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'maxtext'}\n";
453	}
454	if ( $self->{'caption_len'} > $self->{'maxtext'} ) {
455	$self->{'maxtext'} = $self->{'caption_len'} * 1.33;
456	print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'maxtext'}\n";
457	}
458
459	return 1;
460	}
461
462	# return number of files processed, undef if can't process
463	# Note that $base_dir might be "" and that $file might
464	# include directories
465	sub read {
466	my ($self, $pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = (@_);
467	my ($doc_obj, $section, $filepath, $imgtag, $pos, $context, $numdocs, $tndir, $imgs);
468	# forward normal read (runs HTMLPlug if index_pages T)
469	my $ok = $self->SUPER::read($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs);
470	if ( ! $ok ) { return $ok }
471
472	my $outhandle = $self->{'outhandle'};
473	my $textref = $self->{'textref'};
474	my $htdoc_obj = $self->{'htdoc_obj'};
475	$numdocs = 0;
476	$base_dir =~ /(.)\/./;
477	$tndir = "$1/archives/thumbnails"; # TODO: this path shouldn't be hardcoded?
478	&util::mk_all_dir($tndir) unless -e "$tndir";
479
480	$imgs = \%{$self->{'imglist'}};
481	my $nimgs = $self->get_img_list($textref);
482	$self->{'images_indexed'} = $nimgs;
483	if ( $nimgs > 0 ) {
484	my @fplist = (sort { $imgs->{$a}{'pos'} <=> $imgs->{$b}{'pos'} } keys %{$imgs});
485	my $i = 0;
486	foreach $filepath ( @fplist ) {
487	$pos = $imgs->{$filepath}{'pos'};
488	$context = substr ($$textref, $pos - 50, $pos + 50); # grab context (quicker)
489	($imgtag) = ($context =~ /(<(?:img\|a\|body)\s[^>]$filepath[^>]>)/is );
490	if (! defined($imgtag)) { $imgtag = $filepath }
491	print $outhandle "W3ImgPlug: extracting $filepath\n"
492	if ( $self->{'verbosity'} > 1 );
493	$doc_obj = new doc ("", "indexed_doc");
494	$section = $doc_obj->get_top_section();
495	$prevpos = ( $i == 0 ? 0 : $imgs->{$fplist[$i - 1]}{'pos'});
496	$nextpos = ( $i >= ($nimgs -1) ? -1 : $imgs->{$fplist[$i + 1]}{'pos'} );
497
498	$self->extract_image_info($imgtag, $filepath, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos);
499	$processor->process($doc_obj);
500	$numdocs++;
501	$i++;
502	}
503	return $numdocs;
504	} else {
505	print $outhandle "W3ImgPlug: No images from $file indexed\n"
506	if ( $self->{'verbosity'} > 2 );
507	return 1;
508	}
509
510	}
511
512	# for every valid image tag
513	# 1. extract related text and image metadata
514	# 2. add this as document meta-data
515	# 3. add assoc image(s) as files
516	#
517	sub extract_image_info {
518	my $self = shift (@_);
519	my ($tag, $id, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos) = (@_);
520	my ($filename, $orig_fp, $fn, $ext, $reltext, $relreltext, $crcid, $imgs,
521	$thumbfp, $pagetitle, $alttext, $filepath, $aggr);
522	$aggr = $self->{'aggressiveness'};
523	$imgs = \%{$self->{'imglist'}};
524	$filepath = $imgs->{$id}{'relpath'};
525	($filename) = $filepath =~ /([^\/\\]+)$/s;
526	($orig_fp) = "$self->{'base_path'}/$filepath";
527	$orig_fp =~ tr/+/ /;
528	$orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc
529	$orig_fp =~ s/\\/\//g;
530	$filepath = "$self->{'htpath'}/$filepath";
531	($onlyfn) = $filename =~ /([^\\\/]*)$/;
532	($fn, $ext) = $onlyfn =~ /(.)\.(.)/;
533	$fn = lc $fn; $ext = lc $ext;
534	($reltext) = "<tr><td>GifComment</td><td>" . `identify $filepath -ping -format "%c"` . "</td></tr>\n"
535	if ($ext eq "gif");
536	$reltext .= "<tr><td>FilePath</td><td>$orig_fp</td></tr>\n";
537
538	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
539	$crcid = "$fn.$ext." . $self->{'next_crcid'}++;
540	} else { ($crcid) = `cksum $filepath` =~ /^(\d+)/; }
541	$thumbfp = "$tndir/tn_$crcid.jpg";
542	`convert -flatten -filter Hanning $self->{'img_convert_param'} -geometry "$self->{'thumbnail_size'}x$self->{'thumbnail_size'}>" $filepath $thumbfp` unless -e $thumbfp;
543	if ( ! (-e $thumbfp) ) {
544	print STDERR "W3ImgPlug: 'convert' failed. Check ImageMagicK binaries are installed and working correctly\n"; return 0;
545	}
546
547	# shove in full text (tag stripped or unstripped) if settings require it
548	if ( $aggr == 10) {
549	$reltext = "<tr><td>AllPage</td><td>" . $$textref . "</td><tr>\n"; # level 10 (all text, verbatim)
550	} else {
551	$pagetitle = $self->get_meta_value("title", $textref);
552	($alttext) = $tag =~ /\salt\s=\s(?:\"\|\')(.+?)(?:\"\|\')/is;
553	if ( defined($alttext) && length($alttext) > 1) {
554	$reltext .= "<tr><td>ALTtext</td><td>$alttext</td></tr>\n"; }
555	$reltext .= "<tr><td>SplitCapitalisation</td><td>" .
556	$self->split_filepath($orig_fp) . "</td></tr>\n";
557
558	# get caption/tag based near text (if appropriate)
559	if ( $aggr > 1 ) {
560	if ( $aggr >= 2 ) {
561	$reltext .=
562	$self->extract_caption_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
563	$relreltext = $reltext;
564	}
565	# repeat the filepath, alt-text, caption, etc
566	if ( $aggr == 8 ) {
567	$reltext .= $relreltext;
568	}
569	if ( $aggr >= 3 ) {
570	$reltext .=
571	$self->extract_near_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
572	}
573
574	# get page metadata (if appropriate)
575	if ( $aggr >= 6 \|\| ( $aggr >= 2 &&
576	( $self->{'images_indexed'} < 2 \|\|
577	($self->{'smallpage'} == 1 && $self->{'images_indexed'} < 6 )))) {
578	$reltext .= $self->get_page_metadata($textref);
579	}
580	# textual references
581	if ( $aggr == 5 \|\| $aggr >= 7) {
582	if ( length($relreltext) > ($self->{'caption_len'} * 2) ) {
583	$reltext .= $self->get_textrefs($relreltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); }
584	else {
585	$reltext .= $self->get_textrefs($reltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
586	}
587	}
588	} # aggr > 1
589	} # aggr != 10
590
591	$doc_obj->set_OID($crcid);
592	$doc_obj->associate_file($thumbfp, "$fn.thumb.jpg", undef, $section);
593	$doc_obj->add_metadata($section, "OriginalFilename", $filename);
594	$doc_obj->add_metadata($section, "FilePath", $orig_fp);
595	$doc_obj->add_metadata($section, "Filename", $fn);
596	$doc_obj->add_metadata($section, "FileExt", $ext);
597	$doc_obj->add_metadata($section, "FileSize", $imgs->{$id}{'filesize'});
598	$doc_obj->add_metadata($section, "Width", $imgs->{$id}{'width'});
599	$doc_obj->add_metadata($section, "Height", $imgs->{$id}{'height'});
600	$doc_obj->add_metadata($section, "URL", "http://$orig_fp");
601	$doc_obj->add_metadata($section, "PageURL", $self->{'hturl'});
602	$doc_obj->add_metadata($section, "PageTitle", $pagetitle);
603	$doc_obj->add_metadata($section, "ThumbURL",
604	"_httpcollection_/index/assoc/[archivedir]/$fn.thumb.jpg");
605
606	if ( $self->{'document_text'} ) {
607	$doc_obj->add_utf8_text($section, "<table border=1>\n$reltext</table>");
608	} else {
609	$doc_obj->add_metadata($section, "ImageText", "<table border=1>\n$reltext</table>\n");
610	}
611
612	if ( $self->{'index_pages'} ) {
613	my ($cache_url) = "_httpdoc_&d=" . $self->{'htdoc_obj'}->get_OID();
614	if ( $imgs->{$id}{'anchored'} ) {
615	my $a_name = $id;
616	$a_name =~ s/[\/\\\:\&]/_/g;
617	$cache_url .= "#gsdl_$a_name" ;
618	}
619	$doc_obj->add_utf8_metadata($section, "CachePageURL", $cache_url);
620	}
621	if ( ! $self->{'no_cache_images'} ) {
622	$onlyfn = lc $onlyfn;
623	$doc_obj->associate_file($filepath, $onlyfn, undef, $section);
624	$doc_obj->add_utf8_metadata($section, "CacheURL",
625	"_httpcollection_/index/assoc/[archivedir]/$onlyfn");
626	}
627	return 1;
628	}
629
630	sub get_page_metadata {
631	my ($self, $textref) = (@_);
632	my (@rval);
633	$rval[0] = $self->get_meta_value("title", $textref);
634	$rval[1] = $self->get_meta_value("keywords", $textref);
635	$rval[2] = $self->get_meta_value("description", $textref);
636	$rval[3] = $self->{'filename'};
637
638	return wantarray ? @rval : "<tr><td>PageMeta</td><td>@rval</td></tr>\n" ;
639	}
640
641	# turns LargeCatFish into Large,Cat,Fish so MG sees the separate words
642	sub split_filepath {
643	my ($self, $filepath) = (@_);
644	my (@words) = $filepath =~ /([A-Z][a-z]+)/g;
645	return join(',', @words);
646	}
647
648	# finds and extracts sentences
649	# that seem to be on the same topic
650	# as other related text (correlations)
651	# and textual references (e.g. in figure 3 ...)
652	sub get_textrefs {
653	my ($self, $reltext, $textref, $prevpos, $pos, $nextpos) = (@_);
654	my ($maxtext, $mintext, $startpos, $context_size, $context);
655
656	my (@relwords, @refwords, %sentences, @pagemeta);
657
658	# extract larger context
659	$maxtext = $self->{'maxtext'};
660	$startpos = $pos - ($maxtext * 4);
661	$context_size = $maxtext*10;
662	if ($startpos < $prevpos ) { $startpos = $prevpos }
663	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) }
664	$context = substr ( $$textref, $startpos, $context_size );
665	$context =~ s/<.*?>//gs;
666	$context =~ s/^.>(.)/$1/gs;
667	$context =~ s/(.)<.$/$1/gs;
668
669	# get page meta-data (if not already included)
670	if ( $self->{'aggressiveness'} == 5 && ! $self->{'smallpage'} ) {
671	@pagemeta = $self->get_page_metadata($textref);
672	foreach $value ( @pagemeta ) {
673	$context .= "$value."; # make each into psuedo-sentence
674	}
675	}
676
677	# TODO: this list is not exhaustive
678	@refwords = ( '(?:is\|are)? ?(?:show(?:s\|n)\|demonstrate(?:d\|s)\|explains\|features) (?:in\|by\|below\|above\|here)',
679	'(?:see)? (?:figure\|table)? (?:below\|above)');
680
681	# extract general references
682	foreach $rw ( @refwords ) {
683	while ( $context =~ /[\.\?\!\,](.?$rw\W.?[\.\?\!\,])/ig ) {
684	$sentence = $1;
685	$sentence =~ s/\s+/ /g;
686	$sentences{$sentence}+=2;
687	}
688	}
689	# extract specific (figure, table) references by number
690	my ($fignum) = $context =~ /[\.\?\!].?(?:figure\|table)s?[\-\_\ \.](\d+\w)\W.*?[\.\?\!]/ig;
691	if ( $fignum ) {
692	foreach $rw ( @refwords ) {
693	while ( $context =~ /[\.\?\!](.?(figure\|table)[\-\_\ \.]$fignum\W.?[\.\?\!])/ig ) {
694	$sentence = $1;
695	$sentence =~ s/\s+/ /g;
696	$sentences{$sentence}+=4;
697	}
698	}
699	}
700
701	# sentences with occurances of important words
702	@relwords = $reltext =~ /([a-zA-Z]{4,})/g; # take out small words
703	foreach $word ( @relwords ) {
704	if ( $self->{'stopwords'}{$word} ) { next } # skip stop words
705	while ( $context =~ /([^\.\?\!]?$word\W.?[\.\?\!])/ig ) {
706	$sentence = $1;
707	$sentence =~ s/\s+/ /g;
708	$sentences{$sentence}++;
709	}
710	}
711	foreach $sentence ( keys %sentences ) {
712	if ($sentences{$sentence} < $self->{'textref_threshold'}) {
713	delete $sentences{$sentence};
714	}
715	}
716	my ($rval) = join "<br>\n", (keys %sentences);
717	if ( $rval && length($rval) > 5 ) {
718	return ( "<tr><td>TextualReferences</td><td>" . $rval . "</td></tr>\n") }
719	else { return "" }
720	}
721
722	# handles caption extraction
723	# calling the extractor with different
724	# tags and choosing the best candidate caption
725	sub extract_caption_text {
726	my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
727	my (@neartext, $len, $hdelim, $goodlen,
728	$startpos, $context, $context_size);
729
730	$mintext = $self->{'mintext'};
731	$goodlen = $self->{'caption_len'};
732
733	# extract a context to extract near text from (faster)
734	$context_size = $self->{'maxtext'}*3;
735	$startpos = $pos - ($context_size / 2);
736	if ($startpos < $prevpos ) { $startpos = $prevpos }
737	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
738	{ $context_size = ($nextpos - $startpos) }
739
740	$context = substr ( $$textref, $startpos, $context_size );
741	$context =~ s/<!--.*?-->//gs;
742	$context =~ s/^.-->(.)/$1/gs;
743	$context =~ s/(.)<!--.$/$1/gs;
744
745	# try stepping through markup delimiter sets
746	# and selecting the best one
747	foreach $hdelim ( @{ $self->{'cdelims'} } ) {
748	@neartext = $self->extract_caption($tag, $hdelim, \$context);
749	$len = length(join("", @neartext));
750	last if ($len >= $mintext && $len <= $goodlen);
751	}
752	# reject if well over reasonable length
753	if ( $len > $goodlen ) {
754	@neartext = [];
755	}
756	$neartext[0] = " " if (! defined $neartext[0]);
757	$neartext[1] = " " if (! defined $neartext[1]);
758	return "<tr><td>Caption</td><td>" . (join ",", @neartext) . "</td></tr>\n"; # TODO: the \| is for testing purposes
759	} # end extract_caption_text
760
761	# the previous section header often gives a bit
762	# of context to the section that the image is
763	# in (invariably the header is before/above the image)
764	# so extract the text of the closest header above the image
765	#
766	# this fn just gets all the headers above the image, within the context window
767	sub get_prev_header {
768	my ($self, $pos, $textref) = (@_);
769	my ($rhtext);
770	while ( $$textref =~ /<h\d>(.*?)<\/h\d>/sig ) {
771	# only headers before image
772	if ((pos $$textref) < $pos) {
773	$rhtext .= "$1, ";
774	}
775	}
776	if ( $rhtext ) { return "Header($rhtext)" }
777	else { return "" }
778	}
779
780	# not the most robust tag stripping
781	# regexps (see perl.com FAQ) but good enough
782	#
783	# used by caption & tag-based near text algorithms
784	sub strip_tags {
785	my ( $self, $value ) = @_;
786	if ( ! defined($value) ) { $value = "" } # handle nulls
787	else {
788	$value =~ s/<.*?>//gs; # strip all html tags
789	$value =~ s/\s+/\ /g; # remove extra whitespace
790	$value =~ s/\&\w+\;//g; # remove   etc
791	}
792	return $value;
793	}
794
795	# uses the given tag(s) to identify
796	# the caption near to the image
797	# (below, above or both below and above)
798	sub extract_caption {
799	my ($self, $tag, $bound_tag, $contextref) = (@_);
800	my (@nt, $n, $etag, $gotcap);
801	return ("", "") if ( ! ($$contextref =~ /\Q$tag/) );
802
803	$nt[0] = $`;
804	$nt[1] = $';
805	$gotcap = 0;
806
807	# look before the image for a boundary tag
808	($etag, $nt[0]) = $nt[0] =~ /<($bound_tag)[\s]?.?>(.?)$/is;
809	# if bound_tag too far from the image, then prob not caption
810	# (note: have to allow for tags, so multiply by 3
811	if ( $etag && length($nt[0]) < ($self->{'caption_len'} * 3) ) {
812	if ( $nt[0] =~ /<\/$etag>/si ) {
813	# the whole caption is above the image: <tag>text</tag><img>
814	($nt[0]) =~ /<(?:$etag)[\s]?.?>(.?)<\/$etag>/is;
815	$nt[0] = $self->strip_tags($nt[0]);
816	if ( length($nt[0]) > $self->{'mintext'} ) {
817	$gotcap = 1;
818	$nt[1] = "";
819	}
820
821	} elsif ( $nt[1] =~ /<\/$etag>/si) {
822	# the caption tag covers image: <tag>text?<img>text?</tag>
823	($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
824	$nt[0] = $self->strip_tags($nt[0] . $nt[1]);
825	if ( length($nt[0]) > $self->{'mintext'} ) {
826	$gotcap = 2;
827	$nt[1] = "";
828	}
829	}
830	}
831	# else try below the image
832	if ( ! $gotcap ) {
833	# the caption is after the image: <img><tag>text</tag>
834	($etag, $nt[1]) = $nt[1] =~ /^.?<($bound_tag)[\s]?.?>(.*)/is;
835	if ( $etag && $nt[1] =~ /<\/$etag>/s) {
836	($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
837	$gotcap = 3;
838	$nt[0] = "";
839	$nt[1] = $self->strip_tags($nt[1]);
840	}
841	}
842	if ( ! $gotcap ) { $nt[0] = $nt[1] = "" }
843	else {
844	# strip part-tags
845	$nt[0] =~ s/^.*>//s;
846	$nt[1] =~ s/<.*$//s;
847	}
848	my ($type);
849	if ( $gotcap == 0 ) { return ("nocaption", "") }
850	elsif ( $gotcap == 1 ) { $type = "captionabove:" }
851	elsif ( $gotcap == 2 ) { $type = "captioncovering:" }
852	elsif ( $gotcap == 3 ) { $type = "captionbelow:" }
853	return ($type, $nt[0], $nt[1]);
854	}
855
856	# tag-based near text
857	#
858	# tries different tag sets
859	# and chooses the best one
860	sub extract_near_text {
861	my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
862	my (@neartext, $len, $hdelim, $maxtext, $mintext, $goodlen,
863	@bestlen, @best, $startpos, $context, $context_size,
864	$dist, $bdist, $best1, $i, $nt);
865	$bestlen[0] = $bestlen[1] = 0; $bestlen[2] = $bdist = 999999;
866	$best[0] = $best[1] = $best[2] = "";
867	$maxtext = $self->{'maxtext'};
868	$mintext = $self->{'mintext'};
869	$goodlen = $self->{'neartext_len'};
870
871	# extract a context to extract near text from (faster)
872	$context_size = $maxtext*4;
873	$startpos = $pos - ($context_size / 2);
874	if ($startpos < $prevpos ) { $startpos = $prevpos }
875	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
876	{ $context_size = ($nextpos - $startpos) }
877	$context = substr ( $$textref, $startpos, $context_size );
878	$context =~ s/<!--.*?-->//gs;
879	$context =~ s/^.-->(.)/$1/gs;
880	$context =~ s/(.)<!--.$/$1/gs;
881
882	# try stepping through markup delimiter sets
883	# and selecting the best one
884	foreach $hdelim ( @{ $self->{'delims'} } ) {
885	@neartext = $self->extract_tagged_neartext($tag, $hdelim, \$context);
886	$nt = join("", @neartext);
887	$len = length($nt);
888	# Priorities:
889	# 1. Greater than mintext
890	# 2. Less than maxtext
891	# 3. Closest to goodlen
892	if ( $len <= $goodlen && $len > $bestlen[0] ) {
893	$bestlen[0] = $len;
894	$best[0] = $hdelim;
895	} elsif ( $len >= $maxtext && $len < $bestlen[2] ) {
896	$bestlen[2] = $len;
897	$best[2] = $hdelim;
898	} elsif ( $len >= $bestlen[0] && $len <= $bestlen[2] ) {
899	$dist = abs($goodlen - $len);
900	if ( $dist < $bdist ) {
901	$bestlen[1] = $len;
902	$best[1] = $hdelim;
903	$bdist = $dist;
904	}
905	}
906	}
907	$best1 = 2;
908	foreach $i ( 0..2 ) {
909	if ( $bestlen[$i] == 999999 ) { $bestlen[$i] = 0 }
910	$dist = abs($goodlen - $bestlen[$i]);
911	if ( $bestlen[$i] > $mintext && $dist <= $bdist ) {
912	$best1 = $i;
913	$bdist = $dist;
914	}
915	}
916	@neartext = $self->extract_tagged_neartext($tag, $best[$best1], \$context);
917	if ( $bestlen[$best1] > $maxtext ) {
918	# truncate on word boundary if too much text
919	my $hmax = $maxtext / 2;
920	($neartext[0]) = $neartext[0] =~ /([^\s]*.{1,$hmax})$/s;
921	($neartext[1]) = $neartext[1] =~ /^(.{1,$hmax}[^\s]*)/s;
922	} elsif ( $bestlen[$best1] < $mintext ) {
923	# use plain text extraction if tags failed (e.g. usable tag outside context)
924	print {$self->{'outhandle'}} "W3ImgPlug: Fallback to plain-text extraction for $tag\n"
925	if $self->{'verbosity'} > 2;
926	$neartext[0] = "<tr><td>RawNeartext</td><td>" . $self->extract_raw_neartext($tag, $textref) . "</td></tr>";
927	$neartext[1] = "";
928	}
929	# get previous header if available
930	$neartext[0] .= "<br>\n" .
931	$self->get_prev_header($pos, \$context) if ( $self->{'aggressiveness'} >= 4 );
932	$neartext[0] = " " if (! defined $neartext[0]);
933	$neartext[1] = " " if (! defined $neartext[1]);
934
935	return "<tr><td>NearText</td><td>" . (join "\|", @neartext) . "</td></tr>\n"; # TODO: the \| is for testing purposes
936	} # end extract_near_text
937
938	# actually captures tag-based
939	# near-text given a tag set
940	sub extract_tagged_neartext {
941	my ($self, $tag, $bound_tag, $textref) = (@_);
942	return "" if ( ! ($$textref =~ /\Q$tag/) );
943	my (@nt, $delim, $pre_tag, $n);
944	$nt[0] = $`;
945	$nt[1] = $';
946
947	# get text after previous image tag
948	$nt[0] =~ s/.<($bound_tag)[^>]>(.*)/$2/is; # get rid of preceding text
949	if (defined($1)) { $delim = $1 }
950	$pre_tag = $bound_tag;
951
952	if (defined($delim)) {
953	# we want to try and use the end tag of the previous delimiter
954	# (put it on the front of the list)
955	$pre_tag =~ s/(^\|\\|)($delim)($\|\\|)//i; # take it out
956	$pre_tag =~ s/\\|\\|/\\|/i; # replace \|\| with \|
957	$pre_tag = $delim . "\|" . $pre_tag; # put it on the front
958	}
959
960	# get text before next image tag
961	$nt[1] =~ s/<\/?(?:$pre_tag)[^>]>.//is; # get rid of stuff after first delimiter
962
963	# process related text
964	for $n (0..1) {
965	if ( defined($nt[$n]) ) {
966	$nt[$n] =~ s/<.*?>//gs; # strip all html tags
967	$nt[$n] =~ s/\s+/\ /gs; # remove extra whitespace
968	$nt[$n] =~ s/\&\w+\;//sg; # remove   etc
969	# strip part-tags
970	if ( $n == 0 ) { $nt[0] =~ s/^.*>//s }
971	if ( $n == 1 ) { $nt[1] =~ s/<.*$//s }
972	} else { $nt[$n] = ""; } # handle nulls
973	}
974	return @nt;
975	}
976
977	# this function is fall-back
978	# if tags aren't suitable.
979	#
980	# extracts a fixed length of characters
981	# either side of image tag (on word boundary)
982	sub extract_raw_neartext {
983	my ($self, $tag, $textref) = (@_);
984	my ($rawtext, $startpos, $fp);
985	my $imgs = \%{$self->{'imglist'}};
986	($fp) = $tag =~ /([\w\\\/]+\.(?:gif\|jpe?g\|png))/is;
987	if (! $fp) { return " " };
988	# if the cached, plain-text version isn't there, then create it
989	$self->init_plaintext($textref) unless defined($self->{'plaintext'});
990
991	# take the closest maxtext/2 characters
992	# either side of the tag (by word boundary)
993	return "" if ( ! exists $imgs->{$fp}{'rawpos'} );
994	$startpos = $imgs->{$fp}{'rawpos'} - (($self->{'maxtext'} / 2) + 20);
995	if ( $startpos < 0 ) { $startpos = 0 }
996	$rawtext = substr $self->{'plaintext'}, $startpos, $self->{'maxtext'} + 20;
997	$rawtext =~ s/\s\s/ /g;
998
999	return $rawtext;
1000	}
1001
1002	# init plaintext variable for HTML-stripped version
1003	# (for full text index/raw assoc text extraction)
1004	sub init_plaintext {
1005	my ($self, $textref) = (@_);
1006	my ($page, $fp);
1007	my $imgs = \%{$self->{'imglist'}};
1008	$page = $$textref; # make a copy of original
1009
1010	# strip tags around image filenames so they don't get zapped
1011	$page =~ s/<\w+\s+.?([\w\/\\]+\.(?:gif\|jpe?g\|png))[^>]>/\"$1\"/gsi;
1012	$page =~ s/<.*?>//gs;
1013	$page =~ s/ / /gs;
1014	$page =~ s/&/&/gs; #TODO: more &zzz; replacements (except <, $gt;)
1015
1016	# get positions and strip images
1017	while ( $page =~ /([^\s\'\"]+\.(jpe?g\|gif\|png))/ig ) {
1018	$fp = $1;
1019	if ( $imgs->{$fp}{'exists'} ) {
1020	$imgs->{$fp}{'rawpos'} = pos $page;
1021	}
1022	$page =~ s/\"$fp\"//gs;
1023	}
1024	$self->{'plaintext'} = $page;
1025	}
1026
1027	# finds and filters images based on size
1028	# (dimensions, height, filesize) and existence
1029	#
1030	# looks for image filenames (.jpg, .gif, etc)
1031	# and checks for existence on disk
1032	# (hence supports most JavaScript images)
1033	sub get_img_list {
1034	my $self = shift (@_);
1035	my ($textref) = (@_);
1036	my ($filepath, $relpath, $abspath, $pos, $num, $width, $height, $filesize);
1037	my $imgs = \%{$self->{'imglist'}};
1038	while ( $$textref =~ /([^\s\'\"]+\.(jpe?g\|gif\|png))/ig ) {
1039	$filepath = $1;
1040	$pos = pos $$textref;
1041	next if ( $imgs->{$filepath}{'relpath'} );
1042	$relpath = $filepath;
1043	$relpath =~ s/^http\:\/\///; # remove http:// in case we have mirrored it
1044	$relpath =~ s/\\/\//g; # replace \ with /
1045	$relpath =~ s/^\.\///s; # make "./filepath" into "filepath"
1046	$imgs->{$filepath}{'relpath'} = $relpath;
1047	$abspath = "$self->{'htpath'}/$relpath";
1048
1049	if (! -e $abspath) { next }
1050
1051	# can't modify real filepath var because it
1052	# then can't be located in the page for tag recognition later
1053	($width, $height) =
1054	`identify $abspath -ping -format "%wx%h"` =~ /^(\d)x(\d)$/m;
1055	if (! ($width && $height)) {
1056	print STDERR "W3ImgPlug: ($abspath) 'identify' failed. Check ImageMagicK binaries are installed and working correctly\n"; next;
1057	}
1058	$filesize = (-s $abspath);
1059	if ( $filesize >= $self->{'min_img_filesize'}
1060	&& ( $width >= $self->{'min_img_width'} )
1061	&& ( $height >= $self->{'min_img_height'} ) ) {
1062
1063	$imgs->{$filepath}{'exists'} = 1;
1064	$imgs->{$filepath}{'pos'} = $pos;
1065	$imgs->{$filepath}{'width'} = $width;
1066	$imgs->{$filepath}{'height'} = $height;
1067	$imgs->{$filepath}{'filesize'} = $filesize;
1068	} else {
1069	print {$self->{'outhandle'}} "W3ImgPlug: skipping $self->{'base_path'}/$relpath: $filesize, $width x $height\n"
1070	if $self->{'verbosity'} > 2;
1071	}
1072	}
1073	$num = 0;
1074	foreach $i ( keys %{$imgs} ) {
1075	if ( $imgs->{$i}{'pos'} ) {
1076	$num++;
1077	} else { delete $imgs->{$i} }
1078	}
1079	return $num;
1080	}
1081
1082	# make the text available to the read function
1083	# by making it an object variable
1084	sub read_file {
1085	my ($self, $filename, $encoding, $language, $textref) = @_;
1086	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1087
1088	# if HTMLplug has run through, then it will
1089	# have replaced references so we have to
1090	# make a copy of the text before processing
1091	if ( $self->{'index_pages'} ) {
1092	$self->{'text'} = $$textref;
1093	$self->{'textref'} = \($self->{'text'});
1094	} else {
1095	$self->{'textref'} = $textref;
1096	}
1097	$self->{'plaintext'} = undef;
1098	}
1099
1100	# HTMLPlug only extracts meta-data if it is specified in plugin options
1101	# hence a special function to do it here
1102	sub get_meta_value {
1103	my ($self, $name, $textref) = @_;
1104	my ($value);
1105	$name = lc $name;
1106	if ($name eq "title") {
1107	($value) = $$textref =~ /<title>(.*?)<\/title>/is
1108	} else {
1109	my $qm = "(?:\"\|\')";
1110	($value) = $$textref =~ /<meta name\s=\s$qm?$name$qm?\s+content\s=\s$qm?(.?)$qm?\s>/is
1111	}
1112	$value = "" unless $value;
1113	return $value;
1114	}
1115
1116	# make filename an anchor reference
1117	# so we can go straight to the image
1118	# within the cached version of the source page
1119	# (augment's HTMLPlug sub)
1120	sub replace_images {
1121	my $self = shift (@_);
1122	my ($front, $link, $back, $base_dir,
1123	$file, $doc_obj, $section) = @_;
1124	$link =~ s/\"//g;
1125	my ($a_name) = $link;
1126	$a_name =~ s/[\/\\\:\&]/_/g;
1127	# keep a list so we don't repeat the same anchor
1128	if ( ! $self->{'imglist'}{$link}{'anchored'} ) {
1129	$front = "<a name=\"gsdl_$a_name\">$front";
1130	$back = "$back</a>";
1131	$self->{'imglist'}{$link}{'anchored'} = 1;
1132	}
1133	return $self->SUPER::replace_images($front, $link, $back, $base_dir,
1134	$file, $doc_obj, $section);
1135	}
1136
1137	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: