Context Navigation

source: trunk/gsdl/perllib/plugins/W3ImgPlug.pm@ 11008

Last change on this file since 11008 was 10347, checked in by kjdon, 19 years ago
removed the unneeded 'use parsargv'
Property svn:keywords set to `Author Date Id Revision`
File size: 36.8 KB

Rev	Line
[2899]	1	###########################################################################
	2	#
[2996]	3	# W3ImgPlug.pm -- Context-based image indexing plugin for HTML documents
[2899]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright (C) 2001 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
[2996]	27	# DESCRIPTION:
	28	#
	29	# Extracts images and associated text and metadata from
	30	# web pages as individual documents for indexing. Thumbnails
	31	# are created from each image for browsing purposes.
	32	#
	33	# Options are available for configuring the aggressiveness of the
	34	# associated text extraction mechanisms. A higher level of
	35	# aggressiveness will extract more text and consequently
	36	# may mean lower accuracy (precision); however, it may also
	37	# retrieve more of the relevant images from the collection (recall).
	38	# Lower levels of aggressiveness maybe result in slightly faster
	39	# collection builds at the import stage.
	40	#
	41	# W3ImgPlug is a subclass of HTMLPlug (i.e. it will index pages also
	42	# if required). It can be used in place of HTMLPlug to index both
	43	# pages and their images.
	44	#
	45	# REQUIREMENTS:
[2899]	46	#
[2996]	47	# The ImageMagick image manipulation is used to create
	48	# thumbnails and extract some image metadata. (Available
	49	# from http://www.imagemagick.org/)
	50	#
	51	# Unix:
	52	# Many Linux distributions contain ImageMagick.
	53	#
	54	# Windows:
	55	# ImageMagick can be downloaded from the website above.
	56	# Make sure the system path includes the ImageMagick binaries
	57	# before using W3ImgPlug.
	58	#
	59	# NOTE: NT/2000/XP contain a filesystem utility 'convert.exe'
	60	# with the same name as the image conversion utility. The
	61	# ImageMagick FAQ recommends renaming the filesystem
	62	# utility (e.g. to 'fsconvert.exe') to avoid this clash.
	63	#
	64	# USAGE:
	65	#
	66	# An image document consists of metadata elements:
	67	#
	68	# OriginalFilename, FilePath, Filename, FileExt, FileSize,
	69	# Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL
	70	# ImageText, PageTitle
	71	#
	72	# Most of these are only useful in format strings (e.g. ThumbURL,
	73	# Filename, URL, PageURL, CachePageURL).
	74	#
	75	# ImageText, as the name suggests contains the indexable text.
	76	# (unless using the -document_text plugin option)
	77	#
	78	# Since image documents are made up of metadata elements
	79	# alone, format strings are needed to display them properly.
	80	# NOTE: The receptionist will only display results (e.g. thumbnails)
	81	# in 4 columns if the format string begins with "<td><table>".
	82	#
	83	# The example below takes the user to the image within the
	84	# source HTML document rather than using a format string
	85	# on DocumentText to display the image document itself.
	86	#
	87	# Example collect.cfg:
	88	#
	89	# ...
	90	#
	91	# indexes document:ImageText document:text
	92	# defaultindex document:ImageText
	93	#
	94	# collectionmeta .document:ImageText "images"
	95	# collectionmeta .document:text "documents"
	96	#
	97	# ...
	98	#
	99	# plugin W3ImgPlug -index_pages -aggressiveness 6
	100	#
	101	# ...
	102	#
	103	# format SearchVList '<td>{If}{[Title],[link][icon] [Title][[/link],
	104	# <table><tr><td align="center"><a href="[CachePageURL]">
	105	# <img src="[ThumbURL]"></a></td></tr><tr><td align="center">
	106	# <a href="[CachePageURL]"><font size="-1">[OriginalFilename]</font></a>
	107	# <br>[Width]x[Height]</td></tr></table>}</td>'
	108	#
	109	# ...
	110	#
	111
[2899]	112	package W3ImgPlug;
	113
	114	use HTMLPlug;
	115	use ghtml;
	116	use unicode;
	117	use util;
[10254]	118	use strict; # 'subs';
	119	no strict 'refs'; # allow filehandles to be variables and viceversa
[2899]	120
	121	sub BEGIN {
[10254]	122	@W3ImgPlug::ISA = qw( HTMLPlug );
[2899]	123	}
	124
[4744]	125	my $aggressiveness_list =
	126	[ { 'name' => "1",
[4873]	127	'desc' => "{W3ImgPlug.aggressiveness.1}" },
[4744]	128	{ 'name' => "2",
[4873]	129	'desc' => "{W3ImgPlug.aggressiveness.2}" },
[4744]	130	{ 'name' => "3",
[4873]	131	'desc' => "{W3ImgPlug.aggressiveness.3}" },
[4744]	132	{ 'name' => "4",
[4873]	133	'desc' => "{W3ImgPlug.aggressiveness.4}" },
[4744]	134	{ 'name' => "5",
[4873]	135	'desc' => "{W3ImgPlug.aggressiveness.5}" },
[4744]	136	{ 'name' => "6",
[4873]	137	'desc' => "{W3ImgPlug.aggressiveness.6}" },
[4744]	138	{ 'name' => "7",
[4873]	139	'desc' => "{W3ImgPlug.aggressiveness.7}" },
[4744]	140	{ 'name' => "8",
[4873]	141	'desc' => "{W3ImgPlug.aggressiveness.8}" },
[4744]	142	{ 'name' => "9",
[4873]	143	'desc' => "{W3ImgPlug.aggressiveness.9}" } ];
[4744]	144
	145	my $arguments =
	146	[ { 'name' => "aggressiveness",
[4873]	147	'desc' => "{W3ImgPlug.aggressiveness}",
[4744]	148	'type' => "int",
	149	'list' => $aggressiveness_list,
	150	'deft' => "3",
	151	'reqd' => "no" },
	152	{ 'name' => "index_pages",
[4873]	153	'desc' => "{W3ImgPlug.index_pages}",
[4744]	154	'type' => "flag",
	155	'reqd' => "no" },
	156	{ 'name' => "no_cache_images",
[4873]	157	'desc' => "{W3ImgPlug.no_cache_images}",
[4744]	158	'type' => "flag",
	159	'reqd' => "no" },
	160	{ 'name' => "min_size",
[4873]	161	'desc' => "{W3ImgPlug.min_size}",
[4744]	162	'type' => "int",
	163	'deft' => "2000",
	164	'reqd' => "no" },
	165	{ 'name' => "min_width",
[4873]	166	'desc' => "{W3ImgPlug.min_width}",
[4744]	167	'type' => "int",
	168	'deft' => "50",
	169	'reqd' => "no" },
	170	{ 'name' => "min_height",
[4873]	171	'desc' => "{W3ImgPlug.min_height}",
[4744]	172	'type' => "int",
	173	'deft' => "50",
	174	'reqd' => "no" },
	175	{ 'name' => "thumb_size",
[4873]	176	'desc' => "{W3ImgPlug.thumb_size}",
[4744]	177	'type' => "int",
	178	'deft' => "100",
	179	'reqd' => "no" },
	180	{ 'name' => "convert_params",
[4873]	181	'desc' => "{W3ImgPlug.convert_params}",
[4744]	182	'type' => "string",
	183	'deft' => "",
	184	'reqd' => "no" },
	185	{ 'name' => "min_near_text",
[4873]	186	'desc' => "{W3ImgPlug.min_near_text}",
[4744]	187	'type' => "int",
	188	'deft' => "10",
	189	'reqd' => "no" },
	190	{ 'name' => "max_near_text",
[4873]	191	'desc' => "{W3ImgPlug.max_near_text}",
[4744]	192	'type' => "int",
	193	'deft' => "400",
	194	'reqd' => "no" },
	195	{ 'name' => "smallpage_threshold",
[4873]	196	'desc' => "{W3ImgPlug.smallpage_threshold}",
[4744]	197	'type' => "int",
	198	'deft' => "2048",
	199	'reqd' => "no" },
	200	{ 'name' => "textrefs_threshold",
[4873]	201	'desc' => "{W3ImgPlug.textrefs_threshold}",
[4744]	202	'type' => "int",
	203	'deft' => "2",
	204	'reqd' => "no" },
	205	{ 'name' => "caption_length",
[4873]	206	'desc' => "{W3ImgPlug.caption_length}",
[4744]	207	'type' => "int",
	208	'deft' => "80",
	209	'reqd' => "no" },
	210	{ 'name' => "neartext_length",
[4873]	211	'desc' => "{W3ImgPlug.neartext_length}",
[4744]	212	'type' => "int",
	213	'deft' => "300",
	214	'reqd' => "no" },
	215	{ 'name' => "document_text",
[4873]	216	'desc' => "{W3ImgPlug.document_text}",
[4744]	217	'type' => "flag",
[4873]	218	'reqd' => "no" } ];
[4744]	219
	220	my $options = { 'name' => "W3ImgPlug",
[5680]	221	'desc' => "{W3ImgPlug.desc}",
[6408]	222	'abstract' => "no",
[4744]	223	'inherits' => "yes",
	224	'args' => $arguments };
	225
[2899]	226	sub new {
[10218]	227	my ($class) = shift (@_);
	228	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	229	push(@$pluginlist, $class);
[4744]	230
[10218]	231	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	232	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
[2899]	233
[10218]	234	my $self = (defined $hashArgOptLists)? new HTMLPlug($pluginlist,$inputargs,$hashArgOptLists): new HTMLPlug($pluginlist,$inputargs);
	235
[2899]	236	# init class variables
	237	$self->{'textref'} = undef; # init by read_file fn
	238	$self->{'htdoc_obj'} = undef; # init by process fn
	239	$self->{'htpath'} = undef; # init by process fn
	240	$self->{'hturl'} = undef; # init by process fn
	241	$self->{'plaintext'} = undef; # HTML stripped version - only init if needed by raw_neartext sub
	242	$self->{'smallpage'} = 0; # set by process fn
	243	$self->{'images_indexed'} = undef; # num of images indexed - if 1 or 2 then we know page is small
	244	$self->{'initialised'} = undef; # flag (see set_extraction_options())
	245
	246	return bless $self, $class;
	247	}
	248
	249	# if indexing pages, let HTMLPlug do it's stuff
	250	# image extraction done through read()
	251	sub process {
	252	my ($self, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
	253	$self->{'imglist'} = ();
	254	if ( $self->{'index_pages'} ) {
	255	my $ok = $self->SUPER::process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
	256	if ( ! $ok ) { return $ok }
	257	$self->{'htdoc_obj'} = $doc_obj;
	258	}
	259	# else use URL for referencing
[2996]	260	#if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
[2899]	261
	262	$self->{'htpath'} = $base_dir if (-d $base_dir);
	263	if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} .= "/$1"; }
	264	$self->{'htpath'} =~ s/\\/\//g; # replace \ with /
	265
	266	$self->{'hturl'} = "http://$file";
	267	$self->{'hturl'} =~ s/\\/\//g; # for windows
	268	($self->{'filename'}) = $file =~ /.[\/\\](.)/;
	269	($self->{'base_path'}) = $file =~ /(.*)[\/\\]/i;
	270	if ( ( -s "$base_dir/$file") <= $self->{'smallpage_threshold'} ) {
	271	$self->{'smallpage'} = 1;
	272	} else { $self->{'smallpage'} = 0; }
	273
	274	if ( defined($self->{'initialised'}) ) { return 1; }
	275	else {
	276	$self->{'initialised'} = $self->set_extraction_options($base_dir =~ /^(.*?)\/import/i);
	277	return $self->{'initialised'};
	278	}
	279	}
	280
	281	# get complex configuration options from configuration files
[2996]	282	# -- $GSDLCOLLECTION/etc/W3ImgPlug.cfg (tag sets for aggr 2+)
	283	# -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+)
[2899]	284
	285	# If there's no W3ImgPlug.cfg file we'll use the following default values
	286	my $defaultcfg = '
	287	<delimitertagset>
	288	<setname>Caption</setname>
	289	<taggroup>font</taggroup>
	290	<taggroup>tt</taggroup>
	291	<taggroup>small</taggroup>
	292	<taggroup>b</taggroup>
	293	<taggroup>i</taggroup>
	294	<taggroup>u</taggroup>
	295	<taggroup>em</taggroup>
	296	<taggroup>td</taggroup>
	297	<taggroup>li</taggroup>
	298	<taggroup>a</taggroup>
	299	<taggroup>p</taggroup>
	300	<taggroup>tr</taggroup>
	301	<taggroup>center</taggroup>
	302	<taggroup>div</taggroup>
	303	<taggroup>caption</taggroup>
	304	<taggroup>br</taggroup>
	305	<taggroup>ul</taggroup>
	306	<taggroup>ol</taggroup>
	307	<taggroup>table</taggroup>
	308	<taggroup>hr</taggroup>
	309	</delimitertagset>
	310
	311	<delimitertagset>
	312	<setname>Neartext</setname>
	313	<taggroup>tr\|hr\|table\|h\d\|img\|body</taggroup>
	314	<taggroup>td\|tr\|hr\|table\|h\d\|img\|body</taggroup>
	315	<taggroup>p\|br\|td\|tr\|hr\|table\|h\d\|img\|body</taggroup>
	316	<taggroup>font\|p\|i\|b\|em\|img</taggroup>
	317	</delimitertagset>
	318	';
	319
	320	sub set_extraction_options() {
	321	my ($self, $collpath) = @_;
	322	my ($filepath);
	323
	324	print {$self->{'outhandle'}} "W3ImgPlug: Initialising\n"
	325	if $self->{'verbosity'} > 1;
	326	# etc/W3ImgPlug.cfg (XML)
	327	# tag sets for captions and neartext
[4744]	328	if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) {
[2899]	329	$self->{'delims'} = [];
	330	$self->{'cdelims'} = [];
	331	my ($cfg, @tagsets, $tagset, $type, @delims);
	332
	333	$filepath = "$collpath/etc/W3ImgPlug.cfg";
	334	if ( open CFG, "<$filepath" ) {
	335	while (<CFG>) { $cfg .= $_ }
	336	close CFG;
	337	} else {
	338	$cfg = $defaultcfg;
	339	}
	340
	341	(@tagsets) =
	342	$cfg =~ /<delimitertagset>(.*?)<\/delimitertagset>/igs;
	343	foreach $tagset ( @tagsets ) {
	344	($type) = $tagset =~ /<setname>(.*?)<\/setname>/i;
	345	if ( lc($type) eq "caption" ) {
	346	(@{$self->{'cdelims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
	347	}
	348	elsif ( lc($type) eq "neartext" ) {
	349	(@{$self->{'delims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
	350	}
	351	}
	352
	353	# output a warning if there seem to be no delimiters
	354	if ( scalar(@{$self->{'cdelims'}} == 0)) {
	355	print {$self->{'outhandle'}} "W3ImgPlug: Warning: no caption delimiters found in $filepath\n";
	356	}
	357	if ( scalar(@{$self->{'delims'}} == 0)) {
	358	print {$self->{'outhandle'}} "W3ImgPlug: Warning: no neartext delimiters found in $filepath\n";
	359	}
	360	}
	361
	362	# get stop words for textual reference extraction
	363	# TODO: warnings scroll off. Would be best to output them again at end of import
[4744]	364	if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) {
[2899]	365	$self->{'stopwords'} = ();
	366	$filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");
	367	if ( open STOPWORDS, "<$filepath" ) {
	368	while ( <STOPWORDS> ) {
	369	chomp;
	370	$self->{'stopwords'}{$_} = 1;
	371	}
	372	close STOPWORDS;
	373	} else {
	374	print {$self->{'outhandle'}} "W3ImgPlug: Warning: couldn't open stopwords file at $filepath ($!)\n";
	375	}
	376
	377	}
	378
[10218]	379	if ( $self->{'neartext_length'} > $self->{'max_near_text'} ) {
	380	$self->{'max_near_text'} = $self->{'neartext_length'} * 1.33;
	381	print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'max_near_text'}\n";
[2899]	382	}
[10218]	383	if ( $self->{'caption_length'} > $self->{'max_near_text'} ) {
	384	$self->{'max_near_text'} = $self->{'caption_length'} * 1.33;
	385	print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'max_near_text'}\n";
[2899]	386	}
	387
	388	return 1;
	389	}
	390
[7362]	391	# return number of files processed, undef if can't recognise, -1 if
	392	# cant process
[2899]	393	# Note that $base_dir might be "" and that $file might
	394	# include directories
	395	sub read {
[9853]	396	my ($self, $pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = (@_);
[2899]	397	my ($doc_obj, $section, $filepath, $imgtag, $pos, $context, $numdocs, $tndir, $imgs);
	398	# forward normal read (runs HTMLPlug if index_pages T)
[9853]	399	my $ok = $self->SUPER::read($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
[7362]	400	if ( ! $ok ) { return $ok } # what is this returning??
[2899]	401
	402	my $outhandle = $self->{'outhandle'};
	403	my $textref = $self->{'textref'};
	404	my $htdoc_obj = $self->{'htdoc_obj'};
	405	$numdocs = 0;
	406	$base_dir =~ /(.)\/./;
	407	$tndir = "$1/archives/thumbnails"; # TODO: this path shouldn't be hardcoded?
	408	&util::mk_all_dir($tndir) unless -e "$tndir";
	409
	410	$imgs = \%{$self->{'imglist'}};
	411	my $nimgs = $self->get_img_list($textref);
	412	$self->{'images_indexed'} = $nimgs;
	413	if ( $nimgs > 0 ) {
	414	my @fplist = (sort { $imgs->{$a}{'pos'} <=> $imgs->{$b}{'pos'} } keys %{$imgs});
	415	my $i = 0;
	416	foreach $filepath ( @fplist ) {
	417	$pos = $imgs->{$filepath}{'pos'};
	418	$context = substr ($$textref, $pos - 50, $pos + 50); # grab context (quicker)
	419	($imgtag) = ($context =~ /(<(?:img\|a\|body)\s[^>]$filepath[^>]>)/is );
	420	if (! defined($imgtag)) { $imgtag = $filepath }
	421	print $outhandle "W3ImgPlug: extracting $filepath\n"
	422	if ( $self->{'verbosity'} > 1 );
	423	$doc_obj = new doc ("", "indexed_doc");
	424	$section = $doc_obj->get_top_section();
[10254]	425	my $prevpos = ( $i == 0 ? 0 : $imgs->{$fplist[$i - 1]}{'pos'});
	426	my $nextpos = ( $i >= ($nimgs -1) ? -1 : $imgs->{$fplist[$i + 1]}{'pos'} );
[2899]	427
	428	$self->extract_image_info($imgtag, $filepath, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos);
	429	$processor->process($doc_obj);
	430	$numdocs++;
	431	$i++;
	432	}
	433	return $numdocs;
	434	} else {
	435	print $outhandle "W3ImgPlug: No images from $file indexed\n"
	436	if ( $self->{'verbosity'} > 2 );
	437	return 1;
	438	}
	439
	440	}
	441
	442	# for every valid image tag
	443	# 1. extract related text and image metadata
	444	# 2. add this as document meta-data
	445	# 3. add assoc image(s) as files
	446	#
	447	sub extract_image_info {
	448	my $self = shift (@_);
	449	my ($tag, $id, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos) = (@_);
	450	my ($filename, $orig_fp, $fn, $ext, $reltext, $relreltext, $crcid, $imgs,
	451	$thumbfp, $pagetitle, $alttext, $filepath, $aggr);
	452	$aggr = $self->{'aggressiveness'};
	453	$imgs = \%{$self->{'imglist'}};
	454	$filepath = $imgs->{$id}{'relpath'};
	455	($filename) = $filepath =~ /([^\/\\]+)$/s;
	456	($orig_fp) = "$self->{'base_path'}/$filepath";
	457	$orig_fp =~ tr/+/ /;
	458	$orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc
[2996]	459	$orig_fp =~ s/\\/\//g;
[2899]	460	$filepath = "$self->{'htpath'}/$filepath";
[10254]	461	my ($onlyfn) = $filename =~ /([^\\\/]*)$/;
[2899]	462	($fn, $ext) = $onlyfn =~ /(.)\.(.)/;
	463	$fn = lc $fn; $ext = lc $ext;
	464	($reltext) = "<tr><td>GifComment</td><td>" . `identify $filepath -ping -format "%c"` . "</td></tr>\n"
	465	if ($ext eq "gif");
	466	$reltext .= "<tr><td>FilePath</td><td>$orig_fp</td></tr>\n";
	467
	468	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	469	$crcid = "$fn.$ext." . $self->{'next_crcid'}++;
	470	} else { ($crcid) = `cksum $filepath` =~ /^(\d+)/; }
	471	$thumbfp = "$tndir/tn_$crcid.jpg";
[10218]	472	`convert -flatten -filter Hanning $self->{'convert_params'} -geometry "$self->{'thumb_size'}x$self->{'thumb_size'}>" $filepath $thumbfp` unless -e $thumbfp;
[2899]	473	if ( ! (-e $thumbfp) ) {
	474	print STDERR "W3ImgPlug: 'convert' failed. Check ImageMagicK binaries are installed and working correctly\n"; return 0;
	475	}
	476
	477	# shove in full text (tag stripped or unstripped) if settings require it
	478	if ( $aggr == 10) {
	479	$reltext = "<tr><td>AllPage</td><td>" . $$textref . "</td><tr>\n"; # level 10 (all text, verbatim)
	480	} else {
	481	$pagetitle = $self->get_meta_value("title", $textref);
	482	($alttext) = $tag =~ /\salt\s=\s(?:\"\|\')(.+?)(?:\"\|\')/is;
	483	if ( defined($alttext) && length($alttext) > 1) {
	484	$reltext .= "<tr><td>ALTtext</td><td>$alttext</td></tr>\n"; }
	485	$reltext .= "<tr><td>SplitCapitalisation</td><td>" .
	486	$self->split_filepath($orig_fp) . "</td></tr>\n";
	487
	488	# get caption/tag based near text (if appropriate)
	489	if ( $aggr > 1 ) {
	490	if ( $aggr >= 2 ) {
	491	$reltext .=
	492	$self->extract_caption_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
	493	$relreltext = $reltext;
	494	}
	495	# repeat the filepath, alt-text, caption, etc
	496	if ( $aggr == 8 ) {
	497	$reltext .= $relreltext;
	498	}
	499	if ( $aggr >= 3 ) {
	500	$reltext .=
	501	$self->extract_near_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
	502	}
	503
	504	# get page metadata (if appropriate)
	505	if ( $aggr >= 6 \|\| ( $aggr >= 2 &&
	506	( $self->{'images_indexed'} < 2 \|\|
	507	($self->{'smallpage'} == 1 && $self->{'images_indexed'} < 6 )))) {
	508	$reltext .= $self->get_page_metadata($textref);
	509	}
	510	# textual references
	511	if ( $aggr == 5 \|\| $aggr >= 7) {
[10218]	512	if ( length($relreltext) > ($self->{'caption_length'} * 2) ) {
[2899]	513	$reltext .= $self->get_textrefs($relreltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); }
	514	else {
	515	$reltext .= $self->get_textrefs($reltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
	516	}
	517	}
	518	} # aggr > 1
	519	} # aggr != 10
	520
	521	$doc_obj->set_OID($crcid);
	522	$doc_obj->associate_file($thumbfp, "$fn.thumb.jpg", undef, $section);
	523	$doc_obj->add_metadata($section, "OriginalFilename", $filename);
	524	$doc_obj->add_metadata($section, "FilePath", $orig_fp);
	525	$doc_obj->add_metadata($section, "Filename", $fn);
	526	$doc_obj->add_metadata($section, "FileExt", $ext);
	527	$doc_obj->add_metadata($section, "FileSize", $imgs->{$id}{'filesize'});
	528	$doc_obj->add_metadata($section, "Width", $imgs->{$id}{'width'});
	529	$doc_obj->add_metadata($section, "Height", $imgs->{$id}{'height'});
	530	$doc_obj->add_metadata($section, "URL", "http://$orig_fp");
	531	$doc_obj->add_metadata($section, "PageURL", $self->{'hturl'});
	532	$doc_obj->add_metadata($section, "PageTitle", $pagetitle);
	533	$doc_obj->add_metadata($section, "ThumbURL",
	534	"_httpcollection_/index/assoc/[archivedir]/$fn.thumb.jpg");
[8121]	535	$doc_obj->add_metadata($section, "FileFormat", "W3Img");
[2899]	536
	537	if ( $self->{'document_text'} ) {
	538	$doc_obj->add_utf8_text($section, "<table border=1>\n$reltext</table>");
	539	} else {
	540	$doc_obj->add_metadata($section, "ImageText", "<table border=1>\n$reltext</table>\n");
	541	}
	542
	543	if ( $self->{'index_pages'} ) {
	544	my ($cache_url) = "_httpdoc_&d=" . $self->{'htdoc_obj'}->get_OID();
	545	if ( $imgs->{$id}{'anchored'} ) {
	546	my $a_name = $id;
	547	$a_name =~ s/[\/\\\:\&]/_/g;
	548	$cache_url .= "#gsdl_$a_name" ;
	549	}
	550	$doc_obj->add_utf8_metadata($section, "CachePageURL", $cache_url);
	551	}
	552	if ( ! $self->{'no_cache_images'} ) {
	553	$onlyfn = lc $onlyfn;
	554	$doc_obj->associate_file($filepath, $onlyfn, undef, $section);
	555	$doc_obj->add_utf8_metadata($section, "CacheURL",
	556	"_httpcollection_/index/assoc/[archivedir]/$onlyfn");
	557	}
	558	return 1;
	559	}
	560
	561	sub get_page_metadata {
	562	my ($self, $textref) = (@_);
	563	my (@rval);
	564	$rval[0] = $self->get_meta_value("title", $textref);
	565	$rval[1] = $self->get_meta_value("keywords", $textref);
	566	$rval[2] = $self->get_meta_value("description", $textref);
	567	$rval[3] = $self->{'filename'};
	568
	569	return wantarray ? @rval : "<tr><td>PageMeta</td><td>@rval</td></tr>\n" ;
	570	}
	571
	572	# turns LargeCatFish into Large,Cat,Fish so MG sees the separate words
	573	sub split_filepath {
	574	my ($self, $filepath) = (@_);
	575	my (@words) = $filepath =~ /([A-Z][a-z]+)/g;
	576	return join(',', @words);
	577	}
	578
	579	# finds and extracts sentences
	580	# that seem to be on the same topic
	581	# as other related text (correlations)
	582	# and textual references (e.g. in figure 3 ...)
	583	sub get_textrefs {
	584	my ($self, $reltext, $textref, $prevpos, $pos, $nextpos) = (@_);
	585	my ($maxtext, $mintext, $startpos, $context_size, $context);
	586
	587	my (@relwords, @refwords, %sentences, @pagemeta);
	588
	589	# extract larger context
[10218]	590	$maxtext = $self->{'max_near_text'};
[2899]	591	$startpos = $pos - ($maxtext * 4);
	592	$context_size = $maxtext*10;
	593	if ($startpos < $prevpos ) { $startpos = $prevpos }
	594	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) }
	595	$context = substr ( $$textref, $startpos, $context_size );
	596	$context =~ s/<.*?>//gs;
	597	$context =~ s/^.>(.)/$1/gs;
	598	$context =~ s/(.)<.$/$1/gs;
	599
	600	# get page meta-data (if not already included)
	601	if ( $self->{'aggressiveness'} == 5 && ! $self->{'smallpage'} ) {
	602	@pagemeta = $self->get_page_metadata($textref);
[10254]	603	foreach my $value ( @pagemeta ) {
[2899]	604	$context .= "$value."; # make each into psuedo-sentence
	605	}
	606	}
	607
	608	# TODO: this list is not exhaustive
	609	@refwords = ( '(?:is\|are)? ?(?:show(?:s\|n)\|demonstrate(?:d\|s)\|explains\|features) (?:in\|by\|below\|above\|here)',
	610	'(?:see)? (?:figure\|table)? (?:below\|above)');
	611
	612	# extract general references
[10254]	613	foreach my $rw ( @refwords ) {
[2899]	614	while ( $context =~ /[\.\?\!\,](.?$rw\W.?[\.\?\!\,])/ig ) {
[10254]	615	my $sentence = $1;
[2899]	616	$sentence =~ s/\s+/ /g;
	617	$sentences{$sentence}+=2;
	618	}
	619	}
	620	# extract specific (figure, table) references by number
	621	my ($fignum) = $context =~ /[\.\?\!].?(?:figure\|table)s?[\-\_\ \.](\d+\w)\W.*?[\.\?\!]/ig;
	622	if ( $fignum ) {
[10254]	623	foreach my $rw ( @refwords ) {
[2899]	624	while ( $context =~ /[\.\?\!](.?(figure\|table)[\-\_\ \.]$fignum\W.?[\.\?\!])/ig ) {
[10254]	625	my $sentence = $1;
[2899]	626	$sentence =~ s/\s+/ /g;
	627	$sentences{$sentence}+=4;
	628	}
	629	}
	630	}
	631
	632	# sentences with occurances of important words
	633	@relwords = $reltext =~ /([a-zA-Z]{4,})/g; # take out small words
[10254]	634	foreach my $word ( @relwords ) {
[2899]	635	if ( $self->{'stopwords'}{$word} ) { next } # skip stop words
	636	while ( $context =~ /([^\.\?\!]?$word\W.?[\.\?\!])/ig ) {
[10254]	637	my $sentence = $1;
[2899]	638	$sentence =~ s/\s+/ /g;
	639	$sentences{$sentence}++;
	640	}
	641	}
[10254]	642	foreach my $sentence ( keys %sentences ) {
[10218]	643	if ($sentences{$sentence} < $self->{'textrefs_threshold'}) {
[2899]	644	delete $sentences{$sentence};
	645	}
	646	}
	647	my ($rval) = join "<br>\n", (keys %sentences);
	648	if ( $rval && length($rval) > 5 ) {
	649	return ( "<tr><td>TextualReferences</td><td>" . $rval . "</td></tr>\n") }
	650	else { return "" }
	651	}
	652
	653	# handles caption extraction
	654	# calling the extractor with different
	655	# tags and choosing the best candidate caption
	656	sub extract_caption_text {
	657	my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
[10254]	658	my (@neartext, $len, $hdelim, $mintext, $goodlen,
[2899]	659	$startpos, $context, $context_size);
	660
[10218]	661	$mintext = $self->{'min_near_text'};
	662	$goodlen = $self->{'caption_length'};
[2899]	663
	664	# extract a context to extract near text from (faster)
[10218]	665	$context_size = $self->{'max_near_text'}*3;
[2899]	666	$startpos = $pos - ($context_size / 2);
	667	if ($startpos < $prevpos ) { $startpos = $prevpos }
	668	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
	669	{ $context_size = ($nextpos - $startpos) }
	670
	671	$context = substr ( $$textref, $startpos, $context_size );
	672	$context =~ s/<!--.*?-->//gs;
	673	$context =~ s/^.-->(.)/$1/gs;
	674	$context =~ s/(.)<!--.$/$1/gs;
	675
	676	# try stepping through markup delimiter sets
	677	# and selecting the best one
	678	foreach $hdelim ( @{ $self->{'cdelims'} } ) {
	679	@neartext = $self->extract_caption($tag, $hdelim, \$context);
	680	$len = length(join("", @neartext));
	681	last if ($len >= $mintext && $len <= $goodlen);
	682	}
	683	# reject if well over reasonable length
	684	if ( $len > $goodlen ) {
	685	@neartext = [];
	686	}
	687	$neartext[0] = " " if (! defined $neartext[0]);
	688	$neartext[1] = " " if (! defined $neartext[1]);
	689	return "<tr><td>Caption</td><td>" . (join ",", @neartext) . "</td></tr>\n"; # TODO: the \| is for testing purposes
	690	} # end extract_caption_text
	691
	692	# the previous section header often gives a bit
	693	# of context to the section that the image is
	694	# in (invariably the header is before/above the image)
	695	# so extract the text of the closest header above the image
	696	#
	697	# this fn just gets all the headers above the image, within the context window
	698	sub get_prev_header {
	699	my ($self, $pos, $textref) = (@_);
	700	my ($rhtext);
	701	while ( $$textref =~ /<h\d>(.*?)<\/h\d>/sig ) {
	702	# only headers before image
	703	if ((pos $$textref) < $pos) {
	704	$rhtext .= "$1, ";
	705	}
	706	}
	707	if ( $rhtext ) { return "Header($rhtext)" }
	708	else { return "" }
	709	}
	710
	711	# not the most robust tag stripping
	712	# regexps (see perl.com FAQ) but good enough
	713	#
	714	# used by caption & tag-based near text algorithms
	715	sub strip_tags {
	716	my ( $self, $value ) = @_;
	717	if ( ! defined($value) ) { $value = "" } # handle nulls
	718	else {
	719	$value =~ s/<.*?>//gs; # strip all html tags
	720	$value =~ s/\s+/\ /g; # remove extra whitespace
	721	$value =~ s/\&\w+\;//g; # remove   etc
	722	}
	723	return $value;
	724	}
	725
	726	# uses the given tag(s) to identify
	727	# the caption near to the image
	728	# (below, above or both below and above)
	729	sub extract_caption {
	730	my ($self, $tag, $bound_tag, $contextref) = (@_);
	731	my (@nt, $n, $etag, $gotcap);
	732	return ("", "") if ( ! ($$contextref =~ /\Q$tag/) );
	733
	734	$nt[0] = $`;
	735	$nt[1] = $';
	736	$gotcap = 0;
	737
	738	# look before the image for a boundary tag
	739	($etag, $nt[0]) = $nt[0] =~ /<($bound_tag)[\s]?.?>(.?)$/is;
	740	# if bound_tag too far from the image, then prob not caption
	741	# (note: have to allow for tags, so multiply by 3
[10218]	742	if ( $etag && length($nt[0]) < ($self->{'caption_length'} * 3) ) {
[2899]	743	if ( $nt[0] =~ /<\/$etag>/si ) {
	744	# the whole caption is above the image: <tag>text</tag><img>
	745	($nt[0]) =~ /<(?:$etag)[\s]?.?>(.?)<\/$etag>/is;
	746	$nt[0] = $self->strip_tags($nt[0]);
[10218]	747	if ( length($nt[0]) > $self->{'min_near_text'} ) {
[2899]	748	$gotcap = 1;
	749	$nt[1] = "";
	750	}
	751
	752	} elsif ( $nt[1] =~ /<\/$etag>/si) {
	753	# the caption tag covers image: <tag>text?<img>text?</tag>
	754	($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
	755	$nt[0] = $self->strip_tags($nt[0] . $nt[1]);
[10218]	756	if ( length($nt[0]) > $self->{'min_near_text'} ) {
[2899]	757	$gotcap = 2;
	758	$nt[1] = "";
	759	}
	760	}
	761	}
	762	# else try below the image
	763	if ( ! $gotcap ) {
	764	# the caption is after the image: <img><tag>text</tag>
	765	($etag, $nt[1]) = $nt[1] =~ /^.?<($bound_tag)[\s]?.?>(.*)/is;
	766	if ( $etag && $nt[1] =~ /<\/$etag>/s) {
	767	($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
	768	$gotcap = 3;
	769	$nt[0] = "";
	770	$nt[1] = $self->strip_tags($nt[1]);
	771	}
	772	}
	773	if ( ! $gotcap ) { $nt[0] = $nt[1] = "" }
	774	else {
	775	# strip part-tags
	776	$nt[0] =~ s/^.*>//s;
	777	$nt[1] =~ s/<.*$//s;
	778	}
	779	my ($type);
	780	if ( $gotcap == 0 ) { return ("nocaption", "") }
	781	elsif ( $gotcap == 1 ) { $type = "captionabove:" }
	782	elsif ( $gotcap == 2 ) { $type = "captioncovering:" }
	783	elsif ( $gotcap == 3 ) { $type = "captionbelow:" }
	784	return ($type, $nt[0], $nt[1]);
	785	}
	786
	787	# tag-based near text
	788	#
	789	# tries different tag sets
	790	# and chooses the best one
	791	sub extract_near_text {
	792	my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
	793	my (@neartext, $len, $hdelim, $maxtext, $mintext, $goodlen,
	794	@bestlen, @best, $startpos, $context, $context_size,
	795	$dist, $bdist, $best1, $i, $nt);
	796	$bestlen[0] = $bestlen[1] = 0; $bestlen[2] = $bdist = 999999;
	797	$best[0] = $best[1] = $best[2] = "";
[10218]	798	$maxtext = $self->{'max_near_text'};
	799	$mintext = $self->{'min_near_text'};
	800	$goodlen = $self->{'neartext_length'};
[2899]	801
	802	# extract a context to extract near text from (faster)
	803	$context_size = $maxtext*4;
	804	$startpos = $pos - ($context_size / 2);
	805	if ($startpos < $prevpos ) { $startpos = $prevpos }
	806	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
	807	{ $context_size = ($nextpos - $startpos) }
	808	$context = substr ( $$textref, $startpos, $context_size );
	809	$context =~ s/<!--.*?-->//gs;
	810	$context =~ s/^.-->(.)/$1/gs;
	811	$context =~ s/(.)<!--.$/$1/gs;
	812
	813	# try stepping through markup delimiter sets
	814	# and selecting the best one
	815	foreach $hdelim ( @{ $self->{'delims'} } ) {
	816	@neartext = $self->extract_tagged_neartext($tag, $hdelim, \$context);
	817	$nt = join("", @neartext);
	818	$len = length($nt);
	819	# Priorities:
	820	# 1. Greater than mintext
	821	# 2. Less than maxtext
	822	# 3. Closest to goodlen
	823	if ( $len <= $goodlen && $len > $bestlen[0] ) {
	824	$bestlen[0] = $len;
	825	$best[0] = $hdelim;
	826	} elsif ( $len >= $maxtext && $len < $bestlen[2] ) {
	827	$bestlen[2] = $len;
	828	$best[2] = $hdelim;
	829	} elsif ( $len >= $bestlen[0] && $len <= $bestlen[2] ) {
	830	$dist = abs($goodlen - $len);
	831	if ( $dist < $bdist ) {
	832	$bestlen[1] = $len;
	833	$best[1] = $hdelim;
	834	$bdist = $dist;
	835	}
	836	}
	837	}
	838	$best1 = 2;
	839	foreach $i ( 0..2 ) {
	840	if ( $bestlen[$i] == 999999 ) { $bestlen[$i] = 0 }
	841	$dist = abs($goodlen - $bestlen[$i]);
	842	if ( $bestlen[$i] > $mintext && $dist <= $bdist ) {
	843	$best1 = $i;
	844	$bdist = $dist;
	845	}
	846	}
	847	@neartext = $self->extract_tagged_neartext($tag, $best[$best1], \$context);
	848	if ( $bestlen[$best1] > $maxtext ) {
	849	# truncate on word boundary if too much text
	850	my $hmax = $maxtext / 2;
	851	($neartext[0]) = $neartext[0] =~ /([^\s]*.{1,$hmax})$/s;
	852	($neartext[1]) = $neartext[1] =~ /^(.{1,$hmax}[^\s]*)/s;
	853	} elsif ( $bestlen[$best1] < $mintext ) {
	854	# use plain text extraction if tags failed (e.g. usable tag outside context)
	855	print {$self->{'outhandle'}} "W3ImgPlug: Fallback to plain-text extraction for $tag\n"
	856	if $self->{'verbosity'} > 2;
	857	$neartext[0] = "<tr><td>RawNeartext</td><td>" . $self->extract_raw_neartext($tag, $textref) . "</td></tr>";
	858	$neartext[1] = "";
	859	}
	860	# get previous header if available
	861	$neartext[0] .= "<br>\n" .
	862	$self->get_prev_header($pos, \$context) if ( $self->{'aggressiveness'} >= 4 );
	863	$neartext[0] = " " if (! defined $neartext[0]);
	864	$neartext[1] = " " if (! defined $neartext[1]);
	865
	866	return "<tr><td>NearText</td><td>" . (join "\|", @neartext) . "</td></tr>\n"; # TODO: the \| is for testing purposes
	867	} # end extract_near_text
	868
	869	# actually captures tag-based
	870	# near-text given a tag set
	871	sub extract_tagged_neartext {
	872	my ($self, $tag, $bound_tag, $textref) = (@_);
	873	return "" if ( ! ($$textref =~ /\Q$tag/) );
	874	my (@nt, $delim, $pre_tag, $n);
	875	$nt[0] = $`;
	876	$nt[1] = $';
	877
	878	# get text after previous image tag
	879	$nt[0] =~ s/.<($bound_tag)[^>]>(.*)/$2/is; # get rid of preceding text
	880	if (defined($1)) { $delim = $1 }
	881	$pre_tag = $bound_tag;
	882
	883	if (defined($delim)) {
	884	# we want to try and use the end tag of the previous delimiter
	885	# (put it on the front of the list)
	886	$pre_tag =~ s/(^\|\\|)($delim)($\|\\|)//i; # take it out
	887	$pre_tag =~ s/\\|\\|/\\|/i; # replace \|\| with \|
	888	$pre_tag = $delim . "\|" . $pre_tag; # put it on the front
	889	}
	890
	891	# get text before next image tag
	892	$nt[1] =~ s/<\/?(?:$pre_tag)[^>]>.//is; # get rid of stuff after first delimiter
	893
	894	# process related text
	895	for $n (0..1) {
	896	if ( defined($nt[$n]) ) {
	897	$nt[$n] =~ s/<.*?>//gs; # strip all html tags
	898	$nt[$n] =~ s/\s+/\ /gs; # remove extra whitespace
	899	$nt[$n] =~ s/\&\w+\;//sg; # remove   etc
	900	# strip part-tags
	901	if ( $n == 0 ) { $nt[0] =~ s/^.*>//s }
	902	if ( $n == 1 ) { $nt[1] =~ s/<.*$//s }
	903	} else { $nt[$n] = ""; } # handle nulls
	904	}
	905	return @nt;
	906	}
	907
	908	# this function is fall-back
	909	# if tags aren't suitable.
	910	#
	911	# extracts a fixed length of characters
	912	# either side of image tag (on word boundary)
	913	sub extract_raw_neartext {
	914	my ($self, $tag, $textref) = (@_);
	915	my ($rawtext, $startpos, $fp);
	916	my $imgs = \%{$self->{'imglist'}};
	917	($fp) = $tag =~ /([\w\\\/]+\.(?:gif\|jpe?g\|png))/is;
	918	if (! $fp) { return " " };
	919	# if the cached, plain-text version isn't there, then create it
	920	$self->init_plaintext($textref) unless defined($self->{'plaintext'});
	921
	922	# take the closest maxtext/2 characters
	923	# either side of the tag (by word boundary)
	924	return "" if ( ! exists $imgs->{$fp}{'rawpos'} );
[10218]	925	$startpos = $imgs->{$fp}{'rawpos'} - (($self->{'max_near_text'} / 2) + 20);
[2899]	926	if ( $startpos < 0 ) { $startpos = 0 }
[10218]	927	$rawtext = substr $self->{'plaintext'}, $startpos, $self->{'max_near_text'} + 20;
[2899]	928	$rawtext =~ s/\s\s/ /g;
	929
	930	return $rawtext;
	931	}
	932
	933	# init plaintext variable for HTML-stripped version
	934	# (for full text index/raw assoc text extraction)
	935	sub init_plaintext {
	936	my ($self, $textref) = (@_);
	937	my ($page, $fp);
	938	my $imgs = \%{$self->{'imglist'}};
	939	$page = $$textref; # make a copy of original
	940
	941	# strip tags around image filenames so they don't get zapped
	942	$page =~ s/<\w+\s+.?([\w\/\\]+\.(?:gif\|jpe?g\|png))[^>]>/\"$1\"/gsi;
	943	$page =~ s/<.*?>//gs;
	944	$page =~ s/ / /gs;
	945	$page =~ s/&/&/gs; #TODO: more &zzz; replacements (except <, $gt;)
	946
	947	# get positions and strip images
	948	while ( $page =~ /([^\s\'\"]+\.(jpe?g\|gif\|png))/ig ) {
	949	$fp = $1;
	950	if ( $imgs->{$fp}{'exists'} ) {
	951	$imgs->{$fp}{'rawpos'} = pos $page;
	952	}
	953	$page =~ s/\"$fp\"//gs;
	954	}
	955	$self->{'plaintext'} = $page;
	956	}
	957
	958	# finds and filters images based on size
	959	# (dimensions, height, filesize) and existence
	960	#
	961	# looks for image filenames (.jpg, .gif, etc)
	962	# and checks for existence on disk
	963	# (hence supports most JavaScript images)
	964	sub get_img_list {
	965	my $self = shift (@_);
	966	my ($textref) = (@_);
	967	my ($filepath, $relpath, $abspath, $pos, $num, $width, $height, $filesize);
	968	my $imgs = \%{$self->{'imglist'}};
	969	while ( $$textref =~ /([^\s\'\"]+\.(jpe?g\|gif\|png))/ig ) {
	970	$filepath = $1;
	971	$pos = pos $$textref;
	972	next if ( $imgs->{$filepath}{'relpath'} );
	973	$relpath = $filepath;
	974	$relpath =~ s/^http\:\/\///; # remove http:// in case we have mirrored it
	975	$relpath =~ s/\\/\//g; # replace \ with /
	976	$relpath =~ s/^\.\///s; # make "./filepath" into "filepath"
	977	$imgs->{$filepath}{'relpath'} = $relpath;
	978	$abspath = "$self->{'htpath'}/$relpath";
	979
	980	if (! -e $abspath) { next }
	981
	982	# can't modify real filepath var because it
	983	# then can't be located in the page for tag recognition later
	984	($width, $height) =
	985	`identify $abspath -ping -format "%wx%h"` =~ /^(\d)x(\d)$/m;
	986	if (! ($width && $height)) {
	987	print STDERR "W3ImgPlug: ($abspath) 'identify' failed. Check ImageMagicK binaries are installed and working correctly\n"; next;
	988	}
	989	$filesize = (-s $abspath);
[10218]	990	if ( $filesize >= $self->{'min_size'}
	991	&& ( $width >= $self->{'min_width'} )
	992	&& ( $height >= $self->{'min_height'} ) ) {
[2899]	993
	994	$imgs->{$filepath}{'exists'} = 1;
	995	$imgs->{$filepath}{'pos'} = $pos;
	996	$imgs->{$filepath}{'width'} = $width;
	997	$imgs->{$filepath}{'height'} = $height;
	998	$imgs->{$filepath}{'filesize'} = $filesize;
	999	} else {
	1000	print {$self->{'outhandle'}} "W3ImgPlug: skipping $self->{'base_path'}/$relpath: $filesize, $width x $height\n"
	1001	if $self->{'verbosity'} > 2;
	1002	}
	1003	}
	1004	$num = 0;
[10254]	1005	foreach my $i ( keys %{$imgs} ) {
[2899]	1006	if ( $imgs->{$i}{'pos'} ) {
	1007	$num++;
	1008	} else { delete $imgs->{$i} }
	1009	}
	1010	return $num;
	1011	}
	1012
	1013	# make the text available to the read function
	1014	# by making it an object variable
	1015	sub read_file {
	1016	my ($self, $filename, $encoding, $language, $textref) = @_;
	1017	$self->SUPER::read_file($filename, $encoding, $language, $textref);
	1018
	1019	# if HTMLplug has run through, then it will
	1020	# have replaced references so we have to
	1021	# make a copy of the text before processing
	1022	if ( $self->{'index_pages'} ) {
	1023	$self->{'text'} = $$textref;
	1024	$self->{'textref'} = \($self->{'text'});
	1025	} else {
	1026	$self->{'textref'} = $textref;
	1027	}
	1028	$self->{'plaintext'} = undef;
	1029	}
	1030
	1031	# HTMLPlug only extracts meta-data if it is specified in plugin options
	1032	# hence a special function to do it here
	1033	sub get_meta_value {
	1034	my ($self, $name, $textref) = @_;
	1035	my ($value);
	1036	$name = lc $name;
	1037	if ($name eq "title") {
	1038	($value) = $$textref =~ /<title>(.*?)<\/title>/is
	1039	} else {
	1040	my $qm = "(?:\"\|\')";
	1041	($value) = $$textref =~ /<meta name\s=\s$qm?$name$qm?\s+content\s=\s$qm?(.?)$qm?\s>/is
	1042	}
	1043	$value = "" unless $value;
	1044	return $value;
	1045	}
	1046
	1047	# make filename an anchor reference
	1048	# so we can go straight to the image
	1049	# within the cached version of the source page
	1050	# (augment's HTMLPlug sub)
	1051	sub replace_images {
	1052	my $self = shift (@_);
	1053	my ($front, $link, $back, $base_dir,
	1054	$file, $doc_obj, $section) = @_;
[2996]	1055	$link =~ s/\"//g;
[2899]	1056	my ($a_name) = $link;
	1057	$a_name =~ s/[\/\\\:\&]/_/g;
	1058	# keep a list so we don't repeat the same anchor
	1059	if ( ! $self->{'imglist'}{$link}{'anchored'} ) {
	1060	$front = "<a name=\"gsdl_$a_name\">$front";
	1061	$back = "$back</a>";
	1062	$self->{'imglist'}{$link}{'anchored'} = 1;
	1063	}
	1064	return $self->SUPER::replace_images($front, $link, $back, $base_dir,
	1065	$file, $doc_obj, $section);
	1066	}
	1067
	1068	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: