Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

HTMLImagePlugin.pm

Last change on this file was 28563, checked in by kjdon, 10 years ago
changing some util:: methods to FileUtils:: methods
Property svn:keywords set to `Author Date Id Revision`
File size: 37.3 KB

Line
1	###########################################################################
2	#
3	# HTMLImagePlugin.pm -- Context-based image indexing plugin for HTML documents
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 2001 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# DESCRIPTION:
28	#
29	# Extracts images and associated text and metadata from
30	# web pages as individual documents for indexing. Thumbnails
31	# are created from each image for browsing purposes.
32	#
33	# Options are available for configuring the aggressiveness of the
34	# associated text extraction mechanisms. A higher level of
35	# aggressiveness will extract more text and consequently
36	# may mean lower accuracy (precision); however, it may also
37	# retrieve more of the relevant images from the collection (recall).
38	# Lower levels of aggressiveness maybe result in slightly faster
39	# collection builds at the import stage.
40	#
41	# HTMLImagePlugin is a subclass of HTMLPlug (i.e. it will index pages also
42	# if required). It can be used in place of HTMLPlugin to index both
43	# pages and their images.
44	#
45	# REQUIREMENTS:
46	#
47	# The ImageMagick image manipulation is used to create
48	# thumbnails and extract some image metadata. (Available
49	# from http://www.imagemagick.org/)
50	#
51	# Unix:
52	# Many Linux distributions contain ImageMagick.
53	#
54	# Windows:
55	# ImageMagick can be downloaded from the website above.
56	# Make sure the system path includes the ImageMagick binaries
57	# before using HTMLImagePlugin.
58	#
59	# NOTE: NT/2000/XP contain a filesystem utility 'convert.exe'
60	# with the same name as the image conversion utility. The
61	# ImageMagick FAQ recommends renaming the filesystem
62	# utility (e.g. to 'fsconvert.exe') to avoid this clash.
63	#
64	# USAGE:
65	#
66	# An image document consists of metadata elements:
67	#
68	# OriginalFilename, FilePath, Filename, FileExt, FileSize,
69	# Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL
70	# ImageText, PageTitle
71	#
72	# Most of these are only useful in format strings (e.g. ThumbURL,
73	# Filename, URL, PageURL, CachePageURL).
74	#
75	# ImageText, as the name suggests contains the indexable text.
76	# (unless using the -document_text plugin option)
77	#
78	# Since image documents are made up of metadata elements
79	# alone, format strings are needed to display them properly.
80	# NOTE: The receptionist will only display results (e.g. thumbnails)
81	# in 4 columns if the format string begins with "<td><table>".
82	#
83	# The example below takes the user to the image within the
84	# source HTML document rather than using a format string
85	# on DocumentText to display the image document itself.
86	#
87	# Example collect.cfg:
88	#
89	# ...
90	#
91	# indexes document:ImageText document:text
92	# defaultindex document:ImageText
93	#
94	# collectionmeta .document:ImageText "images"
95	# collectionmeta .document:text "documents"
96	#
97	# ...
98	#
99	# plugin HTMLImagePlugin -index_pages -aggressiveness 6
100	#
101	# ...
102	#
103	# format SearchVList '<td>{If}{[Title],[link][icon] [Title][[/link],
104	# <table><tr><td align="center"><a href="[CachePageURL]">
105	# <img src="[ThumbURL]"></a></td></tr><tr><td align="center">
106	# <a href="[CachePageURL]"><font size="-1">[OriginalFilename]</font></a>
107	# <br>[Width]x[Height]</td></tr></table>}</td>'
108	#
109	# ...
110	#
111
112	package HTMLImagePlugin;
113
114	use HTMLPlugin;
115	use ghtml;
116	use unicode;
117	use util;
118	use strict; # 'subs';
119	no strict 'refs'; # allow filehandles to be variables and viceversa
120
121	sub BEGIN {
122	@HTMLImagePlugin::ISA = qw( HTMLPlugin );
123	}
124
125	my $aggressiveness_list =
126	[ { 'name' => "1",
127	'desc' => "{HTMLImagePlugin.aggressiveness.1}" },
128	{ 'name' => "2",
129	'desc' => "{HTMLImagePlugin.aggressiveness.2}" },
130	{ 'name' => "3",
131	'desc' => "{HTMLImagePlugin.aggressiveness.3}" },
132	{ 'name' => "4",
133	'desc' => "{HTMLImagePlugin.aggressiveness.4}" },
134	{ 'name' => "5",
135	'desc' => "{HTMLImagePlugin.aggressiveness.5}" },
136	{ 'name' => "6",
137	'desc' => "{HTMLImagePlugin.aggressiveness.6}" },
138	{ 'name' => "7",
139	'desc' => "{HTMLImagePlugin.aggressiveness.7}" },
140	{ 'name' => "8",
141	'desc' => "{HTMLImagePlugin.aggressiveness.8}" },
142	{ 'name' => "9",
143	'desc' => "{HTMLImagePlugin.aggressiveness.9}" } ];
144
145	my $arguments =
146	[ { 'name' => "aggressiveness",
147	'desc' => "{HTMLImagePlugin.aggressiveness}",
148	'type' => "int",
149	'list' => $aggressiveness_list,
150	'deft' => "3",
151	'reqd' => "no" },
152	{ 'name' => "index_pages",
153	'desc' => "{HTMLImagePlugin.index_pages}",
154	'type' => "flag",
155	'reqd' => "no" },
156	{ 'name' => "no_cache_images",
157	'desc' => "{HTMLImagePlugin.no_cache_images}",
158	'type' => "flag",
159	'reqd' => "no" },
160	{ 'name' => "min_size",
161	'desc' => "{HTMLImagePlugin.min_size}",
162	'type' => "int",
163	'deft' => "2000",
164	'reqd' => "no" },
165	{ 'name' => "min_width",
166	'desc' => "{HTMLImagePlugin.min_width}",
167	'type' => "int",
168	'deft' => "50",
169	'reqd' => "no" },
170	{ 'name' => "min_height",
171	'desc' => "{HTMLImagePlugin.min_height}",
172	'type' => "int",
173	'deft' => "50",
174	'reqd' => "no" },
175	{ 'name' => "thumb_size",
176	'desc' => "{HTMLImagePlugin.thumb_size}",
177	'type' => "int",
178	'deft' => "100",
179	'reqd' => "no" },
180	{ 'name' => "convert_params",
181	'desc' => "{HTMLImagePlugin.convert_params}",
182	'type' => "string",
183	'deft' => "",
184	'reqd' => "no" },
185	{ 'name' => "min_near_text",
186	'desc' => "{HTMLImagePlugin.min_near_text}",
187	'type' => "int",
188	'deft' => "10",
189	'reqd' => "no" },
190	{ 'name' => "max_near_text",
191	'desc' => "{HTMLImagePlugin.max_near_text}",
192	'type' => "int",
193	'deft' => "400",
194	'reqd' => "no" },
195	{ 'name' => "smallpage_threshold",
196	'desc' => "{HTMLImagePlugin.smallpage_threshold}",
197	'type' => "int",
198	'deft' => "2048",
199	'reqd' => "no" },
200	{ 'name' => "textrefs_threshold",
201	'desc' => "{HTMLImagePlugin.textrefs_threshold}",
202	'type' => "int",
203	'deft' => "2",
204	'reqd' => "no" },
205	{ 'name' => "caption_length",
206	'desc' => "{HTMLImagePlugin.caption_length}",
207	'type' => "int",
208	'deft' => "80",
209	'reqd' => "no" },
210	{ 'name' => "neartext_length",
211	'desc' => "{HTMLImagePlugin.neartext_length}",
212	'type' => "int",
213	'deft' => "300",
214	'reqd' => "no" },
215	{ 'name' => "document_text",
216	'desc' => "{HTMLImagePlugin.document_text}",
217	'type' => "flag",
218	'reqd' => "no" } ];
219
220	my $options = { 'name' => "HTMLImagePlugin",
221	'desc' => "{HTMLImagePlugin.desc}",
222	'abstract' => "no",
223	'inherits' => "yes",
224	'args' => $arguments };
225
226	sub new {
227	my ($class) = shift (@_);
228	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
229	push(@$pluginlist, $class);
230
231	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
232	push(@{$hashArgOptLists->{"OptList"}},$options);
233
234	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
235
236	# init class variables
237	$self->{'textref'} = undef; # init by read_file fn
238	$self->{'htdoc_obj'} = undef; # init by process fn
239	$self->{'htpath'} = undef; # init by process fn
240	$self->{'hturl'} = undef; # init by process fn
241	$self->{'plaintext'} = undef; # HTML stripped version - only init if needed by raw_neartext sub
242	$self->{'smallpage'} = 0; # set by process fn
243	$self->{'images_indexed'} = undef; # num of images indexed - if 1 or 2 then we know page is small
244	$self->{'initialised'} = undef; # flag (see set_extraction_options())
245
246	return bless $self, $class;
247	}
248
249	# if indexing pages, let HTMLPlugin do it's stuff
250	# image extraction done through read()
251	sub process {
252	my $self = shift(@_);
253	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
254	$self->{'imglist'} = ();
255	if ( $self->{'index_pages'} ) {
256	my $ok = $self->SUPER::process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
257	if ( ! $ok ) { return $ok }
258	$self->{'htdoc_obj'} = $doc_obj;
259	}
260	# else use URL for referencing
261	#if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
262
263	$self->{'htpath'} = $base_dir if (-d $base_dir);
264	if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} .= "/$1"; }
265	$self->{'htpath'} =~ s/\\/\//g; # replace \ with /
266
267	$self->{'hturl'} = "http://$file";
268	$self->{'hturl'} =~ s/\\/\//g; # for windows
269	($self->{'filename'}) = $file =~ /.[\/\\](.)/;
270	($self->{'base_path'}) = $file =~ /(.*)[\/\\]/i;
271	if ( ( -s "$base_dir/$file") <= $self->{'smallpage_threshold'} ) {
272	$self->{'smallpage'} = 1;
273	} else { $self->{'smallpage'} = 0; }
274
275	if ( defined($self->{'initialised'}) ) { return 1; }
276	else {
277	$self->{'initialised'} = $self->set_extraction_options($base_dir =~ /^(.*?)\/import/i);
278	return $self->{'initialised'};
279	}
280	}
281
282	# get complex configuration options from configuration files
283	# -- $GSDLCOLLECTION/etc/HTMLImagePlugin.cfg (tag sets for aggr 2+)
284	# -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+)
285
286	# If there's no HTMLImagePlugin.cfg file we'll use the following default values
287	my $defaultcfg = '
288	<delimitertagset>
289	<setname>Caption</setname>
290	<taggroup>font</taggroup>
291	<taggroup>tt</taggroup>
292	<taggroup>small</taggroup>
293	<taggroup>b</taggroup>
294	<taggroup>i</taggroup>
295	<taggroup>u</taggroup>
296	<taggroup>em</taggroup>
297	<taggroup>td</taggroup>
298	<taggroup>li</taggroup>
299	<taggroup>a</taggroup>
300	<taggroup>p</taggroup>
301	<taggroup>tr</taggroup>
302	<taggroup>center</taggroup>
303	<taggroup>div</taggroup>
304	<taggroup>caption</taggroup>
305	<taggroup>br</taggroup>
306	<taggroup>ul</taggroup>
307	<taggroup>ol</taggroup>
308	<taggroup>table</taggroup>
309	<taggroup>hr</taggroup>
310	</delimitertagset>
311
312	<delimitertagset>
313	<setname>Neartext</setname>
314	<taggroup>tr\|hr\|table\|h\d\|img\|body</taggroup>
315	<taggroup>td\|tr\|hr\|table\|h\d\|img\|body</taggroup>
316	<taggroup>p\|br\|td\|tr\|hr\|table\|h\d\|img\|body</taggroup>
317	<taggroup>font\|p\|i\|b\|em\|img</taggroup>
318	</delimitertagset>
319	';
320
321	sub set_extraction_options() {
322	my ($self, $collpath) = @_;
323	my ($filepath);
324
325	print {$self->{'outhandle'}} "HTMLImagePlugin: Initialising\n"
326	if $self->{'verbosity'} > 1;
327	# etc/HTMLImagePlugin.cfg (XML)
328	# tag sets for captions and neartext
329	if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) {
330	$self->{'delims'} = [];
331	$self->{'cdelims'} = [];
332	my ($cfg, @tagsets, $tagset, $type, @delims);
333
334	$filepath = "$collpath/etc/HTMLImagePlugin.cfg";
335	if ( open CFG, "<$filepath" ) {
336	while (<CFG>) { $cfg .= $_ }
337	close CFG;
338	} else {
339	$cfg = $defaultcfg;
340	}
341
342	(@tagsets) =
343	$cfg =~ /<delimitertagset>(.*?)<\/delimitertagset>/igs;
344	foreach $tagset ( @tagsets ) {
345	($type) = $tagset =~ /<setname>(.*?)<\/setname>/i;
346	if ( lc($type) eq "caption" ) {
347	(@{$self->{'cdelims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
348	}
349	elsif ( lc($type) eq "neartext" ) {
350	(@{$self->{'delims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
351	}
352	}
353
354	# output a warning if there seem to be no delimiters
355	if ( scalar(@{$self->{'cdelims'}} == 0)) {
356	print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: no caption delimiters found in $filepath\n";
357	}
358	if ( scalar(@{$self->{'delims'}} == 0)) {
359	print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: no neartext delimiters found in $filepath\n";
360	}
361	}
362
363	# get stop words for textual reference extraction
364	# TODO: warnings scroll off. Would be best to output them again at end of import
365	if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) {
366	$self->{'stopwords'} = ();
367	$filepath = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");
368	if ( open STOPWORDS, "<$filepath" ) {
369	while ( <STOPWORDS> ) {
370	chomp;
371	$self->{'stopwords'}{$_} = 1;
372	}
373	close STOPWORDS;
374	} else {
375	print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: couldn't open stopwords file at $filepath ($!)\n";
376	}
377
378	}
379
380	if ( $self->{'neartext_length'} > $self->{'max_near_text'} ) {
381	$self->{'max_near_text'} = $self->{'neartext_length'} * 1.33;
382	print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: adjusted max_text to $self->{'max_near_text'}\n";
383	}
384	if ( $self->{'caption_length'} > $self->{'max_near_text'} ) {
385	$self->{'max_near_text'} = $self->{'caption_length'} * 1.33;
386	print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: adjusted max_text to $self->{'max_near_text'}\n";
387	}
388
389	return 1;
390	}
391
392	# return number of files processed, undef if can't recognise, -1 if
393	# cant process
394	# Note that $base_dir might be "" and that $file might
395	# include directories
396	sub read {
397	my ($self, $pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = (@_);
398	my ($doc_obj, $section, $filepath, $imgtag, $pos, $context, $numdocs, $tndir, $imgs);
399	# forward normal read (runs HTMLPlugin if index_pages T)
400	my $ok = $self->SUPER::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
401	if ( ! $ok ) { return $ok } # what is this returning??
402
403	my $outhandle = $self->{'outhandle'};
404	my $textref = $self->{'textref'};
405	my $htdoc_obj = $self->{'htdoc_obj'};
406	$numdocs = 0;
407	$base_dir =~ /(.)\/./;
408	$tndir = "$1/archives/thumbnails"; # TODO: this path shouldn't be hardcoded?
409	&FileUtils::makeAllDirectories($tndir) unless -e "$tndir";
410
411	$imgs = \%{$self->{'imglist'}};
412	my $nimgs = $self->get_img_list($textref);
413	$self->{'images_indexed'} = $nimgs;
414	if ( $nimgs > 0 ) {
415	my @fplist = (sort { $imgs->{$a}{'pos'} <=> $imgs->{$b}{'pos'} } keys %{$imgs});
416	my $i = 0;
417	foreach $filepath ( @fplist ) {
418	$pos = $imgs->{$filepath}{'pos'};
419	$context = substr ($$textref, $pos - 50, $pos + 50); # grab context (quicker)
420	($imgtag) = ($context =~ /(<(?:img\|a\|body)\s[^>]$filepath[^>]>)/is );
421	if (! defined($imgtag)) { $imgtag = $filepath }
422	print $outhandle "HTMLImagePlugin: extracting $filepath\n"
423	if ( $self->{'verbosity'} > 1 );
424	$doc_obj = new doc ("", "indexed_doc", $self->{'file_rename_method'});
425	$section = $doc_obj->get_top_section();
426	my $prevpos = ( $i == 0 ? 0 : $imgs->{$fplist[$i - 1]}{'pos'});
427	my $nextpos = ( $i >= ($nimgs -1) ? -1 : $imgs->{$fplist[$i + 1]}{'pos'} );
428
429	$self->extract_image_info($imgtag, $filepath, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos);
430	$processor->process($doc_obj);
431	$numdocs++;
432	$i++;
433	}
434	return $numdocs;
435	} else {
436	print $outhandle "HTMLImagePlugin: No images from $file indexed\n"
437	if ( $self->{'verbosity'} > 2 );
438	return 1;
439	}
440
441	}
442
443	# for every valid image tag
444	# 1. extract related text and image metadata
445	# 2. add this as document meta-data
446	# 3. add assoc image(s) as files
447	#
448	sub extract_image_info {
449	my $self = shift (@_);
450	my ($tag, $id, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos) = (@_);
451	my ($filename, $orig_fp, $fn, $ext, $reltext, $relreltext, $crcid, $imgs,
452	$thumbfp, $pagetitle, $alttext, $filepath, $aggr);
453
454	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
455
456	$aggr = $self->{'aggressiveness'};
457	$imgs = \%{$self->{'imglist'}};
458	$filepath = $imgs->{$id}{'relpath'};
459	($filename) = $filepath =~ /([^\/\\]+)$/s;
460	($orig_fp) = "$self->{'base_path'}/$filepath";
461	$orig_fp =~ tr/+/ /;
462	$orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc
463	$orig_fp =~ s/\\/\//g;
464	$filepath = "$self->{'htpath'}/$filepath";
465	my ($onlyfn) = $filename =~ /([^\\\/]*)$/;
466	($fn, $ext) = $onlyfn =~ /(.)\.(.)/;
467	$fn = lc $fn; $ext = lc $ext;
468	($reltext) = "<tr><td>GifComment</td><td>" . `$imagick_cmd identify $filepath -ping -format "%c"` . "</td></tr>\n"
469	if ($ext eq "gif");
470	$reltext .= "<tr><td>FilePath</td><td>$orig_fp</td></tr>\n";
471
472	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
473	$crcid = "$fn.$ext." . $self->{'next_crcid'}++;
474	} else {
475	($crcid) = `cksum $filepath` =~ /^(\d+)/;
476	}
477
478	$thumbfp = "$tndir/tn_$crcid.jpg";
479	`$imagick_cmd convert -flatten -filter Hanning $self->{'convert_params'} -geometry "$self->{'thumb_size'}x$self->{'thumb_size'}>" $filepath $thumbfp` unless -e $thumbfp;
480	if ( ! (-e $thumbfp) ) {
481	print STDERR "HTMLImagePlugin: 'convert' failed. Check ImageMagicK binaries are installed and working correctly\n"; return 0;
482	}
483
484	# shove in full text (tag stripped or unstripped) if settings require it
485	if ( $aggr == 10) {
486	$reltext = "<tr><td>AllPage</td><td>" . $$textref . "</td><tr>\n"; # level 10 (all text, verbatim)
487	} else {
488	$pagetitle = $self->get_meta_value("title", $textref);
489	($alttext) = $tag =~ /\salt\s=\s(?:\"\|\')(.+?)(?:\"\|\')/is;
490	if ( defined($alttext) && length($alttext) > 1) {
491	$reltext .= "<tr><td>ALTtext</td><td>$alttext</td></tr>\n"; }
492	$reltext .= "<tr><td>SplitCapitalisation</td><td>" .
493	$self->split_filepath($orig_fp) . "</td></tr>\n";
494
495	# get caption/tag based near text (if appropriate)
496	if ( $aggr > 1 ) {
497	if ( $aggr >= 2 ) {
498	$reltext .=
499	$self->extract_caption_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
500	$relreltext = $reltext;
501	}
502	# repeat the filepath, alt-text, caption, etc
503	if ( $aggr == 8 ) {
504	$reltext .= $relreltext;
505	}
506	if ( $aggr >= 3 ) {
507	$reltext .=
508	$self->extract_near_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
509	}
510
511	# get page metadata (if appropriate)
512	if ( $aggr >= 6 \|\| ( $aggr >= 2 &&
513	( $self->{'images_indexed'} < 2 \|\|
514	($self->{'smallpage'} == 1 && $self->{'images_indexed'} < 6 )))) {
515	$reltext .= $self->get_page_metadata($textref);
516	}
517	# textual references
518	if ( $aggr == 5 \|\| $aggr >= 7) {
519	if ( length($relreltext) > ($self->{'caption_length'} * 2) ) {
520	$reltext .= $self->get_textrefs($relreltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); }
521	else {
522	$reltext .= $self->get_textrefs($reltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
523	}
524	}
525	} # aggr > 1
526	} # aggr != 10
527
528	$doc_obj->set_OID($crcid);
529	$doc_obj->associate_file($thumbfp, "$fn.thumb.jpg", undef, $section);
530	$doc_obj->add_metadata($section, "OriginalFilename", $filename);
531	$doc_obj->add_metadata($section, "FilePath", $orig_fp);
532	$doc_obj->add_metadata($section, "Filename", $fn);
533	$doc_obj->add_metadata($section, "FileExt", $ext);
534	$doc_obj->add_metadata($section, "FileSize", $imgs->{$id}{'filesize'});
535	$doc_obj->add_metadata($section, "Width", $imgs->{$id}{'width'});
536	$doc_obj->add_metadata($section, "Height", $imgs->{$id}{'height'});
537	$doc_obj->add_metadata($section, "URL", "http://$orig_fp");
538	$doc_obj->add_metadata($section, "PageURL", $self->{'hturl'});
539	$doc_obj->add_metadata($section, "PageTitle", $pagetitle);
540	$doc_obj->add_metadata($section, "ThumbURL",
541	"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$fn.thumb.jpg");
542	$doc_obj->add_metadata($section, "FileFormat", "W3Img");
543
544	if ( $self->{'document_text'} ) {
545	$doc_obj->add_utf8_text($section, "<table border=1>\n$reltext</table>");
546	} else {
547	$doc_obj->add_metadata($section, "ImageText", "<table border=1>\n$reltext</table>\n");
548	}
549
550	if ( $self->{'index_pages'} ) {
551	my ($cache_url) = "_httpdoc_&d=" . $self->{'htdoc_obj'}->get_OID();
552	if ( $imgs->{$id}{'anchored'} ) {
553	my $a_name = $id;
554	$a_name =~ s/[\/\\\:\&]/_/g;
555	$cache_url .= "#gsdl_$a_name" ;
556	}
557	$doc_obj->add_utf8_metadata($section, "CachePageURL", $cache_url);
558	}
559	if ( ! $self->{'no_cache_images'} ) {
560	$onlyfn = lc $onlyfn;
561	$doc_obj->associate_file($filepath, $onlyfn, undef, $section);
562	$doc_obj->add_utf8_metadata($section, "CacheURL",
563	"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/$onlyfn");
564	}
565	return 1;
566	}
567
568	sub get_page_metadata {
569	my ($self, $textref) = (@_);
570	my (@rval);
571	$rval[0] = $self->get_meta_value("title", $textref);
572	$rval[1] = $self->get_meta_value("keywords", $textref);
573	$rval[2] = $self->get_meta_value("description", $textref);
574	$rval[3] = $self->{'filename'};
575
576	return wantarray ? @rval : "<tr><td>PageMeta</td><td>@rval</td></tr>\n" ;
577	}
578
579	# turns LargeCatFish into Large,Cat,Fish so MG sees the separate words
580	sub split_filepath {
581	my ($self, $filepath) = (@_);
582	my (@words) = $filepath =~ /([A-Z][a-z]+)/g;
583	return join(',', @words);
584	}
585
586	# finds and extracts sentences
587	# that seem to be on the same topic
588	# as other related text (correlations)
589	# and textual references (e.g. in figure 3 ...)
590	sub get_textrefs {
591	my ($self, $reltext, $textref, $prevpos, $pos, $nextpos) = (@_);
592	my ($maxtext, $mintext, $startpos, $context_size, $context);
593
594	my (@relwords, @refwords, %sentences, @pagemeta);
595
596	# extract larger context
597	$maxtext = $self->{'max_near_text'};
598	$startpos = $pos - ($maxtext * 4);
599	$context_size = $maxtext*10;
600	if ($startpos < $prevpos ) { $startpos = $prevpos }
601	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) }
602	$context = substr ( $$textref, $startpos, $context_size );
603	$context =~ s/<.*?>//gs;
604	$context =~ s/^.>(.)/$1/gs;
605	$context =~ s/(.)<.$/$1/gs;
606
607	# get page meta-data (if not already included)
608	if ( $self->{'aggressiveness'} == 5 && ! $self->{'smallpage'} ) {
609	@pagemeta = $self->get_page_metadata($textref);
610	foreach my $value ( @pagemeta ) {
611	$context .= "$value."; # make each into psuedo-sentence
612	}
613	}
614
615	# TODO: this list is not exhaustive
616	@refwords = ( '(?:is\|are)? ?(?:show(?:s\|n)\|demonstrate(?:d\|s)\|explains\|features) (?:in\|by\|below\|above\|here)',
617	'(?:see)? (?:figure\|table)? (?:below\|above)');
618
619	# extract general references
620	foreach my $rw ( @refwords ) {
621	while ( $context =~ /[\.\?\!\,](.?$rw\W.?[\.\?\!\,])/ig ) {
622	my $sentence = $1;
623	$sentence =~ s/\s+/ /g;
624	$sentences{$sentence}+=2;
625	}
626	}
627	# extract specific (figure, table) references by number
628	my ($fignum) = $context =~ /[\.\?\!].?(?:figure\|table)s?[\-\_\ \.](\d+\w)\W.*?[\.\?\!]/ig;
629	if ( $fignum ) {
630	foreach my $rw ( @refwords ) {
631	while ( $context =~ /[\.\?\!](.?(figure\|table)[\-\_\ \.]$fignum\W.?[\.\?\!])/ig ) {
632	my $sentence = $1;
633	$sentence =~ s/\s+/ /g;
634	$sentences{$sentence}+=4;
635	}
636	}
637	}
638
639	# sentences with occurances of important words
640	@relwords = $reltext =~ /([a-zA-Z]{4,})/g; # take out small words
641	foreach my $word ( @relwords ) {
642	if ( $self->{'stopwords'}{$word} ) { next } # skip stop words
643	while ( $context =~ /([^\.\?\!]?$word\W.?[\.\?\!])/ig ) {
644	my $sentence = $1;
645	$sentence =~ s/\s+/ /g;
646	$sentences{$sentence}++;
647	}
648	}
649	foreach my $sentence ( keys %sentences ) {
650	if ($sentences{$sentence} < $self->{'textrefs_threshold'}) {
651	delete $sentences{$sentence};
652	}
653	}
654	my ($rval) = join "<br>\n", (keys %sentences);
655	if ( $rval && length($rval) > 5 ) {
656	return ( "<tr><td>TextualReferences</td><td>" . $rval . "</td></tr>\n") }
657	else { return "" }
658	}
659
660	# handles caption extraction
661	# calling the extractor with different
662	# tags and choosing the best candidate caption
663	sub extract_caption_text {
664	my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
665	my (@neartext, $len, $hdelim, $mintext, $goodlen,
666	$startpos, $context, $context_size);
667
668	$mintext = $self->{'min_near_text'};
669	$goodlen = $self->{'caption_length'};
670
671	# extract a context to extract near text from (faster)
672	$context_size = $self->{'max_near_text'}*3;
673	$startpos = $pos - ($context_size / 2);
674	if ($startpos < $prevpos ) { $startpos = $prevpos }
675	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
676	{ $context_size = ($nextpos - $startpos) }
677
678	$context = substr ( $$textref, $startpos, $context_size );
679	$context =~ s/<!--.*?-->//gs;
680	$context =~ s/^.-->(.)/$1/gs;
681	$context =~ s/(.)<!--.$/$1/gs;
682
683	# try stepping through markup delimiter sets
684	# and selecting the best one
685	foreach $hdelim ( @{ $self->{'cdelims'} } ) {
686	@neartext = $self->extract_caption($tag, $hdelim, \$context);
687	$len = length(join("", @neartext));
688	last if ($len >= $mintext && $len <= $goodlen);
689	}
690	# reject if well over reasonable length
691	if ( $len > $goodlen ) {
692	@neartext = [];
693	}
694	$neartext[0] = " " if (! defined $neartext[0]);
695	$neartext[1] = " " if (! defined $neartext[1]);
696	return "<tr><td>Caption</td><td>" . (join ",", @neartext) . "</td></tr>\n"; # TODO: the \| is for testing purposes
697	} # end extract_caption_text
698
699	# the previous section header often gives a bit
700	# of context to the section that the image is
701	# in (invariably the header is before/above the image)
702	# so extract the text of the closest header above the image
703	#
704	# this fn just gets all the headers above the image, within the context window
705	sub get_prev_header {
706	my ($self, $pos, $textref) = (@_);
707	my ($rhtext);
708	while ( $$textref =~ /<h\d>(.*?)<\/h\d>/sig ) {
709	# only headers before image
710	if ((pos $$textref) < $pos) {
711	$rhtext .= "$1, ";
712	}
713	}
714	if ( $rhtext ) { return "Header($rhtext)" }
715	else { return "" }
716	}
717
718	# not the most robust tag stripping
719	# regexps (see perl.com FAQ) but good enough
720	#
721	# used by caption & tag-based near text algorithms
722	sub strip_tags {
723	my ( $self, $value ) = @_;
724	if ( ! defined($value) ) { $value = "" } # handle nulls
725	else {
726	$value =~ s/<.*?>//gs; # strip all html tags
727	$value =~ s/\s+/\ /g; # remove extra whitespace
728	$value =~ s/\&\w+\;//g; # remove   etc
729	}
730	return $value;
731	}
732
733	# uses the given tag(s) to identify
734	# the caption near to the image
735	# (below, above or both below and above)
736	sub extract_caption {
737	my ($self, $tag, $bound_tag, $contextref) = (@_);
738	my (@nt, $n, $etag, $gotcap);
739	return ("", "") if ( ! ($$contextref =~ /\Q$tag/) );
740
741	$nt[0] = $`;
742	$nt[1] = $';
743	$gotcap = 0;
744
745	# look before the image for a boundary tag
746	($etag, $nt[0]) = $nt[0] =~ /<($bound_tag)[\s]?.?>(.?)$/is;
747	# if bound_tag too far from the image, then prob not caption
748	# (note: have to allow for tags, so multiply by 3
749	if ( $etag && length($nt[0]) < ($self->{'caption_length'} * 3) ) {
750	if ( $nt[0] =~ /<\/$etag>/si ) {
751	# the whole caption is above the image: <tag>text</tag><img>
752	($nt[0]) =~ /<(?:$etag)[\s]?.?>(.?)<\/$etag>/is;
753	$nt[0] = $self->strip_tags($nt[0]);
754	if ( length($nt[0]) > $self->{'min_near_text'} ) {
755	$gotcap = 1;
756	$nt[1] = "";
757	}
758
759	} elsif ( $nt[1] =~ /<\/$etag>/si) {
760	# the caption tag covers image: <tag>text?<img>text?</tag>
761	($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
762	$nt[0] = $self->strip_tags($nt[0] . $nt[1]);
763	if ( length($nt[0]) > $self->{'min_near_text'} ) {
764	$gotcap = 2;
765	$nt[1] = "";
766	}
767	}
768	}
769	# else try below the image
770	if ( ! $gotcap ) {
771	# the caption is after the image: <img><tag>text</tag>
772	($etag, $nt[1]) = $nt[1] =~ /^.?<($bound_tag)[\s]?.?>(.*)/is;
773	if ( $etag && $nt[1] =~ /<\/$etag>/s) {
774	($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
775	$gotcap = 3;
776	$nt[0] = "";
777	$nt[1] = $self->strip_tags($nt[1]);
778	}
779	}
780	if ( ! $gotcap ) { $nt[0] = $nt[1] = "" }
781	else {
782	# strip part-tags
783	$nt[0] =~ s/^.*>//s;
784	$nt[1] =~ s/<.*$//s;
785	}
786	my ($type);
787	if ( $gotcap == 0 ) { return ("nocaption", "") }
788	elsif ( $gotcap == 1 ) { $type = "captionabove:" }
789	elsif ( $gotcap == 2 ) { $type = "captioncovering:" }
790	elsif ( $gotcap == 3 ) { $type = "captionbelow:" }
791	return ($type, $nt[0], $nt[1]);
792	}
793
794	# tag-based near text
795	#
796	# tries different tag sets
797	# and chooses the best one
798	sub extract_near_text {
799	my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
800	my (@neartext, $len, $hdelim, $maxtext, $mintext, $goodlen,
801	@bestlen, @best, $startpos, $context, $context_size,
802	$dist, $bdist, $best1, $i, $nt);
803	$bestlen[0] = $bestlen[1] = 0; $bestlen[2] = $bdist = 999999;
804	$best[0] = $best[1] = $best[2] = "";
805	$maxtext = $self->{'max_near_text'};
806	$mintext = $self->{'min_near_text'};
807	$goodlen = $self->{'neartext_length'};
808
809	# extract a context to extract near text from (faster)
810	$context_size = $maxtext*4;
811	$startpos = $pos - ($context_size / 2);
812	if ($startpos < $prevpos ) { $startpos = $prevpos }
813	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
814	{ $context_size = ($nextpos - $startpos) }
815	$context = substr ( $$textref, $startpos, $context_size );
816	$context =~ s/<!--.*?-->//gs;
817	$context =~ s/^.-->(.)/$1/gs;
818	$context =~ s/(.)<!--.$/$1/gs;
819
820	# try stepping through markup delimiter sets
821	# and selecting the best one
822	foreach $hdelim ( @{ $self->{'delims'} } ) {
823	@neartext = $self->extract_tagged_neartext($tag, $hdelim, \$context);
824	$nt = join("", @neartext);
825	$len = length($nt);
826	# Priorities:
827	# 1. Greater than mintext
828	# 2. Less than maxtext
829	# 3. Closest to goodlen
830	if ( $len <= $goodlen && $len > $bestlen[0] ) {
831	$bestlen[0] = $len;
832	$best[0] = $hdelim;
833	} elsif ( $len >= $maxtext && $len < $bestlen[2] ) {
834	$bestlen[2] = $len;
835	$best[2] = $hdelim;
836	} elsif ( $len >= $bestlen[0] && $len <= $bestlen[2] ) {
837	$dist = abs($goodlen - $len);
838	if ( $dist < $bdist ) {
839	$bestlen[1] = $len;
840	$best[1] = $hdelim;
841	$bdist = $dist;
842	}
843	}
844	}
845	$best1 = 2;
846	foreach $i ( 0..2 ) {
847	if ( $bestlen[$i] == 999999 ) { $bestlen[$i] = 0 }
848	$dist = abs($goodlen - $bestlen[$i]);
849	if ( $bestlen[$i] > $mintext && $dist <= $bdist ) {
850	$best1 = $i;
851	$bdist = $dist;
852	}
853	}
854	@neartext = $self->extract_tagged_neartext($tag, $best[$best1], \$context);
855	if ( $bestlen[$best1] > $maxtext ) {
856	# truncate on word boundary if too much text
857	my $hmax = $maxtext / 2;
858	($neartext[0]) = $neartext[0] =~ /([^\s]*.{1,$hmax})$/s;
859	($neartext[1]) = $neartext[1] =~ /^(.{1,$hmax}[^\s]*)/s;
860	} elsif ( $bestlen[$best1] < $mintext ) {
861	# use plain text extraction if tags failed (e.g. usable tag outside context)
862	print {$self->{'outhandle'}} "HTMLImagePlugin: Fallback to plain-text extraction for $tag\n"
863	if $self->{'verbosity'} > 2;
864	$neartext[0] = "<tr><td>RawNeartext</td><td>" . $self->extract_raw_neartext($tag, $textref) . "</td></tr>";
865	$neartext[1] = "";
866	}
867	# get previous header if available
868	$neartext[0] .= "<br>\n" .
869	$self->get_prev_header($pos, \$context) if ( $self->{'aggressiveness'} >= 4 );
870	$neartext[0] = " " if (! defined $neartext[0]);
871	$neartext[1] = " " if (! defined $neartext[1]);
872
873	return "<tr><td>NearText</td><td>" . (join "\|", @neartext) . "</td></tr>\n"; # TODO: the \| is for testing purposes
874	} # end extract_near_text
875
876	# actually captures tag-based
877	# near-text given a tag set
878	sub extract_tagged_neartext {
879	my ($self, $tag, $bound_tag, $textref) = (@_);
880	return "" if ( ! ($$textref =~ /\Q$tag/) );
881	my (@nt, $delim, $pre_tag, $n);
882	$nt[0] = $`;
883	$nt[1] = $';
884
885	# get text after previous image tag
886	$nt[0] =~ s/.<($bound_tag)[^>]>(.*)/$2/is; # get rid of preceding text
887	if (defined($1)) { $delim = $1 }
888	$pre_tag = $bound_tag;
889
890	if (defined($delim)) {
891	# we want to try and use the end tag of the previous delimiter
892	# (put it on the front of the list)
893	$pre_tag =~ s/(^\|\\|)($delim)($\|\\|)//i; # take it out
894	$pre_tag =~ s/\\|\\|/\\|/i; # replace \|\| with \|
895	$pre_tag = $delim . "\|" . $pre_tag; # put it on the front
896	}
897
898	# get text before next image tag
899	$nt[1] =~ s/<\/?(?:$pre_tag)[^>]>.//is; # get rid of stuff after first delimiter
900
901	# process related text
902	for $n (0..1) {
903	if ( defined($nt[$n]) ) {
904	$nt[$n] =~ s/<.*?>//gs; # strip all html tags
905	$nt[$n] =~ s/\s+/\ /gs; # remove extra whitespace
906	$nt[$n] =~ s/\&\w+\;//sg; # remove   etc
907	# strip part-tags
908	if ( $n == 0 ) { $nt[0] =~ s/^.*>//s }
909	if ( $n == 1 ) { $nt[1] =~ s/<.*$//s }
910	} else { $nt[$n] = ""; } # handle nulls
911	}
912	return @nt;
913	}
914
915	# this function is fall-back
916	# if tags aren't suitable.
917	#
918	# extracts a fixed length of characters
919	# either side of image tag (on word boundary)
920	sub extract_raw_neartext {
921	my ($self, $tag, $textref) = (@_);
922	my ($rawtext, $startpos, $fp);
923	my $imgs = \%{$self->{'imglist'}};
924	($fp) = $tag =~ /([\w\\\/]+\.(?:gif\|jpe?g\|png))/is;
925	if (! $fp) { return " " };
926	# if the cached, plain-text version isn't there, then create it
927	$self->init_plaintext($textref) unless defined($self->{'plaintext'});
928
929	# take the closest maxtext/2 characters
930	# either side of the tag (by word boundary)
931	return "" if ( ! exists $imgs->{$fp}{'rawpos'} );
932	$startpos = $imgs->{$fp}{'rawpos'} - (($self->{'max_near_text'} / 2) + 20);
933	if ( $startpos < 0 ) { $startpos = 0 }
934	$rawtext = substr $self->{'plaintext'}, $startpos, $self->{'max_near_text'} + 20;
935	$rawtext =~ s/\s\s/ /g;
936
937	return $rawtext;
938	}
939
940	# init plaintext variable for HTML-stripped version
941	# (for full text index/raw assoc text extraction)
942	sub init_plaintext {
943	my ($self, $textref) = (@_);
944	my ($page, $fp);
945	my $imgs = \%{$self->{'imglist'}};
946	$page = $$textref; # make a copy of original
947
948	# strip tags around image filenames so they don't get zapped
949	$page =~ s/<\w+\s+.?([\w\/\\]+\.(?:gif\|jpe?g\|png))[^>]>/\"$1\"/gsi;
950	$page =~ s/<.*?>//gs;
951	$page =~ s/ / /gs;
952	$page =~ s/&/&/gs; #TODO: more &zzz; replacements (except <, $gt;)
953
954	# get positions and strip images
955	while ( $page =~ /([^\s\'\"]+\.(jpe?g\|gif\|png))/ig ) {
956	$fp = $1;
957	if ( $imgs->{$fp}{'exists'} ) {
958	$imgs->{$fp}{'rawpos'} = pos $page;
959	}
960	$page =~ s/\"$fp\"//gs;
961	}
962	$self->{'plaintext'} = $page;
963	}
964
965	# finds and filters images based on size
966	# (dimensions, height, filesize) and existence
967	#
968	# looks for image filenames (.jpg, .gif, etc)
969	# and checks for existence on disk
970	# (hence supports most JavaScript images)
971	sub get_img_list {
972	my $self = shift (@_);
973	my ($textref) = (@_);
974	my ($filepath, $relpath, $abspath, $pos, $num, $width, $height, $filesize);
975	my $imgs = \%{$self->{'imglist'}};
976	while ( $$textref =~ /([^\s\'\"]+\.(jpe?g\|gif\|png))/ig ) {
977	$filepath = $1;
978	$pos = pos $$textref;
979	next if ( $imgs->{$filepath}{'relpath'} );
980	$relpath = $filepath;
981	$relpath =~ s/^http\:\/\///; # remove http:// in case we have mirrored it
982	$relpath =~ s/\\/\//g; # replace \ with /
983	$relpath =~ s/^\.\///s; # make "./filepath" into "filepath"
984	$imgs->{$filepath}{'relpath'} = $relpath;
985	$abspath = "$self->{'htpath'}/$relpath";
986
987	if (! -e $abspath) { next }
988
989	# can't modify real filepath var because it
990	# then can't be located in the page for tag recognition later
991	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
992	($width, $height) =
993	`$imagick_cmd identify $abspath -ping -format "%wx%h"` =~ /^(\d)x(\d)$/m;
994	if (! ($width && $height)) {
995	print STDERR "HTMLImagePlugin: ($abspath) 'identify' failed. Check ImageMagicK binaries are installed and working correctly\n"; next;
996	}
997	$filesize = (-s $abspath);
998	if ( $filesize >= $self->{'min_size'}
999	&& ( $width >= $self->{'min_width'} )
1000	&& ( $height >= $self->{'min_height'} ) ) {
1001
1002	$imgs->{$filepath}{'exists'} = 1;
1003	$imgs->{$filepath}{'pos'} = $pos;
1004	$imgs->{$filepath}{'width'} = $width;
1005	$imgs->{$filepath}{'height'} = $height;
1006	$imgs->{$filepath}{'filesize'} = $filesize;
1007	} else {
1008	print {$self->{'outhandle'}} "HTMLImagePlugin: skipping $self->{'base_path'}/$relpath: $filesize, $width x $height\n"
1009	if $self->{'verbosity'} > 2;
1010	}
1011	}
1012	$num = 0;
1013	foreach my $i ( keys %{$imgs} ) {
1014	if ( $imgs->{$i}{'pos'} ) {
1015	$num++;
1016	} else { delete $imgs->{$i} }
1017	}
1018	return $num;
1019	}
1020
1021	# make the text available to the read function
1022	# by making it an object variable
1023	sub read_file {
1024	my ($self, $filename, $encoding, $language, $textref) = @_;
1025	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1026
1027	# if HTMLplug has run through, then it will
1028	# have replaced references so we have to
1029	# make a copy of the text before processing
1030	if ( $self->{'index_pages'} ) {
1031	$self->{'text'} = $$textref;
1032	$self->{'textref'} = \($self->{'text'});
1033	} else {
1034	$self->{'textref'} = $textref;
1035	}
1036	$self->{'plaintext'} = undef;
1037	}
1038
1039	# HTMLPlugin only extracts meta-data if it is specified in plugin options
1040	# hence a special function to do it here
1041	sub get_meta_value {
1042	my ($self, $name, $textref) = @_;
1043	my ($value);
1044	$name = lc $name;
1045	if ($name eq "title") {
1046	($value) = $$textref =~ /<title>(.*?)<\/title>/is
1047	} else {
1048	my $qm = "(?:\"\|\')";
1049	($value) = $$textref =~ /<meta name\s=\s$qm?$name$qm?\s+content\s=\s$qm?(.?)$qm?\s>/is
1050	}
1051	$value = "" unless $value;
1052	return $value;
1053	}
1054
1055	# make filename an anchor reference
1056	# so we can go straight to the image
1057	# within the cached version of the source page
1058	# (augment's HTMLPlugin sub)
1059	sub replace_images {
1060	my $self = shift (@_);
1061	my ($front, $link, $back, $base_dir,
1062	$file, $doc_obj, $section) = @_;
1063	$link =~ s/\"//g;
1064	my ($a_name) = $link;
1065	$a_name =~ s/[\/\\\:\&]/_/g;
1066	# keep a list so we don't repeat the same anchor
1067	if ( ! $self->{'imglist'}{$link}{'anchored'} ) {
1068	$front = "<a name=\"gsdl_$a_name\">$front";
1069	$back = "$back</a>";
1070	$self->{'imglist'}{$link}{'anchored'} = 1;
1071	}
1072	return $self->SUPER::replace_images($front, $link, $back, $base_dir,
1073	$file, $doc_obj, $section);
1074	}
1075
1076	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HTMLImagePlugin.pm

Download in other formats: