Context Navigation

source: gsdl/trunk/perllib/plugins/W3ImagePlugin.pm@ 15918

Last change on this file since 15918 was 15872, checked in by kjdon, 16 years ago
plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...
Property svn:keywords set to `Author Date Id Revision`
File size: 37.0 KB

Line
1	###########################################################################
2	#
3	# W3ImagePlugin.pm -- Context-based image indexing plugin for HTML documents
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 2001 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# DESCRIPTION:
28	#
29	# Extracts images and associated text and metadata from
30	# web pages as individual documents for indexing. Thumbnails
31	# are created from each image for browsing purposes.
32	#
33	# Options are available for configuring the aggressiveness of the
34	# associated text extraction mechanisms. A higher level of
35	# aggressiveness will extract more text and consequently
36	# may mean lower accuracy (precision); however, it may also
37	# retrieve more of the relevant images from the collection (recall).
38	# Lower levels of aggressiveness maybe result in slightly faster
39	# collection builds at the import stage.
40	#
41	# W3ImagePlugin is a subclass of HTMLPlug (i.e. it will index pages also
42	# if required). It can be used in place of HTMLPlugin to index both
43	# pages and their images.
44	#
45	# REQUIREMENTS:
46	#
47	# The ImageMagick image manipulation is used to create
48	# thumbnails and extract some image metadata. (Available
49	# from http://www.imagemagick.org/)
50	#
51	# Unix:
52	# Many Linux distributions contain ImageMagick.
53	#
54	# Windows:
55	# ImageMagick can be downloaded from the website above.
56	# Make sure the system path includes the ImageMagick binaries
57	# before using W3ImagePlugin.
58	#
59	# NOTE: NT/2000/XP contain a filesystem utility 'convert.exe'
60	# with the same name as the image conversion utility. The
61	# ImageMagick FAQ recommends renaming the filesystem
62	# utility (e.g. to 'fsconvert.exe') to avoid this clash.
63	#
64	# USAGE:
65	#
66	# An image document consists of metadata elements:
67	#
68	# OriginalFilename, FilePath, Filename, FileExt, FileSize,
69	# Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL
70	# ImageText, PageTitle
71	#
72	# Most of these are only useful in format strings (e.g. ThumbURL,
73	# Filename, URL, PageURL, CachePageURL).
74	#
75	# ImageText, as the name suggests contains the indexable text.
76	# (unless using the -document_text plugin option)
77	#
78	# Since image documents are made up of metadata elements
79	# alone, format strings are needed to display them properly.
80	# NOTE: The receptionist will only display results (e.g. thumbnails)
81	# in 4 columns if the format string begins with "<td><table>".
82	#
83	# The example below takes the user to the image within the
84	# source HTML document rather than using a format string
85	# on DocumentText to display the image document itself.
86	#
87	# Example collect.cfg:
88	#
89	# ...
90	#
91	# indexes document:ImageText document:text
92	# defaultindex document:ImageText
93	#
94	# collectionmeta .document:ImageText "images"
95	# collectionmeta .document:text "documents"
96	#
97	# ...
98	#
99	# plugin W3ImagePlugin -index_pages -aggressiveness 6
100	#
101	# ...
102	#
103	# format SearchVList '<td>{If}{[Title],[link][icon] [Title][[/link],
104	# <table><tr><td align="center"><a href="[CachePageURL]">
105	# <img src="[ThumbURL]"></a></td></tr><tr><td align="center">
106	# <a href="[CachePageURL]"><font size="-1">[OriginalFilename]</font></a>
107	# <br>[Width]x[Height]</td></tr></table>}</td>'
108	#
109	# ...
110	#
111
112	package W3ImagePlugin;
113
114	use HTMLPlugin;
115	use ghtml;
116	use unicode;
117	use util;
118	use strict; # 'subs';
119	no strict 'refs'; # allow filehandles to be variables and viceversa
120
121	sub BEGIN {
122	@W3ImagePlugin::ISA = qw( HTMLPlugin );
123	}
124
125	my $aggressiveness_list =
126	[ { 'name' => "1",
127	'desc' => "{W3ImagePlugin.aggressiveness.1}" },
128	{ 'name' => "2",
129	'desc' => "{W3ImagePlugin.aggressiveness.2}" },
130	{ 'name' => "3",
131	'desc' => "{W3ImagePlugin.aggressiveness.3}" },
132	{ 'name' => "4",
133	'desc' => "{W3ImagePlugin.aggressiveness.4}" },
134	{ 'name' => "5",
135	'desc' => "{W3ImagePlugin.aggressiveness.5}" },
136	{ 'name' => "6",
137	'desc' => "{W3ImagePlugin.aggressiveness.6}" },
138	{ 'name' => "7",
139	'desc' => "{W3ImagePlugin.aggressiveness.7}" },
140	{ 'name' => "8",
141	'desc' => "{W3ImagePlugin.aggressiveness.8}" },
142	{ 'name' => "9",
143	'desc' => "{W3ImagePlugin.aggressiveness.9}" } ];
144
145	my $arguments =
146	[ { 'name' => "aggressiveness",
147	'desc' => "{W3ImagePlugin.aggressiveness}",
148	'type' => "int",
149	'list' => $aggressiveness_list,
150	'deft' => "3",
151	'reqd' => "no" },
152	{ 'name' => "index_pages",
153	'desc' => "{W3ImagePlugin.index_pages}",
154	'type' => "flag",
155	'reqd' => "no" },
156	{ 'name' => "no_cache_images",
157	'desc' => "{W3ImagePlugin.no_cache_images}",
158	'type' => "flag",
159	'reqd' => "no" },
160	{ 'name' => "min_size",
161	'desc' => "{W3ImagePlugin.min_size}",
162	'type' => "int",
163	'deft' => "2000",
164	'reqd' => "no" },
165	{ 'name' => "min_width",
166	'desc' => "{W3ImagePlugin.min_width}",
167	'type' => "int",
168	'deft' => "50",
169	'reqd' => "no" },
170	{ 'name' => "min_height",
171	'desc' => "{W3ImagePlugin.min_height}",
172	'type' => "int",
173	'deft' => "50",
174	'reqd' => "no" },
175	{ 'name' => "thumb_size",
176	'desc' => "{W3ImagePlugin.thumb_size}",
177	'type' => "int",
178	'deft' => "100",
179	'reqd' => "no" },
180	{ 'name' => "convert_params",
181	'desc' => "{W3ImagePlugin.convert_params}",
182	'type' => "string",
183	'deft' => "",
184	'reqd' => "no" },
185	{ 'name' => "min_near_text",
186	'desc' => "{W3ImagePlugin.min_near_text}",
187	'type' => "int",
188	'deft' => "10",
189	'reqd' => "no" },
190	{ 'name' => "max_near_text",
191	'desc' => "{W3ImagePlugin.max_near_text}",
192	'type' => "int",
193	'deft' => "400",
194	'reqd' => "no" },
195	{ 'name' => "smallpage_threshold",
196	'desc' => "{W3ImagePlugin.smallpage_threshold}",
197	'type' => "int",
198	'deft' => "2048",
199	'reqd' => "no" },
200	{ 'name' => "textrefs_threshold",
201	'desc' => "{W3ImagePlugin.textrefs_threshold}",
202	'type' => "int",
203	'deft' => "2",
204	'reqd' => "no" },
205	{ 'name' => "caption_length",
206	'desc' => "{W3ImagePlugin.caption_length}",
207	'type' => "int",
208	'deft' => "80",
209	'reqd' => "no" },
210	{ 'name' => "neartext_length",
211	'desc' => "{W3ImagePlugin.neartext_length}",
212	'type' => "int",
213	'deft' => "300",
214	'reqd' => "no" },
215	{ 'name' => "document_text",
216	'desc' => "{W3ImagePlugin.document_text}",
217	'type' => "flag",
218	'reqd' => "no" } ];
219
220	my $options = { 'name' => "W3ImagePlugin",
221	'desc' => "{W3ImagePlugin.desc}",
222	'abstract' => "no",
223	'inherits' => "yes",
224	'args' => $arguments };
225
226	sub new {
227	my ($class) = shift (@_);
228	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
229	push(@$pluginlist, $class);
230
231	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
232	push(@{$hashArgOptLists->{"OptList"}},$options);
233
234	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
235
236	# init class variables
237	$self->{'textref'} = undef; # init by read_file fn
238	$self->{'htdoc_obj'} = undef; # init by process fn
239	$self->{'htpath'} = undef; # init by process fn
240	$self->{'hturl'} = undef; # init by process fn
241	$self->{'plaintext'} = undef; # HTML stripped version - only init if needed by raw_neartext sub
242	$self->{'smallpage'} = 0; # set by process fn
243	$self->{'images_indexed'} = undef; # num of images indexed - if 1 or 2 then we know page is small
244	$self->{'initialised'} = undef; # flag (see set_extraction_options())
245
246	return bless $self, $class;
247	}
248
249	# if indexing pages, let HTMLPlugin do it's stuff
250	# image extraction done through read()
251	sub process {
252	my $self = shift(@_);
253	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
254	$self->{'imglist'} = ();
255	if ( $self->{'index_pages'} ) {
256	my $ok = $self->SUPER::process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
257	if ( ! $ok ) { return $ok }
258	$self->{'htdoc_obj'} = $doc_obj;
259	}
260	# else use URL for referencing
261	#if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
262
263	$self->{'htpath'} = $base_dir if (-d $base_dir);
264	if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} .= "/$1"; }
265	$self->{'htpath'} =~ s/\\/\//g; # replace \ with /
266
267	$self->{'hturl'} = "http://$file";
268	$self->{'hturl'} =~ s/\\/\//g; # for windows
269	($self->{'filename'}) = $file =~ /.[\/\\](.)/;
270	($self->{'base_path'}) = $file =~ /(.*)[\/\\]/i;
271	if ( ( -s "$base_dir/$file") <= $self->{'smallpage_threshold'} ) {
272	$self->{'smallpage'} = 1;
273	} else { $self->{'smallpage'} = 0; }
274
275	if ( defined($self->{'initialised'}) ) { return 1; }
276	else {
277	$self->{'initialised'} = $self->set_extraction_options($base_dir =~ /^(.*?)\/import/i);
278	return $self->{'initialised'};
279	}
280	}
281
282	# get complex configuration options from configuration files
283	# -- $GSDLCOLLECTION/etc/W3ImagePlugin.cfg (tag sets for aggr 2+)
284	# -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+)
285
286	# If there's no W3ImagePlugin.cfg file we'll use the following default values
287	my $defaultcfg = '
288	<delimitertagset>
289	<setname>Caption</setname>
290	<taggroup>font</taggroup>
291	<taggroup>tt</taggroup>
292	<taggroup>small</taggroup>
293	<taggroup>b</taggroup>
294	<taggroup>i</taggroup>
295	<taggroup>u</taggroup>
296	<taggroup>em</taggroup>
297	<taggroup>td</taggroup>
298	<taggroup>li</taggroup>
299	<taggroup>a</taggroup>
300	<taggroup>p</taggroup>
301	<taggroup>tr</taggroup>
302	<taggroup>center</taggroup>
303	<taggroup>div</taggroup>
304	<taggroup>caption</taggroup>
305	<taggroup>br</taggroup>
306	<taggroup>ul</taggroup>
307	<taggroup>ol</taggroup>
308	<taggroup>table</taggroup>
309	<taggroup>hr</taggroup>
310	</delimitertagset>
311
312	<delimitertagset>
313	<setname>Neartext</setname>
314	<taggroup>tr\|hr\|table\|h\d\|img\|body</taggroup>
315	<taggroup>td\|tr\|hr\|table\|h\d\|img\|body</taggroup>
316	<taggroup>p\|br\|td\|tr\|hr\|table\|h\d\|img\|body</taggroup>
317	<taggroup>font\|p\|i\|b\|em\|img</taggroup>
318	</delimitertagset>
319	';
320
321	sub set_extraction_options() {
322	my ($self, $collpath) = @_;
323	my ($filepath);
324
325	print {$self->{'outhandle'}} "W3ImagePlugin: Initialising\n"
326	if $self->{'verbosity'} > 1;
327	# etc/W3ImagePlugin.cfg (XML)
328	# tag sets for captions and neartext
329	if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) {
330	$self->{'delims'} = [];
331	$self->{'cdelims'} = [];
332	my ($cfg, @tagsets, $tagset, $type, @delims);
333
334	$filepath = "$collpath/etc/W3ImagePlugin.cfg";
335	if ( open CFG, "<$filepath" ) {
336	while (<CFG>) { $cfg .= $_ }
337	close CFG;
338	} else {
339	$cfg = $defaultcfg;
340	}
341
342	(@tagsets) =
343	$cfg =~ /<delimitertagset>(.*?)<\/delimitertagset>/igs;
344	foreach $tagset ( @tagsets ) {
345	($type) = $tagset =~ /<setname>(.*?)<\/setname>/i;
346	if ( lc($type) eq "caption" ) {
347	(@{$self->{'cdelims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
348	}
349	elsif ( lc($type) eq "neartext" ) {
350	(@{$self->{'delims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
351	}
352	}
353
354	# output a warning if there seem to be no delimiters
355	if ( scalar(@{$self->{'cdelims'}} == 0)) {
356	print {$self->{'outhandle'}} "W3ImagePlugin: Warning: no caption delimiters found in $filepath\n";
357	}
358	if ( scalar(@{$self->{'delims'}} == 0)) {
359	print {$self->{'outhandle'}} "W3ImagePlugin: Warning: no neartext delimiters found in $filepath\n";
360	}
361	}
362
363	# get stop words for textual reference extraction
364	# TODO: warnings scroll off. Would be best to output them again at end of import
365	if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) {
366	$self->{'stopwords'} = ();
367	$filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");
368	if ( open STOPWORDS, "<$filepath" ) {
369	while ( <STOPWORDS> ) {
370	chomp;
371	$self->{'stopwords'}{$_} = 1;
372	}
373	close STOPWORDS;
374	} else {
375	print {$self->{'outhandle'}} "W3ImagePlugin: Warning: couldn't open stopwords file at $filepath ($!)\n";
376	}
377
378	}
379
380	if ( $self->{'neartext_length'} > $self->{'max_near_text'} ) {
381	$self->{'max_near_text'} = $self->{'neartext_length'} * 1.33;
382	print {$self->{'outhandle'}} "W3ImagePlugin: Warning: adjusted max_text to $self->{'max_near_text'}\n";
383	}
384	if ( $self->{'caption_length'} > $self->{'max_near_text'} ) {
385	$self->{'max_near_text'} = $self->{'caption_length'} * 1.33;
386	print {$self->{'outhandle'}} "W3ImagePlugin: Warning: adjusted max_text to $self->{'max_near_text'}\n";
387	}
388
389	return 1;
390	}
391
392	# return number of files processed, undef if can't recognise, -1 if
393	# cant process
394	# Note that $base_dir might be "" and that $file might
395	# include directories
396	sub read {
397	my ($self, $pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = (@_);
398	my ($doc_obj, $section, $filepath, $imgtag, $pos, $context, $numdocs, $tndir, $imgs);
399	# forward normal read (runs HTMLPlugin if index_pages T)
400	my $ok = $self->SUPER::read($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
401	if ( ! $ok ) { return $ok } # what is this returning??
402
403	my $outhandle = $self->{'outhandle'};
404	my $textref = $self->{'textref'};
405	my $htdoc_obj = $self->{'htdoc_obj'};
406	$numdocs = 0;
407	$base_dir =~ /(.)\/./;
408	$tndir = "$1/archives/thumbnails"; # TODO: this path shouldn't be hardcoded?
409	&util::mk_all_dir($tndir) unless -e "$tndir";
410
411	$imgs = \%{$self->{'imglist'}};
412	my $nimgs = $self->get_img_list($textref);
413	$self->{'images_indexed'} = $nimgs;
414	if ( $nimgs > 0 ) {
415	my @fplist = (sort { $imgs->{$a}{'pos'} <=> $imgs->{$b}{'pos'} } keys %{$imgs});
416	my $i = 0;
417	foreach $filepath ( @fplist ) {
418	$pos = $imgs->{$filepath}{'pos'};
419	$context = substr ($$textref, $pos - 50, $pos + 50); # grab context (quicker)
420	($imgtag) = ($context =~ /(<(?:img\|a\|body)\s[^>]$filepath[^>]>)/is );
421	if (! defined($imgtag)) { $imgtag = $filepath }
422	print $outhandle "W3ImagePlugin: extracting $filepath\n"
423	if ( $self->{'verbosity'} > 1 );
424	$doc_obj = new doc ("", "indexed_doc");
425	$section = $doc_obj->get_top_section();
426	my $prevpos = ( $i == 0 ? 0 : $imgs->{$fplist[$i - 1]}{'pos'});
427	my $nextpos = ( $i >= ($nimgs -1) ? -1 : $imgs->{$fplist[$i + 1]}{'pos'} );
428
429	$self->extract_image_info($imgtag, $filepath, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos);
430	$processor->process($doc_obj);
431	$numdocs++;
432	$i++;
433	}
434	return $numdocs;
435	} else {
436	print $outhandle "W3ImagePlugin: No images from $file indexed\n"
437	if ( $self->{'verbosity'} > 2 );
438	return 1;
439	}
440
441	}
442
443	# for every valid image tag
444	# 1. extract related text and image metadata
445	# 2. add this as document meta-data
446	# 3. add assoc image(s) as files
447	#
448	sub extract_image_info {
449	my $self = shift (@_);
450	my ($tag, $id, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos) = (@_);
451	my ($filename, $orig_fp, $fn, $ext, $reltext, $relreltext, $crcid, $imgs,
452	$thumbfp, $pagetitle, $alttext, $filepath, $aggr);
453	$aggr = $self->{'aggressiveness'};
454	$imgs = \%{$self->{'imglist'}};
455	$filepath = $imgs->{$id}{'relpath'};
456	($filename) = $filepath =~ /([^\/\\]+)$/s;
457	($orig_fp) = "$self->{'base_path'}/$filepath";
458	$orig_fp =~ tr/+/ /;
459	$orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc
460	$orig_fp =~ s/\\/\//g;
461	$filepath = "$self->{'htpath'}/$filepath";
462	my ($onlyfn) = $filename =~ /([^\\\/]*)$/;
463	($fn, $ext) = $onlyfn =~ /(.)\.(.)/;
464	$fn = lc $fn; $ext = lc $ext;
465	($reltext) = "<tr><td>GifComment</td><td>" . `identify $filepath -ping -format "%c"` . "</td></tr>\n"
466	if ($ext eq "gif");
467	$reltext .= "<tr><td>FilePath</td><td>$orig_fp</td></tr>\n";
468
469	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
470	$crcid = "$fn.$ext." . $self->{'next_crcid'}++;
471	} else { ($crcid) = `cksum $filepath` =~ /^(\d+)/; }
472	$thumbfp = "$tndir/tn_$crcid.jpg";
473	`convert -flatten -filter Hanning $self->{'convert_params'} -geometry "$self->{'thumb_size'}x$self->{'thumb_size'}>" $filepath $thumbfp` unless -e $thumbfp;
474	if ( ! (-e $thumbfp) ) {
475	print STDERR "W3ImagePlugin: 'convert' failed. Check ImageMagicK binaries are installed and working correctly\n"; return 0;
476	}
477
478	# shove in full text (tag stripped or unstripped) if settings require it
479	if ( $aggr == 10) {
480	$reltext = "<tr><td>AllPage</td><td>" . $$textref . "</td><tr>\n"; # level 10 (all text, verbatim)
481	} else {
482	$pagetitle = $self->get_meta_value("title", $textref);
483	($alttext) = $tag =~ /\salt\s=\s(?:\"\|\')(.+?)(?:\"\|\')/is;
484	if ( defined($alttext) && length($alttext) > 1) {
485	$reltext .= "<tr><td>ALTtext</td><td>$alttext</td></tr>\n"; }
486	$reltext .= "<tr><td>SplitCapitalisation</td><td>" .
487	$self->split_filepath($orig_fp) . "</td></tr>\n";
488
489	# get caption/tag based near text (if appropriate)
490	if ( $aggr > 1 ) {
491	if ( $aggr >= 2 ) {
492	$reltext .=
493	$self->extract_caption_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
494	$relreltext = $reltext;
495	}
496	# repeat the filepath, alt-text, caption, etc
497	if ( $aggr == 8 ) {
498	$reltext .= $relreltext;
499	}
500	if ( $aggr >= 3 ) {
501	$reltext .=
502	$self->extract_near_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
503	}
504
505	# get page metadata (if appropriate)
506	if ( $aggr >= 6 \|\| ( $aggr >= 2 &&
507	( $self->{'images_indexed'} < 2 \|\|
508	($self->{'smallpage'} == 1 && $self->{'images_indexed'} < 6 )))) {
509	$reltext .= $self->get_page_metadata($textref);
510	}
511	# textual references
512	if ( $aggr == 5 \|\| $aggr >= 7) {
513	if ( length($relreltext) > ($self->{'caption_length'} * 2) ) {
514	$reltext .= $self->get_textrefs($relreltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); }
515	else {
516	$reltext .= $self->get_textrefs($reltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
517	}
518	}
519	} # aggr > 1
520	} # aggr != 10
521
522	$doc_obj->set_OID($crcid);
523	$doc_obj->associate_file($thumbfp, "$fn.thumb.jpg", undef, $section);
524	$doc_obj->add_metadata($section, "OriginalFilename", $filename);
525	$doc_obj->add_metadata($section, "FilePath", $orig_fp);
526	$doc_obj->add_metadata($section, "Filename", $fn);
527	$doc_obj->add_metadata($section, "FileExt", $ext);
528	$doc_obj->add_metadata($section, "FileSize", $imgs->{$id}{'filesize'});
529	$doc_obj->add_metadata($section, "Width", $imgs->{$id}{'width'});
530	$doc_obj->add_metadata($section, "Height", $imgs->{$id}{'height'});
531	$doc_obj->add_metadata($section, "URL", "http://$orig_fp");
532	$doc_obj->add_metadata($section, "PageURL", $self->{'hturl'});
533	$doc_obj->add_metadata($section, "PageTitle", $pagetitle);
534	$doc_obj->add_metadata($section, "ThumbURL",
535	"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/$fn.thumb.jpg");
536	$doc_obj->add_metadata($section, "FileFormat", "W3Img");
537
538	if ( $self->{'document_text'} ) {
539	$doc_obj->add_utf8_text($section, "<table border=1>\n$reltext</table>");
540	} else {
541	$doc_obj->add_metadata($section, "ImageText", "<table border=1>\n$reltext</table>\n");
542	}
543
544	if ( $self->{'index_pages'} ) {
545	my ($cache_url) = "_httpdoc_&d=" . $self->{'htdoc_obj'}->get_OID();
546	if ( $imgs->{$id}{'anchored'} ) {
547	my $a_name = $id;
548	$a_name =~ s/[\/\\\:\&]/_/g;
549	$cache_url .= "#gsdl_$a_name" ;
550	}
551	$doc_obj->add_utf8_metadata($section, "CachePageURL", $cache_url);
552	}
553	if ( ! $self->{'no_cache_images'} ) {
554	$onlyfn = lc $onlyfn;
555	$doc_obj->associate_file($filepath, $onlyfn, undef, $section);
556	$doc_obj->add_utf8_metadata($section, "CacheURL",
557	"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/$onlyfn");
558	}
559	return 1;
560	}
561
562	sub get_page_metadata {
563	my ($self, $textref) = (@_);
564	my (@rval);
565	$rval[0] = $self->get_meta_value("title", $textref);
566	$rval[1] = $self->get_meta_value("keywords", $textref);
567	$rval[2] = $self->get_meta_value("description", $textref);
568	$rval[3] = $self->{'filename'};
569
570	return wantarray ? @rval : "<tr><td>PageMeta</td><td>@rval</td></tr>\n" ;
571	}
572
573	# turns LargeCatFish into Large,Cat,Fish so MG sees the separate words
574	sub split_filepath {
575	my ($self, $filepath) = (@_);
576	my (@words) = $filepath =~ /([A-Z][a-z]+)/g;
577	return join(',', @words);
578	}
579
580	# finds and extracts sentences
581	# that seem to be on the same topic
582	# as other related text (correlations)
583	# and textual references (e.g. in figure 3 ...)
584	sub get_textrefs {
585	my ($self, $reltext, $textref, $prevpos, $pos, $nextpos) = (@_);
586	my ($maxtext, $mintext, $startpos, $context_size, $context);
587
588	my (@relwords, @refwords, %sentences, @pagemeta);
589
590	# extract larger context
591	$maxtext = $self->{'max_near_text'};
592	$startpos = $pos - ($maxtext * 4);
593	$context_size = $maxtext*10;
594	if ($startpos < $prevpos ) { $startpos = $prevpos }
595	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) }
596	$context = substr ( $$textref, $startpos, $context_size );
597	$context =~ s/<.*?>//gs;
598	$context =~ s/^.>(.)/$1/gs;
599	$context =~ s/(.)<.$/$1/gs;
600
601	# get page meta-data (if not already included)
602	if ( $self->{'aggressiveness'} == 5 && ! $self->{'smallpage'} ) {
603	@pagemeta = $self->get_page_metadata($textref);
604	foreach my $value ( @pagemeta ) {
605	$context .= "$value."; # make each into psuedo-sentence
606	}
607	}
608
609	# TODO: this list is not exhaustive
610	@refwords = ( '(?:is\|are)? ?(?:show(?:s\|n)\|demonstrate(?:d\|s)\|explains\|features) (?:in\|by\|below\|above\|here)',
611	'(?:see)? (?:figure\|table)? (?:below\|above)');
612
613	# extract general references
614	foreach my $rw ( @refwords ) {
615	while ( $context =~ /[\.\?\!\,](.?$rw\W.?[\.\?\!\,])/ig ) {
616	my $sentence = $1;
617	$sentence =~ s/\s+/ /g;
618	$sentences{$sentence}+=2;
619	}
620	}
621	# extract specific (figure, table) references by number
622	my ($fignum) = $context =~ /[\.\?\!].?(?:figure\|table)s?[\-\_\ \.](\d+\w)\W.*?[\.\?\!]/ig;
623	if ( $fignum ) {
624	foreach my $rw ( @refwords ) {
625	while ( $context =~ /[\.\?\!](.?(figure\|table)[\-\_\ \.]$fignum\W.?[\.\?\!])/ig ) {
626	my $sentence = $1;
627	$sentence =~ s/\s+/ /g;
628	$sentences{$sentence}+=4;
629	}
630	}
631	}
632
633	# sentences with occurances of important words
634	@relwords = $reltext =~ /([a-zA-Z]{4,})/g; # take out small words
635	foreach my $word ( @relwords ) {
636	if ( $self->{'stopwords'}{$word} ) { next } # skip stop words
637	while ( $context =~ /([^\.\?\!]?$word\W.?[\.\?\!])/ig ) {
638	my $sentence = $1;
639	$sentence =~ s/\s+/ /g;
640	$sentences{$sentence}++;
641	}
642	}
643	foreach my $sentence ( keys %sentences ) {
644	if ($sentences{$sentence} < $self->{'textrefs_threshold'}) {
645	delete $sentences{$sentence};
646	}
647	}
648	my ($rval) = join "<br>\n", (keys %sentences);
649	if ( $rval && length($rval) > 5 ) {
650	return ( "<tr><td>TextualReferences</td><td>" . $rval . "</td></tr>\n") }
651	else { return "" }
652	}
653
654	# handles caption extraction
655	# calling the extractor with different
656	# tags and choosing the best candidate caption
657	sub extract_caption_text {
658	my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
659	my (@neartext, $len, $hdelim, $mintext, $goodlen,
660	$startpos, $context, $context_size);
661
662	$mintext = $self->{'min_near_text'};
663	$goodlen = $self->{'caption_length'};
664
665	# extract a context to extract near text from (faster)
666	$context_size = $self->{'max_near_text'}*3;
667	$startpos = $pos - ($context_size / 2);
668	if ($startpos < $prevpos ) { $startpos = $prevpos }
669	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
670	{ $context_size = ($nextpos - $startpos) }
671
672	$context = substr ( $$textref, $startpos, $context_size );
673	$context =~ s/<!--.*?-->//gs;
674	$context =~ s/^.-->(.)/$1/gs;
675	$context =~ s/(.)<!--.$/$1/gs;
676
677	# try stepping through markup delimiter sets
678	# and selecting the best one
679	foreach $hdelim ( @{ $self->{'cdelims'} } ) {
680	@neartext = $self->extract_caption($tag, $hdelim, \$context);
681	$len = length(join("", @neartext));
682	last if ($len >= $mintext && $len <= $goodlen);
683	}
684	# reject if well over reasonable length
685	if ( $len > $goodlen ) {
686	@neartext = [];
687	}
688	$neartext[0] = " " if (! defined $neartext[0]);
689	$neartext[1] = " " if (! defined $neartext[1]);
690	return "<tr><td>Caption</td><td>" . (join ",", @neartext) . "</td></tr>\n"; # TODO: the \| is for testing purposes
691	} # end extract_caption_text
692
693	# the previous section header often gives a bit
694	# of context to the section that the image is
695	# in (invariably the header is before/above the image)
696	# so extract the text of the closest header above the image
697	#
698	# this fn just gets all the headers above the image, within the context window
699	sub get_prev_header {
700	my ($self, $pos, $textref) = (@_);
701	my ($rhtext);
702	while ( $$textref =~ /<h\d>(.*?)<\/h\d>/sig ) {
703	# only headers before image
704	if ((pos $$textref) < $pos) {
705	$rhtext .= "$1, ";
706	}
707	}
708	if ( $rhtext ) { return "Header($rhtext)" }
709	else { return "" }
710	}
711
712	# not the most robust tag stripping
713	# regexps (see perl.com FAQ) but good enough
714	#
715	# used by caption & tag-based near text algorithms
716	sub strip_tags {
717	my ( $self, $value ) = @_;
718	if ( ! defined($value) ) { $value = "" } # handle nulls
719	else {
720	$value =~ s/<.*?>//gs; # strip all html tags
721	$value =~ s/\s+/\ /g; # remove extra whitespace
722	$value =~ s/\&\w+\;//g; # remove   etc
723	}
724	return $value;
725	}
726
727	# uses the given tag(s) to identify
728	# the caption near to the image
729	# (below, above or both below and above)
730	sub extract_caption {
731	my ($self, $tag, $bound_tag, $contextref) = (@_);
732	my (@nt, $n, $etag, $gotcap);
733	return ("", "") if ( ! ($$contextref =~ /\Q$tag/) );
734
735	$nt[0] = $`;
736	$nt[1] = $';
737	$gotcap = 0;
738
739	# look before the image for a boundary tag
740	($etag, $nt[0]) = $nt[0] =~ /<($bound_tag)[\s]?.?>(.?)$/is;
741	# if bound_tag too far from the image, then prob not caption
742	# (note: have to allow for tags, so multiply by 3
743	if ( $etag && length($nt[0]) < ($self->{'caption_length'} * 3) ) {
744	if ( $nt[0] =~ /<\/$etag>/si ) {
745	# the whole caption is above the image: <tag>text</tag><img>
746	($nt[0]) =~ /<(?:$etag)[\s]?.?>(.?)<\/$etag>/is;
747	$nt[0] = $self->strip_tags($nt[0]);
748	if ( length($nt[0]) > $self->{'min_near_text'} ) {
749	$gotcap = 1;
750	$nt[1] = "";
751	}
752
753	} elsif ( $nt[1] =~ /<\/$etag>/si) {
754	# the caption tag covers image: <tag>text?<img>text?</tag>
755	($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
756	$nt[0] = $self->strip_tags($nt[0] . $nt[1]);
757	if ( length($nt[0]) > $self->{'min_near_text'} ) {
758	$gotcap = 2;
759	$nt[1] = "";
760	}
761	}
762	}
763	# else try below the image
764	if ( ! $gotcap ) {
765	# the caption is after the image: <img><tag>text</tag>
766	($etag, $nt[1]) = $nt[1] =~ /^.?<($bound_tag)[\s]?.?>(.*)/is;
767	if ( $etag && $nt[1] =~ /<\/$etag>/s) {
768	($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
769	$gotcap = 3;
770	$nt[0] = "";
771	$nt[1] = $self->strip_tags($nt[1]);
772	}
773	}
774	if ( ! $gotcap ) { $nt[0] = $nt[1] = "" }
775	else {
776	# strip part-tags
777	$nt[0] =~ s/^.*>//s;
778	$nt[1] =~ s/<.*$//s;
779	}
780	my ($type);
781	if ( $gotcap == 0 ) { return ("nocaption", "") }
782	elsif ( $gotcap == 1 ) { $type = "captionabove:" }
783	elsif ( $gotcap == 2 ) { $type = "captioncovering:" }
784	elsif ( $gotcap == 3 ) { $type = "captionbelow:" }
785	return ($type, $nt[0], $nt[1]);
786	}
787
788	# tag-based near text
789	#
790	# tries different tag sets
791	# and chooses the best one
792	sub extract_near_text {
793	my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
794	my (@neartext, $len, $hdelim, $maxtext, $mintext, $goodlen,
795	@bestlen, @best, $startpos, $context, $context_size,
796	$dist, $bdist, $best1, $i, $nt);
797	$bestlen[0] = $bestlen[1] = 0; $bestlen[2] = $bdist = 999999;
798	$best[0] = $best[1] = $best[2] = "";
799	$maxtext = $self->{'max_near_text'};
800	$mintext = $self->{'min_near_text'};
801	$goodlen = $self->{'neartext_length'};
802
803	# extract a context to extract near text from (faster)
804	$context_size = $maxtext*4;
805	$startpos = $pos - ($context_size / 2);
806	if ($startpos < $prevpos ) { $startpos = $prevpos }
807	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
808	{ $context_size = ($nextpos - $startpos) }
809	$context = substr ( $$textref, $startpos, $context_size );
810	$context =~ s/<!--.*?-->//gs;
811	$context =~ s/^.-->(.)/$1/gs;
812	$context =~ s/(.)<!--.$/$1/gs;
813
814	# try stepping through markup delimiter sets
815	# and selecting the best one
816	foreach $hdelim ( @{ $self->{'delims'} } ) {
817	@neartext = $self->extract_tagged_neartext($tag, $hdelim, \$context);
818	$nt = join("", @neartext);
819	$len = length($nt);
820	# Priorities:
821	# 1. Greater than mintext
822	# 2. Less than maxtext
823	# 3. Closest to goodlen
824	if ( $len <= $goodlen && $len > $bestlen[0] ) {
825	$bestlen[0] = $len;
826	$best[0] = $hdelim;
827	} elsif ( $len >= $maxtext && $len < $bestlen[2] ) {
828	$bestlen[2] = $len;
829	$best[2] = $hdelim;
830	} elsif ( $len >= $bestlen[0] && $len <= $bestlen[2] ) {
831	$dist = abs($goodlen - $len);
832	if ( $dist < $bdist ) {
833	$bestlen[1] = $len;
834	$best[1] = $hdelim;
835	$bdist = $dist;
836	}
837	}
838	}
839	$best1 = 2;
840	foreach $i ( 0..2 ) {
841	if ( $bestlen[$i] == 999999 ) { $bestlen[$i] = 0 }
842	$dist = abs($goodlen - $bestlen[$i]);
843	if ( $bestlen[$i] > $mintext && $dist <= $bdist ) {
844	$best1 = $i;
845	$bdist = $dist;
846	}
847	}
848	@neartext = $self->extract_tagged_neartext($tag, $best[$best1], \$context);
849	if ( $bestlen[$best1] > $maxtext ) {
850	# truncate on word boundary if too much text
851	my $hmax = $maxtext / 2;
852	($neartext[0]) = $neartext[0] =~ /([^\s]*.{1,$hmax})$/s;
853	($neartext[1]) = $neartext[1] =~ /^(.{1,$hmax}[^\s]*)/s;
854	} elsif ( $bestlen[$best1] < $mintext ) {
855	# use plain text extraction if tags failed (e.g. usable tag outside context)
856	print {$self->{'outhandle'}} "W3ImagePlugin: Fallback to plain-text extraction for $tag\n"
857	if $self->{'verbosity'} > 2;
858	$neartext[0] = "<tr><td>RawNeartext</td><td>" . $self->extract_raw_neartext($tag, $textref) . "</td></tr>";
859	$neartext[1] = "";
860	}
861	# get previous header if available
862	$neartext[0] .= "<br>\n" .
863	$self->get_prev_header($pos, \$context) if ( $self->{'aggressiveness'} >= 4 );
864	$neartext[0] = " " if (! defined $neartext[0]);
865	$neartext[1] = " " if (! defined $neartext[1]);
866
867	return "<tr><td>NearText</td><td>" . (join "\|", @neartext) . "</td></tr>\n"; # TODO: the \| is for testing purposes
868	} # end extract_near_text
869
870	# actually captures tag-based
871	# near-text given a tag set
872	sub extract_tagged_neartext {
873	my ($self, $tag, $bound_tag, $textref) = (@_);
874	return "" if ( ! ($$textref =~ /\Q$tag/) );
875	my (@nt, $delim, $pre_tag, $n);
876	$nt[0] = $`;
877	$nt[1] = $';
878
879	# get text after previous image tag
880	$nt[0] =~ s/.<($bound_tag)[^>]>(.*)/$2/is; # get rid of preceding text
881	if (defined($1)) { $delim = $1 }
882	$pre_tag = $bound_tag;
883
884	if (defined($delim)) {
885	# we want to try and use the end tag of the previous delimiter
886	# (put it on the front of the list)
887	$pre_tag =~ s/(^\|\\|)($delim)($\|\\|)//i; # take it out
888	$pre_tag =~ s/\\|\\|/\\|/i; # replace \|\| with \|
889	$pre_tag = $delim . "\|" . $pre_tag; # put it on the front
890	}
891
892	# get text before next image tag
893	$nt[1] =~ s/<\/?(?:$pre_tag)[^>]>.//is; # get rid of stuff after first delimiter
894
895	# process related text
896	for $n (0..1) {
897	if ( defined($nt[$n]) ) {
898	$nt[$n] =~ s/<.*?>//gs; # strip all html tags
899	$nt[$n] =~ s/\s+/\ /gs; # remove extra whitespace
900	$nt[$n] =~ s/\&\w+\;//sg; # remove   etc
901	# strip part-tags
902	if ( $n == 0 ) { $nt[0] =~ s/^.*>//s }
903	if ( $n == 1 ) { $nt[1] =~ s/<.*$//s }
904	} else { $nt[$n] = ""; } # handle nulls
905	}
906	return @nt;
907	}
908
909	# this function is fall-back
910	# if tags aren't suitable.
911	#
912	# extracts a fixed length of characters
913	# either side of image tag (on word boundary)
914	sub extract_raw_neartext {
915	my ($self, $tag, $textref) = (@_);
916	my ($rawtext, $startpos, $fp);
917	my $imgs = \%{$self->{'imglist'}};
918	($fp) = $tag =~ /([\w\\\/]+\.(?:gif\|jpe?g\|png))/is;
919	if (! $fp) { return " " };
920	# if the cached, plain-text version isn't there, then create it
921	$self->init_plaintext($textref) unless defined($self->{'plaintext'});
922
923	# take the closest maxtext/2 characters
924	# either side of the tag (by word boundary)
925	return "" if ( ! exists $imgs->{$fp}{'rawpos'} );
926	$startpos = $imgs->{$fp}{'rawpos'} - (($self->{'max_near_text'} / 2) + 20);
927	if ( $startpos < 0 ) { $startpos = 0 }
928	$rawtext = substr $self->{'plaintext'}, $startpos, $self->{'max_near_text'} + 20;
929	$rawtext =~ s/\s\s/ /g;
930
931	return $rawtext;
932	}
933
934	# init plaintext variable for HTML-stripped version
935	# (for full text index/raw assoc text extraction)
936	sub init_plaintext {
937	my ($self, $textref) = (@_);
938	my ($page, $fp);
939	my $imgs = \%{$self->{'imglist'}};
940	$page = $$textref; # make a copy of original
941
942	# strip tags around image filenames so they don't get zapped
943	$page =~ s/<\w+\s+.?([\w\/\\]+\.(?:gif\|jpe?g\|png))[^>]>/\"$1\"/gsi;
944	$page =~ s/<.*?>//gs;
945	$page =~ s/ / /gs;
946	$page =~ s/&/&/gs; #TODO: more &zzz; replacements (except <, $gt;)
947
948	# get positions and strip images
949	while ( $page =~ /([^\s\'\"]+\.(jpe?g\|gif\|png))/ig ) {
950	$fp = $1;
951	if ( $imgs->{$fp}{'exists'} ) {
952	$imgs->{$fp}{'rawpos'} = pos $page;
953	}
954	$page =~ s/\"$fp\"//gs;
955	}
956	$self->{'plaintext'} = $page;
957	}
958
959	# finds and filters images based on size
960	# (dimensions, height, filesize) and existence
961	#
962	# looks for image filenames (.jpg, .gif, etc)
963	# and checks for existence on disk
964	# (hence supports most JavaScript images)
965	sub get_img_list {
966	my $self = shift (@_);
967	my ($textref) = (@_);
968	my ($filepath, $relpath, $abspath, $pos, $num, $width, $height, $filesize);
969	my $imgs = \%{$self->{'imglist'}};
970	while ( $$textref =~ /([^\s\'\"]+\.(jpe?g\|gif\|png))/ig ) {
971	$filepath = $1;
972	$pos = pos $$textref;
973	next if ( $imgs->{$filepath}{'relpath'} );
974	$relpath = $filepath;
975	$relpath =~ s/^http\:\/\///; # remove http:// in case we have mirrored it
976	$relpath =~ s/\\/\//g; # replace \ with /
977	$relpath =~ s/^\.\///s; # make "./filepath" into "filepath"
978	$imgs->{$filepath}{'relpath'} = $relpath;
979	$abspath = "$self->{'htpath'}/$relpath";
980
981	if (! -e $abspath) { next }
982
983	# can't modify real filepath var because it
984	# then can't be located in the page for tag recognition later
985	($width, $height) =
986	`identify $abspath -ping -format "%wx%h"` =~ /^(\d)x(\d)$/m;
987	if (! ($width && $height)) {
988	print STDERR "W3ImagePlugin: ($abspath) 'identify' failed. Check ImageMagicK binaries are installed and working correctly\n"; next;
989	}
990	$filesize = (-s $abspath);
991	if ( $filesize >= $self->{'min_size'}
992	&& ( $width >= $self->{'min_width'} )
993	&& ( $height >= $self->{'min_height'} ) ) {
994
995	$imgs->{$filepath}{'exists'} = 1;
996	$imgs->{$filepath}{'pos'} = $pos;
997	$imgs->{$filepath}{'width'} = $width;
998	$imgs->{$filepath}{'height'} = $height;
999	$imgs->{$filepath}{'filesize'} = $filesize;
1000	} else {
1001	print {$self->{'outhandle'}} "W3ImagePlugin: skipping $self->{'base_path'}/$relpath: $filesize, $width x $height\n"
1002	if $self->{'verbosity'} > 2;
1003	}
1004	}
1005	$num = 0;
1006	foreach my $i ( keys %{$imgs} ) {
1007	if ( $imgs->{$i}{'pos'} ) {
1008	$num++;
1009	} else { delete $imgs->{$i} }
1010	}
1011	return $num;
1012	}
1013
1014	# make the text available to the read function
1015	# by making it an object variable
1016	sub read_file {
1017	my ($self, $filename, $encoding, $language, $textref) = @_;
1018	$self->SUPER::read_file($filename, $encoding, $language, $textref);
1019
1020	# if HTMLplug has run through, then it will
1021	# have replaced references so we have to
1022	# make a copy of the text before processing
1023	if ( $self->{'index_pages'} ) {
1024	$self->{'text'} = $$textref;
1025	$self->{'textref'} = \($self->{'text'});
1026	} else {
1027	$self->{'textref'} = $textref;
1028	}
1029	$self->{'plaintext'} = undef;
1030	}
1031
1032	# HTMLPlugin only extracts meta-data if it is specified in plugin options
1033	# hence a special function to do it here
1034	sub get_meta_value {
1035	my ($self, $name, $textref) = @_;
1036	my ($value);
1037	$name = lc $name;
1038	if ($name eq "title") {
1039	($value) = $$textref =~ /<title>(.*?)<\/title>/is
1040	} else {
1041	my $qm = "(?:\"\|\')";
1042	($value) = $$textref =~ /<meta name\s=\s$qm?$name$qm?\s+content\s=\s$qm?(.?)$qm?\s>/is
1043	}
1044	$value = "" unless $value;
1045	return $value;
1046	}
1047
1048	# make filename an anchor reference
1049	# so we can go straight to the image
1050	# within the cached version of the source page
1051	# (augment's HTMLPlugin sub)
1052	sub replace_images {
1053	my $self = shift (@_);
1054	my ($front, $link, $back, $base_dir,
1055	$file, $doc_obj, $section) = @_;
1056	$link =~ s/\"//g;
1057	my ($a_name) = $link;
1058	$a_name =~ s/[\/\\\:\&]/_/g;
1059	# keep a list so we don't repeat the same anchor
1060	if ( ! $self->{'imglist'}{$link}{'anchored'} ) {
1061	$front = "<a name=\"gsdl_$a_name\">$front";
1062	$back = "$back</a>";
1063	$self->{'imglist'}{$link}{'anchored'} = 1;
1064	}
1065	return $self->SUPER::replace_images($front, $link, $back, $base_dir,
1066	$file, $doc_obj, $section);
1067	}
1068
1069	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: