Context Navigation

source: trunk/gsdl/perllib/plugins/W3ImgPlug.pm@ 2996

Last change on this file since 2996 was 2996, checked in by sjboddie, 22 years ago
* empty log message *
Property svn:keywords set to `Author Date Id Revision`
File size: 37.9 KB

Line
1	###########################################################################
2	#
3	# W3ImgPlug.pm -- Context-based image indexing plugin for HTML documents
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 2001 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# DESCRIPTION:
28	#
29	# Extracts images and associated text and metadata from
30	# web pages as individual documents for indexing. Thumbnails
31	# are created from each image for browsing purposes.
32	#
33	# Options are available for configuring the aggressiveness of the
34	# associated text extraction mechanisms. A higher level of
35	# aggressiveness will extract more text and consequently
36	# may mean lower accuracy (precision); however, it may also
37	# retrieve more of the relevant images from the collection (recall).
38	# Lower levels of aggressiveness maybe result in slightly faster
39	# collection builds at the import stage.
40	#
41	# W3ImgPlug is a subclass of HTMLPlug (i.e. it will index pages also
42	# if required). It can be used in place of HTMLPlug to index both
43	# pages and their images.
44	#
45	# REQUIREMENTS:
46	#
47	# The ImageMagick image manipulation is used to create
48	# thumbnails and extract some image metadata. (Available
49	# from http://www.imagemagick.org/)
50	#
51	# Unix:
52	# Many Linux distributions contain ImageMagick.
53	#
54	# Windows:
55	# ImageMagick can be downloaded from the website above.
56	# Make sure the system path includes the ImageMagick binaries
57	# before using W3ImgPlug.
58	#
59	# NOTE: NT/2000/XP contain a filesystem utility 'convert.exe'
60	# with the same name as the image conversion utility. The
61	# ImageMagick FAQ recommends renaming the filesystem
62	# utility (e.g. to 'fsconvert.exe') to avoid this clash.
63	#
64	# USAGE:
65	#
66	# An image document consists of metadata elements:
67	#
68	# OriginalFilename, FilePath, Filename, FileExt, FileSize,
69	# Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL
70	# ImageText, PageTitle
71	#
72	# Most of these are only useful in format strings (e.g. ThumbURL,
73	# Filename, URL, PageURL, CachePageURL).
74	#
75	# ImageText, as the name suggests contains the indexable text.
76	# (unless using the -document_text plugin option)
77	#
78	# Since image documents are made up of metadata elements
79	# alone, format strings are needed to display them properly.
80	# NOTE: The receptionist will only display results (e.g. thumbnails)
81	# in 4 columns if the format string begins with "<td><table>".
82	#
83	# The example below takes the user to the image within the
84	# source HTML document rather than using a format string
85	# on DocumentText to display the image document itself.
86	#
87	# Example collect.cfg:
88	#
89	# ...
90	#
91	# indexes document:ImageText document:text
92	# defaultindex document:ImageText
93	#
94	# collectionmeta .document:ImageText "images"
95	# collectionmeta .document:text "documents"
96	#
97	# ...
98	#
99	# plugin W3ImgPlug -index_pages -aggressiveness 6
100	#
101	# ...
102	#
103	# format SearchVList '<td>{If}{[Title],[link][icon] [Title][[/link],
104	# <table><tr><td align="center"><a href="[CachePageURL]">
105	# <img src="[ThumbURL]"></a></td></tr><tr><td align="center">
106	# <a href="[CachePageURL]"><font size="-1">[OriginalFilename]</font></a>
107	# <br>[Width]x[Height]</td></tr></table>}</td>'
108	#
109	# ...
110	#
111
112	package W3ImgPlug;
113
114	use HTMLPlug;
115	use ghtml;
116	use unicode;
117	use util;
118	use parsargv;
119	use strict 'subs';
120
121	sub BEGIN {
122	@ISA = qw( HTMLPlug );
123	}
124
125	sub print_usage {
126	print STDERR "\nUsage: plugin W3ImgPlug [options]\n\n";
127	print STDERR " options:\n";
128	print STDERR " -aggressiveness Range of related text extraction techniques to use [4]\n";
129	print STDERR " 1: Filename, path, ALT text only\n";
130	print STDERR " 2: All of 1, plus caption where available\n";
131	print STDERR " 3: All of 2, plus near paragraphs where available\n";
132	print STDERR " 4: All of 3, plus previous headers (<h1>, <h2>...)\n";
133	print STDERR " where available\n";
134	print STDERR " 5: All of 4, plus textual references where available\n";
135	print STDERR " 6: All of 4, plus page metatags (title, keywords, etc)\n";
136	print STDERR " 7: All of 6, 5 and 4 combined\n";
137	print STDERR " 8: All of 7, plus repeat caption, filename, etc (raise \n";
138	print STDERR " ranking of more relevant results)\n";
139	print STDERR " 10: All of 1, plus full text of source page\n";
140	print STDERR "\n";
141	print STDERR " -no_cache_images Don't cache images (point to URL of original)\n";
142	print STDERR " -index_pages Index the pages along with the images.\n";
143	print STDERR " Otherwise reference the pages at the source URL\n";
144	print STDERR " -min_size Bytes. Skip images smaller than this [2000]\n";
145	print STDERR " -min_width Pixels. Skip images narrower than this [50 pixels]\n";
146	print STDERR " -min_height Pixels. Skip images shorter than this [50 pixels]\n";
147	print STDERR " -thumb_size Max thumbnail size. Both width and height [100 pixels]\n";
148	print STDERR " -convert_params Additional parameters for ImageMagicK convert on\n";
149	print STDERR " thumbnail creation. For example, '-raise' will give\n";
150	print STDERR " a three dimensional effect to thumbnail images.\n";
151	print STDERR " -document_text Add image text as document:text (otherwise IndexedText\n";
152	print STDERR " metadata field)\n";
153	print STDERR "\n";
154	print STDERR " Advanced Options (applicability depends on aggressiveness level)\n";
155	print STDERR " -smallpage_threshold Images on pages smaller than this (bytes) will have\n";
156	print STDERR " the page (title, keywords, etc) meta-data added [2048]\n";
157	print STDERR " -textrefs_threshold Threshold for textual references. Lower values mean\n";
158	print STDERR " the algorithm is less strict [2]\n";
159	print STDERR " -caption_length Maximum length of captions (in characters) [100]\n";
160	print STDERR " -neartext_length Target length of near text (in characters) [300]\n";
161	print STDERR " -max_near_text Maximum characters near images to extract [400]\n";
162	print STDERR " -min_near_text Minimum characters of near text or caption to extract [10]\n";
163	print STDERR "\n";
164	print STDERR " Tag set configuration file (XML format):\n";
165	print STDERR " <collectionpath>/etc/W3ImgPlug.cfg \n";
166	print STDERR "\n";
167	print STDERR "\n";
168	print STDERR "W3ImgPlug inherits all of HTMLPlug's functionality and options:\n";
169	HTMLPlug::print_usage();
170	print STDERR "\n";
171	}
172
173	sub new {
174	my $class = shift (@_);
175	my $self = new HTMLPlug ($class, @_);
176
177	if (!parsargv::parse(\@_,
178	q^aggressiveness/\d/3^, \$self->{'aggressiveness'},
179	q^index_pages^, \$self->{'index_pages'},
180	q^no_cache_images^, \$self->{'no_cache_images'},
181	q^min_size/\d*/2000^, \$self->{'min_img_filesize'},
182	q^min_width/\d*/50^, \$self->{'min_img_width'},
183	q^min_height/\d*/50^, \$self->{'min_img_height'},
184	q^thumb_size/\d*/100^, \$self->{'thumbnail_size'},
185	q^convert_params/.*/ ^, \$self->{'img_convert_param'},
186	q^max_near_text/\d*/400^, \$self->{'maxtext'},
187	q^min_near_text/\d*/10^, \$self->{'mintext'},
188	q^smallpage_threshold/\d*/2048^, \$self->{'smallpage_threshold'},
189	q^textrefs_threshold/\d*/2^, \$self->{'textref_threshold'},
190	q^caption_length/\d*/80^, \$self->{'caption_len'},
191	q^neartext_length/\d*/300^, \$self->{'neartext_len'},
192	q^document_text^, \$self->{'document_text'},
193	"allow_extra_options"
194	)) {
195
196	print STDERR "\nIncorrect options passed to W3ImgPlug, check your collect.cfg configuration file\n";
197	&print_usage();
198	die "\n";
199	}
200
201	# init class variables
202	$self->{'textref'} = undef; # init by read_file fn
203	$self->{'htdoc_obj'} = undef; # init by process fn
204	$self->{'htpath'} = undef; # init by process fn
205	$self->{'hturl'} = undef; # init by process fn
206	$self->{'plaintext'} = undef; # HTML stripped version - only init if needed by raw_neartext sub
207	$self->{'smallpage'} = 0; # set by process fn
208	$self->{'images_indexed'} = undef; # num of images indexed - if 1 or 2 then we know page is small
209	$self->{'initialised'} = undef; # flag (see set_extraction_options())
210
211	return bless $self, $class;
212	}
213
214	# if indexing pages, let HTMLPlug do it's stuff
215	# image extraction done through read()
216	sub process {
217	my ($self, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
218	$self->{'imglist'} = ();
219	if ( $self->{'index_pages'} ) {
220	my $ok = $self->SUPER::process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
221	if ( ! $ok ) { return $ok }
222	$self->{'htdoc_obj'} = $doc_obj;
223	}
224	# else use URL for referencing
225	#if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
226
227	$self->{'htpath'} = $base_dir if (-d $base_dir);
228	if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} .= "/$1"; }
229	$self->{'htpath'} =~ s/\\/\//g; # replace \ with /
230
231	$self->{'hturl'} = "http://$file";
232	$self->{'hturl'} =~ s/\\/\//g; # for windows
233	($self->{'filename'}) = $file =~ /.[\/\\](.)/;
234	($self->{'base_path'}) = $file =~ /(.*)[\/\\]/i;
235	if ( ( -s "$base_dir/$file") <= $self->{'smallpage_threshold'} ) {
236	$self->{'smallpage'} = 1;
237	} else { $self->{'smallpage'} = 0; }
238
239	if ( defined($self->{'initialised'}) ) { return 1; }
240	else {
241	$self->{'initialised'} = $self->set_extraction_options($base_dir =~ /^(.*?)\/import/i);
242	return $self->{'initialised'};
243	}
244	}
245
246	# get complex configuration options from configuration files
247	# -- $GSDLCOLLECTION/etc/W3ImgPlug.cfg (tag sets for aggr 2+)
248	# -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+)
249
250	# If there's no W3ImgPlug.cfg file we'll use the following default values
251	my $defaultcfg = '
252	<delimitertagset>
253	<setname>Caption</setname>
254	<taggroup>font</taggroup>
255	<taggroup>tt</taggroup>
256	<taggroup>small</taggroup>
257	<taggroup>b</taggroup>
258	<taggroup>i</taggroup>
259	<taggroup>u</taggroup>
260	<taggroup>em</taggroup>
261	<taggroup>td</taggroup>
262	<taggroup>li</taggroup>
263	<taggroup>a</taggroup>
264	<taggroup>p</taggroup>
265	<taggroup>tr</taggroup>
266	<taggroup>center</taggroup>
267	<taggroup>div</taggroup>
268	<taggroup>caption</taggroup>
269	<taggroup>br</taggroup>
270	<taggroup>ul</taggroup>
271	<taggroup>ol</taggroup>
272	<taggroup>table</taggroup>
273	<taggroup>hr</taggroup>
274	</delimitertagset>
275
276	<delimitertagset>
277	<setname>Neartext</setname>
278	<taggroup>tr\|hr\|table\|h\d\|img\|body</taggroup>
279	<taggroup>td\|tr\|hr\|table\|h\d\|img\|body</taggroup>
280	<taggroup>p\|br\|td\|tr\|hr\|table\|h\d\|img\|body</taggroup>
281	<taggroup>font\|p\|i\|b\|em\|img</taggroup>
282	</delimitertagset>
283	';
284
285	sub set_extraction_options() {
286	my ($self, $collpath) = @_;
287	my ($filepath);
288
289	print {$self->{'outhandle'}} "W3ImgPlug: Initialising\n"
290	if $self->{'verbosity'} > 1;
291	# etc/W3ImgPlug.cfg (XML)
292	# tag sets for captions and neartext
293	if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 10 ) {
294	$self->{'delims'} = [];
295	$self->{'cdelims'} = [];
296	my ($cfg, @tagsets, $tagset, $type, @delims);
297
298	$filepath = "$collpath/etc/W3ImgPlug.cfg";
299	if ( open CFG, "<$filepath" ) {
300	while (<CFG>) { $cfg .= $_ }
301	close CFG;
302	} else {
303	$cfg = $defaultcfg;
304	}
305
306	(@tagsets) =
307	$cfg =~ /<delimitertagset>(.*?)<\/delimitertagset>/igs;
308	foreach $tagset ( @tagsets ) {
309	($type) = $tagset =~ /<setname>(.*?)<\/setname>/i;
310	if ( lc($type) eq "caption" ) {
311	(@{$self->{'cdelims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
312	}
313	elsif ( lc($type) eq "neartext" ) {
314	(@{$self->{'delims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
315	}
316	}
317
318	# output a warning if there seem to be no delimiters
319	if ( scalar(@{$self->{'cdelims'}} == 0)) {
320	print {$self->{'outhandle'}} "W3ImgPlug: Warning: no caption delimiters found in $filepath\n";
321	}
322	if ( scalar(@{$self->{'delims'}} == 0)) {
323	print {$self->{'outhandle'}} "W3ImgPlug: Warning: no neartext delimiters found in $filepath\n";
324	}
325	}
326
327	# get stop words for textual reference extraction
328	# TODO: warnings scroll off. Would be best to output them again at end of import
329	if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 10 ) {
330	$self->{'stopwords'} = ();
331	$filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");
332	if ( open STOPWORDS, "<$filepath" ) {
333	while ( <STOPWORDS> ) {
334	chomp;
335	$self->{'stopwords'}{$_} = 1;
336	}
337	close STOPWORDS;
338	} else {
339	print {$self->{'outhandle'}} "W3ImgPlug: Warning: couldn't open stopwords file at $filepath ($!)\n";
340	}
341
342	}
343
344	if ( $self->{'neartext_len'} > $self->{'maxtext'} ) {
345	$self->{'maxtext'} = $self->{'neartext_len'} * 1.33;
346	print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'maxtext'}\n";
347	}
348	if ( $self->{'caption_len'} > $self->{'maxtext'} ) {
349	$self->{'maxtext'} = $self->{'caption_len'} * 1.33;
350	print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'maxtext'}\n";
351	}
352
353	return 1;
354	}
355
356	# return number of files processed, undef if can't process
357	# Note that $base_dir might be "" and that $file might
358	# include directories
359	sub read {
360	my ($self, $pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = (@_);
361	my ($doc_obj, $section, $filepath, $imgtag, $pos, $context, $numdocs, $tndir, $imgs);
362	# forward normal read (runs HTMLPlug if index_pages T)
363	my $ok = $self->SUPER::read($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs);
364	if ( ! $ok ) { return $ok }
365
366	my $outhandle = $self->{'outhandle'};
367	my $textref = $self->{'textref'};
368	my $htdoc_obj = $self->{'htdoc_obj'};
369	$numdocs = 0;
370	$base_dir =~ /(.)\/./;
371	$tndir = "$1/archives/thumbnails"; # TODO: this path shouldn't be hardcoded?
372	&util::mk_all_dir($tndir) unless -e "$tndir";
373
374	$imgs = \%{$self->{'imglist'}};
375	my $nimgs = $self->get_img_list($textref);
376	$self->{'images_indexed'} = $nimgs;
377	if ( $nimgs > 0 ) {
378	my @fplist = (sort { $imgs->{$a}{'pos'} <=> $imgs->{$b}{'pos'} } keys %{$imgs});
379	my $i = 0;
380	foreach $filepath ( @fplist ) {
381	$pos = $imgs->{$filepath}{'pos'};
382	$context = substr ($$textref, $pos - 50, $pos + 50); # grab context (quicker)
383	($imgtag) = ($context =~ /(<(?:img\|a\|body)\s[^>]$filepath[^>]>)/is );
384	if (! defined($imgtag)) { $imgtag = $filepath }
385	print $outhandle "W3ImgPlug: extracting $filepath\n"
386	if ( $self->{'verbosity'} > 1 );
387	$doc_obj = new doc ("", "indexed_doc");
388	$section = $doc_obj->get_top_section();
389	$prevpos = ( $i == 0 ? 0 : $imgs->{$fplist[$i - 1]}{'pos'});
390	$nextpos = ( $i >= ($nimgs -1) ? -1 : $imgs->{$fplist[$i + 1]}{'pos'} );
391
392	$self->extract_image_info($imgtag, $filepath, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos);
393	$processor->process($doc_obj);
394	$numdocs++;
395	$i++;
396	}
397	return $numdocs;
398	} else {
399	print $outhandle "W3ImgPlug: No images from $file indexed\n"
400	if ( $self->{'verbosity'} > 2 );
401	return 1;
402	}
403
404	}
405
406	# for every valid image tag
407	# 1. extract related text and image metadata
408	# 2. add this as document meta-data
409	# 3. add assoc image(s) as files
410	#
411	sub extract_image_info {
412	my $self = shift (@_);
413	my ($tag, $id, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos) = (@_);
414	my ($filename, $orig_fp, $fn, $ext, $reltext, $relreltext, $crcid, $imgs,
415	$thumbfp, $pagetitle, $alttext, $filepath, $aggr);
416	$aggr = $self->{'aggressiveness'};
417	$imgs = \%{$self->{'imglist'}};
418	$filepath = $imgs->{$id}{'relpath'};
419	($filename) = $filepath =~ /([^\/\\]+)$/s;
420	($orig_fp) = "$self->{'base_path'}/$filepath";
421	$orig_fp =~ tr/+/ /;
422	$orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc
423	$orig_fp =~ s/\\/\//g;
424	$filepath = "$self->{'htpath'}/$filepath";
425	($onlyfn) = $filename =~ /([^\\\/]*)$/;
426	($fn, $ext) = $onlyfn =~ /(.)\.(.)/;
427	$fn = lc $fn; $ext = lc $ext;
428	($reltext) = "<tr><td>GifComment</td><td>" . `identify $filepath -ping -format "%c"` . "</td></tr>\n"
429	if ($ext eq "gif");
430	$reltext .= "<tr><td>FilePath</td><td>$orig_fp</td></tr>\n";
431
432	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
433	$crcid = "$fn.$ext." . $self->{'next_crcid'}++;
434	} else { ($crcid) = `cksum $filepath` =~ /^(\d+)/; }
435	$thumbfp = "$tndir/tn_$crcid.jpg";
436	`convert -flatten -filter Hanning $self->{'img_convert_param'} -geometry "$self->{'thumbnail_size'}x$self->{'thumbnail_size'}>" $filepath $thumbfp` unless -e $thumbfp;
437	if ( ! (-e $thumbfp) ) {
438	print STDERR "W3ImgPlug: 'convert' failed. Check ImageMagicK binaries are installed and working correctly\n"; return 0;
439	}
440
441	# shove in full text (tag stripped or unstripped) if settings require it
442	if ( $aggr == 10) {
443	$reltext = "<tr><td>AllPage</td><td>" . $$textref . "</td><tr>\n"; # level 10 (all text, verbatim)
444	} else {
445	$pagetitle = $self->get_meta_value("title", $textref);
446	($alttext) = $tag =~ /\salt\s=\s(?:\"\|\')(.+?)(?:\"\|\')/is;
447	if ( defined($alttext) && length($alttext) > 1) {
448	$reltext .= "<tr><td>ALTtext</td><td>$alttext</td></tr>\n"; }
449	$reltext .= "<tr><td>SplitCapitalisation</td><td>" .
450	$self->split_filepath($orig_fp) . "</td></tr>\n";
451
452	# get caption/tag based near text (if appropriate)
453	if ( $aggr > 1 ) {
454	if ( $aggr >= 2 ) {
455	$reltext .=
456	$self->extract_caption_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
457	$relreltext = $reltext;
458	}
459	# repeat the filepath, alt-text, caption, etc
460	if ( $aggr == 8 ) {
461	$reltext .= $relreltext;
462	}
463	if ( $aggr >= 3 ) {
464	$reltext .=
465	$self->extract_near_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
466	}
467
468	# get page metadata (if appropriate)
469	if ( $aggr >= 6 \|\| ( $aggr >= 2 &&
470	( $self->{'images_indexed'} < 2 \|\|
471	($self->{'smallpage'} == 1 && $self->{'images_indexed'} < 6 )))) {
472	$reltext .= $self->get_page_metadata($textref);
473	}
474	# textual references
475	if ( $aggr == 5 \|\| $aggr >= 7) {
476	if ( length($relreltext) > ($self->{'caption_len'} * 2) ) {
477	$reltext .= $self->get_textrefs($relreltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); }
478	else {
479	$reltext .= $self->get_textrefs($reltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
480	}
481	}
482	} # aggr > 1
483	} # aggr != 10
484
485	$doc_obj->set_OID($crcid);
486	$doc_obj->associate_file($thumbfp, "$fn.thumb.jpg", undef, $section);
487	$doc_obj->add_metadata($section, "OriginalFilename", $filename);
488	$doc_obj->add_metadata($section, "FilePath", $orig_fp);
489	$doc_obj->add_metadata($section, "Filename", $fn);
490	$doc_obj->add_metadata($section, "FileExt", $ext);
491	$doc_obj->add_metadata($section, "FileSize", $imgs->{$id}{'filesize'});
492	$doc_obj->add_metadata($section, "Width", $imgs->{$id}{'width'});
493	$doc_obj->add_metadata($section, "Height", $imgs->{$id}{'height'});
494	$doc_obj->add_metadata($section, "URL", "http://$orig_fp");
495	$doc_obj->add_metadata($section, "PageURL", $self->{'hturl'});
496	$doc_obj->add_metadata($section, "PageTitle", $pagetitle);
497	$doc_obj->add_metadata($section, "ThumbURL",
498	"_httpcollection_/index/assoc/[archivedir]/$fn.thumb.jpg");
499
500	if ( $self->{'document_text'} ) {
501	$doc_obj->add_utf8_text($section, "<table border=1>\n$reltext</table>");
502	} else {
503	$doc_obj->add_metadata($section, "ImageText", "<table border=1>\n$reltext</table>\n");
504	}
505
506	if ( $self->{'index_pages'} ) {
507	my ($cache_url) = "_httpdoc_&d=" . $self->{'htdoc_obj'}->get_OID();
508	if ( $imgs->{$id}{'anchored'} ) {
509	my $a_name = $id;
510	$a_name =~ s/[\/\\\:\&]/_/g;
511	$cache_url .= "#gsdl_$a_name" ;
512	}
513	$doc_obj->add_utf8_metadata($section, "CachePageURL", $cache_url);
514	}
515	if ( ! $self->{'no_cache_images'} ) {
516	$onlyfn = lc $onlyfn;
517	$doc_obj->associate_file($filepath, $onlyfn, undef, $section);
518	$doc_obj->add_utf8_metadata($section, "CacheURL",
519	"_httpcollection_/index/assoc/[archivedir]/$onlyfn");
520	}
521	return 1;
522	}
523
524	sub get_page_metadata {
525	my ($self, $textref) = (@_);
526	my (@rval);
527	$rval[0] = $self->get_meta_value("title", $textref);
528	$rval[1] = $self->get_meta_value("keywords", $textref);
529	$rval[2] = $self->get_meta_value("description", $textref);
530	$rval[3] = $self->{'filename'};
531
532	return wantarray ? @rval : "<tr><td>PageMeta</td><td>@rval</td></tr>\n" ;
533	}
534
535	# turns LargeCatFish into Large,Cat,Fish so MG sees the separate words
536	sub split_filepath {
537	my ($self, $filepath) = (@_);
538	my (@words) = $filepath =~ /([A-Z][a-z]+)/g;
539	return join(',', @words);
540	}
541
542	# finds and extracts sentences
543	# that seem to be on the same topic
544	# as other related text (correlations)
545	# and textual references (e.g. in figure 3 ...)
546	sub get_textrefs {
547	my ($self, $reltext, $textref, $prevpos, $pos, $nextpos) = (@_);
548	my ($maxtext, $mintext, $startpos, $context_size, $context);
549
550	my (@relwords, @refwords, %sentences, @pagemeta);
551
552	# extract larger context
553	$maxtext = $self->{'maxtext'};
554	$startpos = $pos - ($maxtext * 4);
555	$context_size = $maxtext*10;
556	if ($startpos < $prevpos ) { $startpos = $prevpos }
557	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) }
558	$context = substr ( $$textref, $startpos, $context_size );
559	$context =~ s/<.*?>//gs;
560	$context =~ s/^.>(.)/$1/gs;
561	$context =~ s/(.)<.$/$1/gs;
562
563	# get page meta-data (if not already included)
564	if ( $self->{'aggressiveness'} == 5 && ! $self->{'smallpage'} ) {
565	@pagemeta = $self->get_page_metadata($textref);
566	foreach $value ( @pagemeta ) {
567	$context .= "$value."; # make each into psuedo-sentence
568	}
569	}
570
571	# TODO: this list is not exhaustive
572	@refwords = ( '(?:is\|are)? ?(?:show(?:s\|n)\|demonstrate(?:d\|s)\|explains\|features) (?:in\|by\|below\|above\|here)',
573	'(?:see)? (?:figure\|table)? (?:below\|above)');
574
575	# extract general references
576	foreach $rw ( @refwords ) {
577	while ( $context =~ /[\.\?\!\,](.?$rw\W.?[\.\?\!\,])/ig ) {
578	$sentence = $1;
579	$sentence =~ s/\s+/ /g;
580	$sentences{$sentence}+=2;
581	}
582	}
583	# extract specific (figure, table) references by number
584	my ($fignum) = $context =~ /[\.\?\!].?(?:figure\|table)s?[\-\_\ \.](\d+\w)\W.*?[\.\?\!]/ig;
585	if ( $fignum ) {
586	foreach $rw ( @refwords ) {
587	while ( $context =~ /[\.\?\!](.?(figure\|table)[\-\_\ \.]$fignum\W.?[\.\?\!])/ig ) {
588	$sentence = $1;
589	$sentence =~ s/\s+/ /g;
590	$sentences{$sentence}+=4;
591	}
592	}
593	}
594
595	# sentences with occurances of important words
596	@relwords = $reltext =~ /([a-zA-Z]{4,})/g; # take out small words
597	foreach $word ( @relwords ) {
598	if ( $self->{'stopwords'}{$word} ) { next } # skip stop words
599	while ( $context =~ /([^\.\?\!]?$word\W.?[\.\?\!])/ig ) {
600	$sentence = $1;
601	$sentence =~ s/\s+/ /g;
602	$sentences{$sentence}++;
603	}
604	}
605	foreach $sentence ( keys %sentences ) {
606	if ($sentences{$sentence} < $self->{'textref_threshold'}) {
607	delete $sentences{$sentence};
608	}
609	}
610	my ($rval) = join "<br>\n", (keys %sentences);
611	if ( $rval && length($rval) > 5 ) {
612	return ( "<tr><td>TextualReferences</td><td>" . $rval . "</td></tr>\n") }
613	else { return "" }
614	}
615
616	# handles caption extraction
617	# calling the extractor with different
618	# tags and choosing the best candidate caption
619	sub extract_caption_text {
620	my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
621	my (@neartext, $len, $hdelim, $goodlen,
622	$startpos, $context, $context_size);
623
624	$mintext = $self->{'mintext'};
625	$goodlen = $self->{'caption_len'};
626
627	# extract a context to extract near text from (faster)
628	$context_size = $self->{'maxtext'}*3;
629	$startpos = $pos - ($context_size / 2);
630	if ($startpos < $prevpos ) { $startpos = $prevpos }
631	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
632	{ $context_size = ($nextpos - $startpos) }
633
634	$context = substr ( $$textref, $startpos, $context_size );
635	$context =~ s/<!--.*?-->//gs;
636	$context =~ s/^.-->(.)/$1/gs;
637	$context =~ s/(.)<!--.$/$1/gs;
638
639	# try stepping through markup delimiter sets
640	# and selecting the best one
641	foreach $hdelim ( @{ $self->{'cdelims'} } ) {
642	@neartext = $self->extract_caption($tag, $hdelim, \$context);
643	$len = length(join("", @neartext));
644	last if ($len >= $mintext && $len <= $goodlen);
645	}
646	# reject if well over reasonable length
647	if ( $len > $goodlen ) {
648	@neartext = [];
649	}
650	$neartext[0] = " " if (! defined $neartext[0]);
651	$neartext[1] = " " if (! defined $neartext[1]);
652	return "<tr><td>Caption</td><td>" . (join ",", @neartext) . "</td></tr>\n"; # TODO: the \| is for testing purposes
653	} # end extract_caption_text
654
655	# the previous section header often gives a bit
656	# of context to the section that the image is
657	# in (invariably the header is before/above the image)
658	# so extract the text of the closest header above the image
659	#
660	# this fn just gets all the headers above the image, within the context window
661	sub get_prev_header {
662	my ($self, $pos, $textref) = (@_);
663	my ($rhtext);
664	while ( $$textref =~ /<h\d>(.*?)<\/h\d>/sig ) {
665	# only headers before image
666	if ((pos $$textref) < $pos) {
667	$rhtext .= "$1, ";
668	}
669	}
670	if ( $rhtext ) { return "Header($rhtext)" }
671	else { return "" }
672	}
673
674	# not the most robust tag stripping
675	# regexps (see perl.com FAQ) but good enough
676	#
677	# used by caption & tag-based near text algorithms
678	sub strip_tags {
679	my ( $self, $value ) = @_;
680	if ( ! defined($value) ) { $value = "" } # handle nulls
681	else {
682	$value =~ s/<.*?>//gs; # strip all html tags
683	$value =~ s/\s+/\ /g; # remove extra whitespace
684	$value =~ s/\&\w+\;//g; # remove   etc
685	}
686	return $value;
687	}
688
689	# uses the given tag(s) to identify
690	# the caption near to the image
691	# (below, above or both below and above)
692	sub extract_caption {
693	my ($self, $tag, $bound_tag, $contextref) = (@_);
694	my (@nt, $n, $etag, $gotcap);
695	return ("", "") if ( ! ($$contextref =~ /\Q$tag/) );
696
697	$nt[0] = $`;
698	$nt[1] = $';
699	$gotcap = 0;
700
701	# look before the image for a boundary tag
702	($etag, $nt[0]) = $nt[0] =~ /<($bound_tag)[\s]?.?>(.?)$/is;
703	# if bound_tag too far from the image, then prob not caption
704	# (note: have to allow for tags, so multiply by 3
705	if ( $etag && length($nt[0]) < ($self->{'caption_len'} * 3) ) {
706	if ( $nt[0] =~ /<\/$etag>/si ) {
707	# the whole caption is above the image: <tag>text</tag><img>
708	($nt[0]) =~ /<(?:$etag)[\s]?.?>(.?)<\/$etag>/is;
709	$nt[0] = $self->strip_tags($nt[0]);
710	if ( length($nt[0]) > $self->{'mintext'} ) {
711	$gotcap = 1;
712	$nt[1] = "";
713	}
714
715	} elsif ( $nt[1] =~ /<\/$etag>/si) {
716	# the caption tag covers image: <tag>text?<img>text?</tag>
717	($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
718	$nt[0] = $self->strip_tags($nt[0] . $nt[1]);
719	if ( length($nt[0]) > $self->{'mintext'} ) {
720	$gotcap = 2;
721	$nt[1] = "";
722	}
723	}
724	}
725	# else try below the image
726	if ( ! $gotcap ) {
727	# the caption is after the image: <img><tag>text</tag>
728	($etag, $nt[1]) = $nt[1] =~ /^.?<($bound_tag)[\s]?.?>(.*)/is;
729	if ( $etag && $nt[1] =~ /<\/$etag>/s) {
730	($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
731	$gotcap = 3;
732	$nt[0] = "";
733	$nt[1] = $self->strip_tags($nt[1]);
734	}
735	}
736	if ( ! $gotcap ) { $nt[0] = $nt[1] = "" }
737	else {
738	# strip part-tags
739	$nt[0] =~ s/^.*>//s;
740	$nt[1] =~ s/<.*$//s;
741	}
742	my ($type);
743	if ( $gotcap == 0 ) { return ("nocaption", "") }
744	elsif ( $gotcap == 1 ) { $type = "captionabove:" }
745	elsif ( $gotcap == 2 ) { $type = "captioncovering:" }
746	elsif ( $gotcap == 3 ) { $type = "captionbelow:" }
747	return ($type, $nt[0], $nt[1]);
748	}
749
750	# tag-based near text
751	#
752	# tries different tag sets
753	# and chooses the best one
754	sub extract_near_text {
755	my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
756	my (@neartext, $len, $hdelim, $maxtext, $mintext, $goodlen,
757	@bestlen, @best, $startpos, $context, $context_size,
758	$dist, $bdist, $best1, $i, $nt);
759	$bestlen[0] = $bestlen[1] = 0; $bestlen[2] = $bdist = 999999;
760	$best[0] = $best[1] = $best[2] = "";
761	$maxtext = $self->{'maxtext'};
762	$mintext = $self->{'mintext'};
763	$goodlen = $self->{'neartext_len'};
764
765	# extract a context to extract near text from (faster)
766	$context_size = $maxtext*4;
767	$startpos = $pos - ($context_size / 2);
768	if ($startpos < $prevpos ) { $startpos = $prevpos }
769	if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
770	{ $context_size = ($nextpos - $startpos) }
771	$context = substr ( $$textref, $startpos, $context_size );
772	$context =~ s/<!--.*?-->//gs;
773	$context =~ s/^.-->(.)/$1/gs;
774	$context =~ s/(.)<!--.$/$1/gs;
775
776	# try stepping through markup delimiter sets
777	# and selecting the best one
778	foreach $hdelim ( @{ $self->{'delims'} } ) {
779	@neartext = $self->extract_tagged_neartext($tag, $hdelim, \$context);
780	$nt = join("", @neartext);
781	$len = length($nt);
782	# Priorities:
783	# 1. Greater than mintext
784	# 2. Less than maxtext
785	# 3. Closest to goodlen
786	if ( $len <= $goodlen && $len > $bestlen[0] ) {
787	$bestlen[0] = $len;
788	$best[0] = $hdelim;
789	} elsif ( $len >= $maxtext && $len < $bestlen[2] ) {
790	$bestlen[2] = $len;
791	$best[2] = $hdelim;
792	} elsif ( $len >= $bestlen[0] && $len <= $bestlen[2] ) {
793	$dist = abs($goodlen - $len);
794	if ( $dist < $bdist ) {
795	$bestlen[1] = $len;
796	$best[1] = $hdelim;
797	$bdist = $dist;
798	}
799	}
800	}
801	$best1 = 2;
802	foreach $i ( 0..2 ) {
803	if ( $bestlen[$i] == 999999 ) { $bestlen[$i] = 0 }
804	$dist = abs($goodlen - $bestlen[$i]);
805	if ( $bestlen[$i] > $mintext && $dist <= $bdist ) {
806	$best1 = $i;
807	$bdist = $dist;
808	}
809	}
810	@neartext = $self->extract_tagged_neartext($tag, $best[$best1], \$context);
811	if ( $bestlen[$best1] > $maxtext ) {
812	# truncate on word boundary if too much text
813	my $hmax = $maxtext / 2;
814	($neartext[0]) = $neartext[0] =~ /([^\s]*.{1,$hmax})$/s;
815	($neartext[1]) = $neartext[1] =~ /^(.{1,$hmax}[^\s]*)/s;
816	} elsif ( $bestlen[$best1] < $mintext ) {
817	# use plain text extraction if tags failed (e.g. usable tag outside context)
818	print {$self->{'outhandle'}} "W3ImgPlug: Fallback to plain-text extraction for $tag\n"
819	if $self->{'verbosity'} > 2;
820	$neartext[0] = "<tr><td>RawNeartext</td><td>" . $self->extract_raw_neartext($tag, $textref) . "</td></tr>";
821	$neartext[1] = "";
822	}
823	# get previous header if available
824	$neartext[0] .= "<br>\n" .
825	$self->get_prev_header($pos, \$context) if ( $self->{'aggressiveness'} >= 4 );
826	$neartext[0] = " " if (! defined $neartext[0]);
827	$neartext[1] = " " if (! defined $neartext[1]);
828
829	return "<tr><td>NearText</td><td>" . (join "\|", @neartext) . "</td></tr>\n"; # TODO: the \| is for testing purposes
830	} # end extract_near_text
831
832	# actually captures tag-based
833	# near-text given a tag set
834	sub extract_tagged_neartext {
835	my ($self, $tag, $bound_tag, $textref) = (@_);
836	return "" if ( ! ($$textref =~ /\Q$tag/) );
837	my (@nt, $delim, $pre_tag, $n);
838	$nt[0] = $`;
839	$nt[1] = $';
840
841	# get text after previous image tag
842	$nt[0] =~ s/.<($bound_tag)[^>]>(.*)/$2/is; # get rid of preceding text
843	if (defined($1)) { $delim = $1 }
844	$pre_tag = $bound_tag;
845
846	if (defined($delim)) {
847	# we want to try and use the end tag of the previous delimiter
848	# (put it on the front of the list)
849	$pre_tag =~ s/(^\|\\|)($delim)($\|\\|)//i; # take it out
850	$pre_tag =~ s/\\|\\|/\\|/i; # replace \|\| with \|
851	$pre_tag = $delim . "\|" . $pre_tag; # put it on the front
852	}
853
854	# get text before next image tag
855	$nt[1] =~ s/<\/?(?:$pre_tag)[^>]>.//is; # get rid of stuff after first delimiter
856
857	# process related text
858	for $n (0..1) {
859	if ( defined($nt[$n]) ) {
860	$nt[$n] =~ s/<.*?>//gs; # strip all html tags
861	$nt[$n] =~ s/\s+/\ /gs; # remove extra whitespace
862	$nt[$n] =~ s/\&\w+\;//sg; # remove   etc
863	# strip part-tags
864	if ( $n == 0 ) { $nt[0] =~ s/^.*>//s }
865	if ( $n == 1 ) { $nt[1] =~ s/<.*$//s }
866	} else { $nt[$n] = ""; } # handle nulls
867	}
868	return @nt;
869	}
870
871	# this function is fall-back
872	# if tags aren't suitable.
873	#
874	# extracts a fixed length of characters
875	# either side of image tag (on word boundary)
876	sub extract_raw_neartext {
877	my ($self, $tag, $textref) = (@_);
878	my ($rawtext, $startpos, $fp);
879	my $imgs = \%{$self->{'imglist'}};
880	($fp) = $tag =~ /([\w\\\/]+\.(?:gif\|jpe?g\|png))/is;
881	if (! $fp) { return " " };
882	# if the cached, plain-text version isn't there, then create it
883	$self->init_plaintext($textref) unless defined($self->{'plaintext'});
884
885	# take the closest maxtext/2 characters
886	# either side of the tag (by word boundary)
887	return "" if ( ! exists $imgs->{$fp}{'rawpos'} );
888	$startpos = $imgs->{$fp}{'rawpos'} - (($self->{'maxtext'} / 2) + 20);
889	if ( $startpos < 0 ) { $startpos = 0 }
890	$rawtext = substr $self->{'plaintext'}, $startpos, $self->{'maxtext'} + 20;
891	$rawtext =~ s/\s\s/ /g;
892
893	return $rawtext;
894	}
895
896	# init plaintext variable for HTML-stripped version
897	# (for full text index/raw assoc text extraction)
898	sub init_plaintext {
899	my ($self, $textref) = (@_);
900	my ($page, $fp);
901	my $imgs = \%{$self->{'imglist'}};
902	$page = $$textref; # make a copy of original
903
904	# strip tags around image filenames so they don't get zapped
905	$page =~ s/<\w+\s+.?([\w\/\\]+\.(?:gif\|jpe?g\|png))[^>]>/\"$1\"/gsi;
906	$page =~ s/<.*?>//gs;
907	$page =~ s/ / /gs;
908	$page =~ s/&/&/gs; #TODO: more &zzz; replacements (except <, $gt;)
909
910	# get positions and strip images
911	while ( $page =~ /([^\s\'\"]+\.(jpe?g\|gif\|png))/ig ) {
912	$fp = $1;
913	if ( $imgs->{$fp}{'exists'} ) {
914	$imgs->{$fp}{'rawpos'} = pos $page;
915	}
916	$page =~ s/\"$fp\"//gs;
917	}
918	$self->{'plaintext'} = $page;
919	}
920
921	# finds and filters images based on size
922	# (dimensions, height, filesize) and existence
923	#
924	# looks for image filenames (.jpg, .gif, etc)
925	# and checks for existence on disk
926	# (hence supports most JavaScript images)
927	sub get_img_list {
928	my $self = shift (@_);
929	my ($textref) = (@_);
930	my ($filepath, $relpath, $abspath, $pos, $num, $width, $height, $filesize);
931	my $imgs = \%{$self->{'imglist'}};
932	while ( $$textref =~ /([^\s\'\"]+\.(jpe?g\|gif\|png))/ig ) {
933	$filepath = $1;
934	$pos = pos $$textref;
935	next if ( $imgs->{$filepath}{'relpath'} );
936	$relpath = $filepath;
937	$relpath =~ s/^http\:\/\///; # remove http:// in case we have mirrored it
938	$relpath =~ s/\\/\//g; # replace \ with /
939	$relpath =~ s/^\.\///s; # make "./filepath" into "filepath"
940	$imgs->{$filepath}{'relpath'} = $relpath;
941	$abspath = "$self->{'htpath'}/$relpath";
942
943	if (! -e $abspath) { next }
944
945	# can't modify real filepath var because it
946	# then can't be located in the page for tag recognition later
947	($width, $height) =
948	`identify $abspath -ping -format "%wx%h"` =~ /^(\d)x(\d)$/m;
949	if (! ($width && $height)) {
950	print STDERR "W3ImgPlug: ($abspath) 'identify' failed. Check ImageMagicK binaries are installed and working correctly\n"; next;
951	}
952	$filesize = (-s $abspath);
953	if ( $filesize >= $self->{'min_img_filesize'}
954	&& ( $width >= $self->{'min_img_width'} )
955	&& ( $height >= $self->{'min_img_height'} ) ) {
956
957	$imgs->{$filepath}{'exists'} = 1;
958	$imgs->{$filepath}{'pos'} = $pos;
959	$imgs->{$filepath}{'width'} = $width;
960	$imgs->{$filepath}{'height'} = $height;
961	$imgs->{$filepath}{'filesize'} = $filesize;
962	} else {
963	print {$self->{'outhandle'}} "W3ImgPlug: skipping $self->{'base_path'}/$relpath: $filesize, $width x $height\n"
964	if $self->{'verbosity'} > 2;
965	}
966	}
967	$num = 0;
968	foreach $i ( keys %{$imgs} ) {
969	if ( $imgs->{$i}{'pos'} ) {
970	$num++;
971	} else { delete $imgs->{$i} }
972	}
973	return $num;
974	}
975
976	# make the text available to the read function
977	# by making it an object variable
978	sub read_file {
979	my ($self, $filename, $encoding, $language, $textref) = @_;
980	$self->SUPER::read_file($filename, $encoding, $language, $textref);
981
982	# if HTMLplug has run through, then it will
983	# have replaced references so we have to
984	# make a copy of the text before processing
985	if ( $self->{'index_pages'} ) {
986	$self->{'text'} = $$textref;
987	$self->{'textref'} = \($self->{'text'});
988	} else {
989	$self->{'textref'} = $textref;
990	}
991	$self->{'plaintext'} = undef;
992	}
993
994	# HTMLPlug only extracts meta-data if it is specified in plugin options
995	# hence a special function to do it here
996	sub get_meta_value {
997	my ($self, $name, $textref) = @_;
998	my ($value);
999	$name = lc $name;
1000	if ($name eq "title") {
1001	($value) = $$textref =~ /<title>(.*?)<\/title>/is
1002	} else {
1003	my $qm = "(?:\"\|\')";
1004	($value) = $$textref =~ /<meta name\s=\s$qm?$name$qm?\s+content\s=\s$qm?(.?)$qm?\s>/is
1005	}
1006	$value = "" unless $value;
1007	return $value;
1008	}
1009
1010	# make filename an anchor reference
1011	# so we can go straight to the image
1012	# within the cached version of the source page
1013	# (augment's HTMLPlug sub)
1014	sub replace_images {
1015	my $self = shift (@_);
1016	my ($front, $link, $back, $base_dir,
1017	$file, $doc_obj, $section) = @_;
1018	$link =~ s/\"//g;
1019	my ($a_name) = $link;
1020	$a_name =~ s/[\/\\\:\&]/_/g;
1021	# keep a list so we don't repeat the same anchor
1022	if ( ! $self->{'imglist'}{$link}{'anchored'} ) {
1023	$front = "<a name=\"gsdl_$a_name\">$front";
1024	$back = "$back</a>";
1025	$self->{'imglist'}{$link}{'anchored'} = 1;
1026	}
1027	return $self->SUPER::replace_images($front, $link, $back, $base_dir,
1028	$file, $doc_obj, $section);
1029	}
1030
1031	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: