1 | ###########################################################################
|
---|
2 | #
|
---|
3 | # W3ImgPlug.pm -- Context-based image indexing plugin for HTML documents
|
---|
4 | #
|
---|
5 | # A component of the Greenstone digital library software
|
---|
6 | # from the New Zealand Digital Library Project at the
|
---|
7 | # University of Waikato, New Zealand.
|
---|
8 | #
|
---|
9 | # Copyright (C) 2001 New Zealand Digital Library Project
|
---|
10 | #
|
---|
11 | # This program is free software; you can redistribute it and/or modify
|
---|
12 | # it under the terms of the GNU General Public License as published by
|
---|
13 | # the Free Software Foundation; either version 2 of the License, or
|
---|
14 | # (at your option) any later version.
|
---|
15 | #
|
---|
16 | # This program is distributed in the hope that it will be useful,
|
---|
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
19 | # GNU General Public License for more details.
|
---|
20 | #
|
---|
21 | # You should have received a copy of the GNU General Public License
|
---|
22 | # along with this program; if not, write to the Free Software
|
---|
23 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
24 | #
|
---|
25 | ###########################################################################
|
---|
26 |
|
---|
27 | # DESCRIPTION:
|
---|
28 | #
|
---|
29 | # Extracts images and associated text and metadata from
|
---|
30 | # web pages as individual documents for indexing. Thumbnails
|
---|
31 | # are created from each image for browsing purposes.
|
---|
32 | #
|
---|
33 | # Options are available for configuring the aggressiveness of the
|
---|
34 | # associated text extraction mechanisms. A higher level of
|
---|
35 | # aggressiveness will extract more text and consequently
|
---|
36 | # may mean lower accuracy (precision); however, it may also
|
---|
37 | # retrieve more of the relevant images from the collection (recall).
|
---|
38 | # Lower levels of aggressiveness maybe result in slightly faster
|
---|
39 | # collection builds at the import stage.
|
---|
40 | #
|
---|
41 | # W3ImgPlug is a subclass of HTMLPlug (i.e. it will index pages also
|
---|
42 | # if required). It can be used in place of HTMLPlug to index both
|
---|
43 | # pages and their images.
|
---|
44 | #
|
---|
45 | # REQUIREMENTS:
|
---|
46 | #
|
---|
47 | # The ImageMagick image manipulation is used to create
|
---|
48 | # thumbnails and extract some image metadata. (Available
|
---|
49 | # from http://www.imagemagick.org/)
|
---|
50 | #
|
---|
51 | # Unix:
|
---|
52 | # Many Linux distributions contain ImageMagick.
|
---|
53 | #
|
---|
54 | # Windows:
|
---|
55 | # ImageMagick can be downloaded from the website above.
|
---|
56 | # Make sure the system path includes the ImageMagick binaries
|
---|
57 | # before using W3ImgPlug.
|
---|
58 | #
|
---|
59 | # NOTE: NT/2000/XP contain a filesystem utility 'convert.exe'
|
---|
60 | # with the same name as the image conversion utility. The
|
---|
61 | # ImageMagick FAQ recommends renaming the filesystem
|
---|
62 | # utility (e.g. to 'fsconvert.exe') to avoid this clash.
|
---|
63 | #
|
---|
64 | # USAGE:
|
---|
65 | #
|
---|
66 | # An image document consists of metadata elements:
|
---|
67 | #
|
---|
68 | # OriginalFilename, FilePath, Filename, FileExt, FileSize,
|
---|
69 | # Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL
|
---|
70 | # ImageText, PageTitle
|
---|
71 | #
|
---|
72 | # Most of these are only useful in format strings (e.g. ThumbURL,
|
---|
73 | # Filename, URL, PageURL, CachePageURL).
|
---|
74 | #
|
---|
75 | # ImageText, as the name suggests contains the indexable text.
|
---|
76 | # (unless using the -document_text plugin option)
|
---|
77 | #
|
---|
78 | # Since image documents are made up of metadata elements
|
---|
79 | # alone, format strings are needed to display them properly.
|
---|
80 | # NOTE: The receptionist will only display results (e.g. thumbnails)
|
---|
81 | # in 4 columns if the format string begins with "<td><table>".
|
---|
82 | #
|
---|
83 | # The example below takes the user to the image within the
|
---|
84 | # source HTML document rather than using a format string
|
---|
85 | # on DocumentText to display the image document itself.
|
---|
86 | #
|
---|
87 | # Example collect.cfg:
|
---|
88 | #
|
---|
89 | # ...
|
---|
90 | #
|
---|
91 | # indexes document:ImageText document:text
|
---|
92 | # defaultindex document:ImageText
|
---|
93 | #
|
---|
94 | # collectionmeta .document:ImageText "images"
|
---|
95 | # collectionmeta .document:text "documents"
|
---|
96 | #
|
---|
97 | # ...
|
---|
98 | #
|
---|
99 | # plugin W3ImgPlug -index_pages -aggressiveness 6
|
---|
100 | #
|
---|
101 | # ...
|
---|
102 | #
|
---|
103 | # format SearchVList '<td>{If}{[Title],[link][icon] [Title][[/link],
|
---|
104 | # <table><tr><td align="center"><a href="[CachePageURL]">
|
---|
105 | # <img src="[ThumbURL]"></a></td></tr><tr><td align="center">
|
---|
106 | # <a href="[CachePageURL]"><font size="-1">[OriginalFilename]</font></a>
|
---|
107 | # <br>[Width]x[Height]</td></tr></table>}</td>'
|
---|
108 | #
|
---|
109 | # ...
|
---|
110 | #
|
---|
111 |
|
---|
112 | package W3ImgPlug;
|
---|
113 |
|
---|
114 | use HTMLPlug;
|
---|
115 | use ghtml;
|
---|
116 | use unicode;
|
---|
117 | use util;
|
---|
118 | use parsargv;
|
---|
119 | use strict 'subs';
|
---|
120 |
|
---|
121 | sub BEGIN {
|
---|
122 | @ISA = qw( HTMLPlug );
|
---|
123 | }
|
---|
124 |
|
---|
125 | my $aggressiveness_list =
|
---|
126 | [ { 'name' => "1",
|
---|
127 | 'desc' => "{W3ImgPlug.aggressiveness.1}" },
|
---|
128 | { 'name' => "2",
|
---|
129 | 'desc' => "{W3ImgPlug.aggressiveness.2}" },
|
---|
130 | { 'name' => "3",
|
---|
131 | 'desc' => "{W3ImgPlug.aggressiveness.3}" },
|
---|
132 | { 'name' => "4",
|
---|
133 | 'desc' => "{W3ImgPlug.aggressiveness.4}" },
|
---|
134 | { 'name' => "5",
|
---|
135 | 'desc' => "{W3ImgPlug.aggressiveness.5}" },
|
---|
136 | { 'name' => "6",
|
---|
137 | 'desc' => "{W3ImgPlug.aggressiveness.6}" },
|
---|
138 | { 'name' => "7",
|
---|
139 | 'desc' => "{W3ImgPlug.aggressiveness.7}" },
|
---|
140 | { 'name' => "8",
|
---|
141 | 'desc' => "{W3ImgPlug.aggressiveness.8}" },
|
---|
142 | { 'name' => "9",
|
---|
143 | 'desc' => "{W3ImgPlug.aggressiveness.9}" } ];
|
---|
144 |
|
---|
145 | my $arguments =
|
---|
146 | [ { 'name' => "aggressiveness",
|
---|
147 | 'desc' => "{W3ImgPlug.aggressiveness}",
|
---|
148 | 'type' => "int",
|
---|
149 | 'list' => $aggressiveness_list,
|
---|
150 | 'deft' => "3",
|
---|
151 | 'reqd' => "no" },
|
---|
152 | { 'name' => "index_pages",
|
---|
153 | 'desc' => "{W3ImgPlug.index_pages}",
|
---|
154 | 'type' => "flag",
|
---|
155 | 'reqd' => "no" },
|
---|
156 | { 'name' => "no_cache_images",
|
---|
157 | 'desc' => "{W3ImgPlug.no_cache_images}",
|
---|
158 | 'type' => "flag",
|
---|
159 | 'reqd' => "no" },
|
---|
160 | { 'name' => "min_size",
|
---|
161 | 'desc' => "{W3ImgPlug.min_size}",
|
---|
162 | 'type' => "int",
|
---|
163 | 'deft' => "2000",
|
---|
164 | 'reqd' => "no" },
|
---|
165 | { 'name' => "min_width",
|
---|
166 | 'desc' => "{W3ImgPlug.min_width}",
|
---|
167 | 'type' => "int",
|
---|
168 | 'deft' => "50",
|
---|
169 | 'reqd' => "no" },
|
---|
170 | { 'name' => "min_height",
|
---|
171 | 'desc' => "{W3ImgPlug.min_height}",
|
---|
172 | 'type' => "int",
|
---|
173 | 'deft' => "50",
|
---|
174 | 'reqd' => "no" },
|
---|
175 | { 'name' => "thumb_size",
|
---|
176 | 'desc' => "{W3ImgPlug.thumb_size}",
|
---|
177 | 'type' => "int",
|
---|
178 | 'deft' => "100",
|
---|
179 | 'reqd' => "no" },
|
---|
180 | { 'name' => "convert_params",
|
---|
181 | 'desc' => "{W3ImgPlug.convert_params}",
|
---|
182 | 'type' => "string",
|
---|
183 | 'deft' => "",
|
---|
184 | 'reqd' => "no" },
|
---|
185 | { 'name' => "min_near_text",
|
---|
186 | 'desc' => "{W3ImgPlug.min_near_text}",
|
---|
187 | 'type' => "int",
|
---|
188 | 'deft' => "10",
|
---|
189 | 'reqd' => "no" },
|
---|
190 | { 'name' => "max_near_text",
|
---|
191 | 'desc' => "{W3ImgPlug.max_near_text}",
|
---|
192 | 'type' => "int",
|
---|
193 | 'deft' => "400",
|
---|
194 | 'reqd' => "no" },
|
---|
195 | { 'name' => "smallpage_threshold",
|
---|
196 | 'desc' => "{W3ImgPlug.smallpage_threshold}",
|
---|
197 | 'type' => "int",
|
---|
198 | 'deft' => "2048",
|
---|
199 | 'reqd' => "no" },
|
---|
200 | { 'name' => "textrefs_threshold",
|
---|
201 | 'desc' => "{W3ImgPlug.textrefs_threshold}",
|
---|
202 | 'type' => "int",
|
---|
203 | 'deft' => "2",
|
---|
204 | 'reqd' => "no" },
|
---|
205 | { 'name' => "caption_length",
|
---|
206 | 'desc' => "{W3ImgPlug.caption_length}",
|
---|
207 | 'type' => "int",
|
---|
208 | 'deft' => "80",
|
---|
209 | 'reqd' => "no" },
|
---|
210 | { 'name' => "neartext_length",
|
---|
211 | 'desc' => "{W3ImgPlug.neartext_length}",
|
---|
212 | 'type' => "int",
|
---|
213 | 'deft' => "300",
|
---|
214 | 'reqd' => "no" },
|
---|
215 | { 'name' => "document_text",
|
---|
216 | 'desc' => "{W3ImgPlug.document_text}",
|
---|
217 | 'type' => "flag",
|
---|
218 | 'reqd' => "no" } ];
|
---|
219 |
|
---|
220 | my $options = { 'name' => "W3ImgPlug",
|
---|
221 | 'desc' => "{W3ImgPlug.desc}",
|
---|
222 | 'abstract' => "no",
|
---|
223 | 'inherits' => "yes",
|
---|
224 | 'args' => $arguments };
|
---|
225 |
|
---|
226 |
|
---|
227 | # sub print_usage {
|
---|
228 | # print STDERR "\nUsage: plugin W3ImgPlug [options]\n\n";
|
---|
229 | # print STDERR " options:\n";
|
---|
230 | # print STDERR " -aggressiveness Range of related text extraction techniques to use [4]\n";
|
---|
231 | # print STDERR " 1: Filename, path, ALT text only\n";
|
---|
232 | # print STDERR " 2: All of 1, plus caption where available\n";
|
---|
233 | # print STDERR " 3: All of 2, plus near paragraphs where available\n";
|
---|
234 | # print STDERR " 4: All of 3, plus previous headers (<h1>, <h2>...)\n";
|
---|
235 | # print STDERR " where available\n";
|
---|
236 | # print STDERR " 5: All of 4, plus textual references where available\n";
|
---|
237 | # print STDERR " 6: All of 4, plus page metatags (title, keywords, etc)\n";
|
---|
238 | # print STDERR " 7: All of 6, 5 and 4 combined\n";
|
---|
239 | # print STDERR " 8: All of 7, plus repeat caption, filename, etc (raise \n";
|
---|
240 | # print STDERR " ranking of more relevant results)\n";
|
---|
241 | # print STDERR " 10: All of 1, plus full text of source page\n";
|
---|
242 | # print STDERR "\n";
|
---|
243 | # print STDERR " -no_cache_images Don't cache images (point to URL of original)\n";
|
---|
244 | # print STDERR " -index_pages Index the pages along with the images.\n";
|
---|
245 | # print STDERR " Otherwise reference the pages at the source URL\n";
|
---|
246 | # print STDERR " -min_size Bytes. Skip images smaller than this [2000]\n";
|
---|
247 | # print STDERR " -min_width Pixels. Skip images narrower than this [50 pixels]\n";
|
---|
248 | # print STDERR " -min_height Pixels. Skip images shorter than this [50 pixels]\n";
|
---|
249 | # print STDERR " -thumb_size Max thumbnail size. Both width and height [100 pixels]\n";
|
---|
250 | # print STDERR " -convert_params Additional parameters for ImageMagicK convert on\n";
|
---|
251 | # print STDERR " thumbnail creation. For example, '-raise' will give\n";
|
---|
252 | # print STDERR " a three dimensional effect to thumbnail images.\n";
|
---|
253 | # print STDERR " -document_text Add image text as document:text (otherwise IndexedText\n";
|
---|
254 | # print STDERR " metadata field)\n";
|
---|
255 | # print STDERR "\n";
|
---|
256 | # print STDERR " Advanced Options (applicability depends on aggressiveness level)\n";
|
---|
257 | # print STDERR " -smallpage_threshold Images on pages smaller than this (bytes) will have\n";
|
---|
258 | # print STDERR " the page (title, keywords, etc) meta-data added [2048]\n";
|
---|
259 | # print STDERR " -textrefs_threshold Threshold for textual references. Lower values mean\n";
|
---|
260 | # print STDERR " the algorithm is less strict [2]\n";
|
---|
261 | # print STDERR " -caption_length Maximum length of captions (in characters) [100]\n";
|
---|
262 | # print STDERR " -neartext_length Target length of near text (in characters) [300]\n";
|
---|
263 | # print STDERR " -max_near_text Maximum characters near images to extract [400]\n";
|
---|
264 | # print STDERR " -min_near_text Minimum characters of near text or caption to extract [10]\n";
|
---|
265 | # print STDERR "\n";
|
---|
266 | # print STDERR " Tag set configuration file (XML format):\n";
|
---|
267 | # print STDERR " <collectionpath>/etc/W3ImgPlug.cfg \n";
|
---|
268 | # print STDERR "\n";
|
---|
269 | # print STDERR "\n";
|
---|
270 | # print STDERR "W3ImgPlug inherits all of HTMLPlug's functionality and options:\n";
|
---|
271 | # HTMLPlug::print_usage();
|
---|
272 | # print STDERR "\n";
|
---|
273 | # }
|
---|
274 |
|
---|
275 | sub new {
|
---|
276 | my $class = shift (@_);
|
---|
277 | my $self = new HTMLPlug ($class, @_);
|
---|
278 | $self->{'plugin_type'} = "W3ImgPlug";
|
---|
279 | # 14-05-02 To allow for proper inheritance of arguments - John Thompson
|
---|
280 | my $option_list = $self->{'option_list'};
|
---|
281 | push( @{$option_list}, $options );
|
---|
282 |
|
---|
283 | if (!parsargv::parse(\@_,
|
---|
284 | q^aggressiveness/\d/3^, \$self->{'aggressiveness'},
|
---|
285 | q^index_pages^, \$self->{'index_pages'},
|
---|
286 | q^no_cache_images^, \$self->{'no_cache_images'},
|
---|
287 | q^min_size/\d*/2000^, \$self->{'min_img_filesize'},
|
---|
288 | q^min_width/\d*/50^, \$self->{'min_img_width'},
|
---|
289 | q^min_height/\d*/50^, \$self->{'min_img_height'},
|
---|
290 | q^thumb_size/\d*/100^, \$self->{'thumbnail_size'},
|
---|
291 | q^convert_params/.*/ ^, \$self->{'img_convert_param'},
|
---|
292 | q^max_near_text/\d*/400^, \$self->{'maxtext'},
|
---|
293 | q^min_near_text/\d*/10^, \$self->{'mintext'},
|
---|
294 | q^smallpage_threshold/\d*/2048^, \$self->{'smallpage_threshold'},
|
---|
295 | q^textrefs_threshold/\d*/2^, \$self->{'textref_threshold'},
|
---|
296 | q^caption_length/\d*/80^, \$self->{'caption_len'},
|
---|
297 | q^neartext_length/\d*/300^, \$self->{'neartext_len'},
|
---|
298 | q^document_text^, \$self->{'document_text'},
|
---|
299 | "allow_extra_options"
|
---|
300 | )) {
|
---|
301 |
|
---|
302 | print STDERR "\nIncorrect options passed to W3ImgPlug, check your collect.cfg configuration file\n";
|
---|
303 | $self->print_txt_usage(""); # Use default resource bundle
|
---|
304 | die "\n";
|
---|
305 | }
|
---|
306 |
|
---|
307 | # init class variables
|
---|
308 | $self->{'textref'} = undef; # init by read_file fn
|
---|
309 | $self->{'htdoc_obj'} = undef; # init by process fn
|
---|
310 | $self->{'htpath'} = undef; # init by process fn
|
---|
311 | $self->{'hturl'} = undef; # init by process fn
|
---|
312 | $self->{'plaintext'} = undef; # HTML stripped version - only init if needed by raw_neartext sub
|
---|
313 | $self->{'smallpage'} = 0; # set by process fn
|
---|
314 | $self->{'images_indexed'} = undef; # num of images indexed - if 1 or 2 then we know page is small
|
---|
315 | $self->{'initialised'} = undef; # flag (see set_extraction_options())
|
---|
316 |
|
---|
317 | return bless $self, $class;
|
---|
318 | }
|
---|
319 |
|
---|
320 | # if indexing pages, let HTMLPlug do it's stuff
|
---|
321 | # image extraction done through read()
|
---|
322 | sub process {
|
---|
323 | my ($self, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
|
---|
324 | $self->{'imglist'} = ();
|
---|
325 | if ( $self->{'index_pages'} ) {
|
---|
326 | my $ok = $self->SUPER::process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
|
---|
327 | if ( ! $ok ) { return $ok }
|
---|
328 | $self->{'htdoc_obj'} = $doc_obj;
|
---|
329 | }
|
---|
330 | # else use URL for referencing
|
---|
331 | #if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
|
---|
332 |
|
---|
333 | $self->{'htpath'} = $base_dir if (-d $base_dir);
|
---|
334 | if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} .= "/$1"; }
|
---|
335 | $self->{'htpath'} =~ s/\\/\//g; # replace \ with /
|
---|
336 |
|
---|
337 | $self->{'hturl'} = "http://$file";
|
---|
338 | $self->{'hturl'} =~ s/\\/\//g; # for windows
|
---|
339 | ($self->{'filename'}) = $file =~ /.*[\/\\](.*)/;
|
---|
340 | ($self->{'base_path'}) = $file =~ /(.*)[\/\\]/i;
|
---|
341 | if ( ( -s "$base_dir/$file") <= $self->{'smallpage_threshold'} ) {
|
---|
342 | $self->{'smallpage'} = 1;
|
---|
343 | } else { $self->{'smallpage'} = 0; }
|
---|
344 |
|
---|
345 | if ( defined($self->{'initialised'}) ) { return 1; }
|
---|
346 | else {
|
---|
347 | $self->{'initialised'} = $self->set_extraction_options($base_dir =~ /^(.*?)\/import/i);
|
---|
348 | return $self->{'initialised'};
|
---|
349 | }
|
---|
350 | }
|
---|
351 |
|
---|
352 | # get complex configuration options from configuration files
|
---|
353 | # -- $GSDLCOLLECTION/etc/W3ImgPlug.cfg (tag sets for aggr 2+)
|
---|
354 | # -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+)
|
---|
355 |
|
---|
356 | # If there's no W3ImgPlug.cfg file we'll use the following default values
|
---|
357 | my $defaultcfg = '
|
---|
358 | <delimitertagset>
|
---|
359 | <setname>Caption</setname>
|
---|
360 | <taggroup>font</taggroup>
|
---|
361 | <taggroup>tt</taggroup>
|
---|
362 | <taggroup>small</taggroup>
|
---|
363 | <taggroup>b</taggroup>
|
---|
364 | <taggroup>i</taggroup>
|
---|
365 | <taggroup>u</taggroup>
|
---|
366 | <taggroup>em</taggroup>
|
---|
367 | <taggroup>td</taggroup>
|
---|
368 | <taggroup>li</taggroup>
|
---|
369 | <taggroup>a</taggroup>
|
---|
370 | <taggroup>p</taggroup>
|
---|
371 | <taggroup>tr</taggroup>
|
---|
372 | <taggroup>center</taggroup>
|
---|
373 | <taggroup>div</taggroup>
|
---|
374 | <taggroup>caption</taggroup>
|
---|
375 | <taggroup>br</taggroup>
|
---|
376 | <taggroup>ul</taggroup>
|
---|
377 | <taggroup>ol</taggroup>
|
---|
378 | <taggroup>table</taggroup>
|
---|
379 | <taggroup>hr</taggroup>
|
---|
380 | </delimitertagset>
|
---|
381 |
|
---|
382 | <delimitertagset>
|
---|
383 | <setname>Neartext</setname>
|
---|
384 | <taggroup>tr|hr|table|h\d|img|body</taggroup>
|
---|
385 | <taggroup>td|tr|hr|table|h\d|img|body</taggroup>
|
---|
386 | <taggroup>p|br|td|tr|hr|table|h\d|img|body</taggroup>
|
---|
387 | <taggroup>font|p|i|b|em|img</taggroup>
|
---|
388 | </delimitertagset>
|
---|
389 | ';
|
---|
390 |
|
---|
391 | sub set_extraction_options() {
|
---|
392 | my ($self, $collpath) = @_;
|
---|
393 | my ($filepath);
|
---|
394 |
|
---|
395 | print {$self->{'outhandle'}} "W3ImgPlug: Initialising\n"
|
---|
396 | if $self->{'verbosity'} > 1;
|
---|
397 | # etc/W3ImgPlug.cfg (XML)
|
---|
398 | # tag sets for captions and neartext
|
---|
399 | if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) {
|
---|
400 | $self->{'delims'} = [];
|
---|
401 | $self->{'cdelims'} = [];
|
---|
402 | my ($cfg, @tagsets, $tagset, $type, @delims);
|
---|
403 |
|
---|
404 | $filepath = "$collpath/etc/W3ImgPlug.cfg";
|
---|
405 | if ( open CFG, "<$filepath" ) {
|
---|
406 | while (<CFG>) { $cfg .= $_ }
|
---|
407 | close CFG;
|
---|
408 | } else {
|
---|
409 | $cfg = $defaultcfg;
|
---|
410 | }
|
---|
411 |
|
---|
412 | (@tagsets) =
|
---|
413 | $cfg =~ /<delimitertagset>(.*?)<\/delimitertagset>/igs;
|
---|
414 | foreach $tagset ( @tagsets ) {
|
---|
415 | ($type) = $tagset =~ /<setname>(.*?)<\/setname>/i;
|
---|
416 | if ( lc($type) eq "caption" ) {
|
---|
417 | (@{$self->{'cdelims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
|
---|
418 | }
|
---|
419 | elsif ( lc($type) eq "neartext" ) {
|
---|
420 | (@{$self->{'delims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
|
---|
421 | }
|
---|
422 | }
|
---|
423 |
|
---|
424 | # output a warning if there seem to be no delimiters
|
---|
425 | if ( scalar(@{$self->{'cdelims'}} == 0)) {
|
---|
426 | print {$self->{'outhandle'}} "W3ImgPlug: Warning: no caption delimiters found in $filepath\n";
|
---|
427 | }
|
---|
428 | if ( scalar(@{$self->{'delims'}} == 0)) {
|
---|
429 | print {$self->{'outhandle'}} "W3ImgPlug: Warning: no neartext delimiters found in $filepath\n";
|
---|
430 | }
|
---|
431 | }
|
---|
432 |
|
---|
433 | # get stop words for textual reference extraction
|
---|
434 | # TODO: warnings scroll off. Would be best to output them again at end of import
|
---|
435 | if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) {
|
---|
436 | $self->{'stopwords'} = ();
|
---|
437 | $filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");
|
---|
438 | if ( open STOPWORDS, "<$filepath" ) {
|
---|
439 | while ( <STOPWORDS> ) {
|
---|
440 | chomp;
|
---|
441 | $self->{'stopwords'}{$_} = 1;
|
---|
442 | }
|
---|
443 | close STOPWORDS;
|
---|
444 | } else {
|
---|
445 | print {$self->{'outhandle'}} "W3ImgPlug: Warning: couldn't open stopwords file at $filepath ($!)\n";
|
---|
446 | }
|
---|
447 |
|
---|
448 | }
|
---|
449 |
|
---|
450 | if ( $self->{'neartext_len'} > $self->{'maxtext'} ) {
|
---|
451 | $self->{'maxtext'} = $self->{'neartext_len'} * 1.33;
|
---|
452 | print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'maxtext'}\n";
|
---|
453 | }
|
---|
454 | if ( $self->{'caption_len'} > $self->{'maxtext'} ) {
|
---|
455 | $self->{'maxtext'} = $self->{'caption_len'} * 1.33;
|
---|
456 | print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'maxtext'}\n";
|
---|
457 | }
|
---|
458 |
|
---|
459 | return 1;
|
---|
460 | }
|
---|
461 |
|
---|
462 | # return number of files processed, undef if can't process
|
---|
463 | # Note that $base_dir might be "" and that $file might
|
---|
464 | # include directories
|
---|
465 | sub read {
|
---|
466 | my ($self, $pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = (@_);
|
---|
467 | my ($doc_obj, $section, $filepath, $imgtag, $pos, $context, $numdocs, $tndir, $imgs);
|
---|
468 | # forward normal read (runs HTMLPlug if index_pages T)
|
---|
469 | my $ok = $self->SUPER::read($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs);
|
---|
470 | if ( ! $ok ) { return $ok }
|
---|
471 |
|
---|
472 | my $outhandle = $self->{'outhandle'};
|
---|
473 | my $textref = $self->{'textref'};
|
---|
474 | my $htdoc_obj = $self->{'htdoc_obj'};
|
---|
475 | $numdocs = 0;
|
---|
476 | $base_dir =~ /(.*)\/.*/;
|
---|
477 | $tndir = "$1/archives/thumbnails"; # TODO: this path shouldn't be hardcoded?
|
---|
478 | &util::mk_all_dir($tndir) unless -e "$tndir";
|
---|
479 |
|
---|
480 | $imgs = \%{$self->{'imglist'}};
|
---|
481 | my $nimgs = $self->get_img_list($textref);
|
---|
482 | $self->{'images_indexed'} = $nimgs;
|
---|
483 | if ( $nimgs > 0 ) {
|
---|
484 | my @fplist = (sort { $imgs->{$a}{'pos'} <=> $imgs->{$b}{'pos'} } keys %{$imgs});
|
---|
485 | my $i = 0;
|
---|
486 | foreach $filepath ( @fplist ) {
|
---|
487 | $pos = $imgs->{$filepath}{'pos'};
|
---|
488 | $context = substr ($$textref, $pos - 50, $pos + 50); # grab context (quicker)
|
---|
489 | ($imgtag) = ($context =~ /(<(?:img|a|body)\s[^>]*$filepath[^>]*>)/is );
|
---|
490 | if (! defined($imgtag)) { $imgtag = $filepath }
|
---|
491 | print $outhandle "W3ImgPlug: extracting $filepath\n"
|
---|
492 | if ( $self->{'verbosity'} > 1 );
|
---|
493 | $doc_obj = new doc ("", "indexed_doc");
|
---|
494 | $section = $doc_obj->get_top_section();
|
---|
495 | $prevpos = ( $i == 0 ? 0 : $imgs->{$fplist[$i - 1]}{'pos'});
|
---|
496 | $nextpos = ( $i >= ($nimgs -1) ? -1 : $imgs->{$fplist[$i + 1]}{'pos'} );
|
---|
497 |
|
---|
498 | $self->extract_image_info($imgtag, $filepath, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos);
|
---|
499 | $processor->process($doc_obj);
|
---|
500 | $numdocs++;
|
---|
501 | $i++;
|
---|
502 | }
|
---|
503 | return $numdocs;
|
---|
504 | } else {
|
---|
505 | print $outhandle "W3ImgPlug: No images from $file indexed\n"
|
---|
506 | if ( $self->{'verbosity'} > 2 );
|
---|
507 | return 1;
|
---|
508 | }
|
---|
509 |
|
---|
510 | }
|
---|
511 |
|
---|
512 | # for every valid image tag
|
---|
513 | # 1. extract related text and image metadata
|
---|
514 | # 2. add this as document meta-data
|
---|
515 | # 3. add assoc image(s) as files
|
---|
516 | #
|
---|
517 | sub extract_image_info {
|
---|
518 | my $self = shift (@_);
|
---|
519 | my ($tag, $id, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos) = (@_);
|
---|
520 | my ($filename, $orig_fp, $fn, $ext, $reltext, $relreltext, $crcid, $imgs,
|
---|
521 | $thumbfp, $pagetitle, $alttext, $filepath, $aggr);
|
---|
522 | $aggr = $self->{'aggressiveness'};
|
---|
523 | $imgs = \%{$self->{'imglist'}};
|
---|
524 | $filepath = $imgs->{$id}{'relpath'};
|
---|
525 | ($filename) = $filepath =~ /([^\/\\]+)$/s;
|
---|
526 | ($orig_fp) = "$self->{'base_path'}/$filepath";
|
---|
527 | $orig_fp =~ tr/+/ /;
|
---|
528 | $orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc
|
---|
529 | $orig_fp =~ s/\\/\//g;
|
---|
530 | $filepath = "$self->{'htpath'}/$filepath";
|
---|
531 | ($onlyfn) = $filename =~ /([^\\\/]*)$/;
|
---|
532 | ($fn, $ext) = $onlyfn =~ /(.*)\.(.*)/;
|
---|
533 | $fn = lc $fn; $ext = lc $ext;
|
---|
534 | ($reltext) = "<tr><td>GifComment</td><td>" . `identify $filepath -ping -format "%c"` . "</td></tr>\n"
|
---|
535 | if ($ext eq "gif");
|
---|
536 | $reltext .= "<tr><td>FilePath</td><td>$orig_fp</td></tr>\n";
|
---|
537 |
|
---|
538 | if ($ENV{'GSDLOS'} =~ /^windows$/i) {
|
---|
539 | $crcid = "$fn.$ext." . $self->{'next_crcid'}++;
|
---|
540 | } else { ($crcid) = `cksum $filepath` =~ /^(\d+)/; }
|
---|
541 | $thumbfp = "$tndir/tn_$crcid.jpg";
|
---|
542 | `convert -flatten -filter Hanning $self->{'img_convert_param'} -geometry "$self->{'thumbnail_size'}x$self->{'thumbnail_size'}>" $filepath $thumbfp` unless -e $thumbfp;
|
---|
543 | if ( ! (-e $thumbfp) ) {
|
---|
544 | print STDERR "W3ImgPlug: 'convert' failed. Check ImageMagicK binaries are installed and working correctly\n"; return 0;
|
---|
545 | }
|
---|
546 |
|
---|
547 | # shove in full text (tag stripped or unstripped) if settings require it
|
---|
548 | if ( $aggr == 10) {
|
---|
549 | $reltext = "<tr><td>AllPage</td><td>" . $$textref . "</td><tr>\n"; # level 10 (all text, verbatim)
|
---|
550 | } else {
|
---|
551 | $pagetitle = $self->get_meta_value("title", $textref);
|
---|
552 | ($alttext) = $tag =~ /\salt\s*=\s*(?:\"|\')(.+?)(?:\"|\')/is;
|
---|
553 | if ( defined($alttext) && length($alttext) > 1) {
|
---|
554 | $reltext .= "<tr><td>ALTtext</td><td>$alttext</td></tr>\n"; }
|
---|
555 | $reltext .= "<tr><td>SplitCapitalisation</td><td>" .
|
---|
556 | $self->split_filepath($orig_fp) . "</td></tr>\n";
|
---|
557 |
|
---|
558 | # get caption/tag based near text (if appropriate)
|
---|
559 | if ( $aggr > 1 ) {
|
---|
560 | if ( $aggr >= 2 ) {
|
---|
561 | $reltext .=
|
---|
562 | $self->extract_caption_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
|
---|
563 | $relreltext = $reltext;
|
---|
564 | }
|
---|
565 | # repeat the filepath, alt-text, caption, etc
|
---|
566 | if ( $aggr == 8 ) {
|
---|
567 | $reltext .= $relreltext;
|
---|
568 | }
|
---|
569 | if ( $aggr >= 3 ) {
|
---|
570 | $reltext .=
|
---|
571 | $self->extract_near_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
|
---|
572 | }
|
---|
573 |
|
---|
574 | # get page metadata (if appropriate)
|
---|
575 | if ( $aggr >= 6 || ( $aggr >= 2 &&
|
---|
576 | ( $self->{'images_indexed'} < 2 ||
|
---|
577 | ($self->{'smallpage'} == 1 && $self->{'images_indexed'} < 6 )))) {
|
---|
578 | $reltext .= $self->get_page_metadata($textref);
|
---|
579 | }
|
---|
580 | # textual references
|
---|
581 | if ( $aggr == 5 || $aggr >= 7) {
|
---|
582 | if ( length($relreltext) > ($self->{'caption_len'} * 2) ) {
|
---|
583 | $reltext .= $self->get_textrefs($relreltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); }
|
---|
584 | else {
|
---|
585 | $reltext .= $self->get_textrefs($reltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
|
---|
586 | }
|
---|
587 | }
|
---|
588 | } # aggr > 1
|
---|
589 | } # aggr != 10
|
---|
590 |
|
---|
591 | $doc_obj->set_OID($crcid);
|
---|
592 | $doc_obj->associate_file($thumbfp, "$fn.thumb.jpg", undef, $section);
|
---|
593 | $doc_obj->add_metadata($section, "OriginalFilename", $filename);
|
---|
594 | $doc_obj->add_metadata($section, "FilePath", $orig_fp);
|
---|
595 | $doc_obj->add_metadata($section, "Filename", $fn);
|
---|
596 | $doc_obj->add_metadata($section, "FileExt", $ext);
|
---|
597 | $doc_obj->add_metadata($section, "FileSize", $imgs->{$id}{'filesize'});
|
---|
598 | $doc_obj->add_metadata($section, "Width", $imgs->{$id}{'width'});
|
---|
599 | $doc_obj->add_metadata($section, "Height", $imgs->{$id}{'height'});
|
---|
600 | $doc_obj->add_metadata($section, "URL", "http://$orig_fp");
|
---|
601 | $doc_obj->add_metadata($section, "PageURL", $self->{'hturl'});
|
---|
602 | $doc_obj->add_metadata($section, "PageTitle", $pagetitle);
|
---|
603 | $doc_obj->add_metadata($section, "ThumbURL",
|
---|
604 | "_httpcollection_/index/assoc/[archivedir]/$fn.thumb.jpg");
|
---|
605 |
|
---|
606 | if ( $self->{'document_text'} ) {
|
---|
607 | $doc_obj->add_utf8_text($section, "<table border=1>\n$reltext</table>");
|
---|
608 | } else {
|
---|
609 | $doc_obj->add_metadata($section, "ImageText", "<table border=1>\n$reltext</table>\n");
|
---|
610 | }
|
---|
611 |
|
---|
612 | if ( $self->{'index_pages'} ) {
|
---|
613 | my ($cache_url) = "_httpdoc_&d=" . $self->{'htdoc_obj'}->get_OID();
|
---|
614 | if ( $imgs->{$id}{'anchored'} ) {
|
---|
615 | my $a_name = $id;
|
---|
616 | $a_name =~ s/[\/\\\:\&]/_/g;
|
---|
617 | $cache_url .= "#gsdl_$a_name" ;
|
---|
618 | }
|
---|
619 | $doc_obj->add_utf8_metadata($section, "CachePageURL", $cache_url);
|
---|
620 | }
|
---|
621 | if ( ! $self->{'no_cache_images'} ) {
|
---|
622 | $onlyfn = lc $onlyfn;
|
---|
623 | $doc_obj->associate_file($filepath, $onlyfn, undef, $section);
|
---|
624 | $doc_obj->add_utf8_metadata($section, "CacheURL",
|
---|
625 | "_httpcollection_/index/assoc/[archivedir]/$onlyfn");
|
---|
626 | }
|
---|
627 | return 1;
|
---|
628 | }
|
---|
629 |
|
---|
630 | sub get_page_metadata {
|
---|
631 | my ($self, $textref) = (@_);
|
---|
632 | my (@rval);
|
---|
633 | $rval[0] = $self->get_meta_value("title", $textref);
|
---|
634 | $rval[1] = $self->get_meta_value("keywords", $textref);
|
---|
635 | $rval[2] = $self->get_meta_value("description", $textref);
|
---|
636 | $rval[3] = $self->{'filename'};
|
---|
637 |
|
---|
638 | return wantarray ? @rval : "<tr><td>PageMeta</td><td>@rval</td></tr>\n" ;
|
---|
639 | }
|
---|
640 |
|
---|
641 | # turns LargeCatFish into Large,Cat,Fish so MG sees the separate words
|
---|
642 | sub split_filepath {
|
---|
643 | my ($self, $filepath) = (@_);
|
---|
644 | my (@words) = $filepath =~ /([A-Z][a-z]+)/g;
|
---|
645 | return join(',', @words);
|
---|
646 | }
|
---|
647 |
|
---|
648 | # finds and extracts sentences
|
---|
649 | # that seem to be on the same topic
|
---|
650 | # as other related text (correlations)
|
---|
651 | # and textual references (e.g. in figure 3 ...)
|
---|
652 | sub get_textrefs {
|
---|
653 | my ($self, $reltext, $textref, $prevpos, $pos, $nextpos) = (@_);
|
---|
654 | my ($maxtext, $mintext, $startpos, $context_size, $context);
|
---|
655 |
|
---|
656 | my (@relwords, @refwords, %sentences, @pagemeta);
|
---|
657 |
|
---|
658 | # extract larger context
|
---|
659 | $maxtext = $self->{'maxtext'};
|
---|
660 | $startpos = $pos - ($maxtext * 4);
|
---|
661 | $context_size = $maxtext*10;
|
---|
662 | if ($startpos < $prevpos ) { $startpos = $prevpos }
|
---|
663 | if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) }
|
---|
664 | $context = substr ( $$textref, $startpos, $context_size );
|
---|
665 | $context =~ s/<.*?>//gs;
|
---|
666 | $context =~ s/^.*>(.*)/$1/gs;
|
---|
667 | $context =~ s/(.*)<.*$/$1/gs;
|
---|
668 |
|
---|
669 | # get page meta-data (if not already included)
|
---|
670 | if ( $self->{'aggressiveness'} == 5 && ! $self->{'smallpage'} ) {
|
---|
671 | @pagemeta = $self->get_page_metadata($textref);
|
---|
672 | foreach $value ( @pagemeta ) {
|
---|
673 | $context .= "$value."; # make each into psuedo-sentence
|
---|
674 | }
|
---|
675 | }
|
---|
676 |
|
---|
677 | # TODO: this list is not exhaustive
|
---|
678 | @refwords = ( '(?:is|are)? ?(?:show(?:s|n)|demonstrate(?:d|s)|explains|features) (?:in|by|below|above|here)',
|
---|
679 | '(?:see)? (?:figure|table)? (?:below|above)');
|
---|
680 |
|
---|
681 | # extract general references
|
---|
682 | foreach $rw ( @refwords ) {
|
---|
683 | while ( $context =~ /[\.\?\!\,](.*?$rw\W.*?[\.\?\!\,])/ig ) {
|
---|
684 | $sentence = $1;
|
---|
685 | $sentence =~ s/\s+/ /g;
|
---|
686 | $sentences{$sentence}+=2;
|
---|
687 | }
|
---|
688 | }
|
---|
689 | # extract specific (figure, table) references by number
|
---|
690 | my ($fignum) = $context =~ /[\.\?\!].*?(?:figure|table)s?[\-\_\ \.](\d+\w*)\W.*?[\.\?\!]/ig;
|
---|
691 | if ( $fignum ) {
|
---|
692 | foreach $rw ( @refwords ) {
|
---|
693 | while ( $context =~ /[\.\?\!](.*?(figure|table)[\-\_\ \.]$fignum\W.*?[\.\?\!])/ig ) {
|
---|
694 | $sentence = $1;
|
---|
695 | $sentence =~ s/\s+/ /g;
|
---|
696 | $sentences{$sentence}+=4;
|
---|
697 | }
|
---|
698 | }
|
---|
699 | }
|
---|
700 |
|
---|
701 | # sentences with occurances of important words
|
---|
702 | @relwords = $reltext =~ /([a-zA-Z]{4,})/g; # take out small words
|
---|
703 | foreach $word ( @relwords ) {
|
---|
704 | if ( $self->{'stopwords'}{$word} ) { next } # skip stop words
|
---|
705 | while ( $context =~ /([^\.\?\!]*?$word\W.*?[\.\?\!])/ig ) {
|
---|
706 | $sentence = $1;
|
---|
707 | $sentence =~ s/\s+/ /g;
|
---|
708 | $sentences{$sentence}++;
|
---|
709 | }
|
---|
710 | }
|
---|
711 | foreach $sentence ( keys %sentences ) {
|
---|
712 | if ($sentences{$sentence} < $self->{'textref_threshold'}) {
|
---|
713 | delete $sentences{$sentence};
|
---|
714 | }
|
---|
715 | }
|
---|
716 | my ($rval) = join "<br>\n", (keys %sentences);
|
---|
717 | if ( $rval && length($rval) > 5 ) {
|
---|
718 | return ( "<tr><td>TextualReferences</td><td>" . $rval . "</td></tr>\n") }
|
---|
719 | else { return "" }
|
---|
720 | }
|
---|
721 |
|
---|
722 | # handles caption extraction
|
---|
723 | # calling the extractor with different
|
---|
724 | # tags and choosing the best candidate caption
|
---|
725 | sub extract_caption_text {
|
---|
726 | my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
|
---|
727 | my (@neartext, $len, $hdelim, $goodlen,
|
---|
728 | $startpos, $context, $context_size);
|
---|
729 |
|
---|
730 | $mintext = $self->{'mintext'};
|
---|
731 | $goodlen = $self->{'caption_len'};
|
---|
732 |
|
---|
733 | # extract a context to extract near text from (faster)
|
---|
734 | $context_size = $self->{'maxtext'}*3;
|
---|
735 | $startpos = $pos - ($context_size / 2);
|
---|
736 | if ($startpos < $prevpos ) { $startpos = $prevpos }
|
---|
737 | if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
|
---|
738 | { $context_size = ($nextpos - $startpos) }
|
---|
739 |
|
---|
740 | $context = substr ( $$textref, $startpos, $context_size );
|
---|
741 | $context =~ s/<!--.*?-->//gs;
|
---|
742 | $context =~ s/^.*-->(.*)/$1/gs;
|
---|
743 | $context =~ s/(.*)<!--.*$/$1/gs;
|
---|
744 |
|
---|
745 | # try stepping through markup delimiter sets
|
---|
746 | # and selecting the best one
|
---|
747 | foreach $hdelim ( @{ $self->{'cdelims'} } ) {
|
---|
748 | @neartext = $self->extract_caption($tag, $hdelim, \$context);
|
---|
749 | $len = length(join("", @neartext));
|
---|
750 | last if ($len >= $mintext && $len <= $goodlen);
|
---|
751 | }
|
---|
752 | # reject if well over reasonable length
|
---|
753 | if ( $len > $goodlen ) {
|
---|
754 | @neartext = [];
|
---|
755 | }
|
---|
756 | $neartext[0] = " " if (! defined $neartext[0]);
|
---|
757 | $neartext[1] = " " if (! defined $neartext[1]);
|
---|
758 | return "<tr><td>Caption</td><td>" . (join ",", @neartext) . "</td></tr>\n"; # TODO: the | is for testing purposes
|
---|
759 | } # end extract_caption_text
|
---|
760 |
|
---|
761 | # the previous section header often gives a bit
|
---|
762 | # of context to the section that the image is
|
---|
763 | # in (invariably the header is before/above the image)
|
---|
764 | # so extract the text of the closest header above the image
|
---|
765 | #
|
---|
766 | # this fn just gets all the headers above the image, within the context window
|
---|
767 | sub get_prev_header {
|
---|
768 | my ($self, $pos, $textref) = (@_);
|
---|
769 | my ($rhtext);
|
---|
770 | while ( $$textref =~ /<h\d>(.*?)<\/h\d>/sig ) {
|
---|
771 | # only headers before image
|
---|
772 | if ((pos $$textref) < $pos) {
|
---|
773 | $rhtext .= "$1, ";
|
---|
774 | }
|
---|
775 | }
|
---|
776 | if ( $rhtext ) { return "Header($rhtext)" }
|
---|
777 | else { return "" }
|
---|
778 | }
|
---|
779 |
|
---|
780 | # not the most robust tag stripping
|
---|
781 | # regexps (see perl.com FAQ) but good enough
|
---|
782 | #
|
---|
783 | # used by caption & tag-based near text algorithms
|
---|
784 | sub strip_tags {
|
---|
785 | my ( $self, $value ) = @_;
|
---|
786 | if ( ! defined($value) ) { $value = "" } # handle nulls
|
---|
787 | else {
|
---|
788 | $value =~ s/<.*?>//gs; # strip all html tags
|
---|
789 | $value =~ s/\s+/\ /g; # remove extra whitespace
|
---|
790 | $value =~ s/\&\w+\;//g; # remove etc
|
---|
791 | }
|
---|
792 | return $value;
|
---|
793 | }
|
---|
794 |
|
---|
795 | # uses the given tag(s) to identify
|
---|
796 | # the caption near to the image
|
---|
797 | # (below, above or both below and above)
|
---|
798 | sub extract_caption {
|
---|
799 | my ($self, $tag, $bound_tag, $contextref) = (@_);
|
---|
800 | my (@nt, $n, $etag, $gotcap);
|
---|
801 | return ("", "") if ( ! ($$contextref =~ /\Q$tag/) );
|
---|
802 |
|
---|
803 | $nt[0] = $`;
|
---|
804 | $nt[1] = $';
|
---|
805 | $gotcap = 0;
|
---|
806 |
|
---|
807 | # look before the image for a boundary tag
|
---|
808 | ($etag, $nt[0]) = $nt[0] =~ /<($bound_tag)[\s]?.*?>(.*?)$/is;
|
---|
809 | # if bound_tag too far from the image, then prob not caption
|
---|
810 | # (note: have to allow for tags, so multiply by 3
|
---|
811 | if ( $etag && length($nt[0]) < ($self->{'caption_len'} * 3) ) {
|
---|
812 | if ( $nt[0] =~ /<\/$etag>/si ) {
|
---|
813 | # the whole caption is above the image: <tag>text</tag><img>
|
---|
814 | ($nt[0]) =~ /<(?:$etag)[\s]?.*?>(.*?)<\/$etag>/is;
|
---|
815 | $nt[0] = $self->strip_tags($nt[0]);
|
---|
816 | if ( length($nt[0]) > $self->{'mintext'} ) {
|
---|
817 | $gotcap = 1;
|
---|
818 | $nt[1] = "";
|
---|
819 | }
|
---|
820 |
|
---|
821 | } elsif ( $nt[1] =~ /<\/$etag>/si) {
|
---|
822 | # the caption tag covers image: <tag>text?<img>text?</tag>
|
---|
823 | ($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
|
---|
824 | $nt[0] = $self->strip_tags($nt[0] . $nt[1]);
|
---|
825 | if ( length($nt[0]) > $self->{'mintext'} ) {
|
---|
826 | $gotcap = 2;
|
---|
827 | $nt[1] = "";
|
---|
828 | }
|
---|
829 | }
|
---|
830 | }
|
---|
831 | # else try below the image
|
---|
832 | if ( ! $gotcap ) {
|
---|
833 | # the caption is after the image: <img><tag>text</tag>
|
---|
834 | ($etag, $nt[1]) = $nt[1] =~ /^.*?<($bound_tag)[\s]?.*?>(.*)/is;
|
---|
835 | if ( $etag && $nt[1] =~ /<\/$etag>/s) {
|
---|
836 | ($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
|
---|
837 | $gotcap = 3;
|
---|
838 | $nt[0] = "";
|
---|
839 | $nt[1] = $self->strip_tags($nt[1]);
|
---|
840 | }
|
---|
841 | }
|
---|
842 | if ( ! $gotcap ) { $nt[0] = $nt[1] = "" }
|
---|
843 | else {
|
---|
844 | # strip part-tags
|
---|
845 | $nt[0] =~ s/^.*>//s;
|
---|
846 | $nt[1] =~ s/<.*$//s;
|
---|
847 | }
|
---|
848 | my ($type);
|
---|
849 | if ( $gotcap == 0 ) { return ("nocaption", "") }
|
---|
850 | elsif ( $gotcap == 1 ) { $type = "captionabove:" }
|
---|
851 | elsif ( $gotcap == 2 ) { $type = "captioncovering:" }
|
---|
852 | elsif ( $gotcap == 3 ) { $type = "captionbelow:" }
|
---|
853 | return ($type, $nt[0], $nt[1]);
|
---|
854 | }
|
---|
855 |
|
---|
856 | # tag-based near text
|
---|
857 | #
|
---|
858 | # tries different tag sets
|
---|
859 | # and chooses the best one
|
---|
860 | sub extract_near_text {
|
---|
861 | my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
|
---|
862 | my (@neartext, $len, $hdelim, $maxtext, $mintext, $goodlen,
|
---|
863 | @bestlen, @best, $startpos, $context, $context_size,
|
---|
864 | $dist, $bdist, $best1, $i, $nt);
|
---|
865 | $bestlen[0] = $bestlen[1] = 0; $bestlen[2] = $bdist = 999999;
|
---|
866 | $best[0] = $best[1] = $best[2] = "";
|
---|
867 | $maxtext = $self->{'maxtext'};
|
---|
868 | $mintext = $self->{'mintext'};
|
---|
869 | $goodlen = $self->{'neartext_len'};
|
---|
870 |
|
---|
871 | # extract a context to extract near text from (faster)
|
---|
872 | $context_size = $maxtext*4;
|
---|
873 | $startpos = $pos - ($context_size / 2);
|
---|
874 | if ($startpos < $prevpos ) { $startpos = $prevpos }
|
---|
875 | if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
|
---|
876 | { $context_size = ($nextpos - $startpos) }
|
---|
877 | $context = substr ( $$textref, $startpos, $context_size );
|
---|
878 | $context =~ s/<!--.*?-->//gs;
|
---|
879 | $context =~ s/^.*-->(.*)/$1/gs;
|
---|
880 | $context =~ s/(.*)<!--.*$/$1/gs;
|
---|
881 |
|
---|
882 | # try stepping through markup delimiter sets
|
---|
883 | # and selecting the best one
|
---|
884 | foreach $hdelim ( @{ $self->{'delims'} } ) {
|
---|
885 | @neartext = $self->extract_tagged_neartext($tag, $hdelim, \$context);
|
---|
886 | $nt = join("", @neartext);
|
---|
887 | $len = length($nt);
|
---|
888 | # Priorities:
|
---|
889 | # 1. Greater than mintext
|
---|
890 | # 2. Less than maxtext
|
---|
891 | # 3. Closest to goodlen
|
---|
892 | if ( $len <= $goodlen && $len > $bestlen[0] ) {
|
---|
893 | $bestlen[0] = $len;
|
---|
894 | $best[0] = $hdelim;
|
---|
895 | } elsif ( $len >= $maxtext && $len < $bestlen[2] ) {
|
---|
896 | $bestlen[2] = $len;
|
---|
897 | $best[2] = $hdelim;
|
---|
898 | } elsif ( $len >= $bestlen[0] && $len <= $bestlen[2] ) {
|
---|
899 | $dist = abs($goodlen - $len);
|
---|
900 | if ( $dist < $bdist ) {
|
---|
901 | $bestlen[1] = $len;
|
---|
902 | $best[1] = $hdelim;
|
---|
903 | $bdist = $dist;
|
---|
904 | }
|
---|
905 | }
|
---|
906 | }
|
---|
907 | $best1 = 2;
|
---|
908 | foreach $i ( 0..2 ) {
|
---|
909 | if ( $bestlen[$i] == 999999 ) { $bestlen[$i] = 0 }
|
---|
910 | $dist = abs($goodlen - $bestlen[$i]);
|
---|
911 | if ( $bestlen[$i] > $mintext && $dist <= $bdist ) {
|
---|
912 | $best1 = $i;
|
---|
913 | $bdist = $dist;
|
---|
914 | }
|
---|
915 | }
|
---|
916 | @neartext = $self->extract_tagged_neartext($tag, $best[$best1], \$context);
|
---|
917 | if ( $bestlen[$best1] > $maxtext ) {
|
---|
918 | # truncate on word boundary if too much text
|
---|
919 | my $hmax = $maxtext / 2;
|
---|
920 | ($neartext[0]) = $neartext[0] =~ /([^\s]*.{1,$hmax})$/s;
|
---|
921 | ($neartext[1]) = $neartext[1] =~ /^(.{1,$hmax}[^\s]*)/s;
|
---|
922 | } elsif ( $bestlen[$best1] < $mintext ) {
|
---|
923 | # use plain text extraction if tags failed (e.g. usable tag outside context)
|
---|
924 | print {$self->{'outhandle'}} "W3ImgPlug: Fallback to plain-text extraction for $tag\n"
|
---|
925 | if $self->{'verbosity'} > 2;
|
---|
926 | $neartext[0] = "<tr><td>RawNeartext</td><td>" . $self->extract_raw_neartext($tag, $textref) . "</td></tr>";
|
---|
927 | $neartext[1] = "";
|
---|
928 | }
|
---|
929 | # get previous header if available
|
---|
930 | $neartext[0] .= "<br>\n" .
|
---|
931 | $self->get_prev_header($pos, \$context) if ( $self->{'aggressiveness'} >= 4 );
|
---|
932 | $neartext[0] = " " if (! defined $neartext[0]);
|
---|
933 | $neartext[1] = " " if (! defined $neartext[1]);
|
---|
934 |
|
---|
935 | return "<tr><td>NearText</td><td>" . (join "|", @neartext) . "</td></tr>\n"; # TODO: the | is for testing purposes
|
---|
936 | } # end extract_near_text
|
---|
937 |
|
---|
938 | # actually captures tag-based
|
---|
939 | # near-text given a tag set
|
---|
940 | sub extract_tagged_neartext {
|
---|
941 | my ($self, $tag, $bound_tag, $textref) = (@_);
|
---|
942 | return "" if ( ! ($$textref =~ /\Q$tag/) );
|
---|
943 | my (@nt, $delim, $pre_tag, $n);
|
---|
944 | $nt[0] = $`;
|
---|
945 | $nt[1] = $';
|
---|
946 |
|
---|
947 | # get text after previous image tag
|
---|
948 | $nt[0] =~ s/.*<($bound_tag)[^>]*>(.*)/$2/is; # get rid of preceding text
|
---|
949 | if (defined($1)) { $delim = $1 }
|
---|
950 | $pre_tag = $bound_tag;
|
---|
951 |
|
---|
952 | if (defined($delim)) {
|
---|
953 | # we want to try and use the end tag of the previous delimiter
|
---|
954 | # (put it on the front of the list)
|
---|
955 | $pre_tag =~ s/(^|\|)($delim)($|\|)//i; # take it out
|
---|
956 | $pre_tag =~ s/\|\|/\|/i; # replace || with |
|
---|
957 | $pre_tag = $delim . "|" . $pre_tag; # put it on the front
|
---|
958 | }
|
---|
959 |
|
---|
960 | # get text before next image tag
|
---|
961 | $nt[1] =~ s/<\/?(?:$pre_tag)[^>]*>.*//is; # get rid of stuff after first delimiter
|
---|
962 |
|
---|
963 | # process related text
|
---|
964 | for $n (0..1) {
|
---|
965 | if ( defined($nt[$n]) ) {
|
---|
966 | $nt[$n] =~ s/<.*?>//gs; # strip all html tags
|
---|
967 | $nt[$n] =~ s/\s+/\ /gs; # remove extra whitespace
|
---|
968 | $nt[$n] =~ s/\&\w+\;//sg; # remove etc
|
---|
969 | # strip part-tags
|
---|
970 | if ( $n == 0 ) { $nt[0] =~ s/^.*>//s }
|
---|
971 | if ( $n == 1 ) { $nt[1] =~ s/<.*$//s }
|
---|
972 | } else { $nt[$n] = ""; } # handle nulls
|
---|
973 | }
|
---|
974 | return @nt;
|
---|
975 | }
|
---|
976 |
|
---|
977 | # this function is fall-back
|
---|
978 | # if tags aren't suitable.
|
---|
979 | #
|
---|
980 | # extracts a fixed length of characters
|
---|
981 | # either side of image tag (on word boundary)
|
---|
982 | sub extract_raw_neartext {
|
---|
983 | my ($self, $tag, $textref) = (@_);
|
---|
984 | my ($rawtext, $startpos, $fp);
|
---|
985 | my $imgs = \%{$self->{'imglist'}};
|
---|
986 | ($fp) = $tag =~ /([\w\\\/]+\.(?:gif|jpe?g|png))/is;
|
---|
987 | if (! $fp) { return " " };
|
---|
988 | # if the cached, plain-text version isn't there, then create it
|
---|
989 | $self->init_plaintext($textref) unless defined($self->{'plaintext'});
|
---|
990 |
|
---|
991 | # take the closest maxtext/2 characters
|
---|
992 | # either side of the tag (by word boundary)
|
---|
993 | return "" if ( ! exists $imgs->{$fp}{'rawpos'} );
|
---|
994 | $startpos = $imgs->{$fp}{'rawpos'} - (($self->{'maxtext'} / 2) + 20);
|
---|
995 | if ( $startpos < 0 ) { $startpos = 0 }
|
---|
996 | $rawtext = substr $self->{'plaintext'}, $startpos, $self->{'maxtext'} + 20;
|
---|
997 | $rawtext =~ s/\s\s/ /g;
|
---|
998 |
|
---|
999 | return $rawtext;
|
---|
1000 | }
|
---|
1001 |
|
---|
1002 | # init plaintext variable for HTML-stripped version
|
---|
1003 | # (for full text index/raw assoc text extraction)
|
---|
1004 | sub init_plaintext {
|
---|
1005 | my ($self, $textref) = (@_);
|
---|
1006 | my ($page, $fp);
|
---|
1007 | my $imgs = \%{$self->{'imglist'}};
|
---|
1008 | $page = $$textref; # make a copy of original
|
---|
1009 |
|
---|
1010 | # strip tags around image filenames so they don't get zapped
|
---|
1011 | $page =~ s/<\w+\s+.*?([\w\/\\]+\.(?:gif|jpe?g|png))[^>]*>/\"$1\"/gsi;
|
---|
1012 | $page =~ s/<.*?>//gs;
|
---|
1013 | $page =~ s/ / /gs;
|
---|
1014 | $page =~ s/&/&/gs; #TODO: more &zzz; replacements (except <, $gt;)
|
---|
1015 |
|
---|
1016 | # get positions and strip images
|
---|
1017 | while ( $page =~ /([^\s\'\"]+\.(jpe?g|gif|png))/ig ) {
|
---|
1018 | $fp = $1;
|
---|
1019 | if ( $imgs->{$fp}{'exists'} ) {
|
---|
1020 | $imgs->{$fp}{'rawpos'} = pos $page;
|
---|
1021 | }
|
---|
1022 | $page =~ s/\"$fp\"//gs;
|
---|
1023 | }
|
---|
1024 | $self->{'plaintext'} = $page;
|
---|
1025 | }
|
---|
1026 |
|
---|
1027 | # finds and filters images based on size
|
---|
1028 | # (dimensions, height, filesize) and existence
|
---|
1029 | #
|
---|
1030 | # looks for image filenames (.jpg, .gif, etc)
|
---|
1031 | # and checks for existence on disk
|
---|
1032 | # (hence supports most JavaScript images)
|
---|
1033 | sub get_img_list {
|
---|
1034 | my $self = shift (@_);
|
---|
1035 | my ($textref) = (@_);
|
---|
1036 | my ($filepath, $relpath, $abspath, $pos, $num, $width, $height, $filesize);
|
---|
1037 | my $imgs = \%{$self->{'imglist'}};
|
---|
1038 | while ( $$textref =~ /([^\s\'\"]+\.(jpe?g|gif|png))/ig ) {
|
---|
1039 | $filepath = $1;
|
---|
1040 | $pos = pos $$textref;
|
---|
1041 | next if ( $imgs->{$filepath}{'relpath'} );
|
---|
1042 | $relpath = $filepath;
|
---|
1043 | $relpath =~ s/^http\:\/\///; # remove http:// in case we have mirrored it
|
---|
1044 | $relpath =~ s/\\/\//g; # replace \ with /
|
---|
1045 | $relpath =~ s/^\.\///s; # make "./filepath" into "filepath"
|
---|
1046 | $imgs->{$filepath}{'relpath'} = $relpath;
|
---|
1047 | $abspath = "$self->{'htpath'}/$relpath";
|
---|
1048 |
|
---|
1049 | if (! -e $abspath) { next }
|
---|
1050 |
|
---|
1051 | # can't modify real filepath var because it
|
---|
1052 | # then can't be located in the page for tag recognition later
|
---|
1053 | ($width, $height) =
|
---|
1054 | `identify $abspath -ping -format "%wx%h"` =~ /^(\d*)x(\d*)$/m;
|
---|
1055 | if (! ($width && $height)) {
|
---|
1056 | print STDERR "W3ImgPlug: ($abspath) 'identify' failed. Check ImageMagicK binaries are installed and working correctly\n"; next;
|
---|
1057 | }
|
---|
1058 | $filesize = (-s $abspath);
|
---|
1059 | if ( $filesize >= $self->{'min_img_filesize'}
|
---|
1060 | && ( $width >= $self->{'min_img_width'} )
|
---|
1061 | && ( $height >= $self->{'min_img_height'} ) ) {
|
---|
1062 |
|
---|
1063 | $imgs->{$filepath}{'exists'} = 1;
|
---|
1064 | $imgs->{$filepath}{'pos'} = $pos;
|
---|
1065 | $imgs->{$filepath}{'width'} = $width;
|
---|
1066 | $imgs->{$filepath}{'height'} = $height;
|
---|
1067 | $imgs->{$filepath}{'filesize'} = $filesize;
|
---|
1068 | } else {
|
---|
1069 | print {$self->{'outhandle'}} "W3ImgPlug: skipping $self->{'base_path'}/$relpath: $filesize, $width x $height\n"
|
---|
1070 | if $self->{'verbosity'} > 2;
|
---|
1071 | }
|
---|
1072 | }
|
---|
1073 | $num = 0;
|
---|
1074 | foreach $i ( keys %{$imgs} ) {
|
---|
1075 | if ( $imgs->{$i}{'pos'} ) {
|
---|
1076 | $num++;
|
---|
1077 | } else { delete $imgs->{$i} }
|
---|
1078 | }
|
---|
1079 | return $num;
|
---|
1080 | }
|
---|
1081 |
|
---|
1082 | # make the text available to the read function
|
---|
1083 | # by making it an object variable
|
---|
1084 | sub read_file {
|
---|
1085 | my ($self, $filename, $encoding, $language, $textref) = @_;
|
---|
1086 | $self->SUPER::read_file($filename, $encoding, $language, $textref);
|
---|
1087 |
|
---|
1088 | # if HTMLplug has run through, then it will
|
---|
1089 | # have replaced references so we have to
|
---|
1090 | # make a copy of the text before processing
|
---|
1091 | if ( $self->{'index_pages'} ) {
|
---|
1092 | $self->{'text'} = $$textref;
|
---|
1093 | $self->{'textref'} = \($self->{'text'});
|
---|
1094 | } else {
|
---|
1095 | $self->{'textref'} = $textref;
|
---|
1096 | }
|
---|
1097 | $self->{'plaintext'} = undef;
|
---|
1098 | }
|
---|
1099 |
|
---|
1100 | # HTMLPlug only extracts meta-data if it is specified in plugin options
|
---|
1101 | # hence a special function to do it here
|
---|
1102 | sub get_meta_value {
|
---|
1103 | my ($self, $name, $textref) = @_;
|
---|
1104 | my ($value);
|
---|
1105 | $name = lc $name;
|
---|
1106 | if ($name eq "title") {
|
---|
1107 | ($value) = $$textref =~ /<title>(.*?)<\/title>/is
|
---|
1108 | } else {
|
---|
1109 | my $qm = "(?:\"|\')";
|
---|
1110 | ($value) = $$textref =~ /<meta name\s*=\s*$qm?$name$qm?\s+content\s*=\s*$qm?(.*?)$qm?\s*>/is
|
---|
1111 | }
|
---|
1112 | $value = "" unless $value;
|
---|
1113 | return $value;
|
---|
1114 | }
|
---|
1115 |
|
---|
1116 | # make filename an anchor reference
|
---|
1117 | # so we can go straight to the image
|
---|
1118 | # within the cached version of the source page
|
---|
1119 | # (augment's HTMLPlug sub)
|
---|
1120 | sub replace_images {
|
---|
1121 | my $self = shift (@_);
|
---|
1122 | my ($front, $link, $back, $base_dir,
|
---|
1123 | $file, $doc_obj, $section) = @_;
|
---|
1124 | $link =~ s/\"//g;
|
---|
1125 | my ($a_name) = $link;
|
---|
1126 | $a_name =~ s/[\/\\\:\&]/_/g;
|
---|
1127 | # keep a list so we don't repeat the same anchor
|
---|
1128 | if ( ! $self->{'imglist'}{$link}{'anchored'} ) {
|
---|
1129 | $front = "<a name=\"gsdl_$a_name\">$front";
|
---|
1130 | $back = "$back</a>";
|
---|
1131 | $self->{'imglist'}{$link}{'anchored'} = 1;
|
---|
1132 | }
|
---|
1133 | return $self->SUPER::replace_images($front, $link, $back, $base_dir,
|
---|
1134 | $file, $doc_obj, $section);
|
---|
1135 | }
|
---|
1136 |
|
---|
1137 | 1;
|
---|