[2899] | 1 | ###########################################################################
|
---|
| 2 | #
|
---|
[2996] | 3 | # W3ImgPlug.pm -- Context-based image indexing plugin for HTML documents
|
---|
[2899] | 4 | #
|
---|
| 5 | # A component of the Greenstone digital library software
|
---|
| 6 | # from the New Zealand Digital Library Project at the
|
---|
| 7 | # University of Waikato, New Zealand.
|
---|
| 8 | #
|
---|
| 9 | # Copyright (C) 2001 New Zealand Digital Library Project
|
---|
| 10 | #
|
---|
| 11 | # This program is free software; you can redistribute it and/or modify
|
---|
| 12 | # it under the terms of the GNU General Public License as published by
|
---|
| 13 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 14 | # (at your option) any later version.
|
---|
| 15 | #
|
---|
| 16 | # This program is distributed in the hope that it will be useful,
|
---|
| 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 19 | # GNU General Public License for more details.
|
---|
| 20 | #
|
---|
| 21 | # You should have received a copy of the GNU General Public License
|
---|
| 22 | # along with this program; if not, write to the Free Software
|
---|
| 23 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 24 | #
|
---|
| 25 | ###########################################################################
|
---|
| 26 |
|
---|
[2996] | 27 | # DESCRIPTION:
|
---|
| 28 | #
|
---|
| 29 | # Extracts images and associated text and metadata from
|
---|
| 30 | # web pages as individual documents for indexing. Thumbnails
|
---|
| 31 | # are created from each image for browsing purposes.
|
---|
| 32 | #
|
---|
| 33 | # Options are available for configuring the aggressiveness of the
|
---|
| 34 | # associated text extraction mechanisms. A higher level of
|
---|
| 35 | # aggressiveness will extract more text and consequently
|
---|
| 36 | # may mean lower accuracy (precision); however, it may also
|
---|
| 37 | # retrieve more of the relevant images from the collection (recall).
|
---|
| 38 | # Lower levels of aggressiveness maybe result in slightly faster
|
---|
| 39 | # collection builds at the import stage.
|
---|
| 40 | #
|
---|
| 41 | # W3ImgPlug is a subclass of HTMLPlug (i.e. it will index pages also
|
---|
| 42 | # if required). It can be used in place of HTMLPlug to index both
|
---|
| 43 | # pages and their images.
|
---|
| 44 | #
|
---|
| 45 | # REQUIREMENTS:
|
---|
[2899] | 46 | #
|
---|
[2996] | 47 | # The ImageMagick image manipulation is used to create
|
---|
| 48 | # thumbnails and extract some image metadata. (Available
|
---|
| 49 | # from http://www.imagemagick.org/)
|
---|
| 50 | #
|
---|
| 51 | # Unix:
|
---|
| 52 | # Many Linux distributions contain ImageMagick.
|
---|
| 53 | #
|
---|
| 54 | # Windows:
|
---|
| 55 | # ImageMagick can be downloaded from the website above.
|
---|
| 56 | # Make sure the system path includes the ImageMagick binaries
|
---|
| 57 | # before using W3ImgPlug.
|
---|
| 58 | #
|
---|
| 59 | # NOTE: NT/2000/XP contain a filesystem utility 'convert.exe'
|
---|
| 60 | # with the same name as the image conversion utility. The
|
---|
| 61 | # ImageMagick FAQ recommends renaming the filesystem
|
---|
| 62 | # utility (e.g. to 'fsconvert.exe') to avoid this clash.
|
---|
| 63 | #
|
---|
| 64 | # USAGE:
|
---|
| 65 | #
|
---|
| 66 | # An image document consists of metadata elements:
|
---|
| 67 | #
|
---|
| 68 | # OriginalFilename, FilePath, Filename, FileExt, FileSize,
|
---|
| 69 | # Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL
|
---|
| 70 | # ImageText, PageTitle
|
---|
| 71 | #
|
---|
| 72 | # Most of these are only useful in format strings (e.g. ThumbURL,
|
---|
| 73 | # Filename, URL, PageURL, CachePageURL).
|
---|
| 74 | #
|
---|
| 75 | # ImageText, as the name suggests contains the indexable text.
|
---|
| 76 | # (unless using the -document_text plugin option)
|
---|
| 77 | #
|
---|
| 78 | # Since image documents are made up of metadata elements
|
---|
| 79 | # alone, format strings are needed to display them properly.
|
---|
| 80 | # NOTE: The receptionist will only display results (e.g. thumbnails)
|
---|
| 81 | # in 4 columns if the format string begins with "<td><table>".
|
---|
| 82 | #
|
---|
| 83 | # The example below takes the user to the image within the
|
---|
| 84 | # source HTML document rather than using a format string
|
---|
| 85 | # on DocumentText to display the image document itself.
|
---|
| 86 | #
|
---|
| 87 | # Example collect.cfg:
|
---|
| 88 | #
|
---|
| 89 | # ...
|
---|
| 90 | #
|
---|
| 91 | # indexes document:ImageText document:text
|
---|
| 92 | # defaultindex document:ImageText
|
---|
| 93 | #
|
---|
| 94 | # collectionmeta .document:ImageText "images"
|
---|
| 95 | # collectionmeta .document:text "documents"
|
---|
| 96 | #
|
---|
| 97 | # ...
|
---|
| 98 | #
|
---|
| 99 | # plugin W3ImgPlug -index_pages -aggressiveness 6
|
---|
| 100 | #
|
---|
| 101 | # ...
|
---|
| 102 | #
|
---|
| 103 | # format SearchVList '<td>{If}{[Title],[link][icon] [Title][[/link],
|
---|
| 104 | # <table><tr><td align="center"><a href="[CachePageURL]">
|
---|
| 105 | # <img src="[ThumbURL]"></a></td></tr><tr><td align="center">
|
---|
| 106 | # <a href="[CachePageURL]"><font size="-1">[OriginalFilename]</font></a>
|
---|
| 107 | # <br>[Width]x[Height]</td></tr></table>}</td>'
|
---|
| 108 | #
|
---|
| 109 | # ...
|
---|
| 110 | #
|
---|
| 111 |
|
---|
[2899] | 112 | package W3ImgPlug;
|
---|
| 113 |
|
---|
| 114 | use HTMLPlug;
|
---|
| 115 | use ghtml;
|
---|
| 116 | use unicode;
|
---|
| 117 | use util;
|
---|
[10254] | 118 | use strict; # 'subs';
|
---|
| 119 | no strict 'refs'; # allow filehandles to be variables and viceversa
|
---|
[2899] | 120 |
|
---|
| 121 | sub BEGIN {
|
---|
[10254] | 122 | @W3ImgPlug::ISA = qw( HTMLPlug );
|
---|
[2899] | 123 | }
|
---|
| 124 |
|
---|
[4744] | 125 | my $aggressiveness_list =
|
---|
| 126 | [ { 'name' => "1",
|
---|
[4873] | 127 | 'desc' => "{W3ImgPlug.aggressiveness.1}" },
|
---|
[4744] | 128 | { 'name' => "2",
|
---|
[4873] | 129 | 'desc' => "{W3ImgPlug.aggressiveness.2}" },
|
---|
[4744] | 130 | { 'name' => "3",
|
---|
[4873] | 131 | 'desc' => "{W3ImgPlug.aggressiveness.3}" },
|
---|
[4744] | 132 | { 'name' => "4",
|
---|
[4873] | 133 | 'desc' => "{W3ImgPlug.aggressiveness.4}" },
|
---|
[4744] | 134 | { 'name' => "5",
|
---|
[4873] | 135 | 'desc' => "{W3ImgPlug.aggressiveness.5}" },
|
---|
[4744] | 136 | { 'name' => "6",
|
---|
[4873] | 137 | 'desc' => "{W3ImgPlug.aggressiveness.6}" },
|
---|
[4744] | 138 | { 'name' => "7",
|
---|
[4873] | 139 | 'desc' => "{W3ImgPlug.aggressiveness.7}" },
|
---|
[4744] | 140 | { 'name' => "8",
|
---|
[4873] | 141 | 'desc' => "{W3ImgPlug.aggressiveness.8}" },
|
---|
[4744] | 142 | { 'name' => "9",
|
---|
[4873] | 143 | 'desc' => "{W3ImgPlug.aggressiveness.9}" } ];
|
---|
[4744] | 144 |
|
---|
| 145 | my $arguments =
|
---|
| 146 | [ { 'name' => "aggressiveness",
|
---|
[4873] | 147 | 'desc' => "{W3ImgPlug.aggressiveness}",
|
---|
[4744] | 148 | 'type' => "int",
|
---|
| 149 | 'list' => $aggressiveness_list,
|
---|
| 150 | 'deft' => "3",
|
---|
| 151 | 'reqd' => "no" },
|
---|
| 152 | { 'name' => "index_pages",
|
---|
[4873] | 153 | 'desc' => "{W3ImgPlug.index_pages}",
|
---|
[4744] | 154 | 'type' => "flag",
|
---|
| 155 | 'reqd' => "no" },
|
---|
| 156 | { 'name' => "no_cache_images",
|
---|
[4873] | 157 | 'desc' => "{W3ImgPlug.no_cache_images}",
|
---|
[4744] | 158 | 'type' => "flag",
|
---|
| 159 | 'reqd' => "no" },
|
---|
| 160 | { 'name' => "min_size",
|
---|
[4873] | 161 | 'desc' => "{W3ImgPlug.min_size}",
|
---|
[4744] | 162 | 'type' => "int",
|
---|
| 163 | 'deft' => "2000",
|
---|
| 164 | 'reqd' => "no" },
|
---|
| 165 | { 'name' => "min_width",
|
---|
[4873] | 166 | 'desc' => "{W3ImgPlug.min_width}",
|
---|
[4744] | 167 | 'type' => "int",
|
---|
| 168 | 'deft' => "50",
|
---|
| 169 | 'reqd' => "no" },
|
---|
| 170 | { 'name' => "min_height",
|
---|
[4873] | 171 | 'desc' => "{W3ImgPlug.min_height}",
|
---|
[4744] | 172 | 'type' => "int",
|
---|
| 173 | 'deft' => "50",
|
---|
| 174 | 'reqd' => "no" },
|
---|
| 175 | { 'name' => "thumb_size",
|
---|
[4873] | 176 | 'desc' => "{W3ImgPlug.thumb_size}",
|
---|
[4744] | 177 | 'type' => "int",
|
---|
| 178 | 'deft' => "100",
|
---|
| 179 | 'reqd' => "no" },
|
---|
| 180 | { 'name' => "convert_params",
|
---|
[4873] | 181 | 'desc' => "{W3ImgPlug.convert_params}",
|
---|
[4744] | 182 | 'type' => "string",
|
---|
| 183 | 'deft' => "",
|
---|
| 184 | 'reqd' => "no" },
|
---|
| 185 | { 'name' => "min_near_text",
|
---|
[4873] | 186 | 'desc' => "{W3ImgPlug.min_near_text}",
|
---|
[4744] | 187 | 'type' => "int",
|
---|
| 188 | 'deft' => "10",
|
---|
| 189 | 'reqd' => "no" },
|
---|
| 190 | { 'name' => "max_near_text",
|
---|
[4873] | 191 | 'desc' => "{W3ImgPlug.max_near_text}",
|
---|
[4744] | 192 | 'type' => "int",
|
---|
| 193 | 'deft' => "400",
|
---|
| 194 | 'reqd' => "no" },
|
---|
| 195 | { 'name' => "smallpage_threshold",
|
---|
[4873] | 196 | 'desc' => "{W3ImgPlug.smallpage_threshold}",
|
---|
[4744] | 197 | 'type' => "int",
|
---|
| 198 | 'deft' => "2048",
|
---|
| 199 | 'reqd' => "no" },
|
---|
| 200 | { 'name' => "textrefs_threshold",
|
---|
[4873] | 201 | 'desc' => "{W3ImgPlug.textrefs_threshold}",
|
---|
[4744] | 202 | 'type' => "int",
|
---|
| 203 | 'deft' => "2",
|
---|
| 204 | 'reqd' => "no" },
|
---|
| 205 | { 'name' => "caption_length",
|
---|
[4873] | 206 | 'desc' => "{W3ImgPlug.caption_length}",
|
---|
[4744] | 207 | 'type' => "int",
|
---|
| 208 | 'deft' => "80",
|
---|
| 209 | 'reqd' => "no" },
|
---|
| 210 | { 'name' => "neartext_length",
|
---|
[4873] | 211 | 'desc' => "{W3ImgPlug.neartext_length}",
|
---|
[4744] | 212 | 'type' => "int",
|
---|
| 213 | 'deft' => "300",
|
---|
| 214 | 'reqd' => "no" },
|
---|
| 215 | { 'name' => "document_text",
|
---|
[4873] | 216 | 'desc' => "{W3ImgPlug.document_text}",
|
---|
[4744] | 217 | 'type' => "flag",
|
---|
[4873] | 218 | 'reqd' => "no" } ];
|
---|
[4744] | 219 |
|
---|
| 220 | my $options = { 'name' => "W3ImgPlug",
|
---|
[5680] | 221 | 'desc' => "{W3ImgPlug.desc}",
|
---|
[6408] | 222 | 'abstract' => "no",
|
---|
[4744] | 223 | 'inherits' => "yes",
|
---|
| 224 | 'args' => $arguments };
|
---|
| 225 |
|
---|
[2899] | 226 | sub new {
|
---|
[10218] | 227 | my ($class) = shift (@_);
|
---|
| 228 | my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
|
---|
| 229 | push(@$pluginlist, $class);
|
---|
[4744] | 230 |
|
---|
[10218] | 231 | if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
|
---|
| 232 | if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
|
---|
[2899] | 233 |
|
---|
[10218] | 234 | my $self = (defined $hashArgOptLists)? new HTMLPlug($pluginlist,$inputargs,$hashArgOptLists): new HTMLPlug($pluginlist,$inputargs);
|
---|
| 235 |
|
---|
[2899] | 236 | # init class variables
|
---|
| 237 | $self->{'textref'} = undef; # init by read_file fn
|
---|
| 238 | $self->{'htdoc_obj'} = undef; # init by process fn
|
---|
| 239 | $self->{'htpath'} = undef; # init by process fn
|
---|
| 240 | $self->{'hturl'} = undef; # init by process fn
|
---|
| 241 | $self->{'plaintext'} = undef; # HTML stripped version - only init if needed by raw_neartext sub
|
---|
| 242 | $self->{'smallpage'} = 0; # set by process fn
|
---|
| 243 | $self->{'images_indexed'} = undef; # num of images indexed - if 1 or 2 then we know page is small
|
---|
| 244 | $self->{'initialised'} = undef; # flag (see set_extraction_options())
|
---|
| 245 |
|
---|
| 246 | return bless $self, $class;
|
---|
| 247 | }
|
---|
| 248 |
|
---|
| 249 | # if indexing pages, let HTMLPlug do it's stuff
|
---|
| 250 | # image extraction done through read()
|
---|
| 251 | sub process {
|
---|
| 252 | my ($self, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
|
---|
| 253 | $self->{'imglist'} = ();
|
---|
| 254 | if ( $self->{'index_pages'} ) {
|
---|
| 255 | my $ok = $self->SUPER::process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
|
---|
| 256 | if ( ! $ok ) { return $ok }
|
---|
| 257 | $self->{'htdoc_obj'} = $doc_obj;
|
---|
| 258 | }
|
---|
| 259 | # else use URL for referencing
|
---|
[2996] | 260 | #if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
|
---|
[2899] | 261 |
|
---|
| 262 | $self->{'htpath'} = $base_dir if (-d $base_dir);
|
---|
| 263 | if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} .= "/$1"; }
|
---|
| 264 | $self->{'htpath'} =~ s/\\/\//g; # replace \ with /
|
---|
| 265 |
|
---|
| 266 | $self->{'hturl'} = "http://$file";
|
---|
| 267 | $self->{'hturl'} =~ s/\\/\//g; # for windows
|
---|
| 268 | ($self->{'filename'}) = $file =~ /.*[\/\\](.*)/;
|
---|
| 269 | ($self->{'base_path'}) = $file =~ /(.*)[\/\\]/i;
|
---|
| 270 | if ( ( -s "$base_dir/$file") <= $self->{'smallpage_threshold'} ) {
|
---|
| 271 | $self->{'smallpage'} = 1;
|
---|
| 272 | } else { $self->{'smallpage'} = 0; }
|
---|
| 273 |
|
---|
| 274 | if ( defined($self->{'initialised'}) ) { return 1; }
|
---|
| 275 | else {
|
---|
| 276 | $self->{'initialised'} = $self->set_extraction_options($base_dir =~ /^(.*?)\/import/i);
|
---|
| 277 | return $self->{'initialised'};
|
---|
| 278 | }
|
---|
| 279 | }
|
---|
| 280 |
|
---|
| 281 | # get complex configuration options from configuration files
|
---|
[2996] | 282 | # -- $GSDLCOLLECTION/etc/W3ImgPlug.cfg (tag sets for aggr 2+)
|
---|
| 283 | # -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+)
|
---|
[2899] | 284 |
|
---|
| 285 | # If there's no W3ImgPlug.cfg file we'll use the following default values
|
---|
| 286 | my $defaultcfg = '
|
---|
| 287 | <delimitertagset>
|
---|
| 288 | <setname>Caption</setname>
|
---|
| 289 | <taggroup>font</taggroup>
|
---|
| 290 | <taggroup>tt</taggroup>
|
---|
| 291 | <taggroup>small</taggroup>
|
---|
| 292 | <taggroup>b</taggroup>
|
---|
| 293 | <taggroup>i</taggroup>
|
---|
| 294 | <taggroup>u</taggroup>
|
---|
| 295 | <taggroup>em</taggroup>
|
---|
| 296 | <taggroup>td</taggroup>
|
---|
| 297 | <taggroup>li</taggroup>
|
---|
| 298 | <taggroup>a</taggroup>
|
---|
| 299 | <taggroup>p</taggroup>
|
---|
| 300 | <taggroup>tr</taggroup>
|
---|
| 301 | <taggroup>center</taggroup>
|
---|
| 302 | <taggroup>div</taggroup>
|
---|
| 303 | <taggroup>caption</taggroup>
|
---|
| 304 | <taggroup>br</taggroup>
|
---|
| 305 | <taggroup>ul</taggroup>
|
---|
| 306 | <taggroup>ol</taggroup>
|
---|
| 307 | <taggroup>table</taggroup>
|
---|
| 308 | <taggroup>hr</taggroup>
|
---|
| 309 | </delimitertagset>
|
---|
| 310 |
|
---|
| 311 | <delimitertagset>
|
---|
| 312 | <setname>Neartext</setname>
|
---|
| 313 | <taggroup>tr|hr|table|h\d|img|body</taggroup>
|
---|
| 314 | <taggroup>td|tr|hr|table|h\d|img|body</taggroup>
|
---|
| 315 | <taggroup>p|br|td|tr|hr|table|h\d|img|body</taggroup>
|
---|
| 316 | <taggroup>font|p|i|b|em|img</taggroup>
|
---|
| 317 | </delimitertagset>
|
---|
| 318 | ';
|
---|
| 319 |
|
---|
| 320 | sub set_extraction_options() {
|
---|
| 321 | my ($self, $collpath) = @_;
|
---|
| 322 | my ($filepath);
|
---|
| 323 |
|
---|
| 324 | print {$self->{'outhandle'}} "W3ImgPlug: Initialising\n"
|
---|
| 325 | if $self->{'verbosity'} > 1;
|
---|
| 326 | # etc/W3ImgPlug.cfg (XML)
|
---|
| 327 | # tag sets for captions and neartext
|
---|
[4744] | 328 | if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) {
|
---|
[2899] | 329 | $self->{'delims'} = [];
|
---|
| 330 | $self->{'cdelims'} = [];
|
---|
| 331 | my ($cfg, @tagsets, $tagset, $type, @delims);
|
---|
| 332 |
|
---|
| 333 | $filepath = "$collpath/etc/W3ImgPlug.cfg";
|
---|
| 334 | if ( open CFG, "<$filepath" ) {
|
---|
| 335 | while (<CFG>) { $cfg .= $_ }
|
---|
| 336 | close CFG;
|
---|
| 337 | } else {
|
---|
| 338 | $cfg = $defaultcfg;
|
---|
| 339 | }
|
---|
| 340 |
|
---|
| 341 | (@tagsets) =
|
---|
| 342 | $cfg =~ /<delimitertagset>(.*?)<\/delimitertagset>/igs;
|
---|
| 343 | foreach $tagset ( @tagsets ) {
|
---|
| 344 | ($type) = $tagset =~ /<setname>(.*?)<\/setname>/i;
|
---|
| 345 | if ( lc($type) eq "caption" ) {
|
---|
| 346 | (@{$self->{'cdelims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
|
---|
| 347 | }
|
---|
| 348 | elsif ( lc($type) eq "neartext" ) {
|
---|
| 349 | (@{$self->{'delims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
|
---|
| 350 | }
|
---|
| 351 | }
|
---|
| 352 |
|
---|
| 353 | # output a warning if there seem to be no delimiters
|
---|
| 354 | if ( scalar(@{$self->{'cdelims'}} == 0)) {
|
---|
| 355 | print {$self->{'outhandle'}} "W3ImgPlug: Warning: no caption delimiters found in $filepath\n";
|
---|
| 356 | }
|
---|
| 357 | if ( scalar(@{$self->{'delims'}} == 0)) {
|
---|
| 358 | print {$self->{'outhandle'}} "W3ImgPlug: Warning: no neartext delimiters found in $filepath\n";
|
---|
| 359 | }
|
---|
| 360 | }
|
---|
| 361 |
|
---|
| 362 | # get stop words for textual reference extraction
|
---|
| 363 | # TODO: warnings scroll off. Would be best to output them again at end of import
|
---|
[4744] | 364 | if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) {
|
---|
[2899] | 365 | $self->{'stopwords'} = ();
|
---|
| 366 | $filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");
|
---|
| 367 | if ( open STOPWORDS, "<$filepath" ) {
|
---|
| 368 | while ( <STOPWORDS> ) {
|
---|
| 369 | chomp;
|
---|
| 370 | $self->{'stopwords'}{$_} = 1;
|
---|
| 371 | }
|
---|
| 372 | close STOPWORDS;
|
---|
| 373 | } else {
|
---|
| 374 | print {$self->{'outhandle'}} "W3ImgPlug: Warning: couldn't open stopwords file at $filepath ($!)\n";
|
---|
| 375 | }
|
---|
| 376 |
|
---|
| 377 | }
|
---|
| 378 |
|
---|
[10218] | 379 | if ( $self->{'neartext_length'} > $self->{'max_near_text'} ) {
|
---|
| 380 | $self->{'max_near_text'} = $self->{'neartext_length'} * 1.33;
|
---|
| 381 | print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'max_near_text'}\n";
|
---|
[2899] | 382 | }
|
---|
[10218] | 383 | if ( $self->{'caption_length'} > $self->{'max_near_text'} ) {
|
---|
| 384 | $self->{'max_near_text'} = $self->{'caption_length'} * 1.33;
|
---|
| 385 | print {$self->{'outhandle'}} "W3ImgPlug: Warning: adjusted max_text to $self->{'max_near_text'}\n";
|
---|
[2899] | 386 | }
|
---|
| 387 |
|
---|
| 388 | return 1;
|
---|
| 389 | }
|
---|
| 390 |
|
---|
[7362] | 391 | # return number of files processed, undef if can't recognise, -1 if
|
---|
| 392 | # cant process
|
---|
[2899] | 393 | # Note that $base_dir might be "" and that $file might
|
---|
| 394 | # include directories
|
---|
| 395 | sub read {
|
---|
[9853] | 396 | my ($self, $pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = (@_);
|
---|
[2899] | 397 | my ($doc_obj, $section, $filepath, $imgtag, $pos, $context, $numdocs, $tndir, $imgs);
|
---|
| 398 | # forward normal read (runs HTMLPlug if index_pages T)
|
---|
[9853] | 399 | my $ok = $self->SUPER::read($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
|
---|
[7362] | 400 | if ( ! $ok ) { return $ok } # what is this returning??
|
---|
[2899] | 401 |
|
---|
| 402 | my $outhandle = $self->{'outhandle'};
|
---|
| 403 | my $textref = $self->{'textref'};
|
---|
| 404 | my $htdoc_obj = $self->{'htdoc_obj'};
|
---|
| 405 | $numdocs = 0;
|
---|
| 406 | $base_dir =~ /(.*)\/.*/;
|
---|
| 407 | $tndir = "$1/archives/thumbnails"; # TODO: this path shouldn't be hardcoded?
|
---|
| 408 | &util::mk_all_dir($tndir) unless -e "$tndir";
|
---|
| 409 |
|
---|
| 410 | $imgs = \%{$self->{'imglist'}};
|
---|
| 411 | my $nimgs = $self->get_img_list($textref);
|
---|
| 412 | $self->{'images_indexed'} = $nimgs;
|
---|
| 413 | if ( $nimgs > 0 ) {
|
---|
| 414 | my @fplist = (sort { $imgs->{$a}{'pos'} <=> $imgs->{$b}{'pos'} } keys %{$imgs});
|
---|
| 415 | my $i = 0;
|
---|
| 416 | foreach $filepath ( @fplist ) {
|
---|
| 417 | $pos = $imgs->{$filepath}{'pos'};
|
---|
| 418 | $context = substr ($$textref, $pos - 50, $pos + 50); # grab context (quicker)
|
---|
| 419 | ($imgtag) = ($context =~ /(<(?:img|a|body)\s[^>]*$filepath[^>]*>)/is );
|
---|
| 420 | if (! defined($imgtag)) { $imgtag = $filepath }
|
---|
| 421 | print $outhandle "W3ImgPlug: extracting $filepath\n"
|
---|
| 422 | if ( $self->{'verbosity'} > 1 );
|
---|
| 423 | $doc_obj = new doc ("", "indexed_doc");
|
---|
| 424 | $section = $doc_obj->get_top_section();
|
---|
[10254] | 425 | my $prevpos = ( $i == 0 ? 0 : $imgs->{$fplist[$i - 1]}{'pos'});
|
---|
| 426 | my $nextpos = ( $i >= ($nimgs -1) ? -1 : $imgs->{$fplist[$i + 1]}{'pos'} );
|
---|
[2899] | 427 |
|
---|
| 428 | $self->extract_image_info($imgtag, $filepath, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos);
|
---|
| 429 | $processor->process($doc_obj);
|
---|
| 430 | $numdocs++;
|
---|
| 431 | $i++;
|
---|
| 432 | }
|
---|
| 433 | return $numdocs;
|
---|
| 434 | } else {
|
---|
| 435 | print $outhandle "W3ImgPlug: No images from $file indexed\n"
|
---|
| 436 | if ( $self->{'verbosity'} > 2 );
|
---|
| 437 | return 1;
|
---|
| 438 | }
|
---|
| 439 |
|
---|
| 440 | }
|
---|
| 441 |
|
---|
| 442 | # for every valid image tag
|
---|
| 443 | # 1. extract related text and image metadata
|
---|
| 444 | # 2. add this as document meta-data
|
---|
| 445 | # 3. add assoc image(s) as files
|
---|
| 446 | #
|
---|
| 447 | sub extract_image_info {
|
---|
| 448 | my $self = shift (@_);
|
---|
| 449 | my ($tag, $id, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos) = (@_);
|
---|
| 450 | my ($filename, $orig_fp, $fn, $ext, $reltext, $relreltext, $crcid, $imgs,
|
---|
| 451 | $thumbfp, $pagetitle, $alttext, $filepath, $aggr);
|
---|
| 452 | $aggr = $self->{'aggressiveness'};
|
---|
| 453 | $imgs = \%{$self->{'imglist'}};
|
---|
| 454 | $filepath = $imgs->{$id}{'relpath'};
|
---|
| 455 | ($filename) = $filepath =~ /([^\/\\]+)$/s;
|
---|
| 456 | ($orig_fp) = "$self->{'base_path'}/$filepath";
|
---|
| 457 | $orig_fp =~ tr/+/ /;
|
---|
| 458 | $orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc
|
---|
[2996] | 459 | $orig_fp =~ s/\\/\//g;
|
---|
[2899] | 460 | $filepath = "$self->{'htpath'}/$filepath";
|
---|
[10254] | 461 | my ($onlyfn) = $filename =~ /([^\\\/]*)$/;
|
---|
[2899] | 462 | ($fn, $ext) = $onlyfn =~ /(.*)\.(.*)/;
|
---|
| 463 | $fn = lc $fn; $ext = lc $ext;
|
---|
| 464 | ($reltext) = "<tr><td>GifComment</td><td>" . `identify $filepath -ping -format "%c"` . "</td></tr>\n"
|
---|
| 465 | if ($ext eq "gif");
|
---|
| 466 | $reltext .= "<tr><td>FilePath</td><td>$orig_fp</td></tr>\n";
|
---|
| 467 |
|
---|
| 468 | if ($ENV{'GSDLOS'} =~ /^windows$/i) {
|
---|
| 469 | $crcid = "$fn.$ext." . $self->{'next_crcid'}++;
|
---|
| 470 | } else { ($crcid) = `cksum $filepath` =~ /^(\d+)/; }
|
---|
| 471 | $thumbfp = "$tndir/tn_$crcid.jpg";
|
---|
[10218] | 472 | `convert -flatten -filter Hanning $self->{'convert_params'} -geometry "$self->{'thumb_size'}x$self->{'thumb_size'}>" $filepath $thumbfp` unless -e $thumbfp;
|
---|
[2899] | 473 | if ( ! (-e $thumbfp) ) {
|
---|
| 474 | print STDERR "W3ImgPlug: 'convert' failed. Check ImageMagicK binaries are installed and working correctly\n"; return 0;
|
---|
| 475 | }
|
---|
| 476 |
|
---|
| 477 | # shove in full text (tag stripped or unstripped) if settings require it
|
---|
| 478 | if ( $aggr == 10) {
|
---|
| 479 | $reltext = "<tr><td>AllPage</td><td>" . $$textref . "</td><tr>\n"; # level 10 (all text, verbatim)
|
---|
| 480 | } else {
|
---|
| 481 | $pagetitle = $self->get_meta_value("title", $textref);
|
---|
| 482 | ($alttext) = $tag =~ /\salt\s*=\s*(?:\"|\')(.+?)(?:\"|\')/is;
|
---|
| 483 | if ( defined($alttext) && length($alttext) > 1) {
|
---|
| 484 | $reltext .= "<tr><td>ALTtext</td><td>$alttext</td></tr>\n"; }
|
---|
| 485 | $reltext .= "<tr><td>SplitCapitalisation</td><td>" .
|
---|
| 486 | $self->split_filepath($orig_fp) . "</td></tr>\n";
|
---|
| 487 |
|
---|
| 488 | # get caption/tag based near text (if appropriate)
|
---|
| 489 | if ( $aggr > 1 ) {
|
---|
| 490 | if ( $aggr >= 2 ) {
|
---|
| 491 | $reltext .=
|
---|
| 492 | $self->extract_caption_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
|
---|
| 493 | $relreltext = $reltext;
|
---|
| 494 | }
|
---|
| 495 | # repeat the filepath, alt-text, caption, etc
|
---|
| 496 | if ( $aggr == 8 ) {
|
---|
| 497 | $reltext .= $relreltext;
|
---|
| 498 | }
|
---|
| 499 | if ( $aggr >= 3 ) {
|
---|
| 500 | $reltext .=
|
---|
| 501 | $self->extract_near_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
|
---|
| 502 | }
|
---|
| 503 |
|
---|
| 504 | # get page metadata (if appropriate)
|
---|
| 505 | if ( $aggr >= 6 || ( $aggr >= 2 &&
|
---|
| 506 | ( $self->{'images_indexed'} < 2 ||
|
---|
| 507 | ($self->{'smallpage'} == 1 && $self->{'images_indexed'} < 6 )))) {
|
---|
| 508 | $reltext .= $self->get_page_metadata($textref);
|
---|
| 509 | }
|
---|
| 510 | # textual references
|
---|
| 511 | if ( $aggr == 5 || $aggr >= 7) {
|
---|
[10218] | 512 | if ( length($relreltext) > ($self->{'caption_length'} * 2) ) {
|
---|
[2899] | 513 | $reltext .= $self->get_textrefs($relreltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); }
|
---|
| 514 | else {
|
---|
| 515 | $reltext .= $self->get_textrefs($reltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
|
---|
| 516 | }
|
---|
| 517 | }
|
---|
| 518 | } # aggr > 1
|
---|
| 519 | } # aggr != 10
|
---|
| 520 |
|
---|
| 521 | $doc_obj->set_OID($crcid);
|
---|
| 522 | $doc_obj->associate_file($thumbfp, "$fn.thumb.jpg", undef, $section);
|
---|
| 523 | $doc_obj->add_metadata($section, "OriginalFilename", $filename);
|
---|
| 524 | $doc_obj->add_metadata($section, "FilePath", $orig_fp);
|
---|
| 525 | $doc_obj->add_metadata($section, "Filename", $fn);
|
---|
| 526 | $doc_obj->add_metadata($section, "FileExt", $ext);
|
---|
| 527 | $doc_obj->add_metadata($section, "FileSize", $imgs->{$id}{'filesize'});
|
---|
| 528 | $doc_obj->add_metadata($section, "Width", $imgs->{$id}{'width'});
|
---|
| 529 | $doc_obj->add_metadata($section, "Height", $imgs->{$id}{'height'});
|
---|
| 530 | $doc_obj->add_metadata($section, "URL", "http://$orig_fp");
|
---|
| 531 | $doc_obj->add_metadata($section, "PageURL", $self->{'hturl'});
|
---|
| 532 | $doc_obj->add_metadata($section, "PageTitle", $pagetitle);
|
---|
| 533 | $doc_obj->add_metadata($section, "ThumbURL",
|
---|
| 534 | "_httpcollection_/index/assoc/[archivedir]/$fn.thumb.jpg");
|
---|
[8121] | 535 | $doc_obj->add_metadata($section, "FileFormat", "W3Img");
|
---|
[2899] | 536 |
|
---|
| 537 | if ( $self->{'document_text'} ) {
|
---|
| 538 | $doc_obj->add_utf8_text($section, "<table border=1>\n$reltext</table>");
|
---|
| 539 | } else {
|
---|
| 540 | $doc_obj->add_metadata($section, "ImageText", "<table border=1>\n$reltext</table>\n");
|
---|
| 541 | }
|
---|
| 542 |
|
---|
| 543 | if ( $self->{'index_pages'} ) {
|
---|
| 544 | my ($cache_url) = "_httpdoc_&d=" . $self->{'htdoc_obj'}->get_OID();
|
---|
| 545 | if ( $imgs->{$id}{'anchored'} ) {
|
---|
| 546 | my $a_name = $id;
|
---|
| 547 | $a_name =~ s/[\/\\\:\&]/_/g;
|
---|
| 548 | $cache_url .= "#gsdl_$a_name" ;
|
---|
| 549 | }
|
---|
| 550 | $doc_obj->add_utf8_metadata($section, "CachePageURL", $cache_url);
|
---|
| 551 | }
|
---|
| 552 | if ( ! $self->{'no_cache_images'} ) {
|
---|
| 553 | $onlyfn = lc $onlyfn;
|
---|
| 554 | $doc_obj->associate_file($filepath, $onlyfn, undef, $section);
|
---|
| 555 | $doc_obj->add_utf8_metadata($section, "CacheURL",
|
---|
| 556 | "_httpcollection_/index/assoc/[archivedir]/$onlyfn");
|
---|
| 557 | }
|
---|
| 558 | return 1;
|
---|
| 559 | }
|
---|
| 560 |
|
---|
| 561 | sub get_page_metadata {
|
---|
| 562 | my ($self, $textref) = (@_);
|
---|
| 563 | my (@rval);
|
---|
| 564 | $rval[0] = $self->get_meta_value("title", $textref);
|
---|
| 565 | $rval[1] = $self->get_meta_value("keywords", $textref);
|
---|
| 566 | $rval[2] = $self->get_meta_value("description", $textref);
|
---|
| 567 | $rval[3] = $self->{'filename'};
|
---|
| 568 |
|
---|
| 569 | return wantarray ? @rval : "<tr><td>PageMeta</td><td>@rval</td></tr>\n" ;
|
---|
| 570 | }
|
---|
| 571 |
|
---|
| 572 | # turns LargeCatFish into Large,Cat,Fish so MG sees the separate words
|
---|
| 573 | sub split_filepath {
|
---|
| 574 | my ($self, $filepath) = (@_);
|
---|
| 575 | my (@words) = $filepath =~ /([A-Z][a-z]+)/g;
|
---|
| 576 | return join(',', @words);
|
---|
| 577 | }
|
---|
| 578 |
|
---|
| 579 | # finds and extracts sentences
|
---|
| 580 | # that seem to be on the same topic
|
---|
| 581 | # as other related text (correlations)
|
---|
| 582 | # and textual references (e.g. in figure 3 ...)
|
---|
| 583 | sub get_textrefs {
|
---|
| 584 | my ($self, $reltext, $textref, $prevpos, $pos, $nextpos) = (@_);
|
---|
| 585 | my ($maxtext, $mintext, $startpos, $context_size, $context);
|
---|
| 586 |
|
---|
| 587 | my (@relwords, @refwords, %sentences, @pagemeta);
|
---|
| 588 |
|
---|
| 589 | # extract larger context
|
---|
[10218] | 590 | $maxtext = $self->{'max_near_text'};
|
---|
[2899] | 591 | $startpos = $pos - ($maxtext * 4);
|
---|
| 592 | $context_size = $maxtext*10;
|
---|
| 593 | if ($startpos < $prevpos ) { $startpos = $prevpos }
|
---|
| 594 | if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) }
|
---|
| 595 | $context = substr ( $$textref, $startpos, $context_size );
|
---|
| 596 | $context =~ s/<.*?>//gs;
|
---|
| 597 | $context =~ s/^.*>(.*)/$1/gs;
|
---|
| 598 | $context =~ s/(.*)<.*$/$1/gs;
|
---|
| 599 |
|
---|
| 600 | # get page meta-data (if not already included)
|
---|
| 601 | if ( $self->{'aggressiveness'} == 5 && ! $self->{'smallpage'} ) {
|
---|
| 602 | @pagemeta = $self->get_page_metadata($textref);
|
---|
[10254] | 603 | foreach my $value ( @pagemeta ) {
|
---|
[2899] | 604 | $context .= "$value."; # make each into psuedo-sentence
|
---|
| 605 | }
|
---|
| 606 | }
|
---|
| 607 |
|
---|
| 608 | # TODO: this list is not exhaustive
|
---|
| 609 | @refwords = ( '(?:is|are)? ?(?:show(?:s|n)|demonstrate(?:d|s)|explains|features) (?:in|by|below|above|here)',
|
---|
| 610 | '(?:see)? (?:figure|table)? (?:below|above)');
|
---|
| 611 |
|
---|
| 612 | # extract general references
|
---|
[10254] | 613 | foreach my $rw ( @refwords ) {
|
---|
[2899] | 614 | while ( $context =~ /[\.\?\!\,](.*?$rw\W.*?[\.\?\!\,])/ig ) {
|
---|
[10254] | 615 | my $sentence = $1;
|
---|
[2899] | 616 | $sentence =~ s/\s+/ /g;
|
---|
| 617 | $sentences{$sentence}+=2;
|
---|
| 618 | }
|
---|
| 619 | }
|
---|
| 620 | # extract specific (figure, table) references by number
|
---|
| 621 | my ($fignum) = $context =~ /[\.\?\!].*?(?:figure|table)s?[\-\_\ \.](\d+\w*)\W.*?[\.\?\!]/ig;
|
---|
| 622 | if ( $fignum ) {
|
---|
[10254] | 623 | foreach my $rw ( @refwords ) {
|
---|
[2899] | 624 | while ( $context =~ /[\.\?\!](.*?(figure|table)[\-\_\ \.]$fignum\W.*?[\.\?\!])/ig ) {
|
---|
[10254] | 625 | my $sentence = $1;
|
---|
[2899] | 626 | $sentence =~ s/\s+/ /g;
|
---|
| 627 | $sentences{$sentence}+=4;
|
---|
| 628 | }
|
---|
| 629 | }
|
---|
| 630 | }
|
---|
| 631 |
|
---|
| 632 | # sentences with occurances of important words
|
---|
| 633 | @relwords = $reltext =~ /([a-zA-Z]{4,})/g; # take out small words
|
---|
[10254] | 634 | foreach my $word ( @relwords ) {
|
---|
[2899] | 635 | if ( $self->{'stopwords'}{$word} ) { next } # skip stop words
|
---|
| 636 | while ( $context =~ /([^\.\?\!]*?$word\W.*?[\.\?\!])/ig ) {
|
---|
[10254] | 637 | my $sentence = $1;
|
---|
[2899] | 638 | $sentence =~ s/\s+/ /g;
|
---|
| 639 | $sentences{$sentence}++;
|
---|
| 640 | }
|
---|
| 641 | }
|
---|
[10254] | 642 | foreach my $sentence ( keys %sentences ) {
|
---|
[10218] | 643 | if ($sentences{$sentence} < $self->{'textrefs_threshold'}) {
|
---|
[2899] | 644 | delete $sentences{$sentence};
|
---|
| 645 | }
|
---|
| 646 | }
|
---|
| 647 | my ($rval) = join "<br>\n", (keys %sentences);
|
---|
| 648 | if ( $rval && length($rval) > 5 ) {
|
---|
| 649 | return ( "<tr><td>TextualReferences</td><td>" . $rval . "</td></tr>\n") }
|
---|
| 650 | else { return "" }
|
---|
| 651 | }
|
---|
| 652 |
|
---|
| 653 | # handles caption extraction
|
---|
| 654 | # calling the extractor with different
|
---|
| 655 | # tags and choosing the best candidate caption
|
---|
| 656 | sub extract_caption_text {
|
---|
| 657 | my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
|
---|
[10254] | 658 | my (@neartext, $len, $hdelim, $mintext, $goodlen,
|
---|
[2899] | 659 | $startpos, $context, $context_size);
|
---|
| 660 |
|
---|
[10218] | 661 | $mintext = $self->{'min_near_text'};
|
---|
| 662 | $goodlen = $self->{'caption_length'};
|
---|
[2899] | 663 |
|
---|
| 664 | # extract a context to extract near text from (faster)
|
---|
[10218] | 665 | $context_size = $self->{'max_near_text'}*3;
|
---|
[2899] | 666 | $startpos = $pos - ($context_size / 2);
|
---|
| 667 | if ($startpos < $prevpos ) { $startpos = $prevpos }
|
---|
| 668 | if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
|
---|
| 669 | { $context_size = ($nextpos - $startpos) }
|
---|
| 670 |
|
---|
| 671 | $context = substr ( $$textref, $startpos, $context_size );
|
---|
| 672 | $context =~ s/<!--.*?-->//gs;
|
---|
| 673 | $context =~ s/^.*-->(.*)/$1/gs;
|
---|
| 674 | $context =~ s/(.*)<!--.*$/$1/gs;
|
---|
| 675 |
|
---|
| 676 | # try stepping through markup delimiter sets
|
---|
| 677 | # and selecting the best one
|
---|
| 678 | foreach $hdelim ( @{ $self->{'cdelims'} } ) {
|
---|
| 679 | @neartext = $self->extract_caption($tag, $hdelim, \$context);
|
---|
| 680 | $len = length(join("", @neartext));
|
---|
| 681 | last if ($len >= $mintext && $len <= $goodlen);
|
---|
| 682 | }
|
---|
| 683 | # reject if well over reasonable length
|
---|
| 684 | if ( $len > $goodlen ) {
|
---|
| 685 | @neartext = [];
|
---|
| 686 | }
|
---|
| 687 | $neartext[0] = " " if (! defined $neartext[0]);
|
---|
| 688 | $neartext[1] = " " if (! defined $neartext[1]);
|
---|
| 689 | return "<tr><td>Caption</td><td>" . (join ",", @neartext) . "</td></tr>\n"; # TODO: the | is for testing purposes
|
---|
| 690 | } # end extract_caption_text
|
---|
| 691 |
|
---|
| 692 | # the previous section header often gives a bit
|
---|
| 693 | # of context to the section that the image is
|
---|
| 694 | # in (invariably the header is before/above the image)
|
---|
| 695 | # so extract the text of the closest header above the image
|
---|
| 696 | #
|
---|
| 697 | # this fn just gets all the headers above the image, within the context window
|
---|
| 698 | sub get_prev_header {
|
---|
| 699 | my ($self, $pos, $textref) = (@_);
|
---|
| 700 | my ($rhtext);
|
---|
| 701 | while ( $$textref =~ /<h\d>(.*?)<\/h\d>/sig ) {
|
---|
| 702 | # only headers before image
|
---|
| 703 | if ((pos $$textref) < $pos) {
|
---|
| 704 | $rhtext .= "$1, ";
|
---|
| 705 | }
|
---|
| 706 | }
|
---|
| 707 | if ( $rhtext ) { return "Header($rhtext)" }
|
---|
| 708 | else { return "" }
|
---|
| 709 | }
|
---|
| 710 |
|
---|
| 711 | # not the most robust tag stripping
|
---|
| 712 | # regexps (see perl.com FAQ) but good enough
|
---|
| 713 | #
|
---|
| 714 | # used by caption & tag-based near text algorithms
|
---|
| 715 | sub strip_tags {
|
---|
| 716 | my ( $self, $value ) = @_;
|
---|
| 717 | if ( ! defined($value) ) { $value = "" } # handle nulls
|
---|
| 718 | else {
|
---|
| 719 | $value =~ s/<.*?>//gs; # strip all html tags
|
---|
| 720 | $value =~ s/\s+/\ /g; # remove extra whitespace
|
---|
| 721 | $value =~ s/\&\w+\;//g; # remove etc
|
---|
| 722 | }
|
---|
| 723 | return $value;
|
---|
| 724 | }
|
---|
| 725 |
|
---|
| 726 | # uses the given tag(s) to identify
|
---|
| 727 | # the caption near to the image
|
---|
| 728 | # (below, above or both below and above)
|
---|
| 729 | sub extract_caption {
|
---|
| 730 | my ($self, $tag, $bound_tag, $contextref) = (@_);
|
---|
| 731 | my (@nt, $n, $etag, $gotcap);
|
---|
| 732 | return ("", "") if ( ! ($$contextref =~ /\Q$tag/) );
|
---|
| 733 |
|
---|
| 734 | $nt[0] = $`;
|
---|
| 735 | $nt[1] = $';
|
---|
| 736 | $gotcap = 0;
|
---|
| 737 |
|
---|
| 738 | # look before the image for a boundary tag
|
---|
| 739 | ($etag, $nt[0]) = $nt[0] =~ /<($bound_tag)[\s]?.*?>(.*?)$/is;
|
---|
| 740 | # if bound_tag too far from the image, then prob not caption
|
---|
| 741 | # (note: have to allow for tags, so multiply by 3
|
---|
[10218] | 742 | if ( $etag && length($nt[0]) < ($self->{'caption_length'} * 3) ) {
|
---|
[2899] | 743 | if ( $nt[0] =~ /<\/$etag>/si ) {
|
---|
| 744 | # the whole caption is above the image: <tag>text</tag><img>
|
---|
| 745 | ($nt[0]) =~ /<(?:$etag)[\s]?.*?>(.*?)<\/$etag>/is;
|
---|
| 746 | $nt[0] = $self->strip_tags($nt[0]);
|
---|
[10218] | 747 | if ( length($nt[0]) > $self->{'min_near_text'} ) {
|
---|
[2899] | 748 | $gotcap = 1;
|
---|
| 749 | $nt[1] = "";
|
---|
| 750 | }
|
---|
| 751 |
|
---|
| 752 | } elsif ( $nt[1] =~ /<\/$etag>/si) {
|
---|
| 753 | # the caption tag covers image: <tag>text?<img>text?</tag>
|
---|
| 754 | ($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
|
---|
| 755 | $nt[0] = $self->strip_tags($nt[0] . $nt[1]);
|
---|
[10218] | 756 | if ( length($nt[0]) > $self->{'min_near_text'} ) {
|
---|
[2899] | 757 | $gotcap = 2;
|
---|
| 758 | $nt[1] = "";
|
---|
| 759 | }
|
---|
| 760 | }
|
---|
| 761 | }
|
---|
| 762 | # else try below the image
|
---|
| 763 | if ( ! $gotcap ) {
|
---|
| 764 | # the caption is after the image: <img><tag>text</tag>
|
---|
| 765 | ($etag, $nt[1]) = $nt[1] =~ /^.*?<($bound_tag)[\s]?.*?>(.*)/is;
|
---|
| 766 | if ( $etag && $nt[1] =~ /<\/$etag>/s) {
|
---|
| 767 | ($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
|
---|
| 768 | $gotcap = 3;
|
---|
| 769 | $nt[0] = "";
|
---|
| 770 | $nt[1] = $self->strip_tags($nt[1]);
|
---|
| 771 | }
|
---|
| 772 | }
|
---|
| 773 | if ( ! $gotcap ) { $nt[0] = $nt[1] = "" }
|
---|
| 774 | else {
|
---|
| 775 | # strip part-tags
|
---|
| 776 | $nt[0] =~ s/^.*>//s;
|
---|
| 777 | $nt[1] =~ s/<.*$//s;
|
---|
| 778 | }
|
---|
| 779 | my ($type);
|
---|
| 780 | if ( $gotcap == 0 ) { return ("nocaption", "") }
|
---|
| 781 | elsif ( $gotcap == 1 ) { $type = "captionabove:" }
|
---|
| 782 | elsif ( $gotcap == 2 ) { $type = "captioncovering:" }
|
---|
| 783 | elsif ( $gotcap == 3 ) { $type = "captionbelow:" }
|
---|
| 784 | return ($type, $nt[0], $nt[1]);
|
---|
| 785 | }
|
---|
| 786 |
|
---|
| 787 | # tag-based near text
|
---|
| 788 | #
|
---|
| 789 | # tries different tag sets
|
---|
| 790 | # and chooses the best one
|
---|
| 791 | sub extract_near_text {
|
---|
| 792 | my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
|
---|
| 793 | my (@neartext, $len, $hdelim, $maxtext, $mintext, $goodlen,
|
---|
| 794 | @bestlen, @best, $startpos, $context, $context_size,
|
---|
| 795 | $dist, $bdist, $best1, $i, $nt);
|
---|
| 796 | $bestlen[0] = $bestlen[1] = 0; $bestlen[2] = $bdist = 999999;
|
---|
| 797 | $best[0] = $best[1] = $best[2] = "";
|
---|
[10218] | 798 | $maxtext = $self->{'max_near_text'};
|
---|
| 799 | $mintext = $self->{'min_near_text'};
|
---|
| 800 | $goodlen = $self->{'neartext_length'};
|
---|
[2899] | 801 |
|
---|
| 802 | # extract a context to extract near text from (faster)
|
---|
| 803 | $context_size = $maxtext*4;
|
---|
| 804 | $startpos = $pos - ($context_size / 2);
|
---|
| 805 | if ($startpos < $prevpos ) { $startpos = $prevpos }
|
---|
| 806 | if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
|
---|
| 807 | { $context_size = ($nextpos - $startpos) }
|
---|
| 808 | $context = substr ( $$textref, $startpos, $context_size );
|
---|
| 809 | $context =~ s/<!--.*?-->//gs;
|
---|
| 810 | $context =~ s/^.*-->(.*)/$1/gs;
|
---|
| 811 | $context =~ s/(.*)<!--.*$/$1/gs;
|
---|
| 812 |
|
---|
| 813 | # try stepping through markup delimiter sets
|
---|
| 814 | # and selecting the best one
|
---|
| 815 | foreach $hdelim ( @{ $self->{'delims'} } ) {
|
---|
| 816 | @neartext = $self->extract_tagged_neartext($tag, $hdelim, \$context);
|
---|
| 817 | $nt = join("", @neartext);
|
---|
| 818 | $len = length($nt);
|
---|
| 819 | # Priorities:
|
---|
| 820 | # 1. Greater than mintext
|
---|
| 821 | # 2. Less than maxtext
|
---|
| 822 | # 3. Closest to goodlen
|
---|
| 823 | if ( $len <= $goodlen && $len > $bestlen[0] ) {
|
---|
| 824 | $bestlen[0] = $len;
|
---|
| 825 | $best[0] = $hdelim;
|
---|
| 826 | } elsif ( $len >= $maxtext && $len < $bestlen[2] ) {
|
---|
| 827 | $bestlen[2] = $len;
|
---|
| 828 | $best[2] = $hdelim;
|
---|
| 829 | } elsif ( $len >= $bestlen[0] && $len <= $bestlen[2] ) {
|
---|
| 830 | $dist = abs($goodlen - $len);
|
---|
| 831 | if ( $dist < $bdist ) {
|
---|
| 832 | $bestlen[1] = $len;
|
---|
| 833 | $best[1] = $hdelim;
|
---|
| 834 | $bdist = $dist;
|
---|
| 835 | }
|
---|
| 836 | }
|
---|
| 837 | }
|
---|
| 838 | $best1 = 2;
|
---|
| 839 | foreach $i ( 0..2 ) {
|
---|
| 840 | if ( $bestlen[$i] == 999999 ) { $bestlen[$i] = 0 }
|
---|
| 841 | $dist = abs($goodlen - $bestlen[$i]);
|
---|
| 842 | if ( $bestlen[$i] > $mintext && $dist <= $bdist ) {
|
---|
| 843 | $best1 = $i;
|
---|
| 844 | $bdist = $dist;
|
---|
| 845 | }
|
---|
| 846 | }
|
---|
| 847 | @neartext = $self->extract_tagged_neartext($tag, $best[$best1], \$context);
|
---|
| 848 | if ( $bestlen[$best1] > $maxtext ) {
|
---|
| 849 | # truncate on word boundary if too much text
|
---|
| 850 | my $hmax = $maxtext / 2;
|
---|
| 851 | ($neartext[0]) = $neartext[0] =~ /([^\s]*.{1,$hmax})$/s;
|
---|
| 852 | ($neartext[1]) = $neartext[1] =~ /^(.{1,$hmax}[^\s]*)/s;
|
---|
| 853 | } elsif ( $bestlen[$best1] < $mintext ) {
|
---|
| 854 | # use plain text extraction if tags failed (e.g. usable tag outside context)
|
---|
| 855 | print {$self->{'outhandle'}} "W3ImgPlug: Fallback to plain-text extraction for $tag\n"
|
---|
| 856 | if $self->{'verbosity'} > 2;
|
---|
| 857 | $neartext[0] = "<tr><td>RawNeartext</td><td>" . $self->extract_raw_neartext($tag, $textref) . "</td></tr>";
|
---|
| 858 | $neartext[1] = "";
|
---|
| 859 | }
|
---|
| 860 | # get previous header if available
|
---|
| 861 | $neartext[0] .= "<br>\n" .
|
---|
| 862 | $self->get_prev_header($pos, \$context) if ( $self->{'aggressiveness'} >= 4 );
|
---|
| 863 | $neartext[0] = " " if (! defined $neartext[0]);
|
---|
| 864 | $neartext[1] = " " if (! defined $neartext[1]);
|
---|
| 865 |
|
---|
| 866 | return "<tr><td>NearText</td><td>" . (join "|", @neartext) . "</td></tr>\n"; # TODO: the | is for testing purposes
|
---|
| 867 | } # end extract_near_text
|
---|
| 868 |
|
---|
| 869 | # actually captures tag-based
|
---|
| 870 | # near-text given a tag set
|
---|
| 871 | sub extract_tagged_neartext {
|
---|
| 872 | my ($self, $tag, $bound_tag, $textref) = (@_);
|
---|
| 873 | return "" if ( ! ($$textref =~ /\Q$tag/) );
|
---|
| 874 | my (@nt, $delim, $pre_tag, $n);
|
---|
| 875 | $nt[0] = $`;
|
---|
| 876 | $nt[1] = $';
|
---|
| 877 |
|
---|
| 878 | # get text after previous image tag
|
---|
| 879 | $nt[0] =~ s/.*<($bound_tag)[^>]*>(.*)/$2/is; # get rid of preceding text
|
---|
| 880 | if (defined($1)) { $delim = $1 }
|
---|
| 881 | $pre_tag = $bound_tag;
|
---|
| 882 |
|
---|
| 883 | if (defined($delim)) {
|
---|
| 884 | # we want to try and use the end tag of the previous delimiter
|
---|
| 885 | # (put it on the front of the list)
|
---|
| 886 | $pre_tag =~ s/(^|\|)($delim)($|\|)//i; # take it out
|
---|
| 887 | $pre_tag =~ s/\|\|/\|/i; # replace || with |
|
---|
| 888 | $pre_tag = $delim . "|" . $pre_tag; # put it on the front
|
---|
| 889 | }
|
---|
| 890 |
|
---|
| 891 | # get text before next image tag
|
---|
| 892 | $nt[1] =~ s/<\/?(?:$pre_tag)[^>]*>.*//is; # get rid of stuff after first delimiter
|
---|
| 893 |
|
---|
| 894 | # process related text
|
---|
| 895 | for $n (0..1) {
|
---|
| 896 | if ( defined($nt[$n]) ) {
|
---|
| 897 | $nt[$n] =~ s/<.*?>//gs; # strip all html tags
|
---|
| 898 | $nt[$n] =~ s/\s+/\ /gs; # remove extra whitespace
|
---|
| 899 | $nt[$n] =~ s/\&\w+\;//sg; # remove etc
|
---|
| 900 | # strip part-tags
|
---|
| 901 | if ( $n == 0 ) { $nt[0] =~ s/^.*>//s }
|
---|
| 902 | if ( $n == 1 ) { $nt[1] =~ s/<.*$//s }
|
---|
| 903 | } else { $nt[$n] = ""; } # handle nulls
|
---|
| 904 | }
|
---|
| 905 | return @nt;
|
---|
| 906 | }
|
---|
| 907 |
|
---|
| 908 | # this function is fall-back
|
---|
| 909 | # if tags aren't suitable.
|
---|
| 910 | #
|
---|
| 911 | # extracts a fixed length of characters
|
---|
| 912 | # either side of image tag (on word boundary)
|
---|
| 913 | sub extract_raw_neartext {
|
---|
| 914 | my ($self, $tag, $textref) = (@_);
|
---|
| 915 | my ($rawtext, $startpos, $fp);
|
---|
| 916 | my $imgs = \%{$self->{'imglist'}};
|
---|
| 917 | ($fp) = $tag =~ /([\w\\\/]+\.(?:gif|jpe?g|png))/is;
|
---|
| 918 | if (! $fp) { return " " };
|
---|
| 919 | # if the cached, plain-text version isn't there, then create it
|
---|
| 920 | $self->init_plaintext($textref) unless defined($self->{'plaintext'});
|
---|
| 921 |
|
---|
| 922 | # take the closest maxtext/2 characters
|
---|
| 923 | # either side of the tag (by word boundary)
|
---|
| 924 | return "" if ( ! exists $imgs->{$fp}{'rawpos'} );
|
---|
[10218] | 925 | $startpos = $imgs->{$fp}{'rawpos'} - (($self->{'max_near_text'} / 2) + 20);
|
---|
[2899] | 926 | if ( $startpos < 0 ) { $startpos = 0 }
|
---|
[10218] | 927 | $rawtext = substr $self->{'plaintext'}, $startpos, $self->{'max_near_text'} + 20;
|
---|
[2899] | 928 | $rawtext =~ s/\s\s/ /g;
|
---|
| 929 |
|
---|
| 930 | return $rawtext;
|
---|
| 931 | }
|
---|
| 932 |
|
---|
| 933 | # init plaintext variable for HTML-stripped version
|
---|
| 934 | # (for full text index/raw assoc text extraction)
|
---|
| 935 | sub init_plaintext {
|
---|
| 936 | my ($self, $textref) = (@_);
|
---|
| 937 | my ($page, $fp);
|
---|
| 938 | my $imgs = \%{$self->{'imglist'}};
|
---|
| 939 | $page = $$textref; # make a copy of original
|
---|
| 940 |
|
---|
| 941 | # strip tags around image filenames so they don't get zapped
|
---|
| 942 | $page =~ s/<\w+\s+.*?([\w\/\\]+\.(?:gif|jpe?g|png))[^>]*>/\"$1\"/gsi;
|
---|
| 943 | $page =~ s/<.*?>//gs;
|
---|
| 944 | $page =~ s/ / /gs;
|
---|
| 945 | $page =~ s/&/&/gs; #TODO: more &zzz; replacements (except <, $gt;)
|
---|
| 946 |
|
---|
| 947 | # get positions and strip images
|
---|
| 948 | while ( $page =~ /([^\s\'\"]+\.(jpe?g|gif|png))/ig ) {
|
---|
| 949 | $fp = $1;
|
---|
| 950 | if ( $imgs->{$fp}{'exists'} ) {
|
---|
| 951 | $imgs->{$fp}{'rawpos'} = pos $page;
|
---|
| 952 | }
|
---|
| 953 | $page =~ s/\"$fp\"//gs;
|
---|
| 954 | }
|
---|
| 955 | $self->{'plaintext'} = $page;
|
---|
| 956 | }
|
---|
| 957 |
|
---|
| 958 | # finds and filters images based on size
|
---|
| 959 | # (dimensions, height, filesize) and existence
|
---|
| 960 | #
|
---|
| 961 | # looks for image filenames (.jpg, .gif, etc)
|
---|
| 962 | # and checks for existence on disk
|
---|
| 963 | # (hence supports most JavaScript images)
|
---|
| 964 | sub get_img_list {
|
---|
| 965 | my $self = shift (@_);
|
---|
| 966 | my ($textref) = (@_);
|
---|
| 967 | my ($filepath, $relpath, $abspath, $pos, $num, $width, $height, $filesize);
|
---|
| 968 | my $imgs = \%{$self->{'imglist'}};
|
---|
| 969 | while ( $$textref =~ /([^\s\'\"]+\.(jpe?g|gif|png))/ig ) {
|
---|
| 970 | $filepath = $1;
|
---|
| 971 | $pos = pos $$textref;
|
---|
| 972 | next if ( $imgs->{$filepath}{'relpath'} );
|
---|
| 973 | $relpath = $filepath;
|
---|
| 974 | $relpath =~ s/^http\:\/\///; # remove http:// in case we have mirrored it
|
---|
| 975 | $relpath =~ s/\\/\//g; # replace \ with /
|
---|
| 976 | $relpath =~ s/^\.\///s; # make "./filepath" into "filepath"
|
---|
| 977 | $imgs->{$filepath}{'relpath'} = $relpath;
|
---|
| 978 | $abspath = "$self->{'htpath'}/$relpath";
|
---|
| 979 |
|
---|
| 980 | if (! -e $abspath) { next }
|
---|
| 981 |
|
---|
| 982 | # can't modify real filepath var because it
|
---|
| 983 | # then can't be located in the page for tag recognition later
|
---|
| 984 | ($width, $height) =
|
---|
| 985 | `identify $abspath -ping -format "%wx%h"` =~ /^(\d*)x(\d*)$/m;
|
---|
| 986 | if (! ($width && $height)) {
|
---|
| 987 | print STDERR "W3ImgPlug: ($abspath) 'identify' failed. Check ImageMagicK binaries are installed and working correctly\n"; next;
|
---|
| 988 | }
|
---|
| 989 | $filesize = (-s $abspath);
|
---|
[10218] | 990 | if ( $filesize >= $self->{'min_size'}
|
---|
| 991 | && ( $width >= $self->{'min_width'} )
|
---|
| 992 | && ( $height >= $self->{'min_height'} ) ) {
|
---|
[2899] | 993 |
|
---|
| 994 | $imgs->{$filepath}{'exists'} = 1;
|
---|
| 995 | $imgs->{$filepath}{'pos'} = $pos;
|
---|
| 996 | $imgs->{$filepath}{'width'} = $width;
|
---|
| 997 | $imgs->{$filepath}{'height'} = $height;
|
---|
| 998 | $imgs->{$filepath}{'filesize'} = $filesize;
|
---|
| 999 | } else {
|
---|
| 1000 | print {$self->{'outhandle'}} "W3ImgPlug: skipping $self->{'base_path'}/$relpath: $filesize, $width x $height\n"
|
---|
| 1001 | if $self->{'verbosity'} > 2;
|
---|
| 1002 | }
|
---|
| 1003 | }
|
---|
| 1004 | $num = 0;
|
---|
[10254] | 1005 | foreach my $i ( keys %{$imgs} ) {
|
---|
[2899] | 1006 | if ( $imgs->{$i}{'pos'} ) {
|
---|
| 1007 | $num++;
|
---|
| 1008 | } else { delete $imgs->{$i} }
|
---|
| 1009 | }
|
---|
| 1010 | return $num;
|
---|
| 1011 | }
|
---|
| 1012 |
|
---|
| 1013 | # make the text available to the read function
|
---|
| 1014 | # by making it an object variable
|
---|
| 1015 | sub read_file {
|
---|
| 1016 | my ($self, $filename, $encoding, $language, $textref) = @_;
|
---|
| 1017 | $self->SUPER::read_file($filename, $encoding, $language, $textref);
|
---|
| 1018 |
|
---|
| 1019 | # if HTMLplug has run through, then it will
|
---|
| 1020 | # have replaced references so we have to
|
---|
| 1021 | # make a copy of the text before processing
|
---|
| 1022 | if ( $self->{'index_pages'} ) {
|
---|
| 1023 | $self->{'text'} = $$textref;
|
---|
| 1024 | $self->{'textref'} = \($self->{'text'});
|
---|
| 1025 | } else {
|
---|
| 1026 | $self->{'textref'} = $textref;
|
---|
| 1027 | }
|
---|
| 1028 | $self->{'plaintext'} = undef;
|
---|
| 1029 | }
|
---|
| 1030 |
|
---|
| 1031 | # HTMLPlug only extracts meta-data if it is specified in plugin options
|
---|
| 1032 | # hence a special function to do it here
|
---|
| 1033 | sub get_meta_value {
|
---|
| 1034 | my ($self, $name, $textref) = @_;
|
---|
| 1035 | my ($value);
|
---|
| 1036 | $name = lc $name;
|
---|
| 1037 | if ($name eq "title") {
|
---|
| 1038 | ($value) = $$textref =~ /<title>(.*?)<\/title>/is
|
---|
| 1039 | } else {
|
---|
| 1040 | my $qm = "(?:\"|\')";
|
---|
| 1041 | ($value) = $$textref =~ /<meta name\s*=\s*$qm?$name$qm?\s+content\s*=\s*$qm?(.*?)$qm?\s*>/is
|
---|
| 1042 | }
|
---|
| 1043 | $value = "" unless $value;
|
---|
| 1044 | return $value;
|
---|
| 1045 | }
|
---|
| 1046 |
|
---|
| 1047 | # make filename an anchor reference
|
---|
| 1048 | # so we can go straight to the image
|
---|
| 1049 | # within the cached version of the source page
|
---|
| 1050 | # (augment's HTMLPlug sub)
|
---|
| 1051 | sub replace_images {
|
---|
| 1052 | my $self = shift (@_);
|
---|
| 1053 | my ($front, $link, $back, $base_dir,
|
---|
| 1054 | $file, $doc_obj, $section) = @_;
|
---|
[2996] | 1055 | $link =~ s/\"//g;
|
---|
[2899] | 1056 | my ($a_name) = $link;
|
---|
| 1057 | $a_name =~ s/[\/\\\:\&]/_/g;
|
---|
| 1058 | # keep a list so we don't repeat the same anchor
|
---|
| 1059 | if ( ! $self->{'imglist'}{$link}{'anchored'} ) {
|
---|
| 1060 | $front = "<a name=\"gsdl_$a_name\">$front";
|
---|
| 1061 | $back = "$back</a>";
|
---|
| 1062 | $self->{'imglist'}{$link}{'anchored'} = 1;
|
---|
| 1063 | }
|
---|
| 1064 | return $self->SUPER::replace_images($front, $link, $back, $base_dir,
|
---|
| 1065 | $file, $doc_obj, $section);
|
---|
| 1066 | }
|
---|
| 1067 |
|
---|
| 1068 | 1;
|
---|