root/gsdl/trunk/perllib/plugins/HTMLImagePlugin.pm @ 18327

Revision 18327, 37.1 KB (checked in by ak19, 11 years ago)

Extra parameter to new doc(): the renaming method to be used on the file (base64 or URL encoding).

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# HTMLImagePlugin.pm -- Context-based image indexing plugin for HTML documents
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# DESCRIPTION:
28#
29#  Extracts images and associated text and metadata from
30#  web pages as individual documents for indexing. Thumbnails
31#  are created from each image for browsing purposes.
32#
33#  Options are available for configuring the aggressiveness of the
34#  associated text extraction mechanisms. A higher level of
35#  aggressiveness will extract more text and consequently
36#  may mean lower accuracy (precision); however, it may also
37#  retrieve more of the relevant images from the collection (recall).
38#  Lower levels of aggressiveness maybe result in slightly faster
39#  collection builds at the import stage.
40#
41#  HTMLImagePlugin is a subclass of HTMLPlug (i.e. it will index pages also
42#  if required). It can be used in place of HTMLPlugin to index both
43#  pages and their images.
44#
45# REQUIREMENTS:
46#   
47#  The ImageMagick image manipulation is used to create
48#  thumbnails and extract some image metadata. (Available
49#  from http://www.imagemagick.org/)
50#
51#  Unix:
52#    Many Linux distributions contain ImageMagick.
53#
54#  Windows:
55#    ImageMagick can be downloaded from the website above.
56#    Make sure the system path includes the ImageMagick binaries
57#    before using HTMLImagePlugin.
58#
59#    NOTE: NT/2000/XP contain a filesystem utility 'convert.exe'
60#    with the same name as the image conversion utility. The
61#    ImageMagick FAQ recommends renaming the filesystem
62#    utility (e.g. to 'fsconvert.exe') to avoid this clash.
63#
64# USAGE: 
65#
66#  An image document consists of metadata elements:
67#
68#   OriginalFilename, FilePath, Filename, FileExt, FileSize,
69#   Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL
70#   ImageText, PageTitle
71#
72#  Most of these are only useful in format strings (e.g. ThumbURL,
73#  Filename, URL, PageURL, CachePageURL).
74#
75#  ImageText, as the name suggests contains the indexable text.
76#  (unless using the -document_text plugin option)
77#
78#  Since image documents are made up of metadata elements
79#  alone, format strings are needed to display them properly.
80#  NOTE: The receptionist will only display results (e.g. thumbnails)
81#  in 4 columns if the format string begins with "<td><table>".
82#
83#  The example below takes the user to the image within the
84#  source HTML document rather than using a format string
85#  on DocumentText to display the image document itself.
86#
87#  Example collect.cfg:
88#
89#   ...
90#
91#   indexes document:ImageText document:text
92#   defaultindex document:ImageText
93
94#   collectionmeta .document:ImageText "images"
95#   collectionmeta .document:text "documents"
96#
97#   ...
98#
99#   plugin HTMLImagePlugin -index_pages -aggressiveness 6
100#
101#   ...
102
103#   format SearchVList '<td>{If}{[Title],[link][icon]&nbsp;[Title][[/link],
104#    <table><tr><td align="center"><a href="[CachePageURL]">
105#    <img src="[ThumbURL]"></a></td></tr><tr><td align="center">
106#    <a href="[CachePageURL]"><font size="-1">[OriginalFilename]</font></a>
107#    <br>[Width]x[Height]</td></tr></table>}</td>'
108#
109#   ...
110#
111 
112package HTMLImagePlugin;
113
114use HTMLPlugin;
115use ghtml;
116use unicode;
117use util;
118use strict; # 'subs';
119no strict 'refs'; # allow filehandles to be variables and viceversa
120
121sub BEGIN {
122    @HTMLImagePlugin::ISA = qw( HTMLPlugin );
123}
124
125my $aggressiveness_list =
126    [ { 'name' => "1",
127    'desc' => "{HTMLImagePlugin.aggressiveness.1}" },
128      { 'name' => "2",
129    'desc' => "{HTMLImagePlugin.aggressiveness.2}" },
130      { 'name' => "3",
131    'desc' => "{HTMLImagePlugin.aggressiveness.3}" },
132      { 'name' => "4",
133    'desc' => "{HTMLImagePlugin.aggressiveness.4}" },
134      { 'name' => "5",
135    'desc' => "{HTMLImagePlugin.aggressiveness.5}" },
136      { 'name' => "6",
137    'desc' => "{HTMLImagePlugin.aggressiveness.6}" },
138      { 'name' => "7",
139    'desc' => "{HTMLImagePlugin.aggressiveness.7}" },
140      { 'name' => "8",
141    'desc' => "{HTMLImagePlugin.aggressiveness.8}" },
142      { 'name' => "9",
143    'desc' => "{HTMLImagePlugin.aggressiveness.9}" } ];
144
145my $arguments =
146    [ { 'name' => "aggressiveness",
147    'desc' => "{HTMLImagePlugin.aggressiveness}",
148    'type' => "int",
149    'list' => $aggressiveness_list,
150    'deft' => "3",
151    'reqd' => "no" },
152      { 'name' => "index_pages",
153    'desc' => "{HTMLImagePlugin.index_pages}",
154    'type' => "flag",
155    'reqd' => "no" },
156      { 'name' => "no_cache_images",
157    'desc' => "{HTMLImagePlugin.no_cache_images}",
158    'type' => "flag",
159    'reqd' => "no" },
160      { 'name' => "min_size",
161    'desc' => "{HTMLImagePlugin.min_size}",
162    'type' => "int",
163    'deft' => "2000",
164    'reqd' => "no" },
165      { 'name' => "min_width",
166    'desc' => "{HTMLImagePlugin.min_width}",
167    'type' => "int",
168    'deft' => "50",
169    'reqd' => "no" },
170      { 'name' => "min_height",
171    'desc' => "{HTMLImagePlugin.min_height}",
172    'type' => "int",
173    'deft' => "50",
174    'reqd' => "no" },
175      { 'name' => "thumb_size",
176    'desc' => "{HTMLImagePlugin.thumb_size}",
177    'type' => "int",
178    'deft' => "100",
179    'reqd' => "no" },
180      { 'name' => "convert_params",
181    'desc' => "{HTMLImagePlugin.convert_params}",
182    'type' => "string",
183    'deft' => "",
184    'reqd' => "no" },
185      { 'name' => "min_near_text",
186    'desc' => "{HTMLImagePlugin.min_near_text}",
187    'type' => "int",
188    'deft' => "10",
189    'reqd' => "no" },
190      { 'name' => "max_near_text",
191    'desc' => "{HTMLImagePlugin.max_near_text}",
192    'type' => "int",
193    'deft' => "400",
194    'reqd' => "no" },
195      { 'name' => "smallpage_threshold",
196    'desc' => "{HTMLImagePlugin.smallpage_threshold}",
197    'type' => "int",
198    'deft' => "2048",
199    'reqd' => "no" },
200      { 'name' => "textrefs_threshold",
201    'desc' => "{HTMLImagePlugin.textrefs_threshold}",
202    'type' => "int",
203    'deft' => "2",
204    'reqd' => "no" },
205      { 'name' => "caption_length",
206    'desc' => "{HTMLImagePlugin.caption_length}",
207    'type' => "int",
208    'deft' => "80",
209    'reqd' => "no" },
210      { 'name' => "neartext_length",
211    'desc' => "{HTMLImagePlugin.neartext_length}",
212    'type' => "int",
213    'deft' => "300",
214    'reqd' => "no" },
215      { 'name' => "document_text",
216    'desc' => "{HTMLImagePlugin.document_text}",
217    'type' => "flag",
218    'reqd' => "no" } ];
219
220my $options = { 'name'     => "HTMLImagePlugin",
221        'desc'     => "{HTMLImagePlugin.desc}",
222        'abstract' => "no",
223        'inherits' => "yes",
224        'args'     => $arguments };
225
226sub new {
227    my ($class) = shift (@_);
228    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
229    push(@$pluginlist, $class);
230
231    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
232    push(@{$hashArgOptLists->{"OptList"}},$options);
233
234    my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
235
236    # init class variables
237    $self->{'textref'} = undef; # init by read_file fn
238    $self->{'htdoc_obj'} = undef; # init by process fn
239    $self->{'htpath'} = undef; # init by process fn
240    $self->{'hturl'} = undef; # init by process fn
241    $self->{'plaintext'} = undef; # HTML stripped version - only init if needed by raw_neartext sub
242    $self->{'smallpage'} = 0; # set by process fn
243    $self->{'images_indexed'} = undef; # num of images indexed - if 1 or 2 then we know page is small
244    $self->{'initialised'} = undef; # flag (see set_extraction_options())
245
246    return bless $self, $class;
247}
248
249# if indexing pages, let HTMLPlugin do it's stuff
250# image extraction done through read()
251sub process {
252    my $self = shift(@_);
253    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
254    $self->{'imglist'} = ();
255    if ( $self->{'index_pages'} ) {
256    my $ok = $self->SUPER::process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
257    if ( ! $ok ) { return $ok }
258    $self->{'htdoc_obj'} = $doc_obj;
259    }
260    # else use URL for referencing
261    #if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
262
263    $self->{'htpath'} = $base_dir if (-d $base_dir);
264    if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} .= "/$1"; }
265    $self->{'htpath'} =~ s/\\/\//g;  # replace \ with /
266
267    $self->{'hturl'} = "http://$file";
268    $self->{'hturl'} =~ s/\\/\//g; # for windows
269    ($self->{'filename'}) = $file =~ /.*[\/\\](.*)/;
270    ($self->{'base_path'}) = $file =~ /(.*)[\/\\]/i;
271    if ( ( -s "$base_dir/$file") <= $self->{'smallpage_threshold'} ) {
272    $self->{'smallpage'} = 1;
273    } else { $self->{'smallpage'} = 0; }
274
275    if ( defined($self->{'initialised'}) ) { return 1; }
276    else {
277    $self->{'initialised'} = $self->set_extraction_options($base_dir =~ /^(.*?)\/import/i);
278    return $self->{'initialised'};
279    }
280}
281
282# get complex configuration options from configuration files
283# -- $GSDLCOLLECTION/etc/HTMLImagePlugin.cfg (tag sets for aggr 2+)
284# -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+)
285
286# If there's no HTMLImagePlugin.cfg file we'll use the following default values
287my $defaultcfg = '
288<delimitertagset>
289  <setname>Caption</setname>
290  <taggroup>font</taggroup>
291  <taggroup>tt</taggroup>
292  <taggroup>small</taggroup>
293  <taggroup>b</taggroup>
294  <taggroup>i</taggroup>
295  <taggroup>u</taggroup>
296  <taggroup>em</taggroup>
297  <taggroup>td</taggroup>
298  <taggroup>li</taggroup>
299  <taggroup>a</taggroup>
300  <taggroup>p</taggroup>
301  <taggroup>tr</taggroup>
302  <taggroup>center</taggroup>
303  <taggroup>div</taggroup>
304  <taggroup>caption</taggroup>
305  <taggroup>br</taggroup>
306  <taggroup>ul</taggroup>
307  <taggroup>ol</taggroup>
308  <taggroup>table</taggroup>
309  <taggroup>hr</taggroup>
310</delimitertagset>
311
312<delimitertagset>
313  <setname>Neartext</setname>
314  <taggroup>tr|hr|table|h\d|img|body</taggroup>
315  <taggroup>td|tr|hr|table|h\d|img|body</taggroup>
316  <taggroup>p|br|td|tr|hr|table|h\d|img|body</taggroup>
317  <taggroup>font|p|i|b|em|img</taggroup>
318</delimitertagset>
319';
320
321sub set_extraction_options() {
322    my ($self, $collpath) = @_;
323    my ($filepath);
324
325    print {$self->{'outhandle'}} "HTMLImagePlugin: Initialising\n"
326    if $self->{'verbosity'} > 1;
327    # etc/HTMLImagePlugin.cfg (XML)
328    # tag sets for captions and neartext
329    if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) {
330    $self->{'delims'} = [];
331    $self->{'cdelims'} = [];
332    my ($cfg, @tagsets, $tagset, $type, @delims);
333
334    $filepath = "$collpath/etc/HTMLImagePlugin.cfg";
335    if ( open CFG, "<$filepath" ) {
336        while (<CFG>) { $cfg .= $_ }
337        close CFG;
338    } else {
339        $cfg = $defaultcfg;
340    }
341
342    (@tagsets) =
343        $cfg =~ /<delimitertagset>(.*?)<\/delimitertagset>/igs;
344    foreach $tagset ( @tagsets ) {
345        ($type) = $tagset =~ /<setname>(.*?)<\/setname>/i;
346        if ( lc($type) eq "caption" ) {
347        (@{$self->{'cdelims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
348        }
349        elsif ( lc($type) eq "neartext" ) {
350        (@{$self->{'delims'}}) = $tagset =~ /<taggroup>(.*?)<\/taggroup>/igs;
351        }
352    }
353
354    # output a warning if there seem to be no delimiters
355    if ( scalar(@{$self->{'cdelims'}} == 0)) {
356        print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: no caption delimiters found in $filepath\n";
357    }
358    if ( scalar(@{$self->{'delims'}} == 0)) {
359        print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: no neartext delimiters found in $filepath\n";
360    }
361    }
362   
363    # get stop words for textual reference extraction
364    # TODO: warnings scroll off. Would be best to output them again at end of import
365    if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) {
366    $self->{'stopwords'} = ();
367    $filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");
368    if ( open STOPWORDS, "<$filepath" ) {
369        while ( <STOPWORDS> ) {
370        chomp;
371        $self->{'stopwords'}{$_} = 1;
372        }
373        close STOPWORDS;
374    } else {
375        print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: couldn't open stopwords file at $filepath ($!)\n";
376    }
377   
378    }
379
380    if ( $self->{'neartext_length'} > $self->{'max_near_text'} ) {
381    $self->{'max_near_text'} = $self->{'neartext_length'} * 1.33;
382    print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: adjusted max_text to $self->{'max_near_text'}\n";
383    }
384    if ( $self->{'caption_length'} > $self->{'max_near_text'} ) {
385    $self->{'max_near_text'} = $self->{'caption_length'} * 1.33;
386    print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: adjusted max_text to $self->{'max_near_text'}\n";
387    }
388
389    return 1;
390}
391
392# return number of files processed, undef if can't recognise, -1 if
393# cant process
394# Note that $base_dir might be "" and that $file might
395# include directories
396sub read {
397    my ($self, $pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = (@_);
398    my ($doc_obj, $section, $filepath, $imgtag, $pos, $context, $numdocs, $tndir, $imgs);
399    # forward normal read (runs HTMLPlugin if index_pages T)
400    my $ok =  $self->SUPER::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
401    if ( ! $ok ) { return $ok } # what is this returning??
402
403    my $outhandle = $self->{'outhandle'};
404    my $textref = $self->{'textref'};
405    my $htdoc_obj = $self->{'htdoc_obj'};
406    $numdocs = 0;
407    $base_dir =~ /(.*)\/.*/;
408    $tndir = "$1/archives/thumbnails"; # TODO: this path shouldn't be hardcoded?
409    &util::mk_all_dir($tndir) unless -e "$tndir";
410
411    $imgs = \%{$self->{'imglist'}};
412    my $nimgs = $self->get_img_list($textref);
413    $self->{'images_indexed'} = $nimgs;
414    if ( $nimgs > 0 ) {
415    my @fplist = (sort { $imgs->{$a}{'pos'} <=> $imgs->{$b}{'pos'} } keys %{$imgs});
416    my $i = 0;
417    foreach $filepath ( @fplist ) {
418        $pos = $imgs->{$filepath}{'pos'};
419        $context = substr ($$textref, $pos - 50, $pos + 50); # grab context (quicker)
420        ($imgtag) = ($context =~ /(<(?:img|a|body)\s[^>]*$filepath[^>]*>)/is );
421        if (! defined($imgtag)) { $imgtag = $filepath }
422        print $outhandle "HTMLImagePlugin: extracting $filepath\n"
423        if ( $self->{'verbosity'} > 1 );
424        $doc_obj = new doc ("", "indexed_doc", $self->{'file_rename_method'});
425        $section = $doc_obj->get_top_section();
426        my $prevpos = ( $i == 0 ? 0 : $imgs->{$fplist[$i - 1]}{'pos'});
427        my $nextpos = ( $i >= ($nimgs -1) ? -1 : $imgs->{$fplist[$i + 1]}{'pos'} );
428
429        $self->extract_image_info($imgtag, $filepath, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos);
430            $processor->process($doc_obj);
431        $numdocs++;
432        $i++;
433    }
434    return $numdocs;
435    } else {
436    print $outhandle "HTMLImagePlugin: No images from $file indexed\n"
437        if ( $self->{'verbosity'} > 2 );
438    return 1;
439    }
440   
441}
442
443# for every valid image tag
444# 1. extract related text and image metadata
445# 2. add this as document meta-data
446# 3. add assoc image(s) as files
447#
448sub extract_image_info {
449    my $self = shift (@_);
450    my ($tag, $id, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos) = (@_);
451    my ($filename, $orig_fp, $fn, $ext, $reltext, $relreltext, $crcid, $imgs,
452    $thumbfp, $pagetitle, $alttext, $filepath, $aggr);
453    $aggr = $self->{'aggressiveness'};
454    $imgs = \%{$self->{'imglist'}};
455    $filepath = $imgs->{$id}{'relpath'};
456    ($filename) = $filepath =~ /([^\/\\]+)$/s;
457    ($orig_fp) = "$self->{'base_path'}/$filepath";
458    $orig_fp =~ tr/+/ /;
459    $orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc
460    $orig_fp =~ s/\\/\//g;
461    $filepath = "$self->{'htpath'}/$filepath";
462    my ($onlyfn) = $filename =~ /([^\\\/]*)$/;
463    ($fn, $ext) = $onlyfn =~ /(.*)\.(.*)/;
464    $fn = lc $fn; $ext = lc $ext;
465    ($reltext) = "<tr><td>GifComment</td><td>" . `identify $filepath -ping -format "%c"` . "</td></tr>\n"
466        if ($ext eq "gif");
467    $reltext .= "<tr><td>FilePath</td><td>$orig_fp</td></tr>\n";
468
469    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
470    $crcid = "$fn.$ext." . $self->{'next_crcid'}++;
471    } else { ($crcid) = `cksum $filepath` =~ /^(\d+)/; }
472    $thumbfp = "$tndir/tn_$crcid.jpg";
473    `convert -flatten -filter Hanning $self->{'convert_params'} -geometry "$self->{'thumb_size'}x$self->{'thumb_size'}>" $filepath $thumbfp` unless -e $thumbfp;
474    if ( ! (-e $thumbfp) ) {
475    print STDERR "HTMLImagePlugin: 'convert' failed. Check ImageMagicK binaries are installed and working correctly\n"; return 0;
476    }
477   
478    # shove in full text (tag stripped or unstripped) if settings require it
479    if ( $aggr == 10) {
480    $reltext = "<tr><td>AllPage</td><td>" . $$textref . "</td><tr>\n";   # level 10 (all text, verbatim)
481    } else {
482    $pagetitle = $self->get_meta_value("title", $textref);
483    ($alttext) = $tag =~ /\salt\s*=\s*(?:\"|\')(.+?)(?:\"|\')/is;
484    if ( defined($alttext) && length($alttext) > 1) {
485        $reltext .= "<tr><td>ALTtext</td><td>$alttext</td></tr>\n"; }
486    $reltext .= "<tr><td>SplitCapitalisation</td><td>" .
487        $self->split_filepath($orig_fp) . "</td></tr>\n";
488
489    # get caption/tag based near text (if appropriate)
490    if ( $aggr > 1 ) {
491        if ( $aggr >= 2 ) {
492        $reltext .=
493            $self->extract_caption_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
494        $relreltext = $reltext;
495        }
496        # repeat the filepath, alt-text, caption, etc
497        if ( $aggr == 8 ) {
498        $reltext .= $relreltext;
499        }
500        if ( $aggr >= 3 ) {
501        $reltext .=
502            $self->extract_near_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
503        }
504   
505        # get page metadata (if appropriate)
506        if ( $aggr >= 6 || ( $aggr >= 2 &&
507                             ( $self->{'images_indexed'} < 2 ||
508                               ($self->{'smallpage'} == 1 && $self->{'images_indexed'} < 6 )))) {   
509        $reltext .= $self->get_page_metadata($textref);
510        }
511        # textual references
512        if ( $aggr  == 5 || $aggr >= 7) {
513        if ( length($relreltext) > ($self->{'caption_length'} * 2) )  {
514            $reltext .= $self->get_textrefs($relreltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); }
515        else {
516            $reltext .= $self->get_textrefs($reltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
517        }
518        }
519    } # aggr > 1
520    } # aggr != 10
521   
522    $doc_obj->set_OID($crcid);
523    $doc_obj->associate_file($thumbfp, "$fn.thumb.jpg", undef, $section);
524    $doc_obj->add_metadata($section, "OriginalFilename", $filename);
525    $doc_obj->add_metadata($section, "FilePath", $orig_fp);
526    $doc_obj->add_metadata($section, "Filename", $fn);
527    $doc_obj->add_metadata($section, "FileExt", $ext);
528    $doc_obj->add_metadata($section, "FileSize", $imgs->{$id}{'filesize'});
529    $doc_obj->add_metadata($section, "Width", $imgs->{$id}{'width'});
530    $doc_obj->add_metadata($section, "Height", $imgs->{$id}{'height'});
531    $doc_obj->add_metadata($section, "URL", "http://$orig_fp");
532    $doc_obj->add_metadata($section, "PageURL", $self->{'hturl'});
533    $doc_obj->add_metadata($section, "PageTitle", $pagetitle);
534    $doc_obj->add_metadata($section, "ThumbURL",
535               "_httpprefix_/collect/[collection]/index/assoc/[archivedir]/$fn.thumb.jpg");
536    $doc_obj->add_metadata($section, "FileFormat", "W3Img");
537
538    if ( $self->{'document_text'} ) {
539    $doc_obj->add_utf8_text($section, "<table border=1>\n$reltext</table>");
540    } else {
541    $doc_obj->add_metadata($section, "ImageText", "<table border=1>\n$reltext</table>\n");
542    }
543   
544    if ( $self->{'index_pages'} ) {
545    my ($cache_url) = "_httpdoc_&d=" . $self->{'htdoc_obj'}->get_OID();
546    if ( $imgs->{$id}{'anchored'} ) {
547        my $a_name = $id;
548        $a_name =~ s/[\/\\\:\&]/_/g;
549        $cache_url .=  "#gsdl_$a_name" ;
550    }
551    $doc_obj->add_utf8_metadata($section, "CachePageURL", $cache_url);
552    }
553    if ( ! $self->{'no_cache_images'} ) {
554    $onlyfn = lc $onlyfn;
555    $doc_obj->associate_file($filepath, $onlyfn, undef, $section);
556    $doc_obj->add_utf8_metadata($section, "CacheURL",
557                   "_httpprefix_/collect/[collection]/index/assoc/[archivedir]/$onlyfn");
558    }
559    return 1;
560}
561
562sub get_page_metadata {
563    my ($self, $textref) = (@_);
564    my (@rval);
565    $rval[0] = $self->get_meta_value("title", $textref);
566    $rval[1] = $self->get_meta_value("keywords", $textref);
567    $rval[2] = $self->get_meta_value("description", $textref);
568    $rval[3] = $self->{'filename'};
569
570    return wantarray ? @rval : "<tr><td>PageMeta</td><td>@rval</td></tr>\n" ;
571}
572
573# turns LargeCatFish into Large,Cat,Fish so MG sees the separate words
574sub split_filepath {
575    my ($self, $filepath) = (@_);
576    my (@words) = $filepath =~ /([A-Z][a-z]+)/g;
577    return join(',', @words);
578}
579
580# finds and extracts sentences
581# that seem to be on the same topic
582# as other related text (correlations)
583# and textual references (e.g. in figure 3 ...)
584sub get_textrefs {
585    my ($self, $reltext, $textref, $prevpos, $pos, $nextpos) = (@_);
586    my ($maxtext, $mintext, $startpos, $context_size, $context);
587
588    my (@relwords, @refwords, %sentences, @pagemeta);
589
590    # extract larger context
591    $maxtext = $self->{'max_near_text'};
592    $startpos = $pos - ($maxtext * 4);
593    $context_size = $maxtext*10;
594    if ($startpos < $prevpos ) { $startpos = $prevpos }
595    if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) }
596    $context = substr ( $$textref, $startpos, $context_size );
597    $context =~ s/<.*?>//gs;
598    $context =~ s/^.*>(.*)/$1/gs;
599    $context =~ s/(.*)<.*$/$1/gs;
600
601    # get page meta-data (if not already included)
602    if ( $self->{'aggressiveness'} == 5 && ! $self->{'smallpage'} ) {
603    @pagemeta = $self->get_page_metadata($textref);
604    foreach my $value ( @pagemeta ) {
605        $context .= "$value."; # make each into psuedo-sentence
606    }
607    }
608
609    # TODO: this list is not exhaustive
610    @refwords = ( '(?:is|are)? ?(?:show(?:s|n)|demonstrate(?:d|s)|explains|features) (?:in|by|below|above|here)',
611          '(?:see)? (?:figure|table)? (?:below|above)');   
612
613    # extract general references
614    foreach my $rw ( @refwords ) {
615    while ( $context =~ /[\.\?\!\,](.*?$rw\W.*?[\.\?\!\,])/ig ) {
616        my $sentence = $1;
617        $sentence =~ s/\s+/ /g;
618        $sentences{$sentence}+=2;
619    }
620    }
621    # extract specific (figure, table) references by number
622    my ($fignum) = $context =~ /[\.\?\!].*?(?:figure|table)s?[\-\_\ \.](\d+\w*)\W.*?[\.\?\!]/ig;
623    if ( $fignum ) {
624    foreach my $rw ( @refwords ) {
625        while ( $context =~ /[\.\?\!](.*?(figure|table)[\-\_\ \.]$fignum\W.*?[\.\?\!])/ig ) {
626        my $sentence = $1;
627        $sentence =~ s/\s+/ /g;
628        $sentences{$sentence}+=4;
629        }
630    }
631    }
632
633    # sentences with occurances of important words
634    @relwords = $reltext =~ /([a-zA-Z]{4,})/g; # take out small words
635    foreach my $word ( @relwords ) {
636    if ( $self->{'stopwords'}{$word} ) { next } # skip stop words
637    while ( $context =~ /([^\.\?\!]*?$word\W.*?[\.\?\!])/ig ) {
638        my $sentence = $1;
639        $sentence =~ s/\s+/ /g;
640        $sentences{$sentence}++;
641    }
642    }
643    foreach my $sentence ( keys %sentences ) {
644    if ($sentences{$sentence} < $self->{'textrefs_threshold'}) {
645        delete $sentences{$sentence};
646    }
647    }
648    my ($rval) = join "<br>\n", (keys %sentences);
649    if ( $rval && length($rval) > 5 ) {
650    return ( "<tr><td>TextualReferences</td><td>" . $rval . "</td></tr>\n") }
651    else { return "" }
652}
653
654# handles caption extraction
655# calling the extractor with different
656# tags and choosing the best candidate caption
657sub extract_caption_text {
658    my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
659    my (@neartext, $len, $hdelim, $mintext, $goodlen,
660    $startpos, $context, $context_size);
661   
662    $mintext = $self->{'min_near_text'};
663    $goodlen = $self->{'caption_length'};
664
665    # extract a context to extract near text from (faster)
666    $context_size = $self->{'max_near_text'}*3;
667    $startpos = $pos - ($context_size / 2);
668    if ($startpos < $prevpos ) { $startpos = $prevpos }
669    if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
670    { $context_size = ($nextpos - $startpos) }
671
672    $context = substr ( $$textref, $startpos, $context_size );
673    $context =~ s/<!--.*?-->//gs;
674    $context =~ s/^.*-->(.*)/$1/gs;
675    $context =~ s/(.*)<!--.*$/$1/gs;
676
677    # try stepping through markup delimiter sets
678    # and selecting the best one
679    foreach $hdelim ( @{ $self->{'cdelims'} } ) {
680    @neartext = $self->extract_caption($tag, $hdelim, \$context);
681    $len = length(join("", @neartext));
682    last if ($len >= $mintext && $len <= $goodlen);
683    }
684    # reject if well over reasonable length
685    if ( $len > $goodlen ) {
686    @neartext = [];
687    }
688    $neartext[0] = " " if (! defined $neartext[0]);
689    $neartext[1] = " " if (! defined $neartext[1]);
690    return "<tr><td>Caption</td><td>" . (join ",",  @neartext) . "</td></tr>\n"; # TODO: the | is for testing purposes
691} # end extract_caption_text
692
693# the previous section header often gives a bit
694# of context to the section that the image is
695# in (invariably the header is before/above the image)
696# so extract the text of the closest header above the image
697#
698# this fn just gets all the headers above the image, within the context window
699sub get_prev_header {
700    my ($self, $pos, $textref) = (@_);
701    my ($rhtext);
702    while ( $$textref =~ /<h\d>(.*?)<\/h\d>/sig ) {
703    # only headers before image
704    if ((pos $$textref) < $pos) {
705        $rhtext .= "$1, ";
706    }
707    }
708    if ( $rhtext ) { return "Header($rhtext)" }
709    else { return "" }
710}
711
712# not the most robust tag stripping
713# regexps (see perl.com FAQ) but good enough
714#
715# used by caption & tag-based near text algorithms
716sub strip_tags {
717    my ( $self, $value ) = @_;
718    if ( ! defined($value) ) { $value = "" } # handle nulls
719    else {
720    $value =~ s/<.*?>//gs; # strip all html tags
721    $value =~ s/\s+/\ /g; # remove extra whitespace
722    $value =~ s/\&\w+\;//g; # remove &nbsp; etc
723    }
724    return $value;
725}
726
727# uses the given tag(s) to identify
728# the caption near to the image
729# (below, above or both below and above)
730sub extract_caption {
731    my ($self, $tag, $bound_tag, $contextref) = (@_);
732    my (@nt, $n, $etag, $gotcap);
733    return ("", "") if ( ! ($$contextref =~ /\Q$tag/) );
734
735    $nt[0] = $`;
736    $nt[1] = $';
737    $gotcap = 0;
738
739    # look before the image for a boundary tag
740    ($etag, $nt[0]) = $nt[0] =~ /<($bound_tag)[\s]?.*?>(.*?)$/is;
741    # if bound_tag too far from the image, then prob not caption
742    # (note: have to allow for tags, so multiply by 3
743    if ( $etag && length($nt[0]) < ($self->{'caption_length'} * 3) ) {
744    if ( $nt[0] =~ /<\/$etag>/si ) {
745        # the whole caption is above the image: <tag>text</tag><img>
746        ($nt[0]) =~ /<(?:$etag)[\s]?.*?>(.*?)<\/$etag>/is;
747        $nt[0] = $self->strip_tags($nt[0]);
748        if ( length($nt[0]) > $self->{'min_near_text'} ) {
749        $gotcap = 1;
750        $nt[1] = "";
751        }
752
753    } elsif ( $nt[1] =~ /<\/$etag>/si) {
754        # the caption tag covers image: <tag>text?<img>text?</tag>
755        ($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
756        $nt[0] = $self->strip_tags($nt[0] . $nt[1]);
757        if ( length($nt[0]) > $self->{'min_near_text'} ) {
758        $gotcap = 2;
759        $nt[1] = "";
760        }
761    }
762    }
763    # else try below the image
764    if ( ! $gotcap ) {
765    # the caption is after the image: <img><tag>text</tag>
766    ($etag, $nt[1]) = $nt[1] =~ /^.*?<($bound_tag)[\s]?.*?>(.*)/is;
767    if ( $etag && $nt[1] =~ /<\/$etag>/s) {
768        ($nt[1]) = $nt[1] =~ /(.*?)<\/$etag>/si;
769        $gotcap = 3;
770        $nt[0] = "";
771        $nt[1] = $self->strip_tags($nt[1]);
772    }
773    }
774    if ( ! $gotcap ) { $nt[0] = $nt[1] = "" }
775    else {
776    # strip part-tags
777    $nt[0] =~ s/^.*>//s;
778    $nt[1] =~ s/<.*$//s;
779    }
780    my ($type);
781    if ( $gotcap == 0 ) { return ("nocaption", "") }
782    elsif ( $gotcap == 1 ) { $type = "captionabove:" }
783    elsif ( $gotcap == 2 ) { $type = "captioncovering:" }
784    elsif ( $gotcap == 3 ) { $type = "captionbelow:" }
785    return ($type, $nt[0], $nt[1]);
786}
787
788# tag-based near text
789#
790# tries different tag sets
791# and chooses the best one
792sub extract_near_text {
793    my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
794    my (@neartext, $len, $hdelim, $maxtext, $mintext, $goodlen,
795    @bestlen, @best, $startpos, $context, $context_size,
796    $dist, $bdist, $best1, $i, $nt);
797    $bestlen[0] = $bestlen[1] = 0; $bestlen[2] = $bdist = 999999;
798    $best[0] = $best[1] = $best[2] = "";
799    $maxtext = $self->{'max_near_text'};
800    $mintext = $self->{'min_near_text'};
801    $goodlen = $self->{'neartext_length'};
802
803    # extract a context to extract near text from (faster)
804    $context_size = $maxtext*4;
805    $startpos = $pos - ($context_size / 2);
806    if ($startpos < $prevpos ) { $startpos = $prevpos }
807    if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
808    { $context_size = ($nextpos - $startpos) }
809    $context = substr ( $$textref, $startpos, $context_size );
810    $context =~ s/<!--.*?-->//gs;
811    $context =~ s/^.*-->(.*)/$1/gs;
812    $context =~ s/(.*)<!--.*$/$1/gs;
813
814    # try stepping through markup delimiter sets
815    # and selecting the best one
816    foreach $hdelim ( @{ $self->{'delims'} } ) {
817    @neartext = $self->extract_tagged_neartext($tag, $hdelim, \$context);
818    $nt = join("", @neartext);
819    $len = length($nt);
820    # Priorities:
821    # 1. Greater than mintext
822    # 2. Less than maxtext
823    # 3. Closest to goodlen
824    if ( $len <= $goodlen && $len > $bestlen[0] ) {
825        $bestlen[0] = $len;
826        $best[0] = $hdelim;
827    } elsif ( $len >= $maxtext && $len < $bestlen[2] ) {
828        $bestlen[2] = $len;
829        $best[2] = $hdelim;
830    } elsif ( $len >= $bestlen[0] && $len <= $bestlen[2] ) {
831        $dist = abs($goodlen - $len);
832        if ( $dist < $bdist ) {
833        $bestlen[1] = $len;
834        $best[1] = $hdelim;
835        $bdist = $dist;
836        }
837    }
838    }
839    $best1 = 2;
840    foreach $i ( 0..2 ) {
841    if ( $bestlen[$i] == 999999 ) { $bestlen[$i] = 0 }
842    $dist = abs($goodlen - $bestlen[$i]);
843    if ( $bestlen[$i] > $mintext && $dist <= $bdist ) {
844        $best1 = $i;
845        $bdist = $dist;
846    }
847    }
848    @neartext = $self->extract_tagged_neartext($tag, $best[$best1], \$context);
849    if ( $bestlen[$best1] > $maxtext ) {
850    # truncate on word boundary if too much text
851    my $hmax = $maxtext / 2;
852    ($neartext[0]) = $neartext[0] =~ /([^\s]*.{1,$hmax})$/s;
853    ($neartext[1]) = $neartext[1] =~ /^(.{1,$hmax}[^\s]*)/s;
854    } elsif ( $bestlen[$best1] < $mintext ) {
855    # use plain text extraction if tags failed (e.g. usable tag outside context)
856    print {$self->{'outhandle'}} "HTMLImagePlugin: Fallback to plain-text extraction for $tag\n"
857        if $self->{'verbosity'} > 2;
858    $neartext[0] = "<tr><td>RawNeartext</td><td>" . $self->extract_raw_neartext($tag, $textref) . "</td></tr>";
859    $neartext[1] = "";
860    }
861    # get previous header if available
862    $neartext[0] .= "<br>\n" .
863    $self->get_prev_header($pos, \$context) if ( $self->{'aggressiveness'} >= 4 );
864    $neartext[0] = " " if (! defined $neartext[0]);
865    $neartext[1] = " " if (! defined $neartext[1]);
866
867    return "<tr><td>NearText</td><td>" . (join "|",  @neartext) . "</td></tr>\n"; # TODO: the | is for testing purposes
868} # end extract_near_text
869
870# actually captures tag-based
871# near-text given a tag set
872sub extract_tagged_neartext {
873    my ($self, $tag, $bound_tag, $textref) = (@_);
874    return "" if ( ! ($$textref =~ /\Q$tag/) );
875    my (@nt, $delim, $pre_tag, $n);
876    $nt[0] = $`;
877    $nt[1] = $';
878
879    # get text after previous image tag
880    $nt[0] =~ s/.*<($bound_tag)[^>]*>(.*)/$2/is; # get rid of preceding text
881    if (defined($1)) { $delim = $1 }   
882    $pre_tag = $bound_tag;
883
884    if (defined($delim)) {
885    # we want to try and use the end tag of the previous delimiter
886    # (put it on the front of the list)
887    $pre_tag =~ s/(^|\|)($delim)($|\|)//i; # take it out
888    $pre_tag =~ s/\|\|/\|/i; # replace || with |
889    $pre_tag = $delim . "|" . $pre_tag; # put it on the front
890    }
891   
892    # get text before next image tag
893    $nt[1] =~ s/<\/?(?:$pre_tag)[^>]*>.*//is; # get rid of stuff after first delimiter
894
895    # process related text
896    for $n (0..1) {
897    if ( defined($nt[$n]) ) {
898        $nt[$n] =~ s/<.*?>//gs; # strip all html tags
899        $nt[$n] =~ s/\s+/\ /gs; # remove extra whitespace
900        $nt[$n] =~ s/\&\w+\;//sg; # remove &nbsp; etc
901        # strip part-tags
902        if ( $n == 0 ) { $nt[0] =~ s/^.*>//s }
903        if ( $n == 1 ) { $nt[1] =~ s/<.*$//s }
904    } else { $nt[$n] = ""; } # handle nulls
905    }
906    return @nt;
907}
908
909# this function is fall-back
910# if tags aren't suitable.
911#
912# extracts a fixed length of characters
913# either side of image tag (on word boundary)
914sub extract_raw_neartext {
915    my ($self, $tag, $textref) = (@_);
916    my ($rawtext, $startpos, $fp);
917    my $imgs = \%{$self->{'imglist'}};
918    ($fp) = $tag =~ /([\w\\\/]+\.(?:gif|jpe?g|png))/is;
919    if (! $fp) { return " " };
920    # if the cached, plain-text version isn't there, then create it
921    $self->init_plaintext($textref) unless defined($self->{'plaintext'});
922
923    # take the closest maxtext/2 characters
924    # either side of the tag (by word boundary)
925    return "" if ( ! exists $imgs->{$fp}{'rawpos'} );
926    $startpos = $imgs->{$fp}{'rawpos'} - (($self->{'max_near_text'} / 2) + 20);
927    if ( $startpos < 0 ) { $startpos = 0 }
928    $rawtext = substr $self->{'plaintext'}, $startpos, $self->{'max_near_text'} + 20;
929    $rawtext =~ s/\s\s/ /g;
930
931    return $rawtext;
932}
933
934# init plaintext variable for HTML-stripped version
935# (for full text index/raw assoc text extraction)
936sub init_plaintext {
937    my ($self, $textref) = (@_);
938    my ($page, $fp);
939    my $imgs = \%{$self->{'imglist'}};
940    $page = $$textref; # make a copy of original
941
942    # strip tags around image filenames so they don't get zapped
943    $page =~ s/<\w+\s+.*?([\w\/\\]+\.(?:gif|jpe?g|png))[^>]*>/\"$1\"/gsi;
944    $page =~ s/<.*?>//gs;
945    $page =~ s/&nbsp;/ /gs;
946    $page =~ s/&amp;/&/gs; #TODO: more &zzz; replacements (except &lt;, $gt;)
947
948    # get positions and strip images
949    while ( $page =~ /([^\s\'\"]+\.(jpe?g|gif|png))/ig ) {
950    $fp = $1;
951    if ( $imgs->{$fp}{'exists'} ) {
952        $imgs->{$fp}{'rawpos'} = pos $page;
953    }
954    $page =~ s/\"$fp\"//gs;
955    }
956    $self->{'plaintext'} = $page;
957}
958
959# finds and filters images based on size
960# (dimensions, height, filesize) and existence
961#
962# looks for image filenames (.jpg, .gif, etc)
963# and checks for existence on disk
964# (hence supports most JavaScript images)
965sub get_img_list {
966    my $self = shift (@_);
967    my ($textref) = (@_);
968    my ($filepath, $relpath, $abspath, $pos, $num, $width, $height, $filesize);
969    my $imgs = \%{$self->{'imglist'}};
970    while ( $$textref =~ /([^\s\'\"]+\.(jpe?g|gif|png))/ig ) {
971    $filepath = $1;
972    $pos = pos $$textref;
973    next if ( $imgs->{$filepath}{'relpath'} );
974        $relpath = $filepath;
975    $relpath =~ s/^http\:\/\///; # remove http:// in case we have mirrored it
976    $relpath =~ s/\\/\//g;  # replace \ with /
977    $relpath =~ s/^\.\///s; # make "./filepath" into "filepath"
978    $imgs->{$filepath}{'relpath'} = $relpath;
979    $abspath = "$self->{'htpath'}/$relpath";
980
981    if (! -e $abspath) { next }
982
983    # can't modify real filepath var because it
984    # then can't be located in the page for tag recognition later
985    ($width, $height) =
986        `identify $abspath -ping -format "%wx%h"` =~ /^(\d*)x(\d*)$/m;
987    if (! ($width && $height)) {
988        print STDERR "HTMLImagePlugin: ($abspath) 'identify' failed. Check ImageMagicK binaries are installed and working correctly\n"; next;
989    }
990    $filesize = (-s $abspath);
991    if ( $filesize >= $self->{'min_size'}
992        && ( $width >= $self->{'min_width'} )
993        && ( $height >= $self->{'min_height'} ) ) {
994       
995       $imgs->{$filepath}{'exists'} = 1;
996       $imgs->{$filepath}{'pos'} = $pos;
997       $imgs->{$filepath}{'width'} = $width;
998       $imgs->{$filepath}{'height'} = $height;
999       $imgs->{$filepath}{'filesize'} = $filesize;
1000       } else {
1001       print {$self->{'outhandle'}} "HTMLImagePlugin: skipping $self->{'base_path'}/$relpath: $filesize, $width x $height\n"
1002           if $self->{'verbosity'} > 2;
1003       }
1004    }
1005    $num = 0;
1006    foreach my $i ( keys %{$imgs} ) {
1007    if ( $imgs->{$i}{'pos'} ) {
1008        $num++;
1009    } else { delete $imgs->{$i} }
1010    }
1011    return $num;
1012}
1013
1014# make the text available to the read function
1015# by making it an object variable
1016sub read_file {
1017    my ($self, $filename, $encoding, $language, $textref) = @_;
1018    $self->SUPER::read_file($filename, $encoding, $language, $textref);
1019
1020    # if HTMLplug has run through, then it will
1021    # have replaced references so we have to
1022    # make a copy of the text before processing
1023    if ( $self->{'index_pages'} ) {
1024    $self->{'text'} = $$textref;
1025    $self->{'textref'} = \($self->{'text'});
1026    } else {
1027    $self->{'textref'} = $textref;
1028    }
1029    $self->{'plaintext'} = undef;
1030}
1031
1032# HTMLPlugin only extracts meta-data if it is specified in plugin options
1033# hence a special function to do it here
1034sub get_meta_value {
1035    my ($self, $name, $textref) = @_;
1036    my ($value);
1037    $name = lc $name;
1038    if ($name eq "title") {
1039    ($value) = $$textref =~ /<title>(.*?)<\/title>/is
1040    } else {
1041    my $qm = "(?:\"|\')";
1042    ($value) = $$textref =~ /<meta name\s*=\s*$qm?$name$qm?\s+content\s*=\s*$qm?(.*?)$qm?\s*>/is
1043    }
1044    $value = "" unless $value;
1045    return $value;
1046}
1047
1048# make filename an anchor reference
1049# so we can go straight to the image
1050# within the cached version of the source page
1051# (augment's HTMLPlugin sub)
1052sub replace_images {
1053    my $self = shift (@_);
1054    my ($front, $link, $back, $base_dir,
1055    $file, $doc_obj, $section) = @_;
1056    $link =~ s/\"//g;
1057    my ($a_name) = $link;
1058    $a_name =~ s/[\/\\\:\&]/_/g;
1059    # keep a list so we don't repeat the same anchor
1060    if ( ! $self->{'imglist'}{$link}{'anchored'} ) {
1061    $front = "<a name=\"gsdl_$a_name\">$front";
1062    $back = "$back</a>";
1063    $self->{'imglist'}{$link}{'anchored'} = 1;
1064    }
1065    return $self->SUPER::replace_images($front, $link, $back, $base_dir,
1066                     $file, $doc_obj, $section);
1067}
1068
10691;
Note: See TracBrowser for help on using the browser.