Changeset 2996


Ignore:
Timestamp:
2002-02-26T09:50:41+13:00 (22 years ago)
Author:
sjboddie
Message:

* empty log message *

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/W3ImgPlug.pm

    r2899 r2996  
    11###########################################################################
    22#
    3 # W3ImgPlug.pm -- Web image indexing plugin
     3# W3ImgPlug.pm -- Context-based image indexing plugin for HTML documents
    44#
    55# A component of the Greenstone digital library software
     
    2525###########################################################################
    2626
    27 #  Subclass of HTMLPlug, designed to
    28 #  extract images from HTML pagesimages
    29 #  with meta-data and associated text
    30 #  allowing indexing of images
     27# DESCRIPTION:
     28#
     29#  Extracts images and associated text and metadata from
     30#  web pages as individual documents for indexing. Thumbnails
     31#  are created from each image for browsing purposes.
     32#
     33#  Options are available for configuring the aggressiveness of the
     34#  associated text extraction mechanisms. A higher level of
     35#  aggressiveness will extract more text and consequently
     36#  may mean lower accuracy (precision); however, it may also
     37#  retrieve more of the relevant images from the collection (recall).
     38#  Lower levels of aggressiveness maybe result in slightly faster
     39#  collection builds at the import stage.
     40#
     41#  W3ImgPlug is a subclass of HTMLPlug (i.e. it will index pages also
     42#  if required). It can be used in place of HTMLPlug to index both
     43#  pages and their images.
     44#
     45# REQUIREMENTS:
    3146#   
    32 #  can be used in place of HTMLPlug with -cache_pages
    33 #  to index the images to complement the page index
    34 
    35 # TODO: have a gsdl/bin/imgconvert prog that is different for Win vs Linux
    36 #       and calls 'convert' with parameters?
    37 # TODO: windows doesn't have `cksum` utility. subst with one in bin/windows?
    38 #       (currently cludges filename/ext instead)
    39 
     47#  The ImageMagick image manipulation is used to create
     48#  thumbnails and extract some image metadata. (Available
     49#  from http://www.imagemagick.org/)
     50#
     51#  Unix:
     52#    Many Linux distributions contain ImageMagick.
     53#
     54#  Windows:
     55#    ImageMagick can be downloaded from the website above.
     56#    Make sure the system path includes the ImageMagick binaries
     57#    before using W3ImgPlug.
     58#
     59#    NOTE: NT/2000/XP contain a filesystem utility 'convert.exe'
     60#    with the same name as the image conversion utility. The
     61#    ImageMagick FAQ recommends renaming the filesystem
     62#    utility (e.g. to 'fsconvert.exe') to avoid this clash.
     63#
     64# USAGE: 
     65#
     66#  An image document consists of metadata elements:
     67#
     68#   OriginalFilename, FilePath, Filename, FileExt, FileSize,
     69#   Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL
     70#   ImageText, PageTitle
     71#
     72#  Most of these are only useful in format strings (e.g. ThumbURL,
     73#  Filename, URL, PageURL, CachePageURL).
     74#
     75#  ImageText, as the name suggests contains the indexable text.
     76#  (unless using the -document_text plugin option)
     77#
     78#  Since image documents are made up of metadata elements
     79#  alone, format strings are needed to display them properly.
     80#  NOTE: The receptionist will only display results (e.g. thumbnails)
     81#  in 4 columns if the format string begins with "<td><table>".
     82#
     83#  The example below takes the user to the image within the
     84#  source HTML document rather than using a format string
     85#  on DocumentText to display the image document itself.
     86#
     87#  Example collect.cfg:
     88#
     89#   ...
     90#
     91#   indexes document:ImageText document:text
     92#   defaultindex document:ImageText
     93
     94#   collectionmeta .document:ImageText "images"
     95#   collectionmeta .document:text "documents"
     96#
     97#   ...
     98#
     99#   plugin W3ImgPlug -index_pages -aggressiveness 6
     100#
     101#   ...
     102
     103#   format SearchVList '<td>{If}{[Title],[link][icon]&nbsp;[Title][[/link],
     104#    <table><tr><td align="center"><a href="[CachePageURL]">
     105#    <img src="[ThumbURL]"></a></td></tr><tr><td align="center">
     106#    <a href="[CachePageURL]"><font size="-1">[OriginalFilename]</font></a>
     107#    <br>[Width]x[Height]</td></tr></table>}</td>'
     108#
     109#   ...
     110#
     111 
    40112package W3ImgPlug;
    41113
     
    151223    }
    152224    # else use URL for referencing
    153 #    if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
     225    #if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
    154226
    155227    $self->{'htpath'} = $base_dir if (-d $base_dir);
     
    173245
    174246# get complex configuration options from configuration files
    175 # -- $GSDLCOLLECTION/etc/W3ImgPlug.cfg
    176 # -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw
    177 # (only done once, at import start)
     247# -- $GSDLCOLLECTION/etc/W3ImgPlug.cfg (tag sets for aggr 2+)
     248# -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+)
    178249
    179250# If there's no W3ImgPlug.cfg file we'll use the following default values
     
    350421    $orig_fp =~ tr/+/ /;
    351422    $orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc
     423    $orig_fp =~ s/\\/\//g;
    352424    $filepath = "$self->{'htpath'}/$filepath";
    353425    ($onlyfn) = $filename =~ /([^\\\/]*)$/;
     
    9441016    my ($front, $link, $back, $base_dir,
    9451017    $file, $doc_obj, $section) = @_;
     1018    $link =~ s/\"//g;
    9461019    my ($a_name) = $link;
    9471020    $a_name =~ s/[\/\\\:\&]/_/g;
Note: See TracChangeset for help on using the changeset viewer.