Changeset 10278


Ignore:
Timestamp:
2005-07-25T14:14:57+12:00 (19 years ago)
Author:
chi
Message:

A major modification to allow a secondary-plugin setting. With this modification, based on the type of
conversion in -convert_to (originally only to html, text, but now will allow user to define to jpg, gif,
png), a secondary plugin can be set. This secondary plugin will come aside the document
plugin (e.g. PPTPlug, WordPlug, PSPlug, RTFPlug, ExcelPlug...).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/ConvertToPlug.pm

    r10254 r10278  
    11###########################################################################
    22#
    3 # ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending
    4 #                     on plugin argument convert_to
     3# ConvertToPlug.pm -- plugin that inherits from BasPlug
    54#
    65# A component of the Greenstone digital library software
     
    2625###########################################################################
    2726
    28 # The plugin is inherited by such plugins as WordPlug and PDFPlug.
    29 # It facilitates the conversion of these document types to either HTML
    30 # or TEXT by setting up variable that instruct ConvertToBasPlug
    31 # how to work.
    32 
    33 # It works by dynamically inheriting HTMLPlug or TEXTPlug based on
    34 # the plugin argument 'convert_to'.  If the argument is not present,
    35 # the default is to inherit HTMLPlug.
     27# This plugin is inherited by such plugins as WordPlug, PPTPlug, PSPlug, RTFPlug
     28# and PDFPlug. It facilitates the conversion of these document types to either
     29# HTML, Text or auto (allow user to choose which format to convert to).
     30# It works by dynamically inheriting BasPlug and base on the plugin type in
     31# secondary_plugins to devide which format to 'convert_to'. If the argument is
     32# not present, the default is to inherit auto.
    3633
    3734
     
    3936
    4037use BasPlug;
     38use ghtml;
    4139use HTMLPlug;
    4240use TEXTPlug;
    43 use ghtml;
    44 
    45 use strict;
    46 no strict 'refs'; # allow filehandles to be variables and viceversa
     41use PagedImgPlug;
     42
     43#use strict;
     44#no strict 'refs'; # allow filehandles to be variables and viceversa
    4745
    4846sub BEGIN {
    49     @ConvertToPlug::ISA = ('HTMLPlug');
    50 #    @ISA = ('HTMLPlug', 'TEXTPlug');
    51 #    @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug');
     47    @ISA = ('BasPlug');
    5248}
    5349
    5450my $convert_to_list =
    55     [ { 'name' => "html",
     51    [ { 'name' => "auto",
     52    'desc' => "{ConvertToPlug.convert_to.auto}" },
     53      { 'name' => "html",
    5654    'desc' => "{ConvertToPlug.convert_to.html}" },
    5755      { 'name' => "text",
    58     'desc' => "{ConvertToPlug.convert_to.text}" } ];
     56    'desc' => "{ConvertToPlug.convert_to.text}" },
     57      { 'name' => "pagedimg-jpg",
     58    'desc' => "{ConvertToPlug.convert_to.pagedimg-jpg"},
     59      { 'name' => "pagedimg-gif",
     60    'desc' => "{ConvertToPlug.convert_to.pagedimg-gif"},
     61      { 'name' => "pagedimg-png",
     62    'desc' => "{ConvertToPlug.convert_to.pagedimg-png"},
     63      ];
    5964
    6065my $arguments =
     
    95100    if($inputargs->[$intCounter] eq "-convert_to")
    96101    {
    97         if($inputargs->[$intCounter+1] eq "text" || $inputargs->[$intCounter+1] eq "html")
     102        #if($inputargs->[$intCounter+1] eq "auto" || $inputargs->[$intCounter+1] =~ /pagedimg.*/i || $inputargs->[$intCounter+1] eq "text" || $inputargs->[$intCounter+1] eq "html")
     103        # if the setting is "auto" then refer to html for now
     104        if($inputargs->[$intCounter+1] =~ /pagedimg.*/i || $inputargs->[$intCounter+1] eq "text" || $inputargs->[$intCounter+1] eq "html")
    98105        {
    99106        return $inputargs->[$intCounter+1];
     
    103110    }
    104111    return "html";
     112}
     113
     114sub load_secondary_plugins
     115{
     116    my $self = shift (@_);
     117    my ($class,$plugin_options) = @_;
     118
     119    my @convert_to_list = split(",",$self->{'convert_to'});
     120   
     121    $secondary_plugins = {};
     122
     123    foreach my $convert_to (@convert_to_list) {
     124    # load in "convert_to" plugin package
     125    my $plugin_class = $convert_to."Plug";
     126    my $plugin_package = $plugin_class.".pm";
     127    require $plugin_package;
     128
     129    # call its constructor with extra options that we've worked out!
     130    my $arglist = $plugin_options->{$plugin_class};
     131    my $secondary_plugin = new $plugin_class([], \@$arglist);
     132    $secondary_plugins->{$plugin_class} = $secondary_plugin;
     133    }
     134    $self->{'secondary_plugins'} = $secondary_plugins;
    105135}
    106136
     
    127157    $self->{'convert_to'} = "TEXT";
    128158    $self->{'convert_to_ext'} = "txt";
    129     }
    130     else
    131     {
     159    my $text_options = [];
     160    push(@$text_options,"-metadata_fields","Title,GENERATOR");
     161    $secondary_plugin_options->{'TextPlug'} = $text_options;
     162    }
     163    elsif ($strConvertTo =~ /pagedimg.*/i){
     164    $self = (defined $hashArgOptLists)? new PagedImgPlug($pluginlist,$inputargs,$hashArgOptLists): new PagedImgPlug($pluginlist,$inputargs);
     165    $self->{'convert_to'} = "PagedImg";
     166    #$self->{'convert_to'} = $strConvertTo;
     167    my $convert_to_ext = $strConvertTo;
     168    $convert_to_ext =~ s/.*\-(.*)/$1/i;
     169    if ($convert_to_ext eq "gif"){
     170        $self->{'convert_to_ext'} = "gif";
     171    } elsif ($convert_to_ext eq "jpg"){
     172        $self->{'convert_to_ext'} = "jpg";
     173    } elsif ($convert_to_ext eq "png") {
     174        $self->{'convert_to_ext'} = "png";
     175    }
     176    my $pagedimg_options = [];
     177    push(@$pagedimg_options,"-metadata_fields","Title,GENERATOR");
     178    $secondary_plugin_options->{'PagedImgPlug'} = $pagedimg_options;
     179    } else {
     180    # HTML or auto
    132181    $self = (defined $hashArgOptLists)? new HTMLPlug($pluginlist,$inputargs,$hashArgOptLists): new HTMLPlug($pluginlist,$inputargs);
    133182    $self->{'convert_to'} = "HTML";
    134183    $self->{'convert_to_ext'} = "html";
    135 
    136     $self->{'rename_assoc_files'} = 1;
    137     $self->{'metadata_fields'} .= ",GENERATOR";
     184    my $html_options = [];
     185    push(@$html_options,"-rename_assoc_files","1");
     186    push(@$html_options,"-metadata_fields","Title,GENERATOR");
     187    $secondary_plugin_options->{'HTMLPlug'} = $html_options;
    138188    }
    139189
     
    141191}
    142192
    143 # we don't need to block anything, so override the one for HTMLPlug
    144 # files are converted in a temp dir and extra files not passed down the
    145 # plugin list
    146 sub get_default_block_exp {
    147     my $self = shift (@_);
    148 
    149     return "";
    150 }
    151 
    152 # Go straight to BasPlug and avoid the special case implemented by HTMLPlug
    153 sub store_block_files {
    154     return BasPlug::store_block_files(@_);
    155 }
     193
     194sub init {
     195    my $self = shift (@_);
     196    my ($verbosity, $outhandle, $failhandle) = @_;
     197
     198    $self->SUPER::init($verbosity,$outhandle,$failhandle);
     199
     200    my $secondary_plugins =  $self->{'secondary_plugins'};
     201
     202    foreach my $plug_name (keys %$secondary_plugins) {
     203    my $plugin = $secondary_plugins->{$plug_name};
     204    $plugin->init($verbosity,$outhandle,$failhandle);
     205    }
     206}
     207
     208sub deinit {
     209    # called only once, after all plugin passes have been done
     210
     211    my ($self) = @_;
     212
     213    my $secondary_plugins =  $self->{'secondary_plugins'};
     214
     215    foreach my $plug_name (keys %$secondary_plugins) {
     216    my $plugin = $secondary_plugins->{$plug_name};
     217    $plugin->deinit();
     218    }
     219}
     220
     221
     222sub convert_post_process
     223{
     224    # by default do no post processing
     225    return;
     226}
     227
    156228
    157229# Run conversion utility on the input file. 
     
    171243    my $convert_to = $self->{'convert_to'};
    172244    my $failhandle = $self->{'failhandle'};
     245    my $convert_to_ext = $self->{'convert_to_ext'};
    173246
    174247    # softlink to collection tmp dir
     
    198271    # Execute the conversion command and get the type of the result,
    199272    # making sure the converter gives us the appropriate output type
    200     my $output_type = lc($convert_to);
     273    my $output_type="";
     274    if ($convert_to =~ m/PagedImg/i) {
     275    $output_type = lc($convert_to)."-".lc($convert_to_ext);
     276    } else {
     277    $output_type = lc($convert_to);
     278    }
     279
    201280    my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
    202281    if (defined $self->{'convert_options'}) {
     
    211290
    212291    # remove symbolic link to original file
    213     &util::rm($tmp_filename);
     292    #&util::rm($tmp_filename);
    214293
    215294    # Check STDERR here
     
    233312    # store the *actual* output type and return the output filename
    234313    # it's possible we requested conversion to html, but only to text succeeded
    235 
    236314    $self->{'convert_to_ext'} = $output_type;
    237315    if ($output_type =~ /html/i) {
     
    239317    } elsif ($output_type =~ /te?xt/i) {
    240318    $self->{'converted_to'} = "TEXT";
    241     }
     319    } elsif ($output_type =~ /item/i){
     320    $self->{'converted_to'} = "PagedImg";
     321    }
     322   
    242323    my $output_filename = $tmp_filename;
    243 
    244     $output_filename =~ s/$suffix$/.$output_type/;
    245 
     324    if ($output_type =~ /item/i) {
     325    $output_filename = $tmp_dirname . "\\$tailname\\" . $tailname . ".$output_type";
     326    } else {
     327    $output_filename =~ s/$suffix$/.$output_type/;
     328    }
    246329    return $output_filename;
    247330}
     
    249332
    250333# Remove collection specific tmp directory and all its contents.
    251 
    252334sub cleanup_tmp_area {
    253335    my $self = shift (@_);
     
    255337    my $tmp_dirname
    256338    = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
    257     &util::rm_r($tmp_dirname);
     339    #&util::rm_r($tmp_dirname);
    258340    &util::mk_dir($tmp_dirname);
    259341}
    260 
    261 
    262 
    263342
    264343# Override BasPlug read
    265344# We don't want to get language encoding stuff until after we've converted
    266 # our file to either TEXT or HTML.
     345# our file to either TEXT or HTML or PagedImage.
    267346sub read {
    268347    my $self = shift (@_);
     
    273352
    274353    my $outhandle = $self->{'outhandle'};
    275    
    276     my $filename = $file;
    277     $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
    278 
    279     if ($self->associate_with($file,$filename,$metadata)) {
    280     # a form of smart block
    281     $self->{'num_blocked'} ++;
    282     return 0; # blocked
    283     }
    284 
    285     if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
    286     $self->{'num_blocked'} ++;
    287     return 0;
    288     }
    289     if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
    290         return undef;
    291     }
    292     $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
    293 
    294     # read in file ($text will be in utf8)
    295     my $text = "";
     354
     355    my ($block_status,$filename) = $self->read_block(@_);
     356    return $block_status if ((!defined $block_status) || ($block_status==0));
     357    $file = $self->read_tidy_file($file);
    296358
    297359    my $output_ext = $self->{'convert_to_ext'};
    298    
    299360    my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename);
    300 
    301361    if ("$conv_filename" eq "") {return 0;} # allows continue on errors
    302362    if (! -e "$conv_filename") {return 0;} # allows continue on errors
    303363    $self->{'conv_filename'} = $conv_filename;
    304 
    305     # Do encoding stuff
    306     my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
    307 
    308     &BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text);
    309     if (!length ($text)) {
    310     my $plugin_name = ref ($self);
    311         print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
    312         return 0;
    313     }
    314 
    315     # if we converted to HTML, convert é and etc to utf-8.
    316     # this should really happen before language_extraction, but that means
    317     # modifying a file on disk...
    318     $text =~ s/&([^;]+);/&ghtml::getcharequiv($1,0)/ge;
    319 
    320     # create a new document
    321     #my $doc_obj = new doc ($conv_filename, "indexed_doc");
    322     # now we use the original filename here
    323     my $doc_obj = new doc($filename, "indexed_doc");
    324     $doc_obj->set_converted_filename($conv_filename);
    325     $doc_obj->set_OIDtype ($processor->{'OIDtype'});
    326     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
    327     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
     364   
     365    $self->convert_post_process($conv_filename);
     366   
     367    my $secondary_plugins =  $self->{'secondary_plugins'};
     368    my $num_secondary_plugins = scalar(keys %$secondary_plugins);
     369    if ($num_secondary_plugins == 0) {
     370    print $outhandle "Warning: No secondary plugin to use in conversion.  Skipping $file\n";
     371    return 0; # effectively block it
     372    }
     373
     374    my @plugin_names = keys %$secondary_plugins;
     375    my $plugin_name = shift @plugin_names;
     376   
     377    if ($num_secondary_plugins > 1) {
     378    print $outhandle "Warning: Multiple secondary plugins not supported yet!  Choosing $plugin_name\n.";
     379    }
     380   
     381    my $secondary_plugin = $secondary_plugins->{$plugin_name};
     382   
     383    # note: metadata is not carried on to the next level
     384    my ($rv,$doc_obj)
     385    = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename,
     386                        $metadata, $processor, $maxdocs, $total_count,
     387                        $gli);
     388
     389    if ((!defined $rv) || ($rv<1)) {
     390    # wasn't processed
     391    return $rv;
     392    }
     393   
     394    # Override previous gsdlsourcefilename set by secondary plugin
     395    my $collect_file = &util::filename_within_collection($filename);
     396    my $collect_conv_file = &util::filename_within_collection($conv_filename);
     397    $doc_obj->set_source_filename ($collect_file);
     398    $doc_obj->set_converted_filename($collect_conv_file);
     399
    328400    my ($filemeta) = $file =~ /([^\\\/]+)$/;
    329     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
    330     if ($self->{'cover_image'}) {
    331     $self->associate_cover_image($doc_obj, $filename);
    332     }
    333     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
    334     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
    335 
    336     # include any metadata passed in from previous plugins
    337     # note that this metadata is associated with the top level section
    338     $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
     401    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
     402    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
     403    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename));
     404   
    339405    # do plugin specific processing of doc_obj
    340     unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
     406    unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
    341407    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
    342408    return -1;
     
    348414    # process the document
    349415    $processor->process($doc_obj);
    350     $self->cleanup_tmp_area();
     416##    $self->cleanup_tmp_area();
    351417
    352418    $self->{'num_processed'} ++;
     
    356422
    357423
    358 # do plugin specific processing of doc_obj for HTML type
     424# do plugin specific processing of doc_obj for doc_ext type
    359425sub process_type {
    360426    my $self = shift (@_);
    361     my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
    362    
    363     my $conv_filename = $self->{'conv_filename'};
    364     my $tmp_dirname = File::Basename::dirname($conv_filename);
    365     my $tmp_tailname = File::Basename::basename($conv_filename);
    366    
    367     my $converted_to = $self->{'converted_to'};
    368     my $ret_val;   
    369 
    370     if ($converted_to eq "TEXT")
    371     {
    372 
    373     $ret_val = &TEXTPlug::process($self, $textref, $pluginfo,
    374                       $tmp_dirname, $tmp_tailname,
    375                       $metadata, $doc_obj);
    376     }
    377     else
    378     {
    379     $ret_val = &HTMLPlug::process($self, $textref, $pluginfo,
    380                       $tmp_dirname, $tmp_tailname,
    381                       $metadata, $doc_obj);
    382     }
    383 
     427    my ($doc_ext, $base_dir, $file, $doc_obj) = @_;
     428   
    384429    # associate original file with doc object
    385430    my $cursection = $doc_obj->get_top_section();
     
    413458    $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
    414459
    415     return $ret_val;
     460    return 1;
    416461}
    417462
    4184631;
     464
     465
     466
     467
     468
     469
     470
Note: See TracChangeset for help on using the changeset viewer.