Ignore:
Timestamp:
2002-08-13T12:28:15+12:00 (22 years ago)
Author:
sjboddie
Message:

Added -use_strings option to ConvertToPlug. The default behaviour for
plugins derived from ConvertToPlug (WordPlug, PDFPlug etc) is now to
exclude documents that can't be converted correctly. They won't use the
perl strings stuff to extract text unless the -use_strings option is
specified.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/ConvertToPlug.pm

    r3248 r3350  
    6161    print STDERR "   -convert_to (html|text) plugin converts to TEXT or HTML\n";
    6262    print STDERR "                           (default html)\n";
     63    print STDERR "   -use_strings            if set a simple strings function\n";
     64    print STDERR "                           will be called to extract text\n";
     65    print STDERR "                           if the conversion utility fails\n";
    6366}
    6467
     
    7174    $plugin_name =~ s/\.pm$//;
    7275
    73     my $generate_format;
    74     my $kea_arg;
     76    my $newargs = {};
    7577
    7678    if (!parsargv::parse($args, 
    77              q^extract_keyphrases^, \$kea_arg->{'kea'}, #with extra options
    78              q^extract_keyphrase_options/.*/^, \$kea_arg->{'kea_options'}, #no extra options
    79              q^convert_to/(html|text)/html^, \$generate_format,
     79             q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options
     80             q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options
     81             q^convert_to/(html|text)/html^, \$newargs->{'generate_format'},
     82             q^use_strings^, \$newargs->{'use_strings'},
    8083             "allow_extra_options")) {
    8184
     
    8689    }
    8790   
    88     return ($plugin_name,$generate_format, $kea_arg);
     91    return ($plugin_name, $newargs);
    8992}
    9093
     
    9699    # of the argument list.
    97100    my @arglist = @_;
    98     my ($plugin_name, $generate_format, $kea_arg) = $class->parse_args(\@_);
    99 
    100     if ($class eq "PDFPlug" && $generate_format eq "text" &&
     101    my ($plugin_name, $args) = $class->parse_args(\@_);
     102
     103    if ($class eq "PDFPlug" && $args->{'generate_format'} eq "text" &&
    101104    $ENV{'GSDLOS'} =~ /^windows$/i) {
    102105    print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
    103     $generate_format = "html";
    104     }
    105 
    106     if ($generate_format eq "text")
     106    $args->{'generate_format'} = "html";
     107    }
     108
     109    if ($args->{'generate_format'} eq "text")
    107110    {
    108111    $self = new TEXTPlug ($class, @arglist);
     
    120123    }
    121124
    122     #if kea data to be extracted...
    123     $self->{'kea'} = 1 if($kea_arg->{'kea'});
    124     $self->{'kea_options'} = 1 if($kea_arg->{'kea_options'});
     125    foreach my $key (keys %$args) {
     126      $self->{$key} = $args->{$key};
     127    }
    125128 
    126129    return bless $self, $class;
     
    173176    # making sure the converter gives us the appropriate output type
    174177    my $output_type = lc($convert_to);
    175     my $cmd = "perl -S gsConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
     178    my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
     179    if ($self->{'use_strings'}) {
     180      $cmd .= "-use_strings ";
     181    }
     182    $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
    176183    $output_type = `$cmd`;
    177184
Note: See TracChangeset for help on using the changeset viewer.