Changeset 3350


Ignore:
Timestamp:
2002-08-13T12:28:15+12:00 (22 years ago)
Author:
sjboddie
Message:

Added -use_strings option to ConvertToPlug. The default behaviour for
plugins derived from ConvertToPlug (WordPlug, PDFPlug etc) is now to
exclude documents that can't be converted correctly. They won't use the
perl strings stuff to extract text unless the -use_strings option is
specified.

Location:
trunk/gsdl
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/gsConvert.pl

    r3246 r3350  
    5757my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
    5858if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
     59
     60my $use_strings;
    5961
    6062sub print_usage
     
    6870    print STDERR "\t-output\thtml|text\n";
    6971    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
     72    print STDERR "\t-use_strings\t(use strnigs to extract text if conversion fails)\n";
    7073    exit(1);
    7174}
     
    8588             'output/(html|text)/', \$output_type,
    8689             'timeout/\d+/0',\$timeout,
    87              'verbose/\d+/0',   \$verbose))
     90             'verbose/\d+/0',   \$verbose,
     91             'use_strings', \$use_strings))
    8892    {
    8993    print_usage();
     
    469473        close FAILLOG if ($write_to_fail_log);
    470474    }
    471     print STDERR "Continuing...\n";
    472475    return 0; # we can try any_to_text
    473476    }
     
    954957    ($input_filename, $output_filestem) = @_;
    955958
     959    if (!$use_strings) {
     960      return 0;
     961    }
     962
    956963    open(IN, "<$input_filename") || return 0;
    957964    binmode(IN);
  • trunk/gsdl/perllib/plugins/ConvertToPlug.pm

    r3248 r3350  
    6161    print STDERR "   -convert_to (html|text) plugin converts to TEXT or HTML\n";
    6262    print STDERR "                           (default html)\n";
     63    print STDERR "   -use_strings            if set a simple strings function\n";
     64    print STDERR "                           will be called to extract text\n";
     65    print STDERR "                           if the conversion utility fails\n";
    6366}
    6467
     
    7174    $plugin_name =~ s/\.pm$//;
    7275
    73     my $generate_format;
    74     my $kea_arg;
     76    my $newargs = {};
    7577
    7678    if (!parsargv::parse($args, 
    77              q^extract_keyphrases^, \$kea_arg->{'kea'}, #with extra options
    78              q^extract_keyphrase_options/.*/^, \$kea_arg->{'kea_options'}, #no extra options
    79              q^convert_to/(html|text)/html^, \$generate_format,
     79             q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options
     80             q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options
     81             q^convert_to/(html|text)/html^, \$newargs->{'generate_format'},
     82             q^use_strings^, \$newargs->{'use_strings'},
    8083             "allow_extra_options")) {
    8184
     
    8689    }
    8790   
    88     return ($plugin_name,$generate_format, $kea_arg);
     91    return ($plugin_name, $newargs);
    8992}
    9093
     
    9699    # of the argument list.
    97100    my @arglist = @_;
    98     my ($plugin_name, $generate_format, $kea_arg) = $class->parse_args(\@_);
    99 
    100     if ($class eq "PDFPlug" && $generate_format eq "text" &&
     101    my ($plugin_name, $args) = $class->parse_args(\@_);
     102
     103    if ($class eq "PDFPlug" && $args->{'generate_format'} eq "text" &&
    101104    $ENV{'GSDLOS'} =~ /^windows$/i) {
    102105    print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
    103     $generate_format = "html";
    104     }
    105 
    106     if ($generate_format eq "text")
     106    $args->{'generate_format'} = "html";
     107    }
     108
     109    if ($args->{'generate_format'} eq "text")
    107110    {
    108111    $self = new TEXTPlug ($class, @arglist);
     
    120123    }
    121124
    122     #if kea data to be extracted...
    123     $self->{'kea'} = 1 if($kea_arg->{'kea'});
    124     $self->{'kea_options'} = 1 if($kea_arg->{'kea_options'});
     125    foreach my $key (keys %$args) {
     126      $self->{$key} = $args->{$key};
     127    }
    125128 
    126129    return bless $self, $class;
     
    173176    # making sure the converter gives us the appropriate output type
    174177    my $output_type = lc($convert_to);
    175     my $cmd = "perl -S gsConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
     178    my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
     179    if ($self->{'use_strings'}) {
     180      $cmd .= "-use_strings ";
     181    }
     182    $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
    176183    $output_type = `$cmd`;
    177184
Note: See TracChangeset for help on using the changeset viewer.