Changeset 10282


Ignore:
Timestamp:
2005-07-25T14:27:31+12:00 (19 years ago)
Author:
chi
Message:

Modifications to allow the gsConvert either run open source coverting program or VB scripting for
certain types of document (e.g. Word, PPT...etc)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/gsConvert.pl

    r9482 r10282  
    6363my $pdf_zoom;
    6464my $pdf_ignore_images;
     65my $windows_scripting;
    6566
    6667sub print_usage
     
    7273    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
    7374    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
    74     print STDERR "\t-output\thtml|text\n";
     75    print STDERR "\t-output\tauto|html|text|pagedimg-jpg|pagedimg-gif|pagedimg-png\t(output file type)\n";
    7576    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
    7677    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
     78    print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n";
    7779    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
    7880    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
     
    9698             'type/(doc|dot|pdf|ps|ppt|rtf|xls)/', \$input_type,
    9799             '/errlog/.*/', \$faillogfile,
    98              'output/(html|text)/', \$output_type,
     100             'output/(auto|html|text|pagedimg).*/', \$output_type,
    99101             'timeout/\d+/0',\$timeout,
    100              'verbose/\d+/0',   \$verbose,
     102             'verbose/\d+/0', \$verbose,
    101103             'use_strings', \$use_strings,
     104             'windows_scripting',\$windows_scripting,
    102105             'pdf_complex', \$pdf_complex,
    103106             'pdf_ignore_images', \$pdf_ignore_images,
     
    108111    print_usage();
    109112    }
    110 
     113   
     114 
    111115    # Make sure the input file exists and can be opened for reading
    112116    if (scalar(@ARGV!=1)) {
     
    133137    my $stored_dir = cwd();
    134138    chdir ($dirname) || die "Unable to change to directory $dirname";
    135    
    136139    # Select convert utility
    137140    if (!defined $input_type) {
     
    209212
    210213    my $success = 0;
     214    if (!$output_type || ($output_type =~ /html/i)){
     215    if ($windows_scripting) {
     216        print STDERR "***** Calling VB Script!\n";
     217        $success = &native_doc_to_html($input_filename, $output_filestem);
     218    }
     219    else {
     220        print STDERR "**** Calling wvWare\n";
     221        $success = &doc_to_html($input_filename, $output_filestem);   
     222    }
     223    if ($success) {
     224       return "html";
     225    }
     226    }
    211227
    212228    # Attempt specialised conversion to HTML
    213     if (!$output_type || ($output_type =~ /html/i)) {
    214     $success = &doc_to_html($input_filename, $output_filestem);
    215     if ($success) {
    216         return "html";
    217     }
    218     }
     229    #if (!$output_type || ($output_type =~ /html/i)) {
     230#   $success = &doc_to_html($input_filename, $output_filestem);
     231#   if ($success) {
     232#       return "html";
     233#   }
     234#    }
    219235
    220236    return &convertAnything($input_filename, $output_filestem, $output_type);
     
    313329    }
    314330    }
    315 
    316331    return "fail";
    317 
    318332}
    319333
     
    323337
    324338    my $success = 0;
    325 
    326     # Attempt conversion to HTML
    327     if (!$output_type || ($output_type =~ /html/i)) {
     339    my $ppt_convert_type = "";
     340    if (!$output_type || $windows_scripting ||($output_type !~ /html/i)){
     341    if ($output_type =~ /gif/i) {
     342        $ppt_convert_type = "-g";
     343    } elsif ($output_type =~ /jp?g/i){
     344        $ppt_convert_type = "-j";
     345    } elsif ($output_type =~ /png/i){
     346        $ppt_convert_type = "-p";
     347    }
     348    my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
     349                       $ENV{'GSDLOS'}, "pptextract");
     350    $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ /^windows$/i);
     351           
     352    $cmd = "";
     353    #if ($timeout) {$cmd = "ulimit -t $timeout;";}
     354    #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
     355    #$cmd .= "$vbScript $input_filename $output_filestem.html";
     356    # if the converting directory has already existed
     357    if (-d $output_filestem) {
     358        print STDERR "**The conversion directory has existed\n";
     359        return "item";
     360    } else {
     361        $cmd .=  "$vbScript $ppt_convert_type $input_filename $output_filestem";
     362        $cmd .= " 2>\"$output_filestem.err\""
     363        if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
     364        if (system($cmd) !=0) {
     365        print STDERR "Powerpoint VB Scripting convert failed\n";
     366        } else {
     367        return "item";
     368        }
     369    }
     370    } else {
     371    # Attempt conversion to HTML
     372    #if (!$output_type || ($output_type =~ /html/i)) {
    328373    # formulate the command
    329374    $cmd = "";
     
    332377    $cmd .= " 2>\"$output_filestem.err\""
    333378        if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
    334    
    335379   
    336380    # execute the command
     
    388432
    389433
    390 
    391 
    392434# Find the real type of a .doc file
    393435#
    394436# We seem to have a lot of files with a .doc extension that are .rtf
    395437# files or Word 5 files.  This function attempts to tell the difference.
    396 
    397438sub find_docfile_type {
    398439    ($input_filename) = @_;
     
    428469
    429470
    430 
    431471# Specific type-to-type conversions
    432472#
     
    438478
    439479# Attempt to convert a word document to html with the wv program
    440 
    441480sub doc_to_html {
    442481    ($input_filename, $output_filestem) = @_;
     
    461500    $cmd .= " 2> \"$output_filestem.err\""
    462501    if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
    463 
     502   
    464503    # execute the command
    465504    $!=0;
     
    518557}
    519558
     559# Attempt to convert a word document to html with the word2html scripting program
     560sub native_doc_to_html {
     561    ($input_filename, $output_filestem) = @_;
     562
     563    my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
     564                       $ENV{'GSDLOS'}, "word2html");
     565
     566    $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ /^windows$/i);
     567
     568    my $cmd = "";
     569    if ($timeout) {$cmd = "ulimit -t $timeout;";}
     570    #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
     571    $cmd .=  "$vbScript $input_filename $output_filestem.html";
     572   
     573    # redirecting STDERR
     574    $cmd .= " 2> \"$output_filestem.err\""
     575    if ($ENV {'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
     576   
     577    # execute the command
     578    $!=0;
     579    if (system($cmd)!=0)
     580    {
     581    print STDERR "Error executing word2Html converter:$!\n";
     582    if (-s "$output_filestem.err") {
     583        open (ERRFILE, "<$output_filestem.err");
     584       
     585        my $write_to_fail_log=0;
     586        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
     587        {$write_to_fail_log=1;}
     588
     589        my $line;
     590        while ($line=<ERRFILE>) {
     591        if ($line =~ /\w/) {
     592            print STDERR "$line";
     593            print FAILLOG "$line" if ($write_to_fail_log);
     594        }
     595        if ($line !~ m/startup error/) {next;}
     596        print STDERR " (given an invalid .DOC file?)\n";
     597        print FAILLOG " (given an invalid .DOC file?)\n"
     598        if ($write_to_fail_log);
     599       
     600        } # while ERRFILE
     601        close FAILLOG if ($write_to_fail_log);
     602    }
     603    return 0; # we can try any_to_text
     604    }
     605
     606    # Was the conversion successful?
     607    if (-s "$output_filestem.html") {
     608    open(TMP, "$output_filestem.html");
     609    $line = <TMP>;
     610    close(TMP);
     611    if ($line && $line =~ /html/) {
     612        &util::rm("$output_filestem.err") if -e "$output_filestem.err";
     613        return 1;
     614    }
     615    }
     616   
     617    # If here, an error of some sort occurred
     618    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
     619    if (-e "$output_filestem.err") {
     620    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
     621        open (ERRLOG,"$output_filestem.err");
     622        while (<ERRLOG>) {print FAILLOG $_;}
     623        close FAILLOG;
     624        close ERRLOG;
     625    }
     626    &util::rm("$output_filestem.err");
     627    }
     628    return 0;
     629}
     630
     631
    520632
    521633# Attempt to convert an RTF document to html with rtftohtml
     
    528640    if ($timeout) {$cmd = "ulimit -t $timeout;";}
    529641    $cmd .= "rtftohtml";
     642    #$cmd .= "rtf-converter";
    530643
    531644    $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
     
    621734    {
    622735        print FAILLOG "Error - rtftohtml - couldn't extract text\n";
     736        #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
    623737        print FAILLOG " (rtf file might be too recent):\n";
    624738        open (ERRLOG, "$output_filestem.err");
     
    687801        close FAILLOG;
    688802        }   
    689     &util::rm("$output_filestem.err");
     803        &util::rm("$output_filestem.err");
    690804    }
    691805    return 0;
    692806    }
    693 
     807   
    694808    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    695809    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
     
    785899    $cmd .= " 2> $output_filestem.err";
    786900    $!=0;
    787 
    788901    my $retcode=system($cmd);
    789902    $retcode = $? >> 8;  # see man perlfunc - system for this...
Note: See TracChangeset for help on using the changeset viewer.