Changeset 22429

Show
Ignore:
Timestamp:
19.07.2010 13:28:14 (9 years ago)
Author:
davidb
Message:

Support of using OpenOffice? scripting through JODConverter.jar added. Also added in 'use strict' and then fixed up a variety of places that needed 'my' added

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/gsConvert.pl

    r20933 r22429  
    4949} 
    5050 
     51use strict; 
     52 
    5153use parsargv; 
    5254use util; 
     
    6567my $pdf_allow_images_only; 
    6668my $windows_scripting; 
     69my $openoffice_scripting; 
    6770 
    6871sub print_usage 
     
    7275    print STDERR "              or text using third-party programs.\n\n"; 
    7376    print STDERR "  usage: $0 [options] filename\n"; 
    74     print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 
     77    if ($openoffice_scripting) { 
     78    print STDERR "  options:\n\t-type\tdoc|dot|docx|odf|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 
     79    } 
     80    else { 
     81    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 
     82    } 
    7583    print STDERR "\t-errlog\t<filename>\t(append err messages)\n"; 
    7684    print STDERR "\t-output\tauto|html|text|pagedimage_jpg|pagedimage_gif|pagedimage_png\t(output file type)\n"; 
    7785    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n"; 
    7886    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n"; 
    79     print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n"; 
     87    print STDERR "\t-windows_scripting\tuse windows script (if available) when converting Microsoft Word and PPT via VB script\n"; 
     88    print STDERR "\t-openoffice_scripting\tuse openoffice script (if available) when converting Microsoft Word and PPT via OpenOffice\n"; 
    8089    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n"; 
    8190    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n"; 
     
    96105    my ($input_type,$output_type,$verbose); 
    97106 
     107     
     108    # scan for -openoffice_scripting as it effects the permissible 
     109    # values for -type 
     110 
     111    foreach my $a (@ARGV) { 
     112    if ($a =~ m/^-openoffice_scripting$/) { 
     113        $openoffice_scripting = 1; 
     114        last; 
     115    } 
     116    } 
     117 
     118    my $parse_type; 
     119    if ($openoffice_scripting) { 
     120    $parse_type = 'type/(doc|dot|docx|odf|pdf|ps|ppt|rtf|xls)/'; 
     121    } 
     122    else { 
     123    $parse_type = 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/'; 
     124    } 
     125 
    98126    # read command-line arguments 
    99127    if (!parsargv::parse(\@ARGV, 
    100              'type/(doc|dot|pdf|ps|ppt|rtf|xls)/', \$input_type, 
     128             $parse_type, \$input_type, 
    101129             '/errlog/.*/', \$faillogfile, 
    102130             'output/(auto|html|text|pagedimage).*/', \$output_type, 
    103131             'timeout/\d+/0',\$timeout, 
    104132             'verbose/\d+/0', \$verbose, 
     133             'windows_scripting',\$windows_scripting, 
     134             'openoffice_scripting',\$openoffice_scripting, 
    105135             'use_strings', \$use_strings, 
    106              'windows_scripting',\$windows_scripting, 
    107136             'pdf_complex', \$pdf_complex, 
    108137             'pdf_ignore_images', \$pdf_ignore_images, 
     
    144173    print STDERR "Error: No filename extension or input type defined\n"; 
    145174    exit(1); 
     175    }  
     176    elsif ($openoffice_scripting && (($input_type eq "docx") || ($input_type eq "odf"))) { 
     177    print &convertDOC($input_filename, $output_filestem, $output_type); 
     178    print "\n"; 
    146179    }  
    147180    elsif ($input_type eq "doc" || $input_type eq "dot") { 
     
    158191    }  
    159192    elsif ($input_type eq "ps") { 
    160     print &convertPS($input_filename, $output_filestem, $output_type); 
     193    print &convertPS($dirname, $input_filename, $output_filestem, $output_type); 
    161194    print "\n"; 
    162195    }  
     
    195228 
    196229sub convertDOC { 
    197     ($input_filename, $output_filestem, $output_type) = @_; 
     230    my ($input_filename, $output_filestem, $output_type) = @_; 
     231 
     232    if (($openoffice_scripting) && ($input_filename =~ m/\.docx?$/i)) { 
     233    # Jump right in and process with Open Office 
     234        if (openoffice_doc_to_html($input_filename, $output_filestem)) { 
     235        return "html"; 
     236    } 
     237    else { 
     238        return "fail"; 
     239    } 
     240    } 
    198241 
    199242    # Many .doc files are not in fact word documents! 
     
    212255 
    213256sub convertWord678 { 
    214     ($input_filename, $output_filestem, $output_type) = @_; 
     257    my ($input_filename, $output_filestem, $output_type) = @_; 
    215258 
    216259    my $success = 0; 
     
    219262        $success = &native_doc_to_html($input_filename, $output_filestem); 
    220263    } 
     264    elsif ($openoffice_scripting) { 
     265        $success = &openoffice_doc_to_html($input_filename, $output_filestem); 
     266    } 
    221267    else { 
    222268        $success = &doc_to_html($input_filename, $output_filestem);     
     
    233279 
    234280sub convertRTF { 
    235     ($input_filename, $output_filestem, $output_type) = @_; 
     281    my ($input_filename, $output_filestem, $output_type) = @_; 
    236282 
    237283    my $success = 0; 
     
    242288    if ($windows_scripting) { 
    243289        $success = &native_doc_to_html($input_filename, $output_filestem); 
     290    } 
     291    elsif ($openoffice_scripting) { 
     292        $success = &openoffice_doc_to_html($input_filename, $output_filestem); 
    244293    } 
    245294    else { 
     
    261310 
    262311sub convertAnything { 
    263     ($input_filename, $output_filestem, $output_type) = @_; 
     312    my ($input_filename, $output_filestem, $output_type) = @_; 
    264313     
    265314    my $success = 0; 
     
    324373 
    325374sub convertPS { 
    326     ($input_filename, $output_filestem, $output_type) = @_; 
     375    my ($dirname,$input_filename, $output_filestem, $output_type) = @_; 
    327376 
    328377    my $success = 0; 
     
    365414    $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); 
    366415             
    367     $cmd = ""; 
     416    my $cmd = ""; 
    368417    if ($timeout) {$cmd = "ulimit -t $timeout;";} 
    369     # if the converting directory has already existed 
     418    # if the converting directory already exists 
    370419    if (-d $output_filestem) { 
    371         print STDERR "**The conversion directory has existed\n"; 
     420        print STDERR "**The conversion directory already exists\n"; 
    372421        return "item"; 
    373422    } else { 
     
    385434    #if (!$output_type || ($output_type =~ m/html/i)) { 
    386435    # formulate the command 
    387     $cmd = ""; 
     436    my $cmd = ""; 
    388437    $cmd .= "perl -S ppttohtml.pl "; 
    389438    $cmd .= " \"$input_filename\" \"$output_filestem.html\""; 
     
    418467    if (!$output_type || ($output_type =~ m/html/i)) { 
    419468    # formulate the command 
    420     $cmd = ""; 
     469    my $cmd = ""; 
    421470    $cmd .= "perl -S xlstohtml.pl "; 
    422471    $cmd .= " \"$input_filename\" \"$output_filestem.html\""; 
     
    450499# files or Word 5 files.  This function attempts to tell the difference. 
    451500sub find_docfile_type { 
    452     ($input_filename) = @_; 
     501    my ($input_filename) = @_; 
    453502     
    454503    open(CHK, "<$input_filename"); 
     
    492541# Attempt to convert a word document to html with the wv program 
    493542sub doc_to_html { 
    494     ($input_filename, $output_filestem) = @_; 
     543    my ($input_filename, $output_filestem) = @_; 
    495544 
    496545    my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare"); 
     
    612661    if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents) 
    613662    open(TMP, "$output_filestem.html"); 
    614     $line = <TMP>; 
     663    my $line = <TMP>; 
    615664    close(TMP); 
    616665    if ($line && $line =~ m/DOCTYPE HTML/) { 
     
    750799# Attempt to convert a word document to html with the word2html scripting program 
    751800sub native_doc_to_html { 
    752     ($input_filename, $output_filestem) = @_; 
     801    my ($input_filename, $output_filestem) = @_; 
    753802 
    754803    my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", 
     
    757806    $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i); 
    758807    if (-e "$output_filestem.html") { 
    759     print STDERR "*** The conversion file has existed\n"; 
     808    print STDERR "    The conversion file:\n"; 
     809    print STDERR "      $output_filestem.html\n"; 
     810    print STDERR "    ... already exists.  Skipping\n"; 
    760811    return 1; 
    761812    } 
     
    803854    if (-s "$output_filestem.html") { 
    804855    open(TMP, "$output_filestem.html"); 
    805     $line = <TMP>; 
     856    my $line = <TMP>; 
    806857    close(TMP); 
    807     if ($line && $line =~ m/html/) { 
     858    if ($line && $line =~ m/html/i) { 
    808859        &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 
    809860        return 1; 
     
    825876} 
    826877 
     878# Attempt to convert a word document to html with JODConvert scripting program 
     879sub openoffice_doc_to_html { 
     880    my ($input_filename, $output_filestem) = @_; 
     881 
     882    if (-e "$output_filestem.html") { 
     883    print STDERR "    The conversion file:\n"; 
     884    print STDERR "      $output_filestem.html\n"; 
     885    print STDERR "    ... skipping\n"; 
     886    return 1; 
     887    } 
     888 
     889    my $oo_script_dir = &util::filename_cat($ENV{'GEXT_OPENOFFICE'}, "bin", "script"); 
     890    my $oo2html = &util::filename_cat($oo_script_dir,"oo2html"); 
     891    if (!-e $oo2html) { 
     892    print STDERR "Error: Unable to find 'oo2html' in: \n"; 
     893    print STDERR "       $oo_script_dir\n"; 
     894    print STDERR "       Is the OpenOffice extension to Greenstone installed?\n"; 
     895    return 0; 
     896    } 
     897 
     898    my $cmd = ""; 
     899    if ($timeout) {$cmd = "ulimit -t $timeout;";} 
     900    $cmd .=  "$oo2html \"$input_filename\" \"$output_filestem.html\""; 
     901 
     902    # redirecting STDERR 
     903    $cmd .= " 2> \"$output_filestem.err\"" 
     904    if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 
     905     
     906    # execute the command 
     907    $!=0; 
     908    if (system($cmd)!=0) 
     909    { 
     910    print STDERR "Error executing oo2html converter: $!\n"; 
     911    print STDERR "Command was: $cmd\n"; 
     912 
     913    if (-s "$output_filestem.err") { 
     914        open (ERRFILE, "<$output_filestem.err"); 
     915         
     916        my $write_to_fail_log=0; 
     917        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 
     918        {$write_to_fail_log=1;} 
     919 
     920        my $line; 
     921        while ($line=<ERRFILE>) { 
     922        if ($line =~ m/\w/) { 
     923            print STDERR "$line"; 
     924            print FAILLOG "$line" if ($write_to_fail_log); 
     925        } 
     926        if ($line !~ m/startup error/) {next;} 
     927        print STDERR " (given an invalid .DOC file?)\n"; 
     928        print FAILLOG " (given an invalid .DOC file?)\n" 
     929        if ($write_to_fail_log); 
     930         
     931        } # while ERRFILE 
     932        close FAILLOG if ($write_to_fail_log); 
     933    } 
     934    return 0; # we can try any_to_text 
     935    } 
     936 
     937    # Was the conversion successful? 
     938    if (-s "$output_filestem.html") { 
     939    open(TMP, "$output_filestem.html"); 
     940    my $line = <TMP>; 
     941    close(TMP); 
     942    if ($line && $line =~ m/html/i) { 
     943        &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 
     944        return 1; 
     945    } 
     946    } 
     947     
     948    # If here, an error of some sort occurred 
     949     
     950    &util::rm("$output_filestem.html") if -e "$output_filestem.html"; 
     951    if (-e "$output_filestem.err") { 
     952    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) { 
     953        open (ERRLOG,"$output_filestem.err"); 
     954        while (<ERRLOG>) {print FAILLOG $_;} 
     955        close FAILLOG; 
     956        close ERRLOG; 
     957    } 
     958    &util::rm("$output_filestem.err"); 
     959    } 
     960    return 0; 
     961} 
     962 
    827963# Attempt to convert an RTF document to html with rtftohtml 
    828  
    829964sub rtf_to_html { 
    830965    my ($input_filename, $output_filestem) = @_; 
    831966 
    832967    # formulate the command 
    833     $cmd = ""; 
     968    my $cmd = ""; 
    834969    if ($timeout) {$cmd = "ulimit -t $timeout;";} 
    835970    $cmd .= "rtftohtml"; 
     
    9031038        my $line; 
    9041039        while ($line=<TOC>) { 
    905         $line =~ s@</body></html>$@@ ; # only last line has this 
     1040        $line =~ s@</body></html>$@@i ; # only last line has this 
    9061041        # make link relative 
    907         $line =~ s@href=\"[^\#]+@href=\"@; 
     1042        $line =~ s@href=\"[^\#]+@href=\"@i; 
    9081043        print HTML $line; 
    9091044        } 
     
    9491084    my ($dirname, $input_filename, $output_filestem) = @_; 
    9501085 
    951     $cmd = ""; 
     1086    my $cmd = ""; 
    9521087    if ($timeout) {$cmd = "ulimit -t $timeout;";} 
    9531088    $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom"; 
     
    10211156    } 
    10221157 
    1023     $cmd = ""; 
     1158    my $cmd = ""; 
    10241159    if ($timeout) {$cmd = "ulimit -t $timeout;";} 
    10251160    $output_type =~ s/.*\_(.*)/$1/i; 
     
    13161451 
    13171452sub any_to_html { 
    1318     ($input_filename, $output_filestem) = @_; 
     1453    my ($input_filename, $output_filestem) = @_; 
    13191454 
    13201455    # First generate a text file 
     
    13541489 
    13551490sub any_to_text { 
    1356     ($input_filename, $output_filestem) = @_; 
     1491    my ($input_filename, $output_filestem) = @_; 
    13571492 
    13581493    if (!$use_strings) {